@fugood/llama.node 1.3.2 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/CMakeLists.txt +4 -3
  2. package/package.json +14 -14
  3. package/scripts/llama.cpp.patch +5 -5
  4. package/src/llama.cpp/CMakeLists.txt +4 -0
  5. package/src/llama.cpp/common/CMakeLists.txt +6 -37
  6. package/src/llama.cpp/common/common.cpp +1 -5
  7. package/src/llama.cpp/common/download.cpp +47 -29
  8. package/src/llama.cpp/common/log.cpp +6 -0
  9. package/src/llama.cpp/common/log.h +2 -0
  10. package/src/llama.cpp/ggml/include/ggml.h +71 -0
  11. package/src/llama.cpp/ggml/src/CMakeLists.txt +16 -0
  12. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +15 -3
  13. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +29 -0
  14. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +283 -0
  15. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +1 -0
  16. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +235 -34
  17. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +289 -277
  18. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +4 -0
  19. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +95 -42
  20. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +16 -0
  21. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +2 -0
  22. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +17 -0
  23. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +10 -0
  24. package/src/llama.cpp/src/CMakeLists.txt +6 -0
  25. package/src/llama.cpp/src/llama-arch.cpp +32 -0
  26. package/src/llama.cpp/src/llama-arch.h +2 -0
  27. package/src/llama.cpp/src/llama-graph.cpp +2 -1
  28. package/src/llama.cpp/src/llama-model.cpp +102 -0
  29. package/src/llama.cpp/src/llama-model.h +2 -0
  30. package/src/llama.cpp/src/llama-sampling.cpp +10 -5
  31. package/src/llama.cpp/src/llama-vocab.cpp +16 -1
  32. package/src/llama.cpp/src/llama-vocab.h +1 -0
  33. package/src/llama.cpp/src/models/afmoe.cpp +187 -0
  34. package/src/llama.cpp/src/models/models.h +4 -0
  35. package/src/llama.cpp/src/unicode.cpp +77 -0
@@ -5,10 +5,13 @@
5
5
  #include <assert.h>
6
6
  #include <atomic>
7
7
  #include <cfloat>
8
+ #include <cmath>
9
+ #include <algorithm>
8
10
  #include <stdexcept>
9
11
  #include <stdint.h>
10
12
  #include <string.h>
11
13
  #include <string>
14
+ #include <vector>
12
15
  #if defined(__linux__)
13
16
  #include <asm/hwcap.h>
14
17
  #include <sys/auxv.h>
@@ -38,8 +41,9 @@
38
41
 
39
42
  struct ggml_kleidiai_context {
40
43
  cpu_feature features;
41
- ggml_kleidiai_kernels * kernels;
42
- } static ctx = { CPU_FEATURE_NONE, NULL };
44
+ ggml_kleidiai_kernels * kernels_q4;
45
+ ggml_kleidiai_kernels * kernels_q8;
46
+ } static ctx = { CPU_FEATURE_NONE, NULL, NULL };
43
47
 
44
48
  static const char* cpu_feature_to_string(cpu_feature f) {
45
49
  switch (f) {
@@ -73,10 +77,14 @@ static void init_kleidiai_context(void) {
73
77
  if (sme_enabled != 0) {
74
78
  ctx.features |= ggml_cpu_has_sme() ? CPU_FEATURE_SME : CPU_FEATURE_NONE;
75
79
  }
76
- ctx.kernels = ggml_kleidiai_select_kernels_q4_0(ctx.features);
80
+ ctx.kernels_q4 = ggml_kleidiai_select_kernels_q4_0(ctx.features);
81
+ ctx.kernels_q8 = ggml_kleidiai_select_kernels_q8_0(ctx.features);
77
82
  #ifndef NDEBUG
78
- if (ctx.kernels) {
79
- GGML_LOG_DEBUG("kleidiai: using kernel with CPU feature %s\n", cpu_feature_to_string(ctx.kernels->required_cpu));
83
+ if (ctx.kernels_q4) {
84
+ GGML_LOG_DEBUG("kleidiai: using q4 kernel with CPU feature %s\n", cpu_feature_to_string(ctx.kernels_q4->required_cpu));
85
+ }
86
+ if (ctx.kernels_q8) {
87
+ GGML_LOG_DEBUG("kleidiai: using q8 kernel with CPU feature %s\n", cpu_feature_to_string(ctx.kernels_q8->required_cpu));
80
88
  }
81
89
  #endif
82
90
  }
@@ -130,6 +138,9 @@ class tensor_traits : public ggml::cpu::tensor_traits {
130
138
  if (kernels->rhs_type == GGML_TYPE_Q4_0) {
131
139
  if (!lhs_info->packed_size_ex) return false;
132
140
  size = lhs_info->packed_size_ex(m, k, QK4_0, mr, kr, sr);
141
+ } else if (kernels->rhs_type == GGML_TYPE_Q8_0) {
142
+ if (!lhs_info->packed_size_ex) return false;
143
+ size = lhs_info->packed_size_ex(m, k, QK8_0, mr, kr, sr);
133
144
  } else if (kernels->rhs_type == GGML_TYPE_F16) {
134
145
  if (!lhs_info->packed_size_ex || !kernels->rhs_info.packed_size_ex) return false;
135
146
  const int64_t lhs_batch_size0 = op->src[1]->ne[2];
@@ -149,11 +160,13 @@ class tensor_traits : public ggml::cpu::tensor_traits {
149
160
  if (dst->op == GGML_OP_MUL_MAT) {
150
161
  if (dst->src[0]->type == GGML_TYPE_Q4_0) {
151
162
  return compute_forward_q4_0(params, dst);
163
+ } else if (dst->src[0]->type == GGML_TYPE_Q8_0) {
164
+ return compute_forward_q8_0(params, dst);
152
165
  } else if (dst->src[0]->type == GGML_TYPE_F16) {
153
166
  return compute_forward_fp16(params, dst);
154
167
  }
155
168
  } else if (dst->op == GGML_OP_GET_ROWS) {
156
- if (dst->src[0]->type == GGML_TYPE_Q4_0) {
169
+ if (dst->src[0]->type == GGML_TYPE_Q4_0 || dst->src[0]->type == GGML_TYPE_Q8_0) {
157
170
  return compute_forward_get_rows(params, dst);
158
171
  }
159
172
  }
@@ -400,19 +413,120 @@ class tensor_traits : public ggml::cpu::tensor_traits {
400
413
  return true;
401
414
  }
402
415
 
403
- bool compute_forward_get_rows(struct ggml_compute_params * params, struct ggml_tensor * dst) {
404
- GGML_ASSERT(dst->src[0]->type == GGML_TYPE_Q4_0);
405
- if (!ctx.kernels) {
416
+ bool compute_forward_q8_0(struct ggml_compute_params * params, struct ggml_tensor * dst) {
417
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_Q8_0);
418
+
419
+ const ggml_tensor * src0 = dst->src[0];
420
+ const ggml_tensor * src1 = dst->src[1];
421
+
422
+ GGML_TENSOR_BINARY_OP_LOCALS
423
+
424
+ ggml_kleidiai_kernels *kernels = ggml_kleidiai_select_kernels(ctx.features, dst);
425
+ if (!kernels) {
406
426
  return false;
407
427
  }
408
428
 
429
+ bool is_gemv = src1->ne[1] == 1;
430
+ kernel_info * kernel = is_gemv ? &kernels->gemv : &kernels->gemm;
431
+ lhs_packing_info * lhs_info = is_gemv ? &kernels->gemv_lhs_info : &kernels->gemm_lhs_info;
432
+
433
+ if (!kernel || !lhs_info->get_packed_offset_ex || !lhs_info->pack_func_ex ||
434
+ !kernel->get_rhs_packed_offset_ex || !kernel->run_kernel_ex || !kernel->get_dst_offset) {
435
+ return false;
436
+ }
437
+
438
+ const int ith = params->ith;
439
+ const int nth_raw = params->nth;
440
+ const int nth = nth_raw > 0 ? nth_raw : 1;
441
+
442
+ const size_t k = ne00;
443
+ const size_t m = ne11;
444
+ const size_t n = ne01;
445
+
446
+ size_t mr = kernel->get_mr();
447
+ size_t kr = kernel->get_kr();
448
+ size_t sr = kernel->get_sr();
449
+
450
+ const uint8_t * lhs = static_cast<const uint8_t *>(src1->data);
451
+ uint8_t * lhs_packed = static_cast<uint8_t *>(params->wdata);
452
+ const uint8_t * rhs_packed = static_cast<const uint8_t *>(src0->data);
453
+
454
+ const size_t n_step = kernel->get_n_step();
455
+ const size_t num_n_per_thread = kai_roundup(kai_roundup(n, nth) / nth, n_step);
456
+ const size_t n_start = ith * num_n_per_thread;
457
+
458
+ size_t n_to_process = 0;
459
+ if (n_start < n) {
460
+ n_to_process = num_n_per_thread;
461
+ if ((n_start + n_to_process) > n) {
462
+ n_to_process = n - n_start;
463
+ }
464
+ }
465
+
466
+ const size_t num_m_per_thread = kai_roundup(m, mr * nth) / nth;
467
+ const size_t m_start = ith * num_m_per_thread;
468
+ size_t m_to_process = num_m_per_thread;
469
+ if ((m_start + m_to_process) > m) {
470
+ m_to_process = m - m_start;
471
+ }
472
+
473
+ if (m_start < m) {
474
+ const size_t src_stride = src1->nb[1];
475
+ const float * src_ptr = reinterpret_cast<const float *>(lhs + lhs_info->get_offset(m_start, dst->src[1]->nb[1]));
476
+ const size_t lhs_packed_offset = lhs_info->get_packed_offset_ex(m_start, k, 0, mr, kr, sr);
477
+ void * lhs_packed_ptr = static_cast<void *>(lhs_packed + lhs_packed_offset);
478
+
479
+ lhs_info->pack_func_ex(m_to_process, k, 0, mr, kr, sr, 0, src_ptr, src_stride, lhs_packed_ptr);
480
+ }
481
+
482
+ ggml_barrier(params->threadpool);
483
+
484
+ const size_t dst_stride = dst->nb[1];
485
+ const size_t lhs_packed_offset = lhs_info->get_packed_offset_ex(0, k, 0, mr, kr, sr);
486
+ const size_t rhs_packed_offset = kernel->get_rhs_packed_offset_ex(n_start, k, 0);
487
+ const size_t dst_offset = kernel->get_dst_offset(0, n_start, dst_stride);
488
+ const void * rhs_ptr = static_cast<const void *>(rhs_packed + rhs_packed_offset);
489
+ const void * lhs_ptr = static_cast<const void *>(lhs_packed + lhs_packed_offset);
490
+ float * dst_ptr = reinterpret_cast<float *>(static_cast<uint8_t *>(dst->data) + dst_offset);
491
+
492
+ if (n_to_process > 0) {
493
+ kernel->run_kernel_ex(m, n_to_process, k, 0, lhs_ptr, rhs_ptr, dst_ptr, dst_stride,
494
+ sizeof(float), -FLT_MAX, FLT_MAX);
495
+ }
496
+
497
+ return true;
498
+ }
499
+
500
+ bool compute_forward_get_rows(struct ggml_compute_params * params, struct ggml_tensor * dst) {
409
501
  const ggml_tensor * src0 = dst->src[0];
410
502
  const ggml_tensor * src1 = dst->src[1];
411
503
 
412
504
  GGML_TENSOR_BINARY_OP_LOCALS
413
505
 
414
- rhs_packing_info * rhs_info = &ctx.kernels->rhs_info;
415
- kernel_info * kernel = &ctx.kernels->gemm;
506
+ ggml_kleidiai_kernels * kernels = nullptr;
507
+ size_t block_len = 0;
508
+ size_t num_bytes_multiplier = 0;
509
+
510
+ if (dst->src[0]->type == GGML_TYPE_Q4_0) {
511
+ if (!ctx.kernels_q4) {
512
+ return false;
513
+ }
514
+ kernels = ctx.kernels_q4;
515
+ block_len = QK4_0;
516
+ num_bytes_multiplier = sizeof(uint16_t);
517
+ } else if (dst->src[0]->type == GGML_TYPE_Q8_0) {
518
+ if (!ctx.kernels_q8) {
519
+ return false;
520
+ }
521
+ kernels = ctx.kernels_q8;
522
+ block_len = QK8_0;
523
+ num_bytes_multiplier = sizeof(float);
524
+ } else {
525
+ return false;
526
+ }
527
+
528
+ rhs_packing_info * rhs_info = &kernels->rhs_info;
529
+ kernel_info * kernel = &kernels->gemm;
416
530
  if (!rhs_info->to_float || !kernel->get_nr) {
417
531
  return false;
418
532
  }
@@ -423,8 +537,7 @@ class tensor_traits : public ggml::cpu::tensor_traits {
423
537
  const size_t block_rows = kernel->get_nr();
424
538
  const size_t kr = kernel->get_kr();
425
539
 
426
- const size_t num_bytes_multiplier = sizeof(uint16_t);
427
- const size_t packed_stride = rhs_info->packed_stride(nc, block_rows, kr, QK4_0);
540
+ const size_t packed_stride = rhs_info->packed_stride(nc, block_rows, kr, block_len);
428
541
 
429
542
  const int ith = params->ith;
430
543
  const int nth = params->nth;
@@ -439,7 +552,7 @@ class tensor_traits : public ggml::cpu::tensor_traits {
439
552
  GGML_ASSERT(row_idx >= 0 && row_idx < src0->ne[1]);
440
553
 
441
554
  float *out = (float *)((char *)dst->data + i * nb1);
442
- rhs_info->to_float(src0->data, row_idx, nc, out, block_rows, packed_stride, kr, QK4_0, num_bytes_multiplier);
555
+ rhs_info->to_float(src0->data, row_idx, nc, out, block_rows, packed_stride, kr, block_len, num_bytes_multiplier);
443
556
  }
444
557
 
445
558
  return true;
@@ -447,21 +560,91 @@ class tensor_traits : public ggml::cpu::tensor_traits {
447
560
 
448
561
  public:
449
562
  int repack(struct ggml_tensor * tensor, const void * data, size_t data_size) {
450
- GGML_ASSERT(tensor->type == GGML_TYPE_Q4_0);
451
- GGML_ASSERT(ctx.kernels);
452
563
  const size_t n = tensor->ne[1];
453
564
  const size_t k = tensor->ne[0];
454
- size_t nr = ctx.kernels->gemm.get_nr();
455
- size_t kr = ctx.kernels->gemm.get_kr();
456
- size_t sr = ctx.kernels->gemm.get_sr();
457
565
 
458
- struct kai_rhs_pack_qs4cxs1s0_param params;
459
- params.lhs_zero_point = 1;
460
- params.rhs_zero_point = 8;
461
- ctx.kernels->rhs_info.pack_func_ex(1, n, k, nr, kr, sr, QK4_0, 0, (const uint8_t*)data, nullptr, nullptr, tensor->data, 0, &params);
566
+ if (tensor->type == GGML_TYPE_Q4_0) {
567
+ if (!ctx.kernels_q4) {
568
+ return -1;
569
+ }
570
+ size_t nr = ctx.kernels_q4->gemm.get_nr();
571
+ size_t kr = ctx.kernels_q4->gemm.get_kr();
572
+ size_t sr = ctx.kernels_q4->gemm.get_sr();
573
+
574
+ struct kai_rhs_pack_qs4cxs1s0_param params;
575
+ params.lhs_zero_point = 1;
576
+ params.rhs_zero_point = 8;
577
+ ctx.kernels_q4->rhs_info.pack_func_ex(1, n, k, nr, kr, sr, QK4_0, 0,
578
+ static_cast<const uint8_t *>(data),
579
+ nullptr, nullptr, tensor->data, 0, &params);
580
+ GGML_UNUSED(data_size);
581
+ return 0;
582
+ } else if (tensor->type == GGML_TYPE_Q8_0) {
583
+ if (!ctx.kernels_q8) {
584
+ return -1;
585
+ }
586
+
587
+ const size_t row_stride = tensor->nb[1];
588
+ const size_t k_blocks = (k + QK8_0 - 1) / QK8_0;
589
+
590
+ std::vector<int8_t> qdata(n * k, 0);
591
+ std::vector<float> scales(n, 0.0f);
592
+
593
+ for (size_t row = 0; row < n; ++row) {
594
+ const auto * row_blocks = reinterpret_cast<const block_q8_0 *>(
595
+ static_cast<const uint8_t *>(data) + row * row_stride);
596
+
597
+ float max_abs = 0.0f;
598
+ for (size_t block = 0; block < k_blocks; ++block) {
599
+ const block_q8_0 & blk = row_blocks[block];
600
+ const float d = GGML_FP16_TO_FP32(blk.d);
601
+ for (size_t l = 0; l < QK8_0; ++l) {
602
+ const size_t linear_idx = block * QK8_0 + l;
603
+ if (linear_idx >= k) {
604
+ break;
605
+ }
606
+ const float value = d * blk.qs[l];
607
+ max_abs = std::max(max_abs, std::fabs(value));
608
+ }
609
+ }
610
+
611
+ float scale = max_abs > 0.0f ? max_abs / 127.0f : 0.0f;
612
+ scales[row] = scale;
613
+ const float inv_scale = scale > 0.0f ? 1.0f / scale : 0.0f;
614
+
615
+ for (size_t block = 0; block < k_blocks; ++block) {
616
+ const block_q8_0 & blk = row_blocks[block];
617
+ const float d = GGML_FP16_TO_FP32(blk.d);
618
+ for (size_t l = 0; l < QK8_0; ++l) {
619
+ const size_t linear_idx = block * QK8_0 + l;
620
+ if (linear_idx >= k) {
621
+ break;
622
+ }
623
+ const float value = d * blk.qs[l];
624
+ int32_t q = scale > 0.0f ? static_cast<int32_t>(std::lround(value * inv_scale)) : 0;
625
+ q = std::clamp(q, -127, 127);
626
+ qdata[row * k + linear_idx] = static_cast<int8_t>(q);
627
+ }
628
+ }
629
+ }
630
+
631
+ size_t nr = ctx.kernels_q8->gemm.get_nr();
632
+ size_t kr = ctx.kernels_q8->gemm.get_kr();
633
+ size_t sr = ctx.kernels_q8->gemm.get_sr();
634
+
635
+ struct kai_rhs_pack_qsi8cx_params params;
636
+ params.lhs_zero_point = 1;
637
+ params.scale_multiplier = 1.0f;
638
+
639
+ ctx.kernels_q8->rhs_info.pack_func_ex(1, n, k, nr, kr, sr, 0, 0,
640
+ qdata.data(), nullptr, scales.data(),
641
+ tensor->data, 0, &params);
642
+ GGML_UNUSED(data_size);
643
+ return 0;
644
+ }
462
645
 
463
- return 0;
464
646
  GGML_UNUSED(data_size);
647
+ return -1;
465
648
  }
466
649
  };
467
650
 
@@ -518,27 +701,45 @@ static size_t ggml_backend_cpu_kleidiai_buffer_type_get_alignment(ggml_backend_b
518
701
  }
519
702
 
520
703
  static size_t ggml_backend_cpu_kleidiai_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) {
521
- GGML_ASSERT(tensor->type == GGML_TYPE_Q4_0);
522
- GGML_ASSERT(ctx.kernels);
704
+ GGML_UNUSED(buft);
523
705
 
524
- const size_t n = tensor->ne[1];
525
- const size_t k = tensor->ne[0];
526
- const size_t nr = ctx.kernels->gemm.get_nr();
527
- const size_t kr = ctx.kernels->gemm.get_kr();
706
+ const size_t n = tensor->ne[1];
707
+ const size_t k = tensor->ne[0];
708
+
709
+ ggml_kleidiai_kernels * kernels = nullptr;
710
+ size_t block_len = 0;
711
+
712
+ if (tensor->type == GGML_TYPE_Q4_0) {
713
+ GGML_ASSERT(ctx.kernels_q4);
714
+ kernels = ctx.kernels_q4;
715
+ block_len = QK4_0;
716
+ } else if (tensor->type == GGML_TYPE_Q8_0) {
717
+ GGML_ASSERT(ctx.kernels_q8);
718
+ kernels = ctx.kernels_q8;
719
+ block_len = QK8_0;
720
+ } else {
721
+ return 0;
722
+ }
528
723
 
529
- return ctx.kernels->rhs_info.packed_size_ex(n, k, nr, kr, QK4_0);
724
+ const size_t nr = kernels->gemm.get_nr();
725
+ const size_t kr = kernels->gemm.get_kr();
726
+ const size_t packed = kernels->rhs_info.packed_size_ex(n, k, nr, kr, block_len);
727
+ const size_t raw = ggml_nbytes(tensor);
530
728
 
531
- GGML_UNUSED(buft);
729
+ return packed > raw ? packed : raw;
532
730
  }
533
731
 
534
732
  namespace ggml::cpu::kleidiai {
535
733
  class extra_buffer_type : ggml::cpu::extra_buffer_type {
536
734
  bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
537
735
  if ((op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_GET_ROWS) &&
538
- op->src[0]->type == GGML_TYPE_Q4_0 &&
736
+ (op->src[0]->type == GGML_TYPE_Q4_0 || op->src[0]->type == GGML_TYPE_Q8_0) &&
539
737
  op->src[0]->buffer &&
540
738
  (ggml_n_dims(op->src[0]) == 2) &&
541
- op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type() && ctx.kernels) {
739
+ op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type()) {
740
+ if (((op->src[0]->type == GGML_TYPE_Q4_0) ? ctx.kernels_q4 : ctx.kernels_q8) == nullptr) {
741
+ return false;
742
+ }
542
743
  if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
543
744
  return false;
544
745
  }