@fugood/llama.node 1.4.10 → 1.4.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/package.json +15 -15
  2. package/scripts/llama.cpp.patch +30 -30
  3. package/src/LlamaContext.cpp +1 -1
  4. package/src/llama.cpp/common/arg.cpp +29 -14
  5. package/src/llama.cpp/common/arg.h +1 -0
  6. package/src/llama.cpp/common/chat-parser.cpp +11 -0
  7. package/src/llama.cpp/common/chat.cpp +32 -3
  8. package/src/llama.cpp/common/chat.h +1 -0
  9. package/src/llama.cpp/common/common.cpp +23 -23
  10. package/src/llama.cpp/common/common.h +1 -1
  11. package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
  12. package/src/llama.cpp/ggml/include/ggml-backend.h +1 -1
  13. package/src/llama.cpp/ggml/src/CMakeLists.txt +23 -9
  14. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
  15. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
  16. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
  17. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
  18. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
  19. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
  20. package/src/llama.cpp/include/llama.h +13 -4
  21. package/src/llama.cpp/src/CMakeLists.txt +4 -0
  22. package/src/llama.cpp/src/llama-adapter.cpp +12 -3
  23. package/src/llama.cpp/src/llama-adapter.h +7 -1
  24. package/src/llama.cpp/src/llama-arch.cpp +76 -0
  25. package/src/llama.cpp/src/llama-arch.h +7 -0
  26. package/src/llama.cpp/src/llama-chat.cpp +11 -0
  27. package/src/llama.cpp/src/llama-chat.h +1 -0
  28. package/src/llama.cpp/src/llama-context.cpp +22 -21
  29. package/src/llama.cpp/src/llama-hparams.h +4 -3
  30. package/src/llama.cpp/src/llama-kv-cache.h +1 -1
  31. package/src/llama.cpp/src/llama-mmap.cpp +11 -4
  32. package/src/llama.cpp/src/llama-model-loader.cpp +23 -0
  33. package/src/llama.cpp/src/llama-model-loader.h +2 -0
  34. package/src/llama.cpp/src/llama-model.cpp +287 -16
  35. package/src/llama.cpp/src/llama-model.h +13 -2
  36. package/src/llama.cpp/src/llama-sampling.cpp +44 -33
  37. package/src/llama.cpp/src/llama-sampling.h +3 -0
  38. package/src/llama.cpp/src/llama-vocab.cpp +101 -33
  39. package/src/llama.cpp/src/llama-vocab.h +2 -0
  40. package/src/llama.cpp/src/llama.cpp +52 -37
  41. package/src/llama.cpp/src/models/bert.cpp +4 -2
  42. package/src/llama.cpp/src/models/cogvlm.cpp +5 -3
  43. package/src/llama.cpp/src/models/deepseek2.cpp +1 -1
  44. package/src/llama.cpp/src/models/gemma-embedding.cpp +2 -6
  45. package/src/llama.cpp/src/models/gemma3.cpp +3 -4
  46. package/src/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
  47. package/src/llama.cpp/src/models/llama.cpp +19 -6
  48. package/src/llama.cpp/src/models/maincoder.cpp +117 -0
  49. package/src/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
  50. package/src/llama.cpp/src/models/models.h +18 -0
  51. package/src/llama.cpp/src/models/modern-bert.cpp +115 -0
  52. package/src/llama.cpp/src/models/plamo3.cpp +128 -0
  53. package/src/llama.cpp/src/unicode.cpp +23 -14
@@ -18,6 +18,8 @@
18
18
  #include "kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod.h"
19
19
  #include "kai_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod.h"
20
20
  #include "kai_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm.h"
21
+ #include "kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm.h"
22
+ #include "kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod.h"
21
23
 
22
24
  #include "kai_lhs_pack_bf16p2vlx2_f32_sme.h"
23
25
  #include "kai_lhs_quant_pack_qsi8d32p_f32.h"
@@ -69,9 +71,9 @@ static inline void kernel_run_fn10(size_t m, size_t n, size_t k, size_t /*bl*/,
69
71
 
70
72
  template<void(*Fn)(size_t,size_t,size_t,const void*,const void*,float*,size_t,size_t,float,float)>
71
73
  static inline void kernel_run_float_fn10(size_t m, size_t n, size_t k, size_t /*bl*/,
72
- const void* lhs, const void* rhs, void* dst,
73
- size_t dst_stride_row, size_t dst_stride_col,
74
- float clamp_min, float clamp_max) {
74
+ const void* lhs, const void* rhs, void* dst,
75
+ size_t dst_stride_row, size_t dst_stride_col,
76
+ float clamp_min, float clamp_max) {
75
77
  Fn(m, n, k, lhs, rhs, static_cast<float*>(dst), dst_stride_row, dst_stride_col, clamp_min, clamp_max);
76
78
  }
77
79
 
@@ -152,8 +154,8 @@ static inline void rhs_pack_fn12(size_t num_groups, size_t n, size_t k, size_t n
152
154
 
153
155
  template<void(*Fn)(size_t,size_t,size_t,size_t,size_t,size_t,const int8_t*,const float*,const float*,void*,size_t,const struct kai_rhs_pack_qsi8cx_params*)>
154
156
  static inline void rhs_pack_scale_fn12(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t /*bl*/,
155
- size_t /*rhs_stride*/, const void* rhs, const void* bias, const void* scale,
156
- void* rhs_packed, size_t extra_bytes, const void* params) {
157
+ size_t /*rhs_stride*/, const void* rhs, const void* bias, const void* scale,
158
+ void* rhs_packed, size_t extra_bytes, const void* params) {
157
159
  Fn(num_groups, n, k, nr, kr, sr,
158
160
  static_cast<const int8_t*>(rhs),
159
161
  static_cast<const float*>(bias),
@@ -524,6 +526,61 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
524
526
  },
525
527
  #endif
526
528
  #else
529
+ #if defined(__ARM_FEATURE_SVE)
530
+ {
531
+ /* SVE i8mm GEMM */
532
+ /* .kern_info = */ {
533
+ /* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm,
534
+ /* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm,
535
+ /* .get_mr = */ kai_get_mr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm,
536
+ /* .get_nr = */ kai_get_nr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm,
537
+ /* .get_kr = */ kai_get_kr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm,
538
+ /* .get_sr = */ kai_get_sr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm,
539
+ /* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm,
540
+ /* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm,
541
+ /* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm>,
542
+ /* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm>,
543
+ /* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm>,
544
+ },
545
+ /* .gemm_lhs_info = */ {
546
+ /* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p4x8sb_f32_neon,
547
+ /* .get_packed_offset_ex = */ &lhs_offs_fn6<kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p4x8sb_f32_neon>,
548
+ /* .packed_size_ex = */ &lhs_ps_fn6<kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p4x8sb_f32_neon>,
549
+ /* .pack_func_ex = */ &lhs_pack_float_fn10<kai_run_lhs_quant_pack_qsi8d32p4x8sb_f32_neon>,
550
+ },
551
+ /* SVE dotprod GEMV */
552
+ /* .kern_info = */ {
553
+ /* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod,
554
+ /* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod,
555
+ /* .get_mr = */ kai_get_mr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod,
556
+ /* .get_nr = */ kai_get_nr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod,
557
+ /* .get_kr = */ kai_get_kr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod,
558
+ /* .get_sr = */ kai_get_sr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod,
559
+ /* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod,
560
+ /* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod,
561
+ /* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod>,
562
+ /* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod>,
563
+ /* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod>,
564
+ },
565
+ /* .gemv_lhs_info = */ {
566
+ /* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
567
+ /* .get_packed_offset_ex = */ &lhs_offs_fn6<kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32>,
568
+ /* .packed_size_ex = */ &lhs_ps_fn6<kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32>,
569
+ /* .pack_func_ex = */ &lhs_pack_float_fn10<kai_run_lhs_quant_pack_qsi8d32p_f32>,
570
+ },
571
+ /* .rhs_info = */ {
572
+ /* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
573
+ /* .to_float = */ dequantize_row_qsi4c32pscalef16,
574
+ /* .packed_size_ex = */ &rhs_ps_fn5<kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
575
+ /* .packed_stride_ex = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
576
+ /* .pack_func_ex = */ &rhs_pack_fn12<kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
577
+ },
578
+ /* .required_cpu = */ CPU_FEATURE_SVE | CPU_FEATURE_I8MM | CPU_FEATURE_DOTPROD,
579
+ /* .lhs_type = */ GGML_TYPE_F32,
580
+ /* .rhs_type = */ GGML_TYPE_Q4_0,
581
+ /* .op_type = */ GGML_TYPE_F32,
582
+ },
583
+ #endif
527
584
  #if defined(__ARM_FEATURE_MATMUL_INT8)
528
585
  {
529
586
  /* i8mm GEMM */
@@ -578,7 +635,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
578
635
  /* .rhs_type = */ GGML_TYPE_Q4_0,
579
636
  /* .op_type = */ GGML_TYPE_F32,
580
637
  },
581
- #endif
638
+ #endif // __ARM_FEATURE_MATMUL_INT8
582
639
  #if defined(__ARM_FEATURE_DOTPROD)
583
640
  {
584
641
  /* DOTPROD GEMM */
@@ -811,26 +868,27 @@ ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, c
811
868
  ggml_kleidiai_kernels * kernel = nullptr;
812
869
 
813
870
  if (tensor->op == GGML_OP_MUL_MAT && tensor->src[0] != nullptr && tensor->src[1] != nullptr) {
814
- #if defined(__ARM_FEATURE_SME) || defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8)
815
- for (size_t i = 0; i < NELEMS(gemm_gemv_kernels) - 1; ++i) {
816
- if ((cpu_features & gemm_gemv_kernels[i].required_cpu) == gemm_gemv_kernels[i].required_cpu &&
817
- gemm_gemv_kernels[i].lhs_type == tensor->src[1]->type &&
818
- gemm_gemv_kernels[i].rhs_type == tensor->src[0]->type &&
819
- gemm_gemv_kernels[i].op_type == tensor->type) {
820
- kernel = &gemm_gemv_kernels[i];
821
- break;
822
- }
823
- }
824
- if (!kernel) {
825
- for (size_t i = 0; i < NELEMS(gemm_gemv_kernels_q8) - 1; ++i) {
826
- if ((cpu_features & gemm_gemv_kernels_q8[i].required_cpu) == gemm_gemv_kernels_q8[i].required_cpu &&
827
- gemm_gemv_kernels_q8[i].lhs_type == tensor->src[1]->type &&
828
- gemm_gemv_kernels_q8[i].rhs_type == tensor->src[0]->type &&
829
- gemm_gemv_kernels_q8[i].op_type == tensor->type) {
830
- kernel = &gemm_gemv_kernels_q8[i];
831
- break;
871
+ #if defined(__ARM_FEATURE_SME) || \
872
+ defined(__ARM_FEATURE_DOTPROD) || \
873
+ defined(__ARM_FEATURE_MATMUL_INT8) || \
874
+ defined(__ARM_FEATURE_SVE)
875
+ auto try_table = [&](auto & table) {
876
+ for (size_t i = 0; i < NELEMS(table) - 1; ++i) {
877
+ if ((cpu_features & table[i].required_cpu) == table[i].required_cpu &&
878
+ table[i].lhs_type == tensor->src[1]->type &&
879
+ table[i].rhs_type == tensor->src[0]->type &&
880
+ table[i].op_type == tensor->type) {
881
+ kernel = &table[i];
882
+ return true;
832
883
  }
833
884
  }
885
+ return false;
886
+ };
887
+
888
+ if (tensor->src[0]->type == GGML_TYPE_Q8_0) {
889
+ try_table(gemm_gemv_kernels_q8);
890
+ } else {
891
+ try_table(gemm_gemv_kernels);
834
892
  }
835
893
  #else
836
894
  GGML_UNUSED(gemm_gemv_kernels);
@@ -845,7 +903,10 @@ ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, c
845
903
  ggml_kleidiai_kernels * ggml_kleidiai_select_kernels_q4_0(cpu_feature features) {
846
904
  ggml_kleidiai_kernels * kernels = nullptr;
847
905
 
848
- #if defined(__ARM_FEATURE_SME) || defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8)
906
+ #if defined(__ARM_FEATURE_SME) || \
907
+ defined(__ARM_FEATURE_DOTPROD) || \
908
+ defined(__ARM_FEATURE_MATMUL_INT8) || \
909
+ defined(__ARM_FEATURE_SVE)
849
910
  for (size_t i = 0; i < NELEMS(gemm_gemv_kernels) - 1; ++i) {
850
911
  if ((features & gemm_gemv_kernels[i].required_cpu) == gemm_gemv_kernels[i].required_cpu) {
851
912
  kernels = &gemm_gemv_kernels[i];
@@ -46,13 +46,20 @@ struct ggml_kleidiai_context {
46
46
  } static ctx = { CPU_FEATURE_NONE, NULL, NULL };
47
47
 
48
48
  static const char* cpu_feature_to_string(cpu_feature f) {
49
- switch (f) {
50
- case CPU_FEATURE_NONE: return "NONE";
51
- case CPU_FEATURE_DOTPROD: return "DOTPROD";
52
- case CPU_FEATURE_I8MM: return "I8MM";
53
- case CPU_FEATURE_SVE: return "SVE";
54
- case CPU_FEATURE_SME: return "SME";
55
- default: return "UNKNOWN";
49
+ if (f == CPU_FEATURE_NONE) {
50
+ return "NONE";
51
+ } else if ((f & CPU_FEATURE_SME) == CPU_FEATURE_SME) {
52
+ return "SME";
53
+ } else if ((f & CPU_FEATURE_SVE) == CPU_FEATURE_SVE) {
54
+ return "SVE";
55
+ }
56
+ else if ((f & CPU_FEATURE_I8MM) == CPU_FEATURE_I8MM) {
57
+ return "I8MM";
58
+ } else if ((f & CPU_FEATURE_DOTPROD) == CPU_FEATURE_DOTPROD) {
59
+ return "DOTPROD";
60
+ }
61
+ else {
62
+ return "UNKNOWN";
56
63
  }
57
64
  }
58
65
 
@@ -68,7 +75,7 @@ static void init_kleidiai_context(void) {
68
75
 
69
76
  ctx.features = (ggml_cpu_has_dotprod() ? CPU_FEATURE_DOTPROD : CPU_FEATURE_NONE) |
70
77
  (ggml_cpu_has_matmul_int8() ? CPU_FEATURE_I8MM : CPU_FEATURE_NONE) |
71
- (ggml_cpu_has_sve() ? CPU_FEATURE_SVE : CPU_FEATURE_NONE);
78
+ ((ggml_cpu_has_sve() && ggml_cpu_get_sve_cnt() == QK8_0) ? CPU_FEATURE_SVE : CPU_FEATURE_NONE);
72
79
 
73
80
  if (env_var) {
74
81
  sme_enabled = atoi(env_var);