@fugood/llama.node 1.4.11 → 1.4.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +31 -31
- package/src/llama.cpp/common/arg.cpp +128 -59
- package/src/llama.cpp/common/arg.h +1 -0
- package/src/llama.cpp/common/chat-parser.cpp +11 -0
- package/src/llama.cpp/common/chat.cpp +36 -7
- package/src/llama.cpp/common/chat.h +1 -0
- package/src/llama.cpp/common/common.cpp +42 -23
- package/src/llama.cpp/common/common.h +11 -1
- package/src/llama.cpp/common/llguidance.cpp +10 -6
- package/src/llama.cpp/common/regex-partial.cpp +13 -13
- package/src/llama.cpp/common/sampling.cpp +58 -14
- package/src/llama.cpp/common/sampling.h +3 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +1 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +23 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
- package/src/llama.cpp/include/llama.h +100 -12
- package/src/llama.cpp/src/CMakeLists.txt +4 -0
- package/src/llama.cpp/src/llama-adapter.cpp +12 -3
- package/src/llama.cpp/src/llama-adapter.h +7 -1
- package/src/llama.cpp/src/llama-arch.cpp +78 -0
- package/src/llama.cpp/src/llama-arch.h +8 -0
- package/src/llama.cpp/src/llama-chat.cpp +11 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +637 -49
- package/src/llama.cpp/src/llama-context.h +43 -1
- package/src/llama.cpp/src/llama-grammar.cpp +40 -13
- package/src/llama.cpp/src/llama-grammar.h +2 -0
- package/src/llama.cpp/src/llama-graph.cpp +173 -5
- package/src/llama.cpp/src/llama-graph.h +71 -6
- package/src/llama.cpp/src/llama-hparams.cpp +4 -0
- package/src/llama.cpp/src/llama-hparams.h +12 -5
- package/src/llama.cpp/src/llama-kv-cache.h +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +11 -4
- package/src/llama.cpp/src/llama-model-loader.cpp +23 -0
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model-saver.cpp +3 -0
- package/src/llama.cpp/src/llama-model.cpp +337 -26
- package/src/llama.cpp/src/llama-model.h +13 -2
- package/src/llama.cpp/src/llama-sampling.cpp +1259 -186
- package/src/llama.cpp/src/llama-sampling.h +19 -7
- package/src/llama.cpp/src/llama-vocab.cpp +101 -33
- package/src/llama.cpp/src/llama-vocab.h +2 -0
- package/src/llama.cpp/src/llama.cpp +87 -64
- package/src/llama.cpp/src/models/afmoe.cpp +9 -5
- package/src/llama.cpp/src/models/bert.cpp +4 -2
- package/src/llama.cpp/src/models/cogvlm.cpp +5 -3
- package/src/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
- package/src/llama.cpp/src/models/deepseek2.cpp +1 -1
- package/src/llama.cpp/src/models/gemma-embedding.cpp +2 -6
- package/src/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
- package/src/llama.cpp/src/models/gemma3.cpp +3 -4
- package/src/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
- package/src/llama.cpp/src/models/llama-iswa.cpp +6 -2
- package/src/llama.cpp/src/models/llama.cpp +19 -6
- package/src/llama.cpp/src/models/maincoder.cpp +117 -0
- package/src/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
- package/src/llama.cpp/src/models/models.h +18 -0
- package/src/llama.cpp/src/models/modern-bert.cpp +116 -0
- package/src/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
- package/src/llama.cpp/src/models/plamo3.cpp +128 -0
- package/src/llama.cpp/src/models/smallthinker.cpp +11 -5
- package/src/llama.cpp/src/unicode.cpp +23 -14
|
@@ -18,6 +18,8 @@
|
|
|
18
18
|
#include "kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod.h"
|
|
19
19
|
#include "kai_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod.h"
|
|
20
20
|
#include "kai_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm.h"
|
|
21
|
+
#include "kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm.h"
|
|
22
|
+
#include "kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod.h"
|
|
21
23
|
|
|
22
24
|
#include "kai_lhs_pack_bf16p2vlx2_f32_sme.h"
|
|
23
25
|
#include "kai_lhs_quant_pack_qsi8d32p_f32.h"
|
|
@@ -69,9 +71,9 @@ static inline void kernel_run_fn10(size_t m, size_t n, size_t k, size_t /*bl*/,
|
|
|
69
71
|
|
|
70
72
|
template<void(*Fn)(size_t,size_t,size_t,const void*,const void*,float*,size_t,size_t,float,float)>
|
|
71
73
|
static inline void kernel_run_float_fn10(size_t m, size_t n, size_t k, size_t /*bl*/,
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
74
|
+
const void* lhs, const void* rhs, void* dst,
|
|
75
|
+
size_t dst_stride_row, size_t dst_stride_col,
|
|
76
|
+
float clamp_min, float clamp_max) {
|
|
75
77
|
Fn(m, n, k, lhs, rhs, static_cast<float*>(dst), dst_stride_row, dst_stride_col, clamp_min, clamp_max);
|
|
76
78
|
}
|
|
77
79
|
|
|
@@ -152,8 +154,8 @@ static inline void rhs_pack_fn12(size_t num_groups, size_t n, size_t k, size_t n
|
|
|
152
154
|
|
|
153
155
|
template<void(*Fn)(size_t,size_t,size_t,size_t,size_t,size_t,const int8_t*,const float*,const float*,void*,size_t,const struct kai_rhs_pack_qsi8cx_params*)>
|
|
154
156
|
static inline void rhs_pack_scale_fn12(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t /*bl*/,
|
|
155
|
-
|
|
156
|
-
|
|
157
|
+
size_t /*rhs_stride*/, const void* rhs, const void* bias, const void* scale,
|
|
158
|
+
void* rhs_packed, size_t extra_bytes, const void* params) {
|
|
157
159
|
Fn(num_groups, n, k, nr, kr, sr,
|
|
158
160
|
static_cast<const int8_t*>(rhs),
|
|
159
161
|
static_cast<const float*>(bias),
|
|
@@ -524,6 +526,61 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
|
|
524
526
|
},
|
|
525
527
|
#endif
|
|
526
528
|
#else
|
|
529
|
+
#if defined(__ARM_FEATURE_SVE)
|
|
530
|
+
{
|
|
531
|
+
/* SVE i8mm GEMM */
|
|
532
|
+
/* .kern_info = */ {
|
|
533
|
+
/* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm,
|
|
534
|
+
/* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm,
|
|
535
|
+
/* .get_mr = */ kai_get_mr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm,
|
|
536
|
+
/* .get_nr = */ kai_get_nr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm,
|
|
537
|
+
/* .get_kr = */ kai_get_kr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm,
|
|
538
|
+
/* .get_sr = */ kai_get_sr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm,
|
|
539
|
+
/* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm,
|
|
540
|
+
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm,
|
|
541
|
+
/* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm>,
|
|
542
|
+
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm>,
|
|
543
|
+
/* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm>,
|
|
544
|
+
},
|
|
545
|
+
/* .gemm_lhs_info = */ {
|
|
546
|
+
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p4x8sb_f32_neon,
|
|
547
|
+
/* .get_packed_offset_ex = */ &lhs_offs_fn6<kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p4x8sb_f32_neon>,
|
|
548
|
+
/* .packed_size_ex = */ &lhs_ps_fn6<kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p4x8sb_f32_neon>,
|
|
549
|
+
/* .pack_func_ex = */ &lhs_pack_float_fn10<kai_run_lhs_quant_pack_qsi8d32p4x8sb_f32_neon>,
|
|
550
|
+
},
|
|
551
|
+
/* SVE dotprod GEMV */
|
|
552
|
+
/* .kern_info = */ {
|
|
553
|
+
/* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod,
|
|
554
|
+
/* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod,
|
|
555
|
+
/* .get_mr = */ kai_get_mr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod,
|
|
556
|
+
/* .get_nr = */ kai_get_nr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod,
|
|
557
|
+
/* .get_kr = */ kai_get_kr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod,
|
|
558
|
+
/* .get_sr = */ kai_get_sr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod,
|
|
559
|
+
/* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod,
|
|
560
|
+
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod,
|
|
561
|
+
/* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod>,
|
|
562
|
+
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod>,
|
|
563
|
+
/* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod>,
|
|
564
|
+
},
|
|
565
|
+
/* .gemv_lhs_info = */ {
|
|
566
|
+
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
|
|
567
|
+
/* .get_packed_offset_ex = */ &lhs_offs_fn6<kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32>,
|
|
568
|
+
/* .packed_size_ex = */ &lhs_ps_fn6<kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32>,
|
|
569
|
+
/* .pack_func_ex = */ &lhs_pack_float_fn10<kai_run_lhs_quant_pack_qsi8d32p_f32>,
|
|
570
|
+
},
|
|
571
|
+
/* .rhs_info = */ {
|
|
572
|
+
/* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
|
573
|
+
/* .to_float = */ dequantize_row_qsi4c32pscalef16,
|
|
574
|
+
/* .packed_size_ex = */ &rhs_ps_fn5<kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
|
575
|
+
/* .packed_stride_ex = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
|
576
|
+
/* .pack_func_ex = */ &rhs_pack_fn12<kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
|
577
|
+
},
|
|
578
|
+
/* .required_cpu = */ CPU_FEATURE_SVE | CPU_FEATURE_I8MM | CPU_FEATURE_DOTPROD,
|
|
579
|
+
/* .lhs_type = */ GGML_TYPE_F32,
|
|
580
|
+
/* .rhs_type = */ GGML_TYPE_Q4_0,
|
|
581
|
+
/* .op_type = */ GGML_TYPE_F32,
|
|
582
|
+
},
|
|
583
|
+
#endif
|
|
527
584
|
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
|
528
585
|
{
|
|
529
586
|
/* i8mm GEMM */
|
|
@@ -578,7 +635,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
|
|
578
635
|
/* .rhs_type = */ GGML_TYPE_Q4_0,
|
|
579
636
|
/* .op_type = */ GGML_TYPE_F32,
|
|
580
637
|
},
|
|
581
|
-
#endif
|
|
638
|
+
#endif // __ARM_FEATURE_MATMUL_INT8
|
|
582
639
|
#if defined(__ARM_FEATURE_DOTPROD)
|
|
583
640
|
{
|
|
584
641
|
/* DOTPROD GEMM */
|
|
@@ -811,26 +868,27 @@ ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, c
|
|
|
811
868
|
ggml_kleidiai_kernels * kernel = nullptr;
|
|
812
869
|
|
|
813
870
|
if (tensor->op == GGML_OP_MUL_MAT && tensor->src[0] != nullptr && tensor->src[1] != nullptr) {
|
|
814
|
-
#if defined(__ARM_FEATURE_SME)
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
if ((cpu_features & gemm_gemv_kernels_q8[i].required_cpu) == gemm_gemv_kernels_q8[i].required_cpu &&
|
|
827
|
-
gemm_gemv_kernels_q8[i].lhs_type == tensor->src[1]->type &&
|
|
828
|
-
gemm_gemv_kernels_q8[i].rhs_type == tensor->src[0]->type &&
|
|
829
|
-
gemm_gemv_kernels_q8[i].op_type == tensor->type) {
|
|
830
|
-
kernel = &gemm_gemv_kernels_q8[i];
|
|
831
|
-
break;
|
|
871
|
+
#if defined(__ARM_FEATURE_SME) || \
|
|
872
|
+
defined(__ARM_FEATURE_DOTPROD) || \
|
|
873
|
+
defined(__ARM_FEATURE_MATMUL_INT8) || \
|
|
874
|
+
defined(__ARM_FEATURE_SVE)
|
|
875
|
+
auto try_table = [&](auto & table) {
|
|
876
|
+
for (size_t i = 0; i < NELEMS(table) - 1; ++i) {
|
|
877
|
+
if ((cpu_features & table[i].required_cpu) == table[i].required_cpu &&
|
|
878
|
+
table[i].lhs_type == tensor->src[1]->type &&
|
|
879
|
+
table[i].rhs_type == tensor->src[0]->type &&
|
|
880
|
+
table[i].op_type == tensor->type) {
|
|
881
|
+
kernel = &table[i];
|
|
882
|
+
return true;
|
|
832
883
|
}
|
|
833
884
|
}
|
|
885
|
+
return false;
|
|
886
|
+
};
|
|
887
|
+
|
|
888
|
+
if (tensor->src[0]->type == GGML_TYPE_Q8_0) {
|
|
889
|
+
try_table(gemm_gemv_kernels_q8);
|
|
890
|
+
} else {
|
|
891
|
+
try_table(gemm_gemv_kernels);
|
|
834
892
|
}
|
|
835
893
|
#else
|
|
836
894
|
GGML_UNUSED(gemm_gemv_kernels);
|
|
@@ -845,7 +903,10 @@ ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, c
|
|
|
845
903
|
ggml_kleidiai_kernels * ggml_kleidiai_select_kernels_q4_0(cpu_feature features) {
|
|
846
904
|
ggml_kleidiai_kernels * kernels = nullptr;
|
|
847
905
|
|
|
848
|
-
#if defined(__ARM_FEATURE_SME)
|
|
906
|
+
#if defined(__ARM_FEATURE_SME) || \
|
|
907
|
+
defined(__ARM_FEATURE_DOTPROD) || \
|
|
908
|
+
defined(__ARM_FEATURE_MATMUL_INT8) || \
|
|
909
|
+
defined(__ARM_FEATURE_SVE)
|
|
849
910
|
for (size_t i = 0; i < NELEMS(gemm_gemv_kernels) - 1; ++i) {
|
|
850
911
|
if ((features & gemm_gemv_kernels[i].required_cpu) == gemm_gemv_kernels[i].required_cpu) {
|
|
851
912
|
kernels = &gemm_gemv_kernels[i];
|
|
@@ -46,13 +46,20 @@ struct ggml_kleidiai_context {
|
|
|
46
46
|
} static ctx = { CPU_FEATURE_NONE, NULL, NULL };
|
|
47
47
|
|
|
48
48
|
static const char* cpu_feature_to_string(cpu_feature f) {
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
49
|
+
if (f == CPU_FEATURE_NONE) {
|
|
50
|
+
return "NONE";
|
|
51
|
+
} else if ((f & CPU_FEATURE_SME) == CPU_FEATURE_SME) {
|
|
52
|
+
return "SME";
|
|
53
|
+
} else if ((f & CPU_FEATURE_SVE) == CPU_FEATURE_SVE) {
|
|
54
|
+
return "SVE";
|
|
55
|
+
}
|
|
56
|
+
else if ((f & CPU_FEATURE_I8MM) == CPU_FEATURE_I8MM) {
|
|
57
|
+
return "I8MM";
|
|
58
|
+
} else if ((f & CPU_FEATURE_DOTPROD) == CPU_FEATURE_DOTPROD) {
|
|
59
|
+
return "DOTPROD";
|
|
60
|
+
}
|
|
61
|
+
else {
|
|
62
|
+
return "UNKNOWN";
|
|
56
63
|
}
|
|
57
64
|
}
|
|
58
65
|
|
|
@@ -68,7 +75,7 @@ static void init_kleidiai_context(void) {
|
|
|
68
75
|
|
|
69
76
|
ctx.features = (ggml_cpu_has_dotprod() ? CPU_FEATURE_DOTPROD : CPU_FEATURE_NONE) |
|
|
70
77
|
(ggml_cpu_has_matmul_int8() ? CPU_FEATURE_I8MM : CPU_FEATURE_NONE) |
|
|
71
|
-
(ggml_cpu_has_sve()
|
|
78
|
+
((ggml_cpu_has_sve() && ggml_cpu_get_sve_cnt() == QK8_0) ? CPU_FEATURE_SVE : CPU_FEATURE_NONE);
|
|
72
79
|
|
|
73
80
|
if (env_var) {
|
|
74
81
|
sme_enabled = atoi(env_var);
|