@fugood/llama.node 1.4.10 → 1.4.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +30 -30
- package/src/LlamaContext.cpp +1 -1
- package/src/llama.cpp/common/arg.cpp +29 -14
- package/src/llama.cpp/common/arg.h +1 -0
- package/src/llama.cpp/common/chat-parser.cpp +11 -0
- package/src/llama.cpp/common/chat.cpp +32 -3
- package/src/llama.cpp/common/chat.h +1 -0
- package/src/llama.cpp/common/common.cpp +23 -23
- package/src/llama.cpp/common/common.h +1 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +1 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +23 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
- package/src/llama.cpp/include/llama.h +13 -4
- package/src/llama.cpp/src/CMakeLists.txt +4 -0
- package/src/llama.cpp/src/llama-adapter.cpp +12 -3
- package/src/llama.cpp/src/llama-adapter.h +7 -1
- package/src/llama.cpp/src/llama-arch.cpp +76 -0
- package/src/llama.cpp/src/llama-arch.h +7 -0
- package/src/llama.cpp/src/llama-chat.cpp +11 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +22 -21
- package/src/llama.cpp/src/llama-hparams.h +4 -3
- package/src/llama.cpp/src/llama-kv-cache.h +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +11 -4
- package/src/llama.cpp/src/llama-model-loader.cpp +23 -0
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model.cpp +287 -16
- package/src/llama.cpp/src/llama-model.h +13 -2
- package/src/llama.cpp/src/llama-sampling.cpp +44 -33
- package/src/llama.cpp/src/llama-sampling.h +3 -0
- package/src/llama.cpp/src/llama-vocab.cpp +101 -33
- package/src/llama.cpp/src/llama-vocab.h +2 -0
- package/src/llama.cpp/src/llama.cpp +52 -37
- package/src/llama.cpp/src/models/bert.cpp +4 -2
- package/src/llama.cpp/src/models/cogvlm.cpp +5 -3
- package/src/llama.cpp/src/models/deepseek2.cpp +1 -1
- package/src/llama.cpp/src/models/gemma-embedding.cpp +2 -6
- package/src/llama.cpp/src/models/gemma3.cpp +3 -4
- package/src/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
- package/src/llama.cpp/src/models/llama.cpp +19 -6
- package/src/llama.cpp/src/models/maincoder.cpp +117 -0
- package/src/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
- package/src/llama.cpp/src/models/models.h +18 -0
- package/src/llama.cpp/src/models/modern-bert.cpp +115 -0
- package/src/llama.cpp/src/models/plamo3.cpp +128 -0
- package/src/llama.cpp/src/unicode.cpp +23 -14
|
@@ -18,6 +18,8 @@
|
|
|
18
18
|
#include "kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod.h"
|
|
19
19
|
#include "kai_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod.h"
|
|
20
20
|
#include "kai_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm.h"
|
|
21
|
+
#include "kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm.h"
|
|
22
|
+
#include "kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod.h"
|
|
21
23
|
|
|
22
24
|
#include "kai_lhs_pack_bf16p2vlx2_f32_sme.h"
|
|
23
25
|
#include "kai_lhs_quant_pack_qsi8d32p_f32.h"
|
|
@@ -69,9 +71,9 @@ static inline void kernel_run_fn10(size_t m, size_t n, size_t k, size_t /*bl*/,
|
|
|
69
71
|
|
|
70
72
|
template<void(*Fn)(size_t,size_t,size_t,const void*,const void*,float*,size_t,size_t,float,float)>
|
|
71
73
|
static inline void kernel_run_float_fn10(size_t m, size_t n, size_t k, size_t /*bl*/,
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
74
|
+
const void* lhs, const void* rhs, void* dst,
|
|
75
|
+
size_t dst_stride_row, size_t dst_stride_col,
|
|
76
|
+
float clamp_min, float clamp_max) {
|
|
75
77
|
Fn(m, n, k, lhs, rhs, static_cast<float*>(dst), dst_stride_row, dst_stride_col, clamp_min, clamp_max);
|
|
76
78
|
}
|
|
77
79
|
|
|
@@ -152,8 +154,8 @@ static inline void rhs_pack_fn12(size_t num_groups, size_t n, size_t k, size_t n
|
|
|
152
154
|
|
|
153
155
|
template<void(*Fn)(size_t,size_t,size_t,size_t,size_t,size_t,const int8_t*,const float*,const float*,void*,size_t,const struct kai_rhs_pack_qsi8cx_params*)>
|
|
154
156
|
static inline void rhs_pack_scale_fn12(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t /*bl*/,
|
|
155
|
-
|
|
156
|
-
|
|
157
|
+
size_t /*rhs_stride*/, const void* rhs, const void* bias, const void* scale,
|
|
158
|
+
void* rhs_packed, size_t extra_bytes, const void* params) {
|
|
157
159
|
Fn(num_groups, n, k, nr, kr, sr,
|
|
158
160
|
static_cast<const int8_t*>(rhs),
|
|
159
161
|
static_cast<const float*>(bias),
|
|
@@ -524,6 +526,61 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
|
|
524
526
|
},
|
|
525
527
|
#endif
|
|
526
528
|
#else
|
|
529
|
+
#if defined(__ARM_FEATURE_SVE)
|
|
530
|
+
{
|
|
531
|
+
/* SVE i8mm GEMM */
|
|
532
|
+
/* .kern_info = */ {
|
|
533
|
+
/* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm,
|
|
534
|
+
/* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm,
|
|
535
|
+
/* .get_mr = */ kai_get_mr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm,
|
|
536
|
+
/* .get_nr = */ kai_get_nr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm,
|
|
537
|
+
/* .get_kr = */ kai_get_kr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm,
|
|
538
|
+
/* .get_sr = */ kai_get_sr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm,
|
|
539
|
+
/* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm,
|
|
540
|
+
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm,
|
|
541
|
+
/* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm>,
|
|
542
|
+
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm>,
|
|
543
|
+
/* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm>,
|
|
544
|
+
},
|
|
545
|
+
/* .gemm_lhs_info = */ {
|
|
546
|
+
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p4x8sb_f32_neon,
|
|
547
|
+
/* .get_packed_offset_ex = */ &lhs_offs_fn6<kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p4x8sb_f32_neon>,
|
|
548
|
+
/* .packed_size_ex = */ &lhs_ps_fn6<kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p4x8sb_f32_neon>,
|
|
549
|
+
/* .pack_func_ex = */ &lhs_pack_float_fn10<kai_run_lhs_quant_pack_qsi8d32p4x8sb_f32_neon>,
|
|
550
|
+
},
|
|
551
|
+
/* SVE dotprod GEMV */
|
|
552
|
+
/* .kern_info = */ {
|
|
553
|
+
/* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod,
|
|
554
|
+
/* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod,
|
|
555
|
+
/* .get_mr = */ kai_get_mr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod,
|
|
556
|
+
/* .get_nr = */ kai_get_nr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod,
|
|
557
|
+
/* .get_kr = */ kai_get_kr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod,
|
|
558
|
+
/* .get_sr = */ kai_get_sr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod,
|
|
559
|
+
/* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod,
|
|
560
|
+
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod,
|
|
561
|
+
/* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod>,
|
|
562
|
+
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod>,
|
|
563
|
+
/* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod>,
|
|
564
|
+
},
|
|
565
|
+
/* .gemv_lhs_info = */ {
|
|
566
|
+
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
|
|
567
|
+
/* .get_packed_offset_ex = */ &lhs_offs_fn6<kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32>,
|
|
568
|
+
/* .packed_size_ex = */ &lhs_ps_fn6<kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32>,
|
|
569
|
+
/* .pack_func_ex = */ &lhs_pack_float_fn10<kai_run_lhs_quant_pack_qsi8d32p_f32>,
|
|
570
|
+
},
|
|
571
|
+
/* .rhs_info = */ {
|
|
572
|
+
/* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
|
573
|
+
/* .to_float = */ dequantize_row_qsi4c32pscalef16,
|
|
574
|
+
/* .packed_size_ex = */ &rhs_ps_fn5<kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
|
575
|
+
/* .packed_stride_ex = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
|
576
|
+
/* .pack_func_ex = */ &rhs_pack_fn12<kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
|
577
|
+
},
|
|
578
|
+
/* .required_cpu = */ CPU_FEATURE_SVE | CPU_FEATURE_I8MM | CPU_FEATURE_DOTPROD,
|
|
579
|
+
/* .lhs_type = */ GGML_TYPE_F32,
|
|
580
|
+
/* .rhs_type = */ GGML_TYPE_Q4_0,
|
|
581
|
+
/* .op_type = */ GGML_TYPE_F32,
|
|
582
|
+
},
|
|
583
|
+
#endif
|
|
527
584
|
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
|
528
585
|
{
|
|
529
586
|
/* i8mm GEMM */
|
|
@@ -578,7 +635,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
|
|
578
635
|
/* .rhs_type = */ GGML_TYPE_Q4_0,
|
|
579
636
|
/* .op_type = */ GGML_TYPE_F32,
|
|
580
637
|
},
|
|
581
|
-
#endif
|
|
638
|
+
#endif // __ARM_FEATURE_MATMUL_INT8
|
|
582
639
|
#if defined(__ARM_FEATURE_DOTPROD)
|
|
583
640
|
{
|
|
584
641
|
/* DOTPROD GEMM */
|
|
@@ -811,26 +868,27 @@ ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, c
|
|
|
811
868
|
ggml_kleidiai_kernels * kernel = nullptr;
|
|
812
869
|
|
|
813
870
|
if (tensor->op == GGML_OP_MUL_MAT && tensor->src[0] != nullptr && tensor->src[1] != nullptr) {
|
|
814
|
-
#if defined(__ARM_FEATURE_SME)
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
if ((cpu_features & gemm_gemv_kernels_q8[i].required_cpu) == gemm_gemv_kernels_q8[i].required_cpu &&
|
|
827
|
-
gemm_gemv_kernels_q8[i].lhs_type == tensor->src[1]->type &&
|
|
828
|
-
gemm_gemv_kernels_q8[i].rhs_type == tensor->src[0]->type &&
|
|
829
|
-
gemm_gemv_kernels_q8[i].op_type == tensor->type) {
|
|
830
|
-
kernel = &gemm_gemv_kernels_q8[i];
|
|
831
|
-
break;
|
|
871
|
+
#if defined(__ARM_FEATURE_SME) || \
|
|
872
|
+
defined(__ARM_FEATURE_DOTPROD) || \
|
|
873
|
+
defined(__ARM_FEATURE_MATMUL_INT8) || \
|
|
874
|
+
defined(__ARM_FEATURE_SVE)
|
|
875
|
+
auto try_table = [&](auto & table) {
|
|
876
|
+
for (size_t i = 0; i < NELEMS(table) - 1; ++i) {
|
|
877
|
+
if ((cpu_features & table[i].required_cpu) == table[i].required_cpu &&
|
|
878
|
+
table[i].lhs_type == tensor->src[1]->type &&
|
|
879
|
+
table[i].rhs_type == tensor->src[0]->type &&
|
|
880
|
+
table[i].op_type == tensor->type) {
|
|
881
|
+
kernel = &table[i];
|
|
882
|
+
return true;
|
|
832
883
|
}
|
|
833
884
|
}
|
|
885
|
+
return false;
|
|
886
|
+
};
|
|
887
|
+
|
|
888
|
+
if (tensor->src[0]->type == GGML_TYPE_Q8_0) {
|
|
889
|
+
try_table(gemm_gemv_kernels_q8);
|
|
890
|
+
} else {
|
|
891
|
+
try_table(gemm_gemv_kernels);
|
|
834
892
|
}
|
|
835
893
|
#else
|
|
836
894
|
GGML_UNUSED(gemm_gemv_kernels);
|
|
@@ -845,7 +903,10 @@ ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, c
|
|
|
845
903
|
ggml_kleidiai_kernels * ggml_kleidiai_select_kernels_q4_0(cpu_feature features) {
|
|
846
904
|
ggml_kleidiai_kernels * kernels = nullptr;
|
|
847
905
|
|
|
848
|
-
#if defined(__ARM_FEATURE_SME)
|
|
906
|
+
#if defined(__ARM_FEATURE_SME) || \
|
|
907
|
+
defined(__ARM_FEATURE_DOTPROD) || \
|
|
908
|
+
defined(__ARM_FEATURE_MATMUL_INT8) || \
|
|
909
|
+
defined(__ARM_FEATURE_SVE)
|
|
849
910
|
for (size_t i = 0; i < NELEMS(gemm_gemv_kernels) - 1; ++i) {
|
|
850
911
|
if ((features & gemm_gemv_kernels[i].required_cpu) == gemm_gemv_kernels[i].required_cpu) {
|
|
851
912
|
kernels = &gemm_gemv_kernels[i];
|
|
@@ -46,13 +46,20 @@ struct ggml_kleidiai_context {
|
|
|
46
46
|
} static ctx = { CPU_FEATURE_NONE, NULL, NULL };
|
|
47
47
|
|
|
48
48
|
static const char* cpu_feature_to_string(cpu_feature f) {
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
49
|
+
if (f == CPU_FEATURE_NONE) {
|
|
50
|
+
return "NONE";
|
|
51
|
+
} else if ((f & CPU_FEATURE_SME) == CPU_FEATURE_SME) {
|
|
52
|
+
return "SME";
|
|
53
|
+
} else if ((f & CPU_FEATURE_SVE) == CPU_FEATURE_SVE) {
|
|
54
|
+
return "SVE";
|
|
55
|
+
}
|
|
56
|
+
else if ((f & CPU_FEATURE_I8MM) == CPU_FEATURE_I8MM) {
|
|
57
|
+
return "I8MM";
|
|
58
|
+
} else if ((f & CPU_FEATURE_DOTPROD) == CPU_FEATURE_DOTPROD) {
|
|
59
|
+
return "DOTPROD";
|
|
60
|
+
}
|
|
61
|
+
else {
|
|
62
|
+
return "UNKNOWN";
|
|
56
63
|
}
|
|
57
64
|
}
|
|
58
65
|
|
|
@@ -68,7 +75,7 @@ static void init_kleidiai_context(void) {
|
|
|
68
75
|
|
|
69
76
|
ctx.features = (ggml_cpu_has_dotprod() ? CPU_FEATURE_DOTPROD : CPU_FEATURE_NONE) |
|
|
70
77
|
(ggml_cpu_has_matmul_int8() ? CPU_FEATURE_I8MM : CPU_FEATURE_NONE) |
|
|
71
|
-
(ggml_cpu_has_sve()
|
|
78
|
+
((ggml_cpu_has_sve() && ggml_cpu_get_sve_cnt() == QK8_0) ? CPU_FEATURE_SVE : CPU_FEATURE_NONE);
|
|
72
79
|
|
|
73
80
|
if (env_var) {
|
|
74
81
|
sme_enabled = atoi(env_var);
|