@fugood/llama.node 1.1.4 → 1.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +8 -0
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +17 -13
- package/src/LlamaCompletionWorker.cpp +2 -0
- package/src/LlamaContext.cpp +3 -0
- package/src/llama.cpp/common/arg.cpp +80 -10
- package/src/llama.cpp/common/chat.cpp +52 -8
- package/src/llama.cpp/common/chat.h +7 -2
- package/src/llama.cpp/common/common.cpp +1 -0
- package/src/llama.cpp/common/common.h +16 -6
- package/src/llama.cpp/common/speculative.cpp +135 -54
- package/src/llama.cpp/common/speculative.h +8 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +4 -2
- package/src/llama.cpp/ggml/include/ggml.h +37 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +12 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +96 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +3196 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +207 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +35 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +263 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +19 -4
- package/src/llama.cpp/include/llama.h +9 -4
- package/src/llama.cpp/src/llama-arch.cpp +105 -0
- package/src/llama.cpp/src/llama-arch.h +12 -0
- package/src/llama.cpp/src/llama-batch.cpp +1 -1
- package/src/llama.cpp/src/llama-chat.cpp +33 -1
- package/src/llama.cpp/src/llama-chat.h +2 -0
- package/src/llama.cpp/src/llama-context.cpp +19 -10
- package/src/llama.cpp/src/llama-context.h +4 -1
- package/src/llama.cpp/src/llama-graph.cpp +175 -148
- package/src/llama.cpp/src/llama-graph.h +60 -23
- package/src/llama.cpp/src/llama-hparams.h +5 -3
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +6 -2
- package/src/llama.cpp/src/llama-kv-cache-unified.h +1 -1
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +2 -1
- package/src/llama.cpp/src/llama-memory-hybrid.h +1 -0
- package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
- package/src/llama.cpp/src/llama-model-loader.h +3 -2
- package/src/llama.cpp/src/llama-model.cpp +949 -75
- package/src/llama.cpp/src/llama-model.h +24 -4
- package/src/llama.cpp/src/llama-quant.cpp +40 -4
- package/src/llama.cpp/src/llama-vocab.cpp +49 -1
- package/src/llama.cpp/src/llama-vocab.h +1 -0
|
@@ -39,6 +39,7 @@ enum llm_ffn_op_type {
|
|
|
39
39
|
LLM_FFN_SWIGLU,
|
|
40
40
|
LLM_FFN_GEGLU,
|
|
41
41
|
LLM_FFN_REGLU,
|
|
42
|
+
LLM_FFN_SWIGLU_OAI_MOE,
|
|
42
43
|
};
|
|
43
44
|
|
|
44
45
|
enum llm_ffn_gate_type {
|
|
@@ -144,7 +145,7 @@ public:
|
|
|
144
145
|
|
|
145
146
|
ggml_tensor * pos_bucket = nullptr; // I32 [n_batch, n_batch]
|
|
146
147
|
|
|
147
|
-
const llama_hparams
|
|
148
|
+
const llama_hparams hparams;
|
|
148
149
|
};
|
|
149
150
|
|
|
150
151
|
class llm_graph_input_pos_bucket_kv : public llm_graph_input_i {
|
|
@@ -158,7 +159,7 @@ public:
|
|
|
158
159
|
|
|
159
160
|
ggml_tensor * pos_bucket = nullptr; // I32 [n_kv, n_batch]
|
|
160
161
|
|
|
161
|
-
const llama_hparams
|
|
162
|
+
const llama_hparams hparams;
|
|
162
163
|
|
|
163
164
|
const llama_kv_cache_unified_context * mctx;
|
|
164
165
|
};
|
|
@@ -177,8 +178,8 @@ public:
|
|
|
177
178
|
|
|
178
179
|
ggml_tensor * out_ids; // I32 [n_outputs]
|
|
179
180
|
|
|
180
|
-
const llama_hparams
|
|
181
|
-
const llama_cparams
|
|
181
|
+
const llama_hparams hparams;
|
|
182
|
+
const llama_cparams cparams;
|
|
182
183
|
|
|
183
184
|
const uint32_t n_outputs;
|
|
184
185
|
};
|
|
@@ -192,7 +193,7 @@ public:
|
|
|
192
193
|
|
|
193
194
|
ggml_tensor * mean; // F32 [n_batch, n_batch]
|
|
194
195
|
|
|
195
|
-
const llama_cparams
|
|
196
|
+
const llama_cparams cparams;
|
|
196
197
|
};
|
|
197
198
|
|
|
198
199
|
class llm_graph_input_cls : public llm_graph_input_i {
|
|
@@ -204,7 +205,7 @@ public:
|
|
|
204
205
|
|
|
205
206
|
ggml_tensor * cls; // I32 [n_batch]
|
|
206
207
|
|
|
207
|
-
const llama_cparams
|
|
208
|
+
const llama_cparams cparams;
|
|
208
209
|
};
|
|
209
210
|
|
|
210
211
|
class llm_graph_input_rs : public llm_graph_input_i {
|
|
@@ -214,7 +215,12 @@ public:
|
|
|
214
215
|
|
|
215
216
|
void set_input(const llama_ubatch * ubatch) override;
|
|
216
217
|
|
|
217
|
-
ggml_tensor * s_copy;
|
|
218
|
+
ggml_tensor * s_copy; // I32 [n_rs]
|
|
219
|
+
|
|
220
|
+
// views of s_copy, computed once per graph
|
|
221
|
+
// and shared across layers which use build_rs
|
|
222
|
+
ggml_tensor * s_copy_main; // I32 [n_seqs]
|
|
223
|
+
ggml_tensor * s_copy_extra; // I32 [n_rs - n_seqs]
|
|
218
224
|
|
|
219
225
|
const llama_memory_recurrent_context * mctx;
|
|
220
226
|
};
|
|
@@ -247,8 +253,8 @@ public:
|
|
|
247
253
|
ggml_tensor * kq_mask = nullptr; // F32 [n_tokens, n_batch, 1, 1]
|
|
248
254
|
ggml_tensor * kq_mask_cnv = nullptr; // [n_tokens, n_batch, 1, 1]
|
|
249
255
|
|
|
250
|
-
const llama_hparams
|
|
251
|
-
const llama_cparams
|
|
256
|
+
const llama_hparams hparams;
|
|
257
|
+
const llama_cparams cparams;
|
|
252
258
|
};
|
|
253
259
|
|
|
254
260
|
class llm_graph_input_attn_kv_unified : public llm_graph_input_i {
|
|
@@ -278,8 +284,11 @@ public:
|
|
|
278
284
|
ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
|
|
279
285
|
ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
|
|
280
286
|
|
|
281
|
-
|
|
282
|
-
|
|
287
|
+
// note: these have to be copies because in order to be able to reuse a graph, its inputs
|
|
288
|
+
// need to carry these parameters with them. otherwise, they can point to freed
|
|
289
|
+
// llm_graph_params from a previous batch, causing stack-use-after-return
|
|
290
|
+
const llama_hparams hparams;
|
|
291
|
+
const llama_cparams cparams;
|
|
283
292
|
|
|
284
293
|
const llama_kv_cache_unified_context * mctx;
|
|
285
294
|
};
|
|
@@ -318,8 +327,8 @@ public:
|
|
|
318
327
|
ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
|
|
319
328
|
ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
|
|
320
329
|
|
|
321
|
-
const llama_hparams
|
|
322
|
-
const llama_cparams
|
|
330
|
+
const llama_hparams hparams;
|
|
331
|
+
const llama_cparams cparams;
|
|
323
332
|
|
|
324
333
|
const llama_kv_cache_unified_iswa_context * mctx;
|
|
325
334
|
};
|
|
@@ -415,7 +424,9 @@ struct llm_graph_params {
|
|
|
415
424
|
(!ubatch.embd && !other.ubatch.embd)
|
|
416
425
|
);
|
|
417
426
|
|
|
418
|
-
|
|
427
|
+
// when we split the batch using "equal_seqs" we have to verify that the participating sequences are the same
|
|
428
|
+
// the reason is because the set of attention streams would be different for different sequences
|
|
429
|
+
if (can_reuse_ubatch && ubatch.equal_seqs()) {
|
|
419
430
|
if (!ubatch.data) {
|
|
420
431
|
// if the old ubatch does not own it's data, then we cannot guarantee that it is still alive, and
|
|
421
432
|
// therefore we cannot perform the sequence id check. normally should never happen
|
|
@@ -609,6 +620,7 @@ struct llm_graph_context {
|
|
|
609
620
|
llm_ffn_gate_type type_gate,
|
|
610
621
|
int il) const;
|
|
611
622
|
|
|
623
|
+
// build MoE FFN without bias tensors
|
|
612
624
|
ggml_tensor * build_moe_ffn(
|
|
613
625
|
ggml_tensor * cur,
|
|
614
626
|
ggml_tensor * gate_inp,
|
|
@@ -623,19 +635,29 @@ struct llm_graph_context {
|
|
|
623
635
|
bool scale_w,
|
|
624
636
|
float w_scale,
|
|
625
637
|
llama_expert_gating_func_type gating_op,
|
|
626
|
-
int il
|
|
638
|
+
int il,
|
|
639
|
+
ggml_tensor * probs_in = nullptr) const;
|
|
627
640
|
|
|
628
|
-
ggml_tensor *
|
|
641
|
+
ggml_tensor * build_moe_ffn(
|
|
629
642
|
ggml_tensor * cur,
|
|
630
|
-
ggml_tensor *
|
|
643
|
+
ggml_tensor * gate_inp,
|
|
644
|
+
ggml_tensor * gate_inp_b,
|
|
631
645
|
ggml_tensor * up_exps,
|
|
646
|
+
ggml_tensor * up_exps_b,
|
|
632
647
|
ggml_tensor * gate_exps,
|
|
648
|
+
ggml_tensor * gate_exps_b,
|
|
633
649
|
ggml_tensor * down_exps,
|
|
650
|
+
ggml_tensor * down_exps_b,
|
|
634
651
|
ggml_tensor * exp_probs_b,
|
|
635
652
|
int64_t n_expert,
|
|
636
653
|
int64_t n_expert_used,
|
|
654
|
+
llm_ffn_op_type type_op,
|
|
655
|
+
bool norm_w,
|
|
656
|
+
bool scale_w,
|
|
657
|
+
float w_scale,
|
|
637
658
|
llama_expert_gating_func_type gating_op,
|
|
638
|
-
int il
|
|
659
|
+
int il,
|
|
660
|
+
ggml_tensor * probs_in = nullptr) const;
|
|
639
661
|
|
|
640
662
|
//
|
|
641
663
|
// inputs
|
|
@@ -663,6 +685,7 @@ struct llm_graph_context {
|
|
|
663
685
|
ggml_tensor * v, // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false)
|
|
664
686
|
ggml_tensor * kq_b,
|
|
665
687
|
ggml_tensor * kq_mask,
|
|
688
|
+
ggml_tensor * sinks,
|
|
666
689
|
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
|
667
690
|
float kq_scale) const;
|
|
668
691
|
|
|
@@ -709,6 +732,20 @@ struct llm_graph_context {
|
|
|
709
732
|
float kq_scale,
|
|
710
733
|
int il) const;
|
|
711
734
|
|
|
735
|
+
// TODO: temporary to keep the diff small. after the code is public will refactor to simplify this
|
|
736
|
+
ggml_tensor * build_attn_with_sinks(
|
|
737
|
+
llm_graph_input_attn_kv_unified_iswa * inp,
|
|
738
|
+
ggml_tensor * wo,
|
|
739
|
+
ggml_tensor * wo_b,
|
|
740
|
+
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
|
741
|
+
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] optional
|
|
742
|
+
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] optional
|
|
743
|
+
ggml_tensor * kq_b,
|
|
744
|
+
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
|
745
|
+
ggml_tensor * sinks, // [n_head_q]
|
|
746
|
+
float kq_scale,
|
|
747
|
+
int il) const;
|
|
748
|
+
|
|
712
749
|
llm_graph_input_attn_cross * build_attn_inp_cross() const;
|
|
713
750
|
|
|
714
751
|
ggml_tensor * build_attn(
|
|
@@ -727,7 +764,6 @@ struct llm_graph_context {
|
|
|
727
764
|
// recurrent
|
|
728
765
|
//
|
|
729
766
|
|
|
730
|
-
// TODO: avoid notion of "kv"
|
|
731
767
|
// TODO: move this implementation to llama_memory_recurrent.
|
|
732
768
|
// this is analogous to llama_kv_cache_unified::cpy_k / cpy_v
|
|
733
769
|
// when moving, avoid passing `ggml_cgraph` - only pass `ggml_context`. would likely need to split the
|
|
@@ -735,12 +771,13 @@ struct llm_graph_context {
|
|
|
735
771
|
// `llama_memory_recurrent`
|
|
736
772
|
ggml_tensor * build_rs(
|
|
737
773
|
ggml_tensor * s,
|
|
738
|
-
ggml_tensor *
|
|
774
|
+
ggml_tensor * state_copy_main,
|
|
775
|
+
ggml_tensor * state_copy_extra,
|
|
739
776
|
int32_t state_size,
|
|
740
777
|
int32_t n_seqs,
|
|
741
|
-
uint32_t
|
|
742
|
-
uint32_t
|
|
743
|
-
uint32_t
|
|
778
|
+
uint32_t n_rs,
|
|
779
|
+
uint32_t rs_head,
|
|
780
|
+
uint32_t rs_size,
|
|
744
781
|
int32_t rs_zero,
|
|
745
782
|
const llm_graph_get_rows_fn & get_state_rows = ggml_get_rows) const;
|
|
746
783
|
|
|
@@ -9,9 +9,10 @@
|
|
|
9
9
|
#define LLAMA_MAX_EXPERTS 384 // Kimi-K2
|
|
10
10
|
|
|
11
11
|
enum llama_expert_gating_func_type {
|
|
12
|
-
LLAMA_EXPERT_GATING_FUNC_TYPE_NONE
|
|
13
|
-
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX
|
|
14
|
-
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID
|
|
12
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_NONE = 0,
|
|
13
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX = 1,
|
|
14
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2,
|
|
15
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT = 3, // applied to the router weights instead of the logits
|
|
15
16
|
};
|
|
16
17
|
|
|
17
18
|
enum llama_swa_type {
|
|
@@ -73,6 +74,7 @@ struct llama_hparams {
|
|
|
73
74
|
bool expert_weights_norm = false;
|
|
74
75
|
uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
|
|
75
76
|
uint32_t moe_every_n_layers = 0;
|
|
77
|
+
uint32_t nextn_predict_layers = 0;
|
|
76
78
|
|
|
77
79
|
float f_norm_eps;
|
|
78
80
|
float f_norm_rms_eps;
|
|
@@ -39,6 +39,10 @@ llama_kv_cache_unified::llama_kv_cache_unified(
|
|
|
39
39
|
if (model.arch == LLM_ARCH_GEMMA3N) {
|
|
40
40
|
n_layer_cache = 20;
|
|
41
41
|
}
|
|
42
|
+
if (model.arch == LLM_ARCH_GLM4_MOE) {
|
|
43
|
+
// GLM-4.5: Only process up to last layer, skip final NextN layer
|
|
44
|
+
n_layer_cache = hparams.n_layer - hparams.nextn_predict_layers;
|
|
45
|
+
}
|
|
42
46
|
|
|
43
47
|
// create a context for each buffer type
|
|
44
48
|
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
|
@@ -183,7 +187,7 @@ llama_kv_cache_unified::llama_kv_cache_unified(
|
|
|
183
187
|
const size_t memory_size_k = size_k_bytes();
|
|
184
188
|
const size_t memory_size_v = size_v_bytes();
|
|
185
189
|
|
|
186
|
-
LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u/%
|
|
190
|
+
LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u/%u seqs), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
|
|
187
191
|
(float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), kv_size, (int) layers.size(), n_seq_max, n_stream,
|
|
188
192
|
ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
|
|
189
193
|
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
|
|
@@ -193,7 +197,7 @@ llama_kv_cache_unified::llama_kv_cache_unified(
|
|
|
193
197
|
debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;
|
|
194
198
|
|
|
195
199
|
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
|
|
196
|
-
supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) != 0 :
|
|
200
|
+
supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) != 0 : supports_set_rows;
|
|
197
201
|
|
|
198
202
|
if (!supports_set_rows) {
|
|
199
203
|
// ref: https://github.com/ggml-org/llama.cpp/pull/14363
|
|
@@ -25,6 +25,7 @@ llama_memory_hybrid::llama_memory_hybrid(
|
|
|
25
25
|
/* common */
|
|
26
26
|
uint32_t n_seq_max,
|
|
27
27
|
bool offload,
|
|
28
|
+
bool unified,
|
|
28
29
|
/* layer filters */
|
|
29
30
|
layer_filter_cb && filter_attn,
|
|
30
31
|
layer_filter_cb && filter_recr) :
|
|
@@ -38,7 +39,7 @@ llama_memory_hybrid::llama_memory_hybrid(
|
|
|
38
39
|
type_v,
|
|
39
40
|
v_trans,
|
|
40
41
|
offload,
|
|
41
|
-
|
|
42
|
+
unified,
|
|
42
43
|
kv_size,
|
|
43
44
|
n_seq_max,
|
|
44
45
|
n_pad,
|
|
@@ -35,6 +35,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
|
35
35
|
case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
|
|
36
36
|
case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
|
|
37
37
|
case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
|
|
38
|
+
case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: return "MXFP4 MoE";
|
|
38
39
|
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
|
|
39
40
|
case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
|
|
40
41
|
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
|
|
@@ -58,8 +58,9 @@ struct llama_model_loader {
|
|
|
58
58
|
}
|
|
59
59
|
};
|
|
60
60
|
|
|
61
|
-
static const int TENSOR_NOT_REQUIRED = 1;
|
|
62
|
-
static const int TENSOR_DUPLICATED =
|
|
61
|
+
static const int TENSOR_NOT_REQUIRED = 1 << 0;
|
|
62
|
+
static const int TENSOR_DUPLICATED = 1 << 1;
|
|
63
|
+
static const int TENSOR_SKIP = 1 << 2;
|
|
63
64
|
|
|
64
65
|
int n_kv = 0;
|
|
65
66
|
int n_tensors = 0;
|