@fugood/llama.node 1.1.4 → 1.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +8 -0
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +17 -13
- package/src/LlamaCompletionWorker.cpp +2 -0
- package/src/LlamaContext.cpp +3 -0
- package/src/llama.cpp/common/arg.cpp +80 -10
- package/src/llama.cpp/common/chat.cpp +52 -8
- package/src/llama.cpp/common/chat.h +7 -2
- package/src/llama.cpp/common/common.cpp +1 -0
- package/src/llama.cpp/common/common.h +16 -6
- package/src/llama.cpp/common/speculative.cpp +135 -54
- package/src/llama.cpp/common/speculative.h +8 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +4 -2
- package/src/llama.cpp/ggml/include/ggml.h +37 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +12 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +96 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +3196 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +207 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +35 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +263 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +19 -4
- package/src/llama.cpp/include/llama.h +9 -4
- package/src/llama.cpp/src/llama-arch.cpp +105 -0
- package/src/llama.cpp/src/llama-arch.h +12 -0
- package/src/llama.cpp/src/llama-batch.cpp +1 -1
- package/src/llama.cpp/src/llama-chat.cpp +33 -1
- package/src/llama.cpp/src/llama-chat.h +2 -0
- package/src/llama.cpp/src/llama-context.cpp +19 -10
- package/src/llama.cpp/src/llama-context.h +4 -1
- package/src/llama.cpp/src/llama-graph.cpp +175 -148
- package/src/llama.cpp/src/llama-graph.h +60 -23
- package/src/llama.cpp/src/llama-hparams.h +5 -3
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +6 -2
- package/src/llama.cpp/src/llama-kv-cache-unified.h +1 -1
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +2 -1
- package/src/llama.cpp/src/llama-memory-hybrid.h +1 -0
- package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
- package/src/llama.cpp/src/llama-model-loader.h +3 -2
- package/src/llama.cpp/src/llama-model.cpp +949 -75
- package/src/llama.cpp/src/llama-model.h +24 -4
- package/src/llama.cpp/src/llama-quant.cpp +40 -4
- package/src/llama.cpp/src/llama-vocab.cpp +49 -1
- package/src/llama.cpp/src/llama-vocab.h +1 -0
|
@@ -101,8 +101,10 @@ enum llm_type {
|
|
|
101
101
|
LLM_TYPE_A13B,
|
|
102
102
|
LLM_TYPE_21B_A3B, // Ernie MoE small
|
|
103
103
|
LLM_TYPE_30B_A3B,
|
|
104
|
+
LLM_TYPE_106B_A12B, // GLM-4.5-Air
|
|
104
105
|
LLM_TYPE_235B_A22B,
|
|
105
106
|
LLM_TYPE_300B_A47B, // Ernie MoE big
|
|
107
|
+
LLM_TYPE_355B_A32B, // GLM-4.5
|
|
106
108
|
LLM_TYPE_E2B,
|
|
107
109
|
LLM_TYPE_E4B,
|
|
108
110
|
};
|
|
@@ -166,6 +168,15 @@ struct llama_layer_shortconv {
|
|
|
166
168
|
struct ggml_tensor * out_proj = nullptr;
|
|
167
169
|
};
|
|
168
170
|
|
|
171
|
+
struct llama_layer_nextn {
|
|
172
|
+
struct ggml_tensor * eh_proj = nullptr;
|
|
173
|
+
struct ggml_tensor * embed_tokens = nullptr;
|
|
174
|
+
struct ggml_tensor * enorm = nullptr;
|
|
175
|
+
struct ggml_tensor * hnorm = nullptr;
|
|
176
|
+
struct ggml_tensor * shared_head_head = nullptr;
|
|
177
|
+
struct ggml_tensor * shared_head_norm = nullptr;
|
|
178
|
+
};
|
|
179
|
+
|
|
169
180
|
struct llama_layer {
|
|
170
181
|
// normalization
|
|
171
182
|
struct ggml_tensor * attn_norm = nullptr;
|
|
@@ -241,10 +252,14 @@ struct llama_layer {
|
|
|
241
252
|
struct ggml_tensor * ffn_up_enc = nullptr;
|
|
242
253
|
|
|
243
254
|
// ff MoE
|
|
244
|
-
struct ggml_tensor * ffn_gate_inp
|
|
245
|
-
struct ggml_tensor * ffn_gate_exps
|
|
246
|
-
struct ggml_tensor * ffn_down_exps
|
|
247
|
-
struct ggml_tensor * ffn_up_exps
|
|
255
|
+
struct ggml_tensor * ffn_gate_inp = nullptr;
|
|
256
|
+
struct ggml_tensor * ffn_gate_exps = nullptr;
|
|
257
|
+
struct ggml_tensor * ffn_down_exps = nullptr;
|
|
258
|
+
struct ggml_tensor * ffn_up_exps = nullptr;
|
|
259
|
+
struct ggml_tensor * ffn_gate_inp_b = nullptr;
|
|
260
|
+
struct ggml_tensor * ffn_gate_exps_b = nullptr;
|
|
261
|
+
struct ggml_tensor * ffn_down_exps_b = nullptr;
|
|
262
|
+
struct ggml_tensor * ffn_up_exps_b = nullptr;
|
|
248
263
|
|
|
249
264
|
// ff shared expert (shexp)
|
|
250
265
|
struct ggml_tensor * ffn_gate_inp_shexp = nullptr;
|
|
@@ -349,11 +364,16 @@ struct llama_layer {
|
|
|
349
364
|
struct ggml_tensor * laurel_r = nullptr;
|
|
350
365
|
struct ggml_tensor * laurel_post_norm = nullptr;
|
|
351
366
|
|
|
367
|
+
// openai-moe
|
|
368
|
+
struct ggml_tensor * attn_sinks = nullptr;
|
|
369
|
+
|
|
352
370
|
struct llama_layer_posnet posnet;
|
|
353
371
|
|
|
354
372
|
struct llama_layer_convnext convnext;
|
|
355
373
|
|
|
356
374
|
struct llama_layer_shortconv shortconv;
|
|
375
|
+
|
|
376
|
+
struct llama_layer_nextn nextn;
|
|
357
377
|
};
|
|
358
378
|
|
|
359
379
|
struct llama_model {
|
|
@@ -211,7 +211,10 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
|
|
211
211
|
const int64_t nx = tensor->ne[0];
|
|
212
212
|
const int64_t qk_k = ggml_blck_size(new_type);
|
|
213
213
|
|
|
214
|
-
if (
|
|
214
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_MXFP4_MOE) {
|
|
215
|
+
new_type = GGML_TYPE_Q8_0;
|
|
216
|
+
}
|
|
217
|
+
else if (arch == LLM_ARCH_FALCON || nx % qk_k != 0) {
|
|
215
218
|
new_type = GGML_TYPE_Q8_0;
|
|
216
219
|
}
|
|
217
220
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
|
@@ -223,6 +226,14 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
|
|
223
226
|
new_type = GGML_TYPE_Q6_K;
|
|
224
227
|
}
|
|
225
228
|
}
|
|
229
|
+
} else if (ftype == LLAMA_FTYPE_MOSTLY_MXFP4_MOE) {
|
|
230
|
+
// MoE tensors -> MXFP4
|
|
231
|
+
// other tensors -> Q8_0
|
|
232
|
+
if (tensor->ne[2] > 1) {
|
|
233
|
+
new_type = GGML_TYPE_MXFP4;
|
|
234
|
+
} else {
|
|
235
|
+
new_type = GGML_TYPE_Q8_0;
|
|
236
|
+
}
|
|
226
237
|
} else if (name == "token_embd.weight" || name == "per_layer_token_embd.weight") {
|
|
227
238
|
if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
|
|
228
239
|
new_type = qs.params->token_embedding_type;
|
|
@@ -533,6 +544,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
533
544
|
case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
|
|
534
545
|
case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break;
|
|
535
546
|
|
|
547
|
+
case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: default_type = GGML_TYPE_MXFP4; break;
|
|
548
|
+
|
|
536
549
|
// K-quants
|
|
537
550
|
case LLAMA_FTYPE_MOSTLY_Q2_K_S:
|
|
538
551
|
case LLAMA_FTYPE_MOSTLY_Q2_K: default_type = GGML_TYPE_Q2_K; break;
|
|
@@ -875,9 +888,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
875
888
|
|
|
876
889
|
// get more optimal quantization type based on the tensor shape, layer, etc.
|
|
877
890
|
if (!params->pure && ggml_is_quantized(default_type)) {
|
|
891
|
+
int fallback = qs.n_fallback;
|
|
878
892
|
new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
|
|
879
|
-
// unless the user specifies a type
|
|
880
|
-
if (params->tensor_types) {
|
|
893
|
+
// unless the user specifies a type, and the tensor geometry will not require fallback quantisation
|
|
894
|
+
if (params->tensor_types && qs.n_fallback - fallback == 0) {
|
|
881
895
|
const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
|
|
882
896
|
const std::string tensor_name(tensor->name);
|
|
883
897
|
for (const auto & [tname, qtype] : tensor_types) {
|
|
@@ -890,7 +904,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
890
904
|
}
|
|
891
905
|
}
|
|
892
906
|
}
|
|
893
|
-
|
|
894
907
|
if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
|
|
895
908
|
new_type = params->token_embedding_type;
|
|
896
909
|
}
|
|
@@ -984,6 +997,29 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
984
997
|
const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
|
|
985
998
|
|
|
986
999
|
new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
|
|
1000
|
+
|
|
1001
|
+
// TODO: temporary sanity check that the F16 -> MXFP4 is lossless
|
|
1002
|
+
#if 1
|
|
1003
|
+
if (new_type == GGML_TYPE_MXFP4) {
|
|
1004
|
+
auto * x = f32_data_03;
|
|
1005
|
+
|
|
1006
|
+
//LLAMA_LOG_INFO("nrows = %d, n_per_row = %d\n", nrows, n_per_row);
|
|
1007
|
+
std::vector<float> deq(nrows*n_per_row);
|
|
1008
|
+
const ggml_type_traits * qtype = ggml_get_type_traits(new_type);
|
|
1009
|
+
qtype->to_float(new_data_03, deq.data(), deq.size());
|
|
1010
|
+
|
|
1011
|
+
double err = 0.0f;
|
|
1012
|
+
for (int i = 0; i < (int) deq.size(); ++i) {
|
|
1013
|
+
err += fabsf(deq[i] - x[i]);
|
|
1014
|
+
//if (fabsf(deq[i] - x[i]) > 0.00001 && i < 256) {
|
|
1015
|
+
if (deq[i] != x[i]) {
|
|
1016
|
+
LLAMA_LOG_INFO("deq[%d] = %f, x[%d] = %f\n", i, deq[i], i, x[i]);
|
|
1017
|
+
}
|
|
1018
|
+
}
|
|
1019
|
+
//LLAMA_LOG_INFO("err = %f\n", err);
|
|
1020
|
+
GGML_ASSERT(err == 0.00000);
|
|
1021
|
+
}
|
|
1022
|
+
#endif
|
|
987
1023
|
}
|
|
988
1024
|
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
|
989
1025
|
}
|
|
@@ -307,6 +307,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|
|
307
307
|
};
|
|
308
308
|
break;
|
|
309
309
|
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM:
|
|
310
|
+
case LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE:
|
|
310
311
|
regex_exprs = {
|
|
311
312
|
"\\p{N}{1,3}",
|
|
312
313
|
"[一-龥-ゟ゠-ヿ]+",
|
|
@@ -1855,7 +1856,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1855
1856
|
tokenizer_pre == "gigachat" ||
|
|
1856
1857
|
tokenizer_pre == "jina-v2-es" ||
|
|
1857
1858
|
tokenizer_pre == "jina-v2-de" ||
|
|
1858
|
-
tokenizer_pre == "a.x-4.0"
|
|
1859
|
+
tokenizer_pre == "a.x-4.0" ||
|
|
1860
|
+
tokenizer_pre == "mellum") {
|
|
1859
1861
|
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
|
1860
1862
|
} else if (
|
|
1861
1863
|
tokenizer_pre == "jina-v1-en" ||
|
|
@@ -1964,6 +1966,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1964
1966
|
tokenizer_pre == "hunyuan") {
|
|
1965
1967
|
pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN;
|
|
1966
1968
|
clean_spaces = false;
|
|
1969
|
+
} else if (
|
|
1970
|
+
tokenizer_pre == "hunyuan-dense") {
|
|
1971
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE;
|
|
1972
|
+
clean_spaces = false;
|
|
1967
1973
|
} else if (
|
|
1968
1974
|
tokenizer_pre == "kimi-k2") {
|
|
1969
1975
|
pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
|
|
@@ -2185,6 +2191,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2185
2191
|
|| t.first == "<|fim▁begin|>" // DeepSeek
|
|
2186
2192
|
|| t.first == "<PRE>"
|
|
2187
2193
|
|| t.first == "▁<PRE>" // CodeLlama
|
|
2194
|
+
|| t.first == "<|code_prefix|>" // GLM-4.5
|
|
2188
2195
|
) {
|
|
2189
2196
|
special_fim_pre_id = t.second;
|
|
2190
2197
|
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
@@ -2204,6 +2211,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2204
2211
|
|| t.first == "<|fim▁hole|>" // DeepSeek
|
|
2205
2212
|
|| t.first == "<SUF>"
|
|
2206
2213
|
|| t.first == "▁<SUF>" // CodeLlama
|
|
2214
|
+
|| t.first == "<|code_suffix|>" // GLM-4.5
|
|
2207
2215
|
) {
|
|
2208
2216
|
special_fim_suf_id = t.second;
|
|
2209
2217
|
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
@@ -2223,6 +2231,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2223
2231
|
|| t.first == "<|fim▁end|>" // DeepSeek
|
|
2224
2232
|
|| t.first == "<MID>"
|
|
2225
2233
|
|| t.first == "▁<MID>" // CodeLlama
|
|
2234
|
+
|| t.first == "<|code_middle|>" // GLM-4.5
|
|
2226
2235
|
) {
|
|
2227
2236
|
special_fim_mid_id = t.second;
|
|
2228
2237
|
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
@@ -2305,6 +2314,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2305
2314
|
|| t.first == "<|eot_id|>"
|
|
2306
2315
|
|| t.first == "<|im_end|>"
|
|
2307
2316
|
|| t.first == "<|end|>"
|
|
2317
|
+
|| t.first == "<|return|>" // o200k_harmony
|
|
2318
|
+
|| t.first == "<|call|>" // o200k_harmony
|
|
2308
2319
|
|| t.first == "<end_of_turn>"
|
|
2309
2320
|
|| t.first == "<|endoftext|>"
|
|
2310
2321
|
|| t.first == "<|eom_id|>"
|
|
@@ -2328,6 +2339,13 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2328
2339
|
}
|
|
2329
2340
|
}
|
|
2330
2341
|
|
|
2342
|
+
// @ngxson : quick hack for gpt-oss, always render these tokens
|
|
2343
|
+
for (const auto & t : token_to_id) {
|
|
2344
|
+
if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>") {
|
|
2345
|
+
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
|
|
2346
|
+
}
|
|
2347
|
+
}
|
|
2348
|
+
|
|
2331
2349
|
// sanity checks
|
|
2332
2350
|
if (special_eos_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eos_id) == 0) {
|
|
2333
2351
|
special_eog_ids.insert(special_eos_id);
|
|
@@ -2343,6 +2361,36 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2343
2361
|
special_eog_ids.insert(special_eom_id);
|
|
2344
2362
|
LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
|
|
2345
2363
|
}
|
|
2364
|
+
|
|
2365
|
+
// TODO: workaround for o200k_harmony tokenizer: the "<|end|>" token should not be EOG
|
|
2366
|
+
// we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens,
|
|
2367
|
+
// we remove the "<|end|>" token from the EOG list
|
|
2368
|
+
{
|
|
2369
|
+
bool has_return = false;
|
|
2370
|
+
bool has_call = false;
|
|
2371
|
+
bool has_end = false;
|
|
2372
|
+
|
|
2373
|
+
llama_token end_id = LLAMA_TOKEN_NULL;
|
|
2374
|
+
|
|
2375
|
+
LLAMA_LOG_INFO("%s: printing all EOG tokens:\n", __func__);
|
|
2376
|
+
for (auto tid : special_eog_ids) {
|
|
2377
|
+
LLAMA_LOG_INFO("%s: - %d ('%s')\n", __func__, tid, id_to_token[tid].text.c_str());
|
|
2378
|
+
|
|
2379
|
+
if (id_to_token[tid].text == "<|return|>") {
|
|
2380
|
+
has_return = true;
|
|
2381
|
+
} else if (id_to_token[tid].text == "<|call|>") {
|
|
2382
|
+
has_call = true;
|
|
2383
|
+
} else if (id_to_token[tid].text == "<|end|>") {
|
|
2384
|
+
has_end = true;
|
|
2385
|
+
end_id = tid;
|
|
2386
|
+
}
|
|
2387
|
+
}
|
|
2388
|
+
|
|
2389
|
+
if (has_return && has_call && has_end) {
|
|
2390
|
+
special_eog_ids.erase(end_id);
|
|
2391
|
+
LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
|
|
2392
|
+
}
|
|
2393
|
+
}
|
|
2346
2394
|
}
|
|
2347
2395
|
|
|
2348
2396
|
// build special tokens cache
|