@fugood/llama.node 1.1.4 → 1.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/lib/binding.ts +8 -0
  2. package/package.json +14 -14
  3. package/scripts/llama.cpp.patch +17 -13
  4. package/src/LlamaCompletionWorker.cpp +2 -0
  5. package/src/LlamaContext.cpp +3 -0
  6. package/src/llama.cpp/common/arg.cpp +80 -10
  7. package/src/llama.cpp/common/chat.cpp +52 -8
  8. package/src/llama.cpp/common/chat.h +7 -2
  9. package/src/llama.cpp/common/common.cpp +1 -0
  10. package/src/llama.cpp/common/common.h +16 -6
  11. package/src/llama.cpp/common/speculative.cpp +135 -54
  12. package/src/llama.cpp/common/speculative.h +8 -1
  13. package/src/llama.cpp/ggml/CMakeLists.txt +4 -2
  14. package/src/llama.cpp/ggml/include/ggml.h +37 -1
  15. package/src/llama.cpp/ggml/src/CMakeLists.txt +12 -1
  16. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +61 -0
  17. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +96 -8
  18. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +3196 -0
  19. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +20 -0
  20. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14 -1
  21. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +207 -9
  22. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -7
  23. package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +35 -0
  24. package/src/llama.cpp/ggml/src/ggml-cpu/quants.h +8 -0
  25. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +263 -0
  26. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
  27. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +19 -4
  28. package/src/llama.cpp/include/llama.h +9 -4
  29. package/src/llama.cpp/src/llama-arch.cpp +105 -0
  30. package/src/llama.cpp/src/llama-arch.h +12 -0
  31. package/src/llama.cpp/src/llama-batch.cpp +1 -1
  32. package/src/llama.cpp/src/llama-chat.cpp +33 -1
  33. package/src/llama.cpp/src/llama-chat.h +2 -0
  34. package/src/llama.cpp/src/llama-context.cpp +19 -10
  35. package/src/llama.cpp/src/llama-context.h +4 -1
  36. package/src/llama.cpp/src/llama-graph.cpp +175 -148
  37. package/src/llama.cpp/src/llama-graph.h +60 -23
  38. package/src/llama.cpp/src/llama-hparams.h +5 -3
  39. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +6 -2
  40. package/src/llama.cpp/src/llama-kv-cache-unified.h +1 -1
  41. package/src/llama.cpp/src/llama-memory-hybrid.cpp +2 -1
  42. package/src/llama.cpp/src/llama-memory-hybrid.h +1 -0
  43. package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
  44. package/src/llama.cpp/src/llama-model-loader.h +3 -2
  45. package/src/llama.cpp/src/llama-model.cpp +949 -75
  46. package/src/llama.cpp/src/llama-model.h +24 -4
  47. package/src/llama.cpp/src/llama-quant.cpp +40 -4
  48. package/src/llama.cpp/src/llama-vocab.cpp +49 -1
  49. package/src/llama.cpp/src/llama-vocab.h +1 -0
@@ -101,8 +101,10 @@ enum llm_type {
101
101
  LLM_TYPE_A13B,
102
102
  LLM_TYPE_21B_A3B, // Ernie MoE small
103
103
  LLM_TYPE_30B_A3B,
104
+ LLM_TYPE_106B_A12B, // GLM-4.5-Air
104
105
  LLM_TYPE_235B_A22B,
105
106
  LLM_TYPE_300B_A47B, // Ernie MoE big
107
+ LLM_TYPE_355B_A32B, // GLM-4.5
106
108
  LLM_TYPE_E2B,
107
109
  LLM_TYPE_E4B,
108
110
  };
@@ -166,6 +168,15 @@ struct llama_layer_shortconv {
166
168
  struct ggml_tensor * out_proj = nullptr;
167
169
  };
168
170
 
171
+ struct llama_layer_nextn {
172
+ struct ggml_tensor * eh_proj = nullptr;
173
+ struct ggml_tensor * embed_tokens = nullptr;
174
+ struct ggml_tensor * enorm = nullptr;
175
+ struct ggml_tensor * hnorm = nullptr;
176
+ struct ggml_tensor * shared_head_head = nullptr;
177
+ struct ggml_tensor * shared_head_norm = nullptr;
178
+ };
179
+
169
180
  struct llama_layer {
170
181
  // normalization
171
182
  struct ggml_tensor * attn_norm = nullptr;
@@ -241,10 +252,14 @@ struct llama_layer {
241
252
  struct ggml_tensor * ffn_up_enc = nullptr;
242
253
 
243
254
  // ff MoE
244
- struct ggml_tensor * ffn_gate_inp = nullptr;
245
- struct ggml_tensor * ffn_gate_exps = nullptr;
246
- struct ggml_tensor * ffn_down_exps = nullptr;
247
- struct ggml_tensor * ffn_up_exps = nullptr;
255
+ struct ggml_tensor * ffn_gate_inp = nullptr;
256
+ struct ggml_tensor * ffn_gate_exps = nullptr;
257
+ struct ggml_tensor * ffn_down_exps = nullptr;
258
+ struct ggml_tensor * ffn_up_exps = nullptr;
259
+ struct ggml_tensor * ffn_gate_inp_b = nullptr;
260
+ struct ggml_tensor * ffn_gate_exps_b = nullptr;
261
+ struct ggml_tensor * ffn_down_exps_b = nullptr;
262
+ struct ggml_tensor * ffn_up_exps_b = nullptr;
248
263
 
249
264
  // ff shared expert (shexp)
250
265
  struct ggml_tensor * ffn_gate_inp_shexp = nullptr;
@@ -349,11 +364,16 @@ struct llama_layer {
349
364
  struct ggml_tensor * laurel_r = nullptr;
350
365
  struct ggml_tensor * laurel_post_norm = nullptr;
351
366
 
367
+ // openai-moe
368
+ struct ggml_tensor * attn_sinks = nullptr;
369
+
352
370
  struct llama_layer_posnet posnet;
353
371
 
354
372
  struct llama_layer_convnext convnext;
355
373
 
356
374
  struct llama_layer_shortconv shortconv;
375
+
376
+ struct llama_layer_nextn nextn;
357
377
  };
358
378
 
359
379
  struct llama_model {
@@ -211,7 +211,10 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
211
211
  const int64_t nx = tensor->ne[0];
212
212
  const int64_t qk_k = ggml_blck_size(new_type);
213
213
 
214
- if (arch == LLM_ARCH_FALCON || nx % qk_k != 0) {
214
+ if (ftype == LLAMA_FTYPE_MOSTLY_MXFP4_MOE) {
215
+ new_type = GGML_TYPE_Q8_0;
216
+ }
217
+ else if (arch == LLM_ARCH_FALCON || nx % qk_k != 0) {
215
218
  new_type = GGML_TYPE_Q8_0;
216
219
  }
217
220
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
@@ -223,6 +226,14 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
223
226
  new_type = GGML_TYPE_Q6_K;
224
227
  }
225
228
  }
229
+ } else if (ftype == LLAMA_FTYPE_MOSTLY_MXFP4_MOE) {
230
+ // MoE tensors -> MXFP4
231
+ // other tensors -> Q8_0
232
+ if (tensor->ne[2] > 1) {
233
+ new_type = GGML_TYPE_MXFP4;
234
+ } else {
235
+ new_type = GGML_TYPE_Q8_0;
236
+ }
226
237
  } else if (name == "token_embd.weight" || name == "per_layer_token_embd.weight") {
227
238
  if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
228
239
  new_type = qs.params->token_embedding_type;
@@ -533,6 +544,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
533
544
  case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
534
545
  case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break;
535
546
 
547
+ case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: default_type = GGML_TYPE_MXFP4; break;
548
+
536
549
  // K-quants
537
550
  case LLAMA_FTYPE_MOSTLY_Q2_K_S:
538
551
  case LLAMA_FTYPE_MOSTLY_Q2_K: default_type = GGML_TYPE_Q2_K; break;
@@ -875,9 +888,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
875
888
 
876
889
  // get more optimal quantization type based on the tensor shape, layer, etc.
877
890
  if (!params->pure && ggml_is_quantized(default_type)) {
891
+ int fallback = qs.n_fallback;
878
892
  new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
879
- // unless the user specifies a type
880
- if (params->tensor_types) {
893
+ // unless the user specifies a type, and the tensor geometry will not require fallback quantisation
894
+ if (params->tensor_types && qs.n_fallback - fallback == 0) {
881
895
  const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
882
896
  const std::string tensor_name(tensor->name);
883
897
  for (const auto & [tname, qtype] : tensor_types) {
@@ -890,7 +904,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
890
904
  }
891
905
  }
892
906
  }
893
-
894
907
  if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
895
908
  new_type = params->token_embedding_type;
896
909
  }
@@ -984,6 +997,29 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
984
997
  const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
985
998
 
986
999
  new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
1000
+
1001
+ // TODO: temporary sanity check that the F16 -> MXFP4 is lossless
1002
+ #if 1
1003
+ if (new_type == GGML_TYPE_MXFP4) {
1004
+ auto * x = f32_data_03;
1005
+
1006
+ //LLAMA_LOG_INFO("nrows = %d, n_per_row = %d\n", nrows, n_per_row);
1007
+ std::vector<float> deq(nrows*n_per_row);
1008
+ const ggml_type_traits * qtype = ggml_get_type_traits(new_type);
1009
+ qtype->to_float(new_data_03, deq.data(), deq.size());
1010
+
1011
+ double err = 0.0f;
1012
+ for (int i = 0; i < (int) deq.size(); ++i) {
1013
+ err += fabsf(deq[i] - x[i]);
1014
+ //if (fabsf(deq[i] - x[i]) > 0.00001 && i < 256) {
1015
+ if (deq[i] != x[i]) {
1016
+ LLAMA_LOG_INFO("deq[%d] = %f, x[%d] = %f\n", i, deq[i], i, x[i]);
1017
+ }
1018
+ }
1019
+ //LLAMA_LOG_INFO("err = %f\n", err);
1020
+ GGML_ASSERT(err == 0.00000);
1021
+ }
1022
+ #endif
987
1023
  }
988
1024
  LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
989
1025
  }
@@ -307,6 +307,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
307
307
  };
308
308
  break;
309
309
  case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM:
310
+ case LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE:
310
311
  regex_exprs = {
311
312
  "\\p{N}{1,3}",
312
313
  "[一-龥぀-ゟ゠-ヿ]+",
@@ -1855,7 +1856,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1855
1856
  tokenizer_pre == "gigachat" ||
1856
1857
  tokenizer_pre == "jina-v2-es" ||
1857
1858
  tokenizer_pre == "jina-v2-de" ||
1858
- tokenizer_pre == "a.x-4.0") {
1859
+ tokenizer_pre == "a.x-4.0" ||
1860
+ tokenizer_pre == "mellum") {
1859
1861
  pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1860
1862
  } else if (
1861
1863
  tokenizer_pre == "jina-v1-en" ||
@@ -1964,6 +1966,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1964
1966
  tokenizer_pre == "hunyuan") {
1965
1967
  pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN;
1966
1968
  clean_spaces = false;
1969
+ } else if (
1970
+ tokenizer_pre == "hunyuan-dense") {
1971
+ pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE;
1972
+ clean_spaces = false;
1967
1973
  } else if (
1968
1974
  tokenizer_pre == "kimi-k2") {
1969
1975
  pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
@@ -2185,6 +2191,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2185
2191
  || t.first == "<|fim▁begin|>" // DeepSeek
2186
2192
  || t.first == "<PRE>"
2187
2193
  || t.first == "▁<PRE>" // CodeLlama
2194
+ || t.first == "<|code_prefix|>" // GLM-4.5
2188
2195
  ) {
2189
2196
  special_fim_pre_id = t.second;
2190
2197
  if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -2204,6 +2211,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2204
2211
  || t.first == "<|fim▁hole|>" // DeepSeek
2205
2212
  || t.first == "<SUF>"
2206
2213
  || t.first == "▁<SUF>" // CodeLlama
2214
+ || t.first == "<|code_suffix|>" // GLM-4.5
2207
2215
  ) {
2208
2216
  special_fim_suf_id = t.second;
2209
2217
  if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -2223,6 +2231,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2223
2231
  || t.first == "<|fim▁end|>" // DeepSeek
2224
2232
  || t.first == "<MID>"
2225
2233
  || t.first == "▁<MID>" // CodeLlama
2234
+ || t.first == "<|code_middle|>" // GLM-4.5
2226
2235
  ) {
2227
2236
  special_fim_mid_id = t.second;
2228
2237
  if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -2305,6 +2314,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2305
2314
  || t.first == "<|eot_id|>"
2306
2315
  || t.first == "<|im_end|>"
2307
2316
  || t.first == "<|end|>"
2317
+ || t.first == "<|return|>" // o200k_harmony
2318
+ || t.first == "<|call|>" // o200k_harmony
2308
2319
  || t.first == "<end_of_turn>"
2309
2320
  || t.first == "<|endoftext|>"
2310
2321
  || t.first == "<|eom_id|>"
@@ -2328,6 +2339,13 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2328
2339
  }
2329
2340
  }
2330
2341
 
2342
+ // @ngxson : quick hack for gpt-oss, always render these tokens
2343
+ for (const auto & t : token_to_id) {
2344
+ if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>") {
2345
+ id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
2346
+ }
2347
+ }
2348
+
2331
2349
  // sanity checks
2332
2350
  if (special_eos_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eos_id) == 0) {
2333
2351
  special_eog_ids.insert(special_eos_id);
@@ -2343,6 +2361,36 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2343
2361
  special_eog_ids.insert(special_eom_id);
2344
2362
  LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
2345
2363
  }
2364
+
2365
+ // TODO: workaround for o200k_harmony tokenizer: the "<|end|>" token should not be EOG
2366
+ // we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens,
2367
+ // we remove the "<|end|>" token from the EOG list
2368
+ {
2369
+ bool has_return = false;
2370
+ bool has_call = false;
2371
+ bool has_end = false;
2372
+
2373
+ llama_token end_id = LLAMA_TOKEN_NULL;
2374
+
2375
+ LLAMA_LOG_INFO("%s: printing all EOG tokens:\n", __func__);
2376
+ for (auto tid : special_eog_ids) {
2377
+ LLAMA_LOG_INFO("%s: - %d ('%s')\n", __func__, tid, id_to_token[tid].text.c_str());
2378
+
2379
+ if (id_to_token[tid].text == "<|return|>") {
2380
+ has_return = true;
2381
+ } else if (id_to_token[tid].text == "<|call|>") {
2382
+ has_call = true;
2383
+ } else if (id_to_token[tid].text == "<|end|>") {
2384
+ has_end = true;
2385
+ end_id = tid;
2386
+ }
2387
+ }
2388
+
2389
+ if (has_return && has_call && has_end) {
2390
+ special_eog_ids.erase(end_id);
2391
+ LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
2392
+ }
2393
+ }
2346
2394
  }
2347
2395
 
2348
2396
  // build special tokens cache
@@ -46,6 +46,7 @@ enum llama_vocab_pre_type {
46
46
  LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
47
47
  LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
48
48
  LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
49
+ LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38,
49
50
  };
50
51
 
51
52
  struct LLM_KV;