@fugood/llama.node 1.1.5 → 1.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/package.json +14 -14
  2. package/scripts/llama.cpp.patch +17 -13
  3. package/src/LlamaCompletionWorker.cpp +2 -0
  4. package/src/llama.cpp/common/arg.cpp +28 -11
  5. package/src/llama.cpp/common/chat.cpp +46 -2
  6. package/src/llama.cpp/common/chat.h +7 -2
  7. package/src/llama.cpp/common/common.h +3 -2
  8. package/src/llama.cpp/ggml/CMakeLists.txt +3 -2
  9. package/src/llama.cpp/ggml/include/ggml.h +37 -1
  10. package/src/llama.cpp/ggml/src/CMakeLists.txt +12 -1
  11. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +61 -0
  12. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +96 -8
  13. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +6 -0
  14. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14 -1
  15. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +207 -9
  16. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -7
  17. package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +35 -0
  18. package/src/llama.cpp/ggml/src/ggml-cpu/quants.h +8 -0
  19. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +19 -4
  20. package/src/llama.cpp/include/llama.h +1 -0
  21. package/src/llama.cpp/src/llama-arch.cpp +65 -0
  22. package/src/llama.cpp/src/llama-arch.h +10 -0
  23. package/src/llama.cpp/src/llama-chat.cpp +13 -0
  24. package/src/llama.cpp/src/llama-chat.h +1 -0
  25. package/src/llama.cpp/src/llama-context.cpp +8 -8
  26. package/src/llama.cpp/src/llama-graph.cpp +118 -9
  27. package/src/llama.cpp/src/llama-graph.h +38 -0
  28. package/src/llama.cpp/src/llama-hparams.h +5 -3
  29. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +4 -0
  30. package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
  31. package/src/llama.cpp/src/llama-model-loader.h +3 -2
  32. package/src/llama.cpp/src/llama-model.cpp +499 -4
  33. package/src/llama.cpp/src/llama-model.h +24 -4
  34. package/src/llama.cpp/src/llama-quant.cpp +37 -1
  35. package/src/llama.cpp/src/llama-vocab.cpp +42 -0
@@ -211,7 +211,10 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
211
211
  const int64_t nx = tensor->ne[0];
212
212
  const int64_t qk_k = ggml_blck_size(new_type);
213
213
 
214
- if (arch == LLM_ARCH_FALCON || nx % qk_k != 0) {
214
+ if (ftype == LLAMA_FTYPE_MOSTLY_MXFP4_MOE) {
215
+ new_type = GGML_TYPE_Q8_0;
216
+ }
217
+ else if (arch == LLM_ARCH_FALCON || nx % qk_k != 0) {
215
218
  new_type = GGML_TYPE_Q8_0;
216
219
  }
217
220
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
@@ -223,6 +226,14 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
223
226
  new_type = GGML_TYPE_Q6_K;
224
227
  }
225
228
  }
229
+ } else if (ftype == LLAMA_FTYPE_MOSTLY_MXFP4_MOE) {
230
+ // MoE tensors -> MXFP4
231
+ // other tensors -> Q8_0
232
+ if (tensor->ne[2] > 1) {
233
+ new_type = GGML_TYPE_MXFP4;
234
+ } else {
235
+ new_type = GGML_TYPE_Q8_0;
236
+ }
226
237
  } else if (name == "token_embd.weight" || name == "per_layer_token_embd.weight") {
227
238
  if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
228
239
  new_type = qs.params->token_embedding_type;
@@ -533,6 +544,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
533
544
  case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
534
545
  case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break;
535
546
 
547
+ case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: default_type = GGML_TYPE_MXFP4; break;
548
+
536
549
  // K-quants
537
550
  case LLAMA_FTYPE_MOSTLY_Q2_K_S:
538
551
  case LLAMA_FTYPE_MOSTLY_Q2_K: default_type = GGML_TYPE_Q2_K; break;
@@ -984,6 +997,29 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
984
997
  const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
985
998
 
986
999
  new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
1000
+
1001
+ // TODO: temporary sanity check that the F16 -> MXFP4 is lossless
1002
+ #if 1
1003
+ if (new_type == GGML_TYPE_MXFP4) {
1004
+ auto * x = f32_data_03;
1005
+
1006
+ //LLAMA_LOG_INFO("nrows = %d, n_per_row = %d\n", nrows, n_per_row);
1007
+ std::vector<float> deq(nrows*n_per_row);
1008
+ const ggml_type_traits * qtype = ggml_get_type_traits(new_type);
1009
+ qtype->to_float(new_data_03, deq.data(), deq.size());
1010
+
1011
+ double err = 0.0f;
1012
+ for (int i = 0; i < (int) deq.size(); ++i) {
1013
+ err += fabsf(deq[i] - x[i]);
1014
+ //if (fabsf(deq[i] - x[i]) > 0.00001 && i < 256) {
1015
+ if (deq[i] != x[i]) {
1016
+ LLAMA_LOG_INFO("deq[%d] = %f, x[%d] = %f\n", i, deq[i], i, x[i]);
1017
+ }
1018
+ }
1019
+ //LLAMA_LOG_INFO("err = %f\n", err);
1020
+ GGML_ASSERT(err == 0.00000);
1021
+ }
1022
+ #endif
987
1023
  }
988
1024
  LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
989
1025
  }
@@ -2191,6 +2191,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2191
2191
  || t.first == "<|fim▁begin|>" // DeepSeek
2192
2192
  || t.first == "<PRE>"
2193
2193
  || t.first == "▁<PRE>" // CodeLlama
2194
+ || t.first == "<|code_prefix|>" // GLM-4.5
2194
2195
  ) {
2195
2196
  special_fim_pre_id = t.second;
2196
2197
  if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -2210,6 +2211,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2210
2211
  || t.first == "<|fim▁hole|>" // DeepSeek
2211
2212
  || t.first == "<SUF>"
2212
2213
  || t.first == "▁<SUF>" // CodeLlama
2214
+ || t.first == "<|code_suffix|>" // GLM-4.5
2213
2215
  ) {
2214
2216
  special_fim_suf_id = t.second;
2215
2217
  if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -2229,6 +2231,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2229
2231
  || t.first == "<|fim▁end|>" // DeepSeek
2230
2232
  || t.first == "<MID>"
2231
2233
  || t.first == "▁<MID>" // CodeLlama
2234
+ || t.first == "<|code_middle|>" // GLM-4.5
2232
2235
  ) {
2233
2236
  special_fim_mid_id = t.second;
2234
2237
  if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -2311,6 +2314,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2311
2314
  || t.first == "<|eot_id|>"
2312
2315
  || t.first == "<|im_end|>"
2313
2316
  || t.first == "<|end|>"
2317
+ || t.first == "<|return|>" // o200k_harmony
2318
+ || t.first == "<|call|>" // o200k_harmony
2314
2319
  || t.first == "<end_of_turn>"
2315
2320
  || t.first == "<|endoftext|>"
2316
2321
  || t.first == "<|eom_id|>"
@@ -2334,6 +2339,13 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2334
2339
  }
2335
2340
  }
2336
2341
 
2342
+ // @ngxson : quick hack for gpt-oss, always render these tokens
2343
+ for (const auto & t : token_to_id) {
2344
+ if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>") {
2345
+ id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
2346
+ }
2347
+ }
2348
+
2337
2349
  // sanity checks
2338
2350
  if (special_eos_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eos_id) == 0) {
2339
2351
  special_eog_ids.insert(special_eos_id);
@@ -2349,6 +2361,36 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2349
2361
  special_eog_ids.insert(special_eom_id);
2350
2362
  LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
2351
2363
  }
2364
+
2365
+ // TODO: workaround for o200k_harmony tokenizer: the "<|end|>" token should not be EOG
2366
+ // we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens,
2367
+ // we remove the "<|end|>" token from the EOG list
2368
+ {
2369
+ bool has_return = false;
2370
+ bool has_call = false;
2371
+ bool has_end = false;
2372
+
2373
+ llama_token end_id = LLAMA_TOKEN_NULL;
2374
+
2375
+ LLAMA_LOG_INFO("%s: printing all EOG tokens:\n", __func__);
2376
+ for (auto tid : special_eog_ids) {
2377
+ LLAMA_LOG_INFO("%s: - %d ('%s')\n", __func__, tid, id_to_token[tid].text.c_str());
2378
+
2379
+ if (id_to_token[tid].text == "<|return|>") {
2380
+ has_return = true;
2381
+ } else if (id_to_token[tid].text == "<|call|>") {
2382
+ has_call = true;
2383
+ } else if (id_to_token[tid].text == "<|end|>") {
2384
+ has_end = true;
2385
+ end_id = tid;
2386
+ }
2387
+ }
2388
+
2389
+ if (has_return && has_call && has_end) {
2390
+ special_eog_ids.erase(end_id);
2391
+ LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
2392
+ }
2393
+ }
2352
2394
  }
2353
2395
 
2354
2396
  // build special tokens cache