@fugood/llama.node 1.1.4 → 1.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/lib/binding.ts +8 -0
  2. package/package.json +14 -14
  3. package/scripts/llama.cpp.patch +17 -13
  4. package/src/LlamaCompletionWorker.cpp +2 -0
  5. package/src/LlamaContext.cpp +3 -0
  6. package/src/llama.cpp/common/arg.cpp +80 -10
  7. package/src/llama.cpp/common/chat.cpp +52 -8
  8. package/src/llama.cpp/common/chat.h +7 -2
  9. package/src/llama.cpp/common/common.cpp +1 -0
  10. package/src/llama.cpp/common/common.h +16 -6
  11. package/src/llama.cpp/common/speculative.cpp +135 -54
  12. package/src/llama.cpp/common/speculative.h +8 -1
  13. package/src/llama.cpp/ggml/CMakeLists.txt +4 -2
  14. package/src/llama.cpp/ggml/include/ggml.h +37 -1
  15. package/src/llama.cpp/ggml/src/CMakeLists.txt +12 -1
  16. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +61 -0
  17. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +96 -8
  18. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +3196 -0
  19. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +20 -0
  20. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14 -1
  21. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +207 -9
  22. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -7
  23. package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +35 -0
  24. package/src/llama.cpp/ggml/src/ggml-cpu/quants.h +8 -0
  25. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +263 -0
  26. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
  27. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +19 -4
  28. package/src/llama.cpp/include/llama.h +9 -4
  29. package/src/llama.cpp/src/llama-arch.cpp +105 -0
  30. package/src/llama.cpp/src/llama-arch.h +12 -0
  31. package/src/llama.cpp/src/llama-batch.cpp +1 -1
  32. package/src/llama.cpp/src/llama-chat.cpp +33 -1
  33. package/src/llama.cpp/src/llama-chat.h +2 -0
  34. package/src/llama.cpp/src/llama-context.cpp +19 -10
  35. package/src/llama.cpp/src/llama-context.h +4 -1
  36. package/src/llama.cpp/src/llama-graph.cpp +175 -148
  37. package/src/llama.cpp/src/llama-graph.h +60 -23
  38. package/src/llama.cpp/src/llama-hparams.h +5 -3
  39. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +6 -2
  40. package/src/llama.cpp/src/llama-kv-cache-unified.h +1 -1
  41. package/src/llama.cpp/src/llama-memory-hybrid.cpp +2 -1
  42. package/src/llama.cpp/src/llama-memory-hybrid.h +1 -0
  43. package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
  44. package/src/llama.cpp/src/llama-model-loader.h +3 -2
  45. package/src/llama.cpp/src/llama-model.cpp +949 -75
  46. package/src/llama.cpp/src/llama-model.h +24 -4
  47. package/src/llama.cpp/src/llama-quant.cpp +40 -4
  48. package/src/llama.cpp/src/llama-vocab.cpp +49 -1
  49. package/src/llama.cpp/src/llama-vocab.h +1 -0
@@ -59,7 +59,7 @@ bool llama_batch_allocr::init(
59
59
  for (int32_t i = 0; i < batch.n_tokens; ++i) {
60
60
  for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
61
61
  if (batch.seq_id && (batch.seq_id[i][s] < 0 || batch.seq_id[i][s] >= (llama_seq_id) n_seq_max)) {
62
- LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d > %d\n", __func__, i, s, batch.seq_id[i][s], (llama_seq_id) n_seq_max);
62
+ LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d >= %d\n", __func__, i, s, batch.seq_id[i][s], (llama_seq_id) n_seq_max);
63
63
  return false;
64
64
  }
65
65
  }
@@ -66,6 +66,8 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
66
66
  { "llama4", LLM_CHAT_TEMPLATE_LLAMA4 },
67
67
  { "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM },
68
68
  { "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE },
69
+ { "gpt-oss", LLM_CHAT_TEMPLATE_OPENAI_MOE },
70
+ { "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE },
69
71
  { "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
70
72
  };
71
73
 
@@ -193,6 +195,10 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
193
195
  return LLM_CHAT_TEMPLATE_DOTS1;
194
196
  } else if (tmpl_contains("<|startoftext|>") && tmpl_contains("<|extra_4|>")) {
195
197
  return LLM_CHAT_TEMPLATE_HUNYUAN_MOE;
198
+ } else if (tmpl_contains("<|start|>") && tmpl_contains("<|channel|>")) {
199
+ return LLM_CHAT_TEMPLATE_OPENAI_MOE;
200
+ } else if (tmpl_contains("<|hy_place▁holder▁no▁2|>") && tmpl_contains("<|hy_place▁holder▁no▁3|>")) {
201
+ return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE;
196
202
  } else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
197
203
  return LLM_CHAT_TEMPLATE_KIMI_K2;
198
204
  }
@@ -698,11 +704,37 @@ int32_t llm_chat_apply_template(
698
704
  if (role == "system") {
699
705
  ss << "<|startoftext|>" << message->content << "<|extra_4|>";
700
706
  } else if (role == "assistant") {
701
- ss << "<|startoftext|>" << message->content << "<|eos|>";
707
+ ss << message->content << "<|eos|>";
702
708
  } else {
703
709
  ss << "<|startoftext|>" << message->content << "<|extra_0|>";
704
710
  }
705
711
  }
712
+ } else if (tmpl == LLM_CHAT_TEMPLATE_OPENAI_MOE) {
713
+ // OpenAI MoE (based on Harmony chat template)
714
+ for (auto message : chat) {
715
+ std::string role(message->role);
716
+ ss << "<|start|>" << role << "<|message|>" << message->content;
717
+ ss << (role == "assistant" ? "<|return|>" : "<|end|>");
718
+ }
719
+ if (add_ass) {
720
+ ss << "<|start|>assistant";
721
+ }
722
+ } else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_DENSE) {
723
+ // tencent/Hunyuan-4B-Instruct
724
+ for (size_t i = 0; i < chat.size(); i++) {
725
+ std::string role(chat[i]->role);
726
+ if (i == 0) {
727
+ if (role == "system") {
728
+ ss << chat[i]->content << "<|hy_place▁holder▁no▁3|>";
729
+ }
730
+ }
731
+
732
+ if (role == "assistant") {
733
+ ss << "<|hy_Assistant|>" << chat[i]->content << "<|hy_place▁holder▁no▁2|>";
734
+ } else if (role == "user") {
735
+ ss << "<|hy_User|>" << chat[i]->content << "<|hy_Assistant|>";
736
+ }
737
+ }
706
738
  } else if (tmpl == LLM_CHAT_TEMPLATE_KIMI_K2) {
707
739
  // moonshotai/Kimi-K2-Instruct
708
740
  for (auto message : chat) {
@@ -46,6 +46,8 @@ enum llm_chat_template {
46
46
  LLM_CHAT_TEMPLATE_SMOLVLM,
47
47
  LLM_CHAT_TEMPLATE_DOTS1,
48
48
  LLM_CHAT_TEMPLATE_HUNYUAN_MOE,
49
+ LLM_CHAT_TEMPLATE_OPENAI_MOE,
50
+ LLM_CHAT_TEMPLATE_HUNYUAN_DENSE,
49
51
  LLM_CHAT_TEMPLATE_KIMI_K2,
50
52
  LLM_CHAT_TEMPLATE_UNKNOWN,
51
53
  };
@@ -105,7 +105,7 @@ llama_context::llama_context(
105
105
 
106
106
  {
107
107
  const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
108
- supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : false;
108
+ supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : supports_set_rows;
109
109
 
110
110
  if (!supports_set_rows && !cparams.kv_unified) {
111
111
  LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__);
@@ -113,6 +113,15 @@ llama_context::llama_context(
113
113
  }
114
114
  }
115
115
 
116
+ {
117
+ const char * LLAMA_GRAPH_REUSE_DISABLE = getenv("LLAMA_GRAPH_REUSE_DISABLE");
118
+ graph_reuse_disable = LLAMA_GRAPH_REUSE_DISABLE ? (atoi(LLAMA_GRAPH_REUSE_DISABLE) != 0) : graph_reuse_disable;
119
+
120
+ if (graph_reuse_disable) {
121
+ LLAMA_LOG_WARN("%s: graph reuse disabled\n", __func__);
122
+ }
123
+ }
124
+
116
125
  const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
117
126
 
118
127
  LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max);
@@ -716,7 +725,7 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll
716
725
  // in order to correctly reuse a graph, it's full topology has to be uniquely determined by these parameters
717
726
  const auto gparams = graph_params(res, ubatch, mctx, gtype);
718
727
 
719
- if (res->can_reuse(gparams)) {
728
+ if (!graph_reuse_disable && res->can_reuse(gparams)) {
720
729
  //LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__);
721
730
 
722
731
  n_reused++;
@@ -777,7 +786,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
777
786
  const auto & hparams = model.hparams;
778
787
 
779
788
  const int64_t n_embd = hparams.n_embd;
780
- const int32_t n_vocab = model.vocab.n_tokens();
789
+ const int64_t n_vocab = model.vocab.n_tokens();
781
790
 
782
791
  // note: during encode, we always pass the full sequence starting from pos = 0
783
792
  if (!balloc->init(batch_inp, model.vocab, nullptr, n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) {
@@ -950,7 +959,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
950
959
  const auto & vocab = model.vocab;
951
960
  const auto & hparams = model.hparams;
952
961
 
953
- const int32_t n_vocab = vocab.n_tokens();
962
+ const int64_t n_vocab = vocab.n_tokens();
954
963
  const int64_t n_embd = hparams.n_embd;
955
964
 
956
965
  // when computing embeddings, all tokens are output
@@ -1319,21 +1328,21 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
1319
1328
  }
1320
1329
 
1321
1330
  void llama_context::output_reorder() {
1322
- const uint32_t n_vocab = model.vocab.n_tokens();
1331
+ const uint64_t n_vocab = model.vocab.n_tokens();
1323
1332
  const uint64_t n_embd = model.hparams.n_embd;
1324
1333
 
1325
- for (uint32_t s = 0; s < output_swaps.size(); ++s) {
1326
- const uint32_t i0 = output_swaps[s].i0;
1327
- const uint32_t i1 = output_swaps[s].i1;
1334
+ for (size_t s = 0; s < output_swaps.size(); ++s) {
1335
+ const uint64_t i0 = output_swaps[s].i0;
1336
+ const uint64_t i1 = output_swaps[s].i1;
1328
1337
 
1329
1338
  if (logits_size > 0) {
1330
- for (uint32_t k = 0; k < n_vocab; k++) {
1339
+ for (uint64_t k = 0; k < n_vocab; k++) {
1331
1340
  std::swap(logits[i0*n_vocab + k], logits[i1*n_vocab + k]);
1332
1341
  }
1333
1342
  }
1334
1343
 
1335
1344
  if (embd_size > 0) {
1336
- for (uint32_t k = 0; k < n_embd; k++) {
1345
+ for (uint64_t k = 0; k < n_embd; k++) {
1337
1346
  std::swap(embd[i0*n_embd + k], embd[i1*n_embd + k]);
1338
1347
  }
1339
1348
  }
@@ -289,7 +289,10 @@ private:
289
289
 
290
290
  // env: LLAMA_SET_ROWS (temporary)
291
291
  // ref: https://github.com/ggml-org/llama.cpp/pull/14285
292
- bool supports_set_rows = false;
292
+ bool supports_set_rows = true;
293
+
294
+ // env: LLAMA_GRAPH_REUSE_DISABLE
295
+ bool graph_reuse_disable = false;
293
296
 
294
297
  // perf
295
298
  mutable int64_t t_start_us = 0;