@fugood/llama.node 1.1.4 → 1.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +8 -0
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +17 -13
- package/src/LlamaCompletionWorker.cpp +2 -0
- package/src/LlamaContext.cpp +3 -0
- package/src/llama.cpp/common/arg.cpp +80 -10
- package/src/llama.cpp/common/chat.cpp +52 -8
- package/src/llama.cpp/common/chat.h +7 -2
- package/src/llama.cpp/common/common.cpp +1 -0
- package/src/llama.cpp/common/common.h +16 -6
- package/src/llama.cpp/common/speculative.cpp +135 -54
- package/src/llama.cpp/common/speculative.h +8 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +4 -2
- package/src/llama.cpp/ggml/include/ggml.h +37 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +12 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +96 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +3196 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +207 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +35 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +263 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +19 -4
- package/src/llama.cpp/include/llama.h +9 -4
- package/src/llama.cpp/src/llama-arch.cpp +105 -0
- package/src/llama.cpp/src/llama-arch.h +12 -0
- package/src/llama.cpp/src/llama-batch.cpp +1 -1
- package/src/llama.cpp/src/llama-chat.cpp +33 -1
- package/src/llama.cpp/src/llama-chat.h +2 -0
- package/src/llama.cpp/src/llama-context.cpp +19 -10
- package/src/llama.cpp/src/llama-context.h +4 -1
- package/src/llama.cpp/src/llama-graph.cpp +175 -148
- package/src/llama.cpp/src/llama-graph.h +60 -23
- package/src/llama.cpp/src/llama-hparams.h +5 -3
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +6 -2
- package/src/llama.cpp/src/llama-kv-cache-unified.h +1 -1
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +2 -1
- package/src/llama.cpp/src/llama-memory-hybrid.h +1 -0
- package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
- package/src/llama.cpp/src/llama-model-loader.h +3 -2
- package/src/llama.cpp/src/llama-model.cpp +949 -75
- package/src/llama.cpp/src/llama-model.h +24 -4
- package/src/llama.cpp/src/llama-quant.cpp +40 -4
- package/src/llama.cpp/src/llama-vocab.cpp +49 -1
- package/src/llama.cpp/src/llama-vocab.h +1 -0
|
@@ -59,7 +59,7 @@ bool llama_batch_allocr::init(
|
|
|
59
59
|
for (int32_t i = 0; i < batch.n_tokens; ++i) {
|
|
60
60
|
for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
|
|
61
61
|
if (batch.seq_id && (batch.seq_id[i][s] < 0 || batch.seq_id[i][s] >= (llama_seq_id) n_seq_max)) {
|
|
62
|
-
LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d
|
|
62
|
+
LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d >= %d\n", __func__, i, s, batch.seq_id[i][s], (llama_seq_id) n_seq_max);
|
|
63
63
|
return false;
|
|
64
64
|
}
|
|
65
65
|
}
|
|
@@ -66,6 +66,8 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|
|
66
66
|
{ "llama4", LLM_CHAT_TEMPLATE_LLAMA4 },
|
|
67
67
|
{ "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM },
|
|
68
68
|
{ "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE },
|
|
69
|
+
{ "gpt-oss", LLM_CHAT_TEMPLATE_OPENAI_MOE },
|
|
70
|
+
{ "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE },
|
|
69
71
|
{ "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
|
|
70
72
|
};
|
|
71
73
|
|
|
@@ -193,6 +195,10 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
|
|
193
195
|
return LLM_CHAT_TEMPLATE_DOTS1;
|
|
194
196
|
} else if (tmpl_contains("<|startoftext|>") && tmpl_contains("<|extra_4|>")) {
|
|
195
197
|
return LLM_CHAT_TEMPLATE_HUNYUAN_MOE;
|
|
198
|
+
} else if (tmpl_contains("<|start|>") && tmpl_contains("<|channel|>")) {
|
|
199
|
+
return LLM_CHAT_TEMPLATE_OPENAI_MOE;
|
|
200
|
+
} else if (tmpl_contains("<|hy_place▁holder▁no▁2|>") && tmpl_contains("<|hy_place▁holder▁no▁3|>")) {
|
|
201
|
+
return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE;
|
|
196
202
|
} else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
|
|
197
203
|
return LLM_CHAT_TEMPLATE_KIMI_K2;
|
|
198
204
|
}
|
|
@@ -698,11 +704,37 @@ int32_t llm_chat_apply_template(
|
|
|
698
704
|
if (role == "system") {
|
|
699
705
|
ss << "<|startoftext|>" << message->content << "<|extra_4|>";
|
|
700
706
|
} else if (role == "assistant") {
|
|
701
|
-
ss <<
|
|
707
|
+
ss << message->content << "<|eos|>";
|
|
702
708
|
} else {
|
|
703
709
|
ss << "<|startoftext|>" << message->content << "<|extra_0|>";
|
|
704
710
|
}
|
|
705
711
|
}
|
|
712
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_OPENAI_MOE) {
|
|
713
|
+
// OpenAI MoE (based on Harmony chat template)
|
|
714
|
+
for (auto message : chat) {
|
|
715
|
+
std::string role(message->role);
|
|
716
|
+
ss << "<|start|>" << role << "<|message|>" << message->content;
|
|
717
|
+
ss << (role == "assistant" ? "<|return|>" : "<|end|>");
|
|
718
|
+
}
|
|
719
|
+
if (add_ass) {
|
|
720
|
+
ss << "<|start|>assistant";
|
|
721
|
+
}
|
|
722
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_DENSE) {
|
|
723
|
+
// tencent/Hunyuan-4B-Instruct
|
|
724
|
+
for (size_t i = 0; i < chat.size(); i++) {
|
|
725
|
+
std::string role(chat[i]->role);
|
|
726
|
+
if (i == 0) {
|
|
727
|
+
if (role == "system") {
|
|
728
|
+
ss << chat[i]->content << "<|hy_place▁holder▁no▁3|>";
|
|
729
|
+
}
|
|
730
|
+
}
|
|
731
|
+
|
|
732
|
+
if (role == "assistant") {
|
|
733
|
+
ss << "<|hy_Assistant|>" << chat[i]->content << "<|hy_place▁holder▁no▁2|>";
|
|
734
|
+
} else if (role == "user") {
|
|
735
|
+
ss << "<|hy_User|>" << chat[i]->content << "<|hy_Assistant|>";
|
|
736
|
+
}
|
|
737
|
+
}
|
|
706
738
|
} else if (tmpl == LLM_CHAT_TEMPLATE_KIMI_K2) {
|
|
707
739
|
// moonshotai/Kimi-K2-Instruct
|
|
708
740
|
for (auto message : chat) {
|
|
@@ -105,7 +105,7 @@ llama_context::llama_context(
|
|
|
105
105
|
|
|
106
106
|
{
|
|
107
107
|
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
|
|
108
|
-
supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) :
|
|
108
|
+
supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : supports_set_rows;
|
|
109
109
|
|
|
110
110
|
if (!supports_set_rows && !cparams.kv_unified) {
|
|
111
111
|
LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__);
|
|
@@ -113,6 +113,15 @@ llama_context::llama_context(
|
|
|
113
113
|
}
|
|
114
114
|
}
|
|
115
115
|
|
|
116
|
+
{
|
|
117
|
+
const char * LLAMA_GRAPH_REUSE_DISABLE = getenv("LLAMA_GRAPH_REUSE_DISABLE");
|
|
118
|
+
graph_reuse_disable = LLAMA_GRAPH_REUSE_DISABLE ? (atoi(LLAMA_GRAPH_REUSE_DISABLE) != 0) : graph_reuse_disable;
|
|
119
|
+
|
|
120
|
+
if (graph_reuse_disable) {
|
|
121
|
+
LLAMA_LOG_WARN("%s: graph reuse disabled\n", __func__);
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
|
|
116
125
|
const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
|
|
117
126
|
|
|
118
127
|
LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max);
|
|
@@ -716,7 +725,7 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll
|
|
|
716
725
|
// in order to correctly reuse a graph, it's full topology has to be uniquely determined by these parameters
|
|
717
726
|
const auto gparams = graph_params(res, ubatch, mctx, gtype);
|
|
718
727
|
|
|
719
|
-
if (res->can_reuse(gparams)) {
|
|
728
|
+
if (!graph_reuse_disable && res->can_reuse(gparams)) {
|
|
720
729
|
//LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__);
|
|
721
730
|
|
|
722
731
|
n_reused++;
|
|
@@ -777,7 +786,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
|
|
777
786
|
const auto & hparams = model.hparams;
|
|
778
787
|
|
|
779
788
|
const int64_t n_embd = hparams.n_embd;
|
|
780
|
-
const
|
|
789
|
+
const int64_t n_vocab = model.vocab.n_tokens();
|
|
781
790
|
|
|
782
791
|
// note: during encode, we always pass the full sequence starting from pos = 0
|
|
783
792
|
if (!balloc->init(batch_inp, model.vocab, nullptr, n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) {
|
|
@@ -950,7 +959,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
|
|
950
959
|
const auto & vocab = model.vocab;
|
|
951
960
|
const auto & hparams = model.hparams;
|
|
952
961
|
|
|
953
|
-
const
|
|
962
|
+
const int64_t n_vocab = vocab.n_tokens();
|
|
954
963
|
const int64_t n_embd = hparams.n_embd;
|
|
955
964
|
|
|
956
965
|
// when computing embeddings, all tokens are output
|
|
@@ -1319,21 +1328,21 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
|
|
1319
1328
|
}
|
|
1320
1329
|
|
|
1321
1330
|
void llama_context::output_reorder() {
|
|
1322
|
-
const
|
|
1331
|
+
const uint64_t n_vocab = model.vocab.n_tokens();
|
|
1323
1332
|
const uint64_t n_embd = model.hparams.n_embd;
|
|
1324
1333
|
|
|
1325
|
-
for (
|
|
1326
|
-
const
|
|
1327
|
-
const
|
|
1334
|
+
for (size_t s = 0; s < output_swaps.size(); ++s) {
|
|
1335
|
+
const uint64_t i0 = output_swaps[s].i0;
|
|
1336
|
+
const uint64_t i1 = output_swaps[s].i1;
|
|
1328
1337
|
|
|
1329
1338
|
if (logits_size > 0) {
|
|
1330
|
-
for (
|
|
1339
|
+
for (uint64_t k = 0; k < n_vocab; k++) {
|
|
1331
1340
|
std::swap(logits[i0*n_vocab + k], logits[i1*n_vocab + k]);
|
|
1332
1341
|
}
|
|
1333
1342
|
}
|
|
1334
1343
|
|
|
1335
1344
|
if (embd_size > 0) {
|
|
1336
|
-
for (
|
|
1345
|
+
for (uint64_t k = 0; k < n_embd; k++) {
|
|
1337
1346
|
std::swap(embd[i0*n_embd + k], embd[i1*n_embd + k]);
|
|
1338
1347
|
}
|
|
1339
1348
|
}
|
|
@@ -289,7 +289,10 @@ private:
|
|
|
289
289
|
|
|
290
290
|
// env: LLAMA_SET_ROWS (temporary)
|
|
291
291
|
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
|
|
292
|
-
bool supports_set_rows =
|
|
292
|
+
bool supports_set_rows = true;
|
|
293
|
+
|
|
294
|
+
// env: LLAMA_GRAPH_REUSE_DISABLE
|
|
295
|
+
bool graph_reuse_disable = false;
|
|
293
296
|
|
|
294
297
|
// perf
|
|
295
298
|
mutable int64_t t_start_us = 0;
|