@fugood/llama.node 1.1.5 → 1.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +17 -13
- package/src/LlamaCompletionWorker.cpp +2 -0
- package/src/llama.cpp/common/arg.cpp +28 -11
- package/src/llama.cpp/common/chat.cpp +46 -2
- package/src/llama.cpp/common/chat.h +7 -2
- package/src/llama.cpp/common/common.h +3 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +3 -2
- package/src/llama.cpp/ggml/include/ggml.h +37 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +12 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +96 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +6 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +207 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +35 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +19 -4
- package/src/llama.cpp/include/llama.h +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +65 -0
- package/src/llama.cpp/src/llama-arch.h +10 -0
- package/src/llama.cpp/src/llama-chat.cpp +13 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +8 -8
- package/src/llama.cpp/src/llama-graph.cpp +118 -9
- package/src/llama.cpp/src/llama-graph.h +38 -0
- package/src/llama.cpp/src/llama-hparams.h +5 -3
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +4 -0
- package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
- package/src/llama.cpp/src/llama-model-loader.h +3 -2
- package/src/llama.cpp/src/llama-model.cpp +499 -4
- package/src/llama.cpp/src/llama-model.h +24 -4
- package/src/llama.cpp/src/llama-quant.cpp +37 -1
- package/src/llama.cpp/src/llama-vocab.cpp +42 -0
|
@@ -211,7 +211,10 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
|
|
211
211
|
const int64_t nx = tensor->ne[0];
|
|
212
212
|
const int64_t qk_k = ggml_blck_size(new_type);
|
|
213
213
|
|
|
214
|
-
if (
|
|
214
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_MXFP4_MOE) {
|
|
215
|
+
new_type = GGML_TYPE_Q8_0;
|
|
216
|
+
}
|
|
217
|
+
else if (arch == LLM_ARCH_FALCON || nx % qk_k != 0) {
|
|
215
218
|
new_type = GGML_TYPE_Q8_0;
|
|
216
219
|
}
|
|
217
220
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
|
@@ -223,6 +226,14 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
|
|
223
226
|
new_type = GGML_TYPE_Q6_K;
|
|
224
227
|
}
|
|
225
228
|
}
|
|
229
|
+
} else if (ftype == LLAMA_FTYPE_MOSTLY_MXFP4_MOE) {
|
|
230
|
+
// MoE tensors -> MXFP4
|
|
231
|
+
// other tensors -> Q8_0
|
|
232
|
+
if (tensor->ne[2] > 1) {
|
|
233
|
+
new_type = GGML_TYPE_MXFP4;
|
|
234
|
+
} else {
|
|
235
|
+
new_type = GGML_TYPE_Q8_0;
|
|
236
|
+
}
|
|
226
237
|
} else if (name == "token_embd.weight" || name == "per_layer_token_embd.weight") {
|
|
227
238
|
if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
|
|
228
239
|
new_type = qs.params->token_embedding_type;
|
|
@@ -533,6 +544,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
533
544
|
case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
|
|
534
545
|
case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break;
|
|
535
546
|
|
|
547
|
+
case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: default_type = GGML_TYPE_MXFP4; break;
|
|
548
|
+
|
|
536
549
|
// K-quants
|
|
537
550
|
case LLAMA_FTYPE_MOSTLY_Q2_K_S:
|
|
538
551
|
case LLAMA_FTYPE_MOSTLY_Q2_K: default_type = GGML_TYPE_Q2_K; break;
|
|
@@ -984,6 +997,29 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
984
997
|
const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
|
|
985
998
|
|
|
986
999
|
new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
|
|
1000
|
+
|
|
1001
|
+
// TODO: temporary sanity check that the F16 -> MXFP4 is lossless
|
|
1002
|
+
#if 1
|
|
1003
|
+
if (new_type == GGML_TYPE_MXFP4) {
|
|
1004
|
+
auto * x = f32_data_03;
|
|
1005
|
+
|
|
1006
|
+
//LLAMA_LOG_INFO("nrows = %d, n_per_row = %d\n", nrows, n_per_row);
|
|
1007
|
+
std::vector<float> deq(nrows*n_per_row);
|
|
1008
|
+
const ggml_type_traits * qtype = ggml_get_type_traits(new_type);
|
|
1009
|
+
qtype->to_float(new_data_03, deq.data(), deq.size());
|
|
1010
|
+
|
|
1011
|
+
double err = 0.0f;
|
|
1012
|
+
for (int i = 0; i < (int) deq.size(); ++i) {
|
|
1013
|
+
err += fabsf(deq[i] - x[i]);
|
|
1014
|
+
//if (fabsf(deq[i] - x[i]) > 0.00001 && i < 256) {
|
|
1015
|
+
if (deq[i] != x[i]) {
|
|
1016
|
+
LLAMA_LOG_INFO("deq[%d] = %f, x[%d] = %f\n", i, deq[i], i, x[i]);
|
|
1017
|
+
}
|
|
1018
|
+
}
|
|
1019
|
+
//LLAMA_LOG_INFO("err = %f\n", err);
|
|
1020
|
+
GGML_ASSERT(err == 0.00000);
|
|
1021
|
+
}
|
|
1022
|
+
#endif
|
|
987
1023
|
}
|
|
988
1024
|
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
|
989
1025
|
}
|
|
@@ -2191,6 +2191,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2191
2191
|
|| t.first == "<|fim▁begin|>" // DeepSeek
|
|
2192
2192
|
|| t.first == "<PRE>"
|
|
2193
2193
|
|| t.first == "▁<PRE>" // CodeLlama
|
|
2194
|
+
|| t.first == "<|code_prefix|>" // GLM-4.5
|
|
2194
2195
|
) {
|
|
2195
2196
|
special_fim_pre_id = t.second;
|
|
2196
2197
|
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
@@ -2210,6 +2211,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2210
2211
|
|| t.first == "<|fim▁hole|>" // DeepSeek
|
|
2211
2212
|
|| t.first == "<SUF>"
|
|
2212
2213
|
|| t.first == "▁<SUF>" // CodeLlama
|
|
2214
|
+
|| t.first == "<|code_suffix|>" // GLM-4.5
|
|
2213
2215
|
) {
|
|
2214
2216
|
special_fim_suf_id = t.second;
|
|
2215
2217
|
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
@@ -2229,6 +2231,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2229
2231
|
|| t.first == "<|fim▁end|>" // DeepSeek
|
|
2230
2232
|
|| t.first == "<MID>"
|
|
2231
2233
|
|| t.first == "▁<MID>" // CodeLlama
|
|
2234
|
+
|| t.first == "<|code_middle|>" // GLM-4.5
|
|
2232
2235
|
) {
|
|
2233
2236
|
special_fim_mid_id = t.second;
|
|
2234
2237
|
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
@@ -2311,6 +2314,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2311
2314
|
|| t.first == "<|eot_id|>"
|
|
2312
2315
|
|| t.first == "<|im_end|>"
|
|
2313
2316
|
|| t.first == "<|end|>"
|
|
2317
|
+
|| t.first == "<|return|>" // o200k_harmony
|
|
2318
|
+
|| t.first == "<|call|>" // o200k_harmony
|
|
2314
2319
|
|| t.first == "<end_of_turn>"
|
|
2315
2320
|
|| t.first == "<|endoftext|>"
|
|
2316
2321
|
|| t.first == "<|eom_id|>"
|
|
@@ -2334,6 +2339,13 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2334
2339
|
}
|
|
2335
2340
|
}
|
|
2336
2341
|
|
|
2342
|
+
// @ngxson : quick hack for gpt-oss, always render these tokens
|
|
2343
|
+
for (const auto & t : token_to_id) {
|
|
2344
|
+
if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>") {
|
|
2345
|
+
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
|
|
2346
|
+
}
|
|
2347
|
+
}
|
|
2348
|
+
|
|
2337
2349
|
// sanity checks
|
|
2338
2350
|
if (special_eos_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eos_id) == 0) {
|
|
2339
2351
|
special_eog_ids.insert(special_eos_id);
|
|
@@ -2349,6 +2361,36 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2349
2361
|
special_eog_ids.insert(special_eom_id);
|
|
2350
2362
|
LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
|
|
2351
2363
|
}
|
|
2364
|
+
|
|
2365
|
+
// TODO: workaround for o200k_harmony tokenizer: the "<|end|>" token should not be EOG
|
|
2366
|
+
// we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens,
|
|
2367
|
+
// we remove the "<|end|>" token from the EOG list
|
|
2368
|
+
{
|
|
2369
|
+
bool has_return = false;
|
|
2370
|
+
bool has_call = false;
|
|
2371
|
+
bool has_end = false;
|
|
2372
|
+
|
|
2373
|
+
llama_token end_id = LLAMA_TOKEN_NULL;
|
|
2374
|
+
|
|
2375
|
+
LLAMA_LOG_INFO("%s: printing all EOG tokens:\n", __func__);
|
|
2376
|
+
for (auto tid : special_eog_ids) {
|
|
2377
|
+
LLAMA_LOG_INFO("%s: - %d ('%s')\n", __func__, tid, id_to_token[tid].text.c_str());
|
|
2378
|
+
|
|
2379
|
+
if (id_to_token[tid].text == "<|return|>") {
|
|
2380
|
+
has_return = true;
|
|
2381
|
+
} else if (id_to_token[tid].text == "<|call|>") {
|
|
2382
|
+
has_call = true;
|
|
2383
|
+
} else if (id_to_token[tid].text == "<|end|>") {
|
|
2384
|
+
has_end = true;
|
|
2385
|
+
end_id = tid;
|
|
2386
|
+
}
|
|
2387
|
+
}
|
|
2388
|
+
|
|
2389
|
+
if (has_return && has_call && has_end) {
|
|
2390
|
+
special_eog_ids.erase(end_id);
|
|
2391
|
+
LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
|
|
2392
|
+
}
|
|
2393
|
+
}
|
|
2352
2394
|
}
|
|
2353
2395
|
|
|
2354
2396
|
// build special tokens cache
|