@fugood/llama.node 1.4.10 → 1.4.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +30 -30
- package/src/LlamaContext.cpp +1 -1
- package/src/llama.cpp/common/arg.cpp +29 -14
- package/src/llama.cpp/common/arg.h +1 -0
- package/src/llama.cpp/common/chat-parser.cpp +11 -0
- package/src/llama.cpp/common/chat.cpp +32 -3
- package/src/llama.cpp/common/chat.h +1 -0
- package/src/llama.cpp/common/common.cpp +23 -23
- package/src/llama.cpp/common/common.h +1 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +1 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +23 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
- package/src/llama.cpp/include/llama.h +13 -4
- package/src/llama.cpp/src/CMakeLists.txt +4 -0
- package/src/llama.cpp/src/llama-adapter.cpp +12 -3
- package/src/llama.cpp/src/llama-adapter.h +7 -1
- package/src/llama.cpp/src/llama-arch.cpp +76 -0
- package/src/llama.cpp/src/llama-arch.h +7 -0
- package/src/llama.cpp/src/llama-chat.cpp +11 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +22 -21
- package/src/llama.cpp/src/llama-hparams.h +4 -3
- package/src/llama.cpp/src/llama-kv-cache.h +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +11 -4
- package/src/llama.cpp/src/llama-model-loader.cpp +23 -0
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model.cpp +287 -16
- package/src/llama.cpp/src/llama-model.h +13 -2
- package/src/llama.cpp/src/llama-sampling.cpp +44 -33
- package/src/llama.cpp/src/llama-sampling.h +3 -0
- package/src/llama.cpp/src/llama-vocab.cpp +101 -33
- package/src/llama.cpp/src/llama-vocab.h +2 -0
- package/src/llama.cpp/src/llama.cpp +52 -37
- package/src/llama.cpp/src/models/bert.cpp +4 -2
- package/src/llama.cpp/src/models/cogvlm.cpp +5 -3
- package/src/llama.cpp/src/models/deepseek2.cpp +1 -1
- package/src/llama.cpp/src/models/gemma-embedding.cpp +2 -6
- package/src/llama.cpp/src/models/gemma3.cpp +3 -4
- package/src/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
- package/src/llama.cpp/src/models/llama.cpp +19 -6
- package/src/llama.cpp/src/models/maincoder.cpp +117 -0
- package/src/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
- package/src/llama.cpp/src/models/models.h +18 -0
- package/src/llama.cpp/src/models/modern-bert.cpp +115 -0
- package/src/llama.cpp/src/models/plamo3.cpp +128 -0
- package/src/llama.cpp/src/unicode.cpp +23 -14
|
@@ -421,39 +421,6 @@ void llama_sampler_free(struct llama_sampler * smpl) {
|
|
|
421
421
|
delete smpl;
|
|
422
422
|
}
|
|
423
423
|
|
|
424
|
-
llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx) {
|
|
425
|
-
const auto * logits = llama_get_logits_ith(ctx, idx);
|
|
426
|
-
|
|
427
|
-
const llama_model * model = llama_get_model(ctx);
|
|
428
|
-
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
429
|
-
|
|
430
|
-
const int n_vocab = llama_vocab_n_tokens(vocab);
|
|
431
|
-
|
|
432
|
-
// TODO: do not allocate each time
|
|
433
|
-
std::vector<llama_token_data> cur;
|
|
434
|
-
cur.reserve(n_vocab);
|
|
435
|
-
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
|
436
|
-
cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
|
437
|
-
}
|
|
438
|
-
|
|
439
|
-
llama_token_data_array cur_p = {
|
|
440
|
-
/* .data = */ cur.data(),
|
|
441
|
-
/* .size = */ cur.size(),
|
|
442
|
-
/* .selected = */ -1,
|
|
443
|
-
/* .sorted = */ false,
|
|
444
|
-
};
|
|
445
|
-
|
|
446
|
-
llama_sampler_apply(smpl, &cur_p);
|
|
447
|
-
|
|
448
|
-
GGML_ASSERT(cur_p.selected >= 0 && cur_p.selected < (int32_t) cur_p.size);
|
|
449
|
-
|
|
450
|
-
auto token = cur_p.data[cur_p.selected].id;
|
|
451
|
-
|
|
452
|
-
llama_sampler_accept(smpl, token);
|
|
453
|
-
|
|
454
|
-
return token;
|
|
455
|
-
}
|
|
456
|
-
|
|
457
424
|
// sampler chain
|
|
458
425
|
|
|
459
426
|
static const char * llama_sampler_chain_name(const struct llama_sampler * /*smpl*/) {
|
|
@@ -527,12 +494,56 @@ struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_param
|
|
|
527
494
|
/* .ctx = */ new llama_sampler_chain {
|
|
528
495
|
/* .params = */ params,
|
|
529
496
|
/* .samplers = */ {},
|
|
497
|
+
/* .cur = */ {},
|
|
530
498
|
/* .t_sample_us = */ 0,
|
|
531
499
|
/* .n_sample = */ 0,
|
|
532
500
|
}
|
|
533
501
|
);
|
|
534
502
|
}
|
|
535
503
|
|
|
504
|
+
llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx) {
|
|
505
|
+
const auto * logits = llama_get_logits_ith(ctx, idx);
|
|
506
|
+
|
|
507
|
+
const llama_model * model = llama_get_model(ctx);
|
|
508
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
509
|
+
|
|
510
|
+
const int n_vocab = llama_vocab_n_tokens(vocab);
|
|
511
|
+
|
|
512
|
+
// use pre-allocated buffer from chain if available, otherwise allocate locally
|
|
513
|
+
std::vector<llama_token_data> * cur_ptr;
|
|
514
|
+
std::vector<llama_token_data> cur_local;
|
|
515
|
+
|
|
516
|
+
if (smpl->iface == &llama_sampler_chain_i) {
|
|
517
|
+
auto * chain = (llama_sampler_chain *) smpl->ctx;
|
|
518
|
+
cur_ptr = &chain->cur;
|
|
519
|
+
} else {
|
|
520
|
+
cur_ptr = &cur_local;
|
|
521
|
+
}
|
|
522
|
+
|
|
523
|
+
auto & cur = *cur_ptr;
|
|
524
|
+
cur.resize(n_vocab);
|
|
525
|
+
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
|
526
|
+
cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
llama_token_data_array cur_p = {
|
|
530
|
+
/* .data = */ cur.data(),
|
|
531
|
+
/* .size = */ cur.size(),
|
|
532
|
+
/* .selected = */ -1,
|
|
533
|
+
/* .sorted = */ false,
|
|
534
|
+
};
|
|
535
|
+
|
|
536
|
+
llama_sampler_apply(smpl, &cur_p);
|
|
537
|
+
|
|
538
|
+
GGML_ASSERT(cur_p.selected >= 0 && cur_p.selected < (int32_t) cur_p.size);
|
|
539
|
+
|
|
540
|
+
auto token = cur_p.data[cur_p.selected].id;
|
|
541
|
+
|
|
542
|
+
llama_sampler_accept(smpl, token);
|
|
543
|
+
|
|
544
|
+
return token;
|
|
545
|
+
}
|
|
546
|
+
|
|
536
547
|
void llama_sampler_chain_add(struct llama_sampler * chain, struct llama_sampler * smpl) {
|
|
537
548
|
auto * p = (llama_sampler_chain *) chain->ctx;
|
|
538
549
|
p->samplers.push_back(smpl);
|
|
@@ -314,6 +314,12 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|
|
314
314
|
"[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+",
|
|
315
315
|
};
|
|
316
316
|
break;
|
|
317
|
+
case LLAMA_VOCAB_PRE_TYPE_YOUTU:
|
|
318
|
+
regex_exprs = {
|
|
319
|
+
"[가-힣ㄱ-ㆎ]+|[!…“”‘’—:;,、-〿︰-﹏]+|[ㄅ-ㄯ]+|[一-龥-ゟ゠-ヿ]+",
|
|
320
|
+
"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
|
321
|
+
};
|
|
322
|
+
break;
|
|
317
323
|
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
|
|
318
324
|
regex_exprs = {
|
|
319
325
|
"[\r\n]",
|
|
@@ -355,6 +361,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|
|
355
361
|
case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
|
|
356
362
|
case LLAMA_VOCAB_PRE_TYPE_QWEN2:
|
|
357
363
|
case LLAMA_VOCAB_PRE_TYPE_HUNYUAN:
|
|
364
|
+
case LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN:
|
|
358
365
|
regex_exprs = {
|
|
359
366
|
// original regex from tokenizer.json
|
|
360
367
|
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
|
|
@@ -1860,6 +1867,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1860
1867
|
tokenizer_pre == "deepseek-v3") {
|
|
1861
1868
|
pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
|
|
1862
1869
|
clean_spaces = false;
|
|
1870
|
+
} else if (
|
|
1871
|
+
tokenizer_pre == "youtu") {
|
|
1872
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_YOUTU;
|
|
1873
|
+
clean_spaces = false;
|
|
1874
|
+
ignore_merges = true;
|
|
1863
1875
|
} else if (
|
|
1864
1876
|
tokenizer_pre == "falcon") {
|
|
1865
1877
|
pre_type = LLAMA_VOCAB_PRE_TYPE_FALCON;
|
|
@@ -1878,7 +1890,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1878
1890
|
tokenizer_pre == "jina-v2-es" ||
|
|
1879
1891
|
tokenizer_pre == "jina-v2-de" ||
|
|
1880
1892
|
tokenizer_pre == "a.x-4.0" ||
|
|
1881
|
-
tokenizer_pre == "mellum"
|
|
1893
|
+
tokenizer_pre == "mellum" ||
|
|
1894
|
+
tokenizer_pre == "modern-bert" ) {
|
|
1882
1895
|
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
|
1883
1896
|
} else if (
|
|
1884
1897
|
tokenizer_pre == "jina-v1-en" ||
|
|
@@ -2014,6 +2027,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2014
2027
|
tokenizer_pre == "minimax-m2") {
|
|
2015
2028
|
pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2;
|
|
2016
2029
|
clean_spaces = false;
|
|
2030
|
+
} else if (
|
|
2031
|
+
tokenizer_pre == "solar-open") {
|
|
2032
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN;
|
|
2033
|
+
clean_spaces = false;
|
|
2017
2034
|
} else {
|
|
2018
2035
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
|
2019
2036
|
}
|
|
@@ -2186,6 +2203,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2186
2203
|
// for now, we apply this workaround to find the tokens based on their text
|
|
2187
2204
|
|
|
2188
2205
|
for (const auto & t : token_to_id) {
|
|
2206
|
+
auto & attr = id_to_token[t.second].attr;
|
|
2207
|
+
|
|
2189
2208
|
// find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
|
|
2190
2209
|
if (special_eot_id == LLAMA_TOKEN_NULL) {
|
|
2191
2210
|
if (false
|
|
@@ -2201,10 +2220,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2201
2220
|
|| t.first == "<end_of_utterance>" // smoldocling
|
|
2202
2221
|
) {
|
|
2203
2222
|
special_eot_id = t.second;
|
|
2204
|
-
if ((
|
|
2223
|
+
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
2205
2224
|
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
|
2206
2225
|
__func__, t.second, t.first.c_str());
|
|
2207
|
-
|
|
2226
|
+
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
|
|
2208
2227
|
}
|
|
2209
2228
|
}
|
|
2210
2229
|
}
|
|
@@ -2215,10 +2234,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2215
2234
|
|| t.first == "<|eom_id|>"
|
|
2216
2235
|
) {
|
|
2217
2236
|
special_eom_id = t.second;
|
|
2218
|
-
if ((
|
|
2237
|
+
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
2219
2238
|
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
|
2220
2239
|
__func__, t.second, t.first.c_str());
|
|
2221
|
-
|
|
2240
|
+
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
|
|
2222
2241
|
}
|
|
2223
2242
|
}
|
|
2224
2243
|
}
|
|
@@ -2235,10 +2254,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2235
2254
|
|| t.first == "<|code_prefix|>" // GLM-4.5
|
|
2236
2255
|
) {
|
|
2237
2256
|
special_fim_pre_id = t.second;
|
|
2238
|
-
if ((
|
|
2257
|
+
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
2239
2258
|
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
|
2240
2259
|
__func__, t.second, t.first.c_str());
|
|
2241
|
-
|
|
2260
|
+
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
|
|
2242
2261
|
}
|
|
2243
2262
|
}
|
|
2244
2263
|
}
|
|
@@ -2255,10 +2274,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2255
2274
|
|| t.first == "<|code_suffix|>" // GLM-4.5
|
|
2256
2275
|
) {
|
|
2257
2276
|
special_fim_suf_id = t.second;
|
|
2258
|
-
if ((
|
|
2277
|
+
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
2259
2278
|
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
|
2260
2279
|
__func__, t.second, t.first.c_str());
|
|
2261
|
-
|
|
2280
|
+
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
|
|
2262
2281
|
}
|
|
2263
2282
|
}
|
|
2264
2283
|
}
|
|
@@ -2275,10 +2294,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2275
2294
|
|| t.first == "<|code_middle|>" // GLM-4.5
|
|
2276
2295
|
) {
|
|
2277
2296
|
special_fim_mid_id = t.second;
|
|
2278
|
-
if ((
|
|
2297
|
+
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
2279
2298
|
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
|
2280
2299
|
__func__, t.second, t.first.c_str());
|
|
2281
|
-
|
|
2300
|
+
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
|
|
2282
2301
|
}
|
|
2283
2302
|
}
|
|
2284
2303
|
}
|
|
@@ -2292,10 +2311,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2292
2311
|
|| t.first == "<PAD>"
|
|
2293
2312
|
) {
|
|
2294
2313
|
special_fim_pad_id = t.second;
|
|
2295
|
-
if ((
|
|
2314
|
+
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
2296
2315
|
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
|
2297
2316
|
__func__, t.second, t.first.c_str());
|
|
2298
|
-
|
|
2317
|
+
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
|
|
2299
2318
|
}
|
|
2300
2319
|
}
|
|
2301
2320
|
}
|
|
@@ -2310,10 +2329,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2310
2329
|
|| t.first == "<reponame>" // Granite
|
|
2311
2330
|
) {
|
|
2312
2331
|
special_fim_rep_id = t.second;
|
|
2313
|
-
if ((
|
|
2332
|
+
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
2314
2333
|
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
|
2315
2334
|
__func__, t.second, t.first.c_str());
|
|
2316
|
-
|
|
2335
|
+
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
|
|
2317
2336
|
}
|
|
2318
2337
|
}
|
|
2319
2338
|
}
|
|
@@ -2324,15 +2343,41 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2324
2343
|
|| t.first == "<|file_sep|>" // Qwen
|
|
2325
2344
|
) {
|
|
2326
2345
|
special_fim_sep_id = t.second;
|
|
2327
|
-
if ((
|
|
2346
|
+
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
2328
2347
|
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
|
2329
2348
|
__func__, t.second, t.first.c_str());
|
|
2330
|
-
|
|
2349
|
+
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
|
|
2331
2350
|
}
|
|
2332
2351
|
}
|
|
2333
2352
|
}
|
|
2334
2353
|
}
|
|
2335
2354
|
|
|
2355
|
+
// auto-detect unused tokens: e.g. control tokens with the word "unused"
|
|
2356
|
+
// ideally, these tokens should be marked as unused during conversion
|
|
2357
|
+
{
|
|
2358
|
+
uint32_t n_unused = 0;
|
|
2359
|
+
|
|
2360
|
+
for (const auto & t : token_to_id) {
|
|
2361
|
+
auto & attr = id_to_token[t.second].attr;
|
|
2362
|
+
|
|
2363
|
+
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
2364
|
+
continue;
|
|
2365
|
+
}
|
|
2366
|
+
|
|
2367
|
+
if ((attr & LLAMA_TOKEN_ATTR_UNUSED) == 0) {
|
|
2368
|
+
if (strstr(t.first.c_str(), "unused") != NULL) {
|
|
2369
|
+
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_UNUSED);
|
|
2370
|
+
}
|
|
2371
|
+
}
|
|
2372
|
+
|
|
2373
|
+
if (attr & LLAMA_TOKEN_ATTR_UNUSED) {
|
|
2374
|
+
n_unused++;
|
|
2375
|
+
}
|
|
2376
|
+
}
|
|
2377
|
+
|
|
2378
|
+
LLAMA_LOG_INFO("%s: %u unused tokens\n", __func__, n_unused);
|
|
2379
|
+
}
|
|
2380
|
+
|
|
2336
2381
|
// maintain a list of tokens that cause end-of-generation
|
|
2337
2382
|
// this is currently determined based on the token text, which is obviously not ideal
|
|
2338
2383
|
// ref: https://github.com/ggerganov/llama.cpp/issues/9606
|
|
@@ -2351,12 +2396,16 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2351
2396
|
}
|
|
2352
2397
|
|
|
2353
2398
|
for (const auto & t : token_to_id) {
|
|
2399
|
+
auto & attr = id_to_token[t.second].attr;
|
|
2400
|
+
|
|
2354
2401
|
if (false
|
|
2355
2402
|
|| t.first == "<|eot_id|>"
|
|
2356
2403
|
|| t.first == "<|im_end|>"
|
|
2357
2404
|
|| t.first == "<|end|>"
|
|
2358
2405
|
|| t.first == "<|return|>" // o200k_harmony
|
|
2359
2406
|
|| t.first == "<|call|>" // o200k_harmony
|
|
2407
|
+
|| t.first == "<|flush|>" // solar-open
|
|
2408
|
+
|| t.first == "<|calls|>" // solar-open
|
|
2360
2409
|
|| t.first == "<end_of_turn>"
|
|
2361
2410
|
|| t.first == "<|endoftext|>"
|
|
2362
2411
|
|| t.first == "<|eom_id|>"
|
|
@@ -2366,24 +2415,28 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2366
2415
|
|| t.first == "<end_of_utterance>" // smoldocling
|
|
2367
2416
|
) {
|
|
2368
2417
|
special_eog_ids.insert(t.second);
|
|
2369
|
-
if ((
|
|
2418
|
+
if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
2370
2419
|
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
|
2371
2420
|
__func__, t.second, t.first.c_str());
|
|
2372
|
-
|
|
2421
|
+
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
|
|
2373
2422
|
}
|
|
2374
2423
|
} else {
|
|
2375
|
-
|
|
2376
|
-
|
|
2377
|
-
|
|
2378
|
-
|
|
2424
|
+
if (attr & LLAMA_TOKEN_ATTR_CONTROL && !(attr & LLAMA_TOKEN_ATTR_UNUSED)) {
|
|
2425
|
+
// token is control, but not marked as EOG -> print a debug log
|
|
2426
|
+
if (special_eog_ids.count(t.second) == 0) {
|
|
2427
|
+
LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
|
|
2428
|
+
__func__, t.second, t.first.c_str());
|
|
2429
|
+
}
|
|
2379
2430
|
}
|
|
2380
2431
|
}
|
|
2381
2432
|
}
|
|
2382
2433
|
|
|
2383
2434
|
// @ngxson : quick hack for gpt-oss, always render these tokens
|
|
2384
2435
|
for (const auto & t : token_to_id) {
|
|
2436
|
+
auto & attr = id_to_token[t.second].attr;
|
|
2437
|
+
|
|
2385
2438
|
if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>") {
|
|
2386
|
-
|
|
2439
|
+
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_USER_DEFINED);
|
|
2387
2440
|
}
|
|
2388
2441
|
}
|
|
2389
2442
|
|
|
@@ -2403,34 +2456,42 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2403
2456
|
LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
|
|
2404
2457
|
}
|
|
2405
2458
|
|
|
2406
|
-
// TODO: workaround for o200k_harmony tokenizer: the "<|end|>" token should not be EOG
|
|
2407
|
-
// we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens,
|
|
2459
|
+
// TODO: workaround for o200k_harmony and solar-open tokenizer: the "<|end|>" token should not be EOG
|
|
2460
|
+
// we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens ("<|calls|>" and "<|flush|>" for solar-open),
|
|
2408
2461
|
// we remove the "<|end|>" token from the EOG list
|
|
2409
2462
|
{
|
|
2410
2463
|
bool has_return = false;
|
|
2411
2464
|
bool has_call = false;
|
|
2412
2465
|
bool has_end = false;
|
|
2466
|
+
bool has_flush = false;
|
|
2413
2467
|
|
|
2414
2468
|
llama_token end_id = LLAMA_TOKEN_NULL;
|
|
2415
2469
|
|
|
2416
2470
|
LLAMA_LOG_INFO("%s: printing all EOG tokens:\n", __func__);
|
|
2417
2471
|
for (auto tid : special_eog_ids) {
|
|
2418
|
-
|
|
2472
|
+
auto & text = id_to_token[tid].text;
|
|
2473
|
+
|
|
2474
|
+
LLAMA_LOG_INFO("%s: - %d ('%s')\n", __func__, tid, text.c_str());
|
|
2419
2475
|
|
|
2420
|
-
if (
|
|
2476
|
+
if (text == "<|return|>") {
|
|
2421
2477
|
has_return = true;
|
|
2422
|
-
} else if (
|
|
2478
|
+
} else if (text == "<|call|>" || text == "<|calls|>") {
|
|
2423
2479
|
has_call = true;
|
|
2424
|
-
} else if (
|
|
2480
|
+
} else if (text == "<|flush|>") {
|
|
2481
|
+
has_flush = true;
|
|
2482
|
+
} else if (text == "<|end|>") {
|
|
2425
2483
|
has_end = true;
|
|
2426
2484
|
end_id = tid;
|
|
2427
2485
|
}
|
|
2428
2486
|
}
|
|
2429
2487
|
|
|
2430
|
-
if (has_return && has_call && has_end) {
|
|
2488
|
+
if ((has_return && has_call && has_end) || (has_call && has_flush && has_end)) {
|
|
2431
2489
|
special_eog_ids.erase(end_id);
|
|
2432
|
-
|
|
2433
|
-
|
|
2490
|
+
|
|
2491
|
+
auto & attr = id_to_token[end_id].attr;
|
|
2492
|
+
attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_USER_DEFINED);
|
|
2493
|
+
|
|
2494
|
+
LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>', or '<|calls|>' and '<|flush|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
|
|
2434
2495
|
}
|
|
2435
2496
|
}
|
|
2436
2497
|
}
|
|
@@ -2528,6 +2589,13 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2528
2589
|
for (const auto * token : {"<unk>", "<s>", "<|endoftext|>"}) {
|
|
2529
2590
|
_set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
|
|
2530
2591
|
}
|
|
2592
|
+
} else if (_contains_any(model_name, {"modern-bert"})) {
|
|
2593
|
+
if (token_to_id.count("[MASK]") == 0 ) {
|
|
2594
|
+
LLAMA_LOG_WARN("%s: Mask token missing in vocab!\n", __func__);
|
|
2595
|
+
}
|
|
2596
|
+
else {
|
|
2597
|
+
_set_token_attr("[MASK]", LLAMA_TOKEN_ATTR_LSTRIP, true);
|
|
2598
|
+
}
|
|
2531
2599
|
}
|
|
2532
2600
|
}
|
|
2533
2601
|
}
|
|
@@ -140,6 +140,10 @@ enum layer_fraction_t {
|
|
|
140
140
|
};
|
|
141
141
|
// this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue
|
|
142
142
|
|
|
143
|
+
class llama_params_fit_exception : public std::runtime_error {
|
|
144
|
+
using std::runtime_error::runtime_error;
|
|
145
|
+
};
|
|
146
|
+
|
|
143
147
|
static void llama_params_fit_impl(
|
|
144
148
|
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
|
145
149
|
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
|
@@ -181,12 +185,11 @@ static void llama_params_fit_impl(
|
|
|
181
185
|
}
|
|
182
186
|
}
|
|
183
187
|
|
|
184
|
-
int64_t
|
|
188
|
+
int64_t sum_free = 0;
|
|
185
189
|
int64_t sum_projected_free = 0;
|
|
186
190
|
int64_t min_projected_free = INT64_MAX;
|
|
187
191
|
int64_t sum_projected_used = 0;
|
|
188
192
|
int64_t sum_projected_model = 0;
|
|
189
|
-
int64_t sum_projected_ctx = 0;
|
|
190
193
|
|
|
191
194
|
if (nd > 1) {
|
|
192
195
|
LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
|
|
@@ -197,12 +200,11 @@ static void llama_params_fit_impl(
|
|
|
197
200
|
const int64_t projected_used = dmd.mb.total();
|
|
198
201
|
const int64_t projected_free = dmd.free - projected_used;
|
|
199
202
|
|
|
200
|
-
|
|
203
|
+
sum_free += dmd.free;
|
|
201
204
|
sum_projected_used += projected_used;
|
|
202
205
|
sum_projected_free += projected_free;
|
|
203
206
|
min_projected_free = std::min(min_projected_free, projected_free);
|
|
204
207
|
sum_projected_model += dmd.mb.model;
|
|
205
|
-
sum_projected_ctx += dmd.mb.context;
|
|
206
208
|
|
|
207
209
|
if (nd > 1) {
|
|
208
210
|
LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " %s\n",
|
|
@@ -210,10 +212,9 @@ static void llama_params_fit_impl(
|
|
|
210
212
|
projected_free >= 0 ? "surplus" : "deficit");
|
|
211
213
|
}
|
|
212
214
|
}
|
|
213
|
-
assert(
|
|
214
|
-
assert(sum_projected_used >= sum_projected_ctx);
|
|
215
|
+
assert(sum_free >= 0 && sum_projected_used >= 0);
|
|
215
216
|
LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
|
|
216
|
-
__func__, sum_projected_used/MiB,
|
|
217
|
+
__func__, sum_projected_used/MiB, sum_free/MiB);
|
|
217
218
|
if (min_projected_free >= margin) {
|
|
218
219
|
if (nd == 1) {
|
|
219
220
|
LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
|
|
@@ -236,9 +237,7 @@ static void llama_params_fit_impl(
|
|
|
236
237
|
__func__, margin/MiB, -global_surplus/MiB);
|
|
237
238
|
if (cparams->n_ctx == 0) {
|
|
238
239
|
if (hp_nct > n_ctx_min) {
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
int64_t memory_reduction = -global_surplus;
|
|
240
|
+
int64_t sum_used_target = sum_free - nd*margin_s;
|
|
242
241
|
if (nd > 1) {
|
|
243
242
|
// for multiple devices we need to be more conservative in terms of how much context we think can fit:
|
|
244
243
|
// - for dense models only whole layers can be assigned to devices
|
|
@@ -246,24 +245,34 @@ static void llama_params_fit_impl(
|
|
|
246
245
|
// - on average we expect a waste of 0.5 layers/tensors per device
|
|
247
246
|
// - use slightly more than the expected average for nd devices to be safe
|
|
248
247
|
const int64_t model_per_layer = sum_projected_model / std::min(uint32_t(mparams->n_gpu_layers), hp_ngl);
|
|
249
|
-
|
|
248
|
+
sum_used_target -= (nd + 1) * model_per_layer / (hp_nex == 0 ? 2 : 6);
|
|
250
249
|
}
|
|
251
250
|
|
|
252
|
-
|
|
253
|
-
cparams->n_ctx =
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
251
|
+
int64_t sum_projected_used_min_ctx = 0;
|
|
252
|
+
cparams->n_ctx = n_ctx_min;
|
|
253
|
+
const dmds_t dmds_min_ctx = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
|
254
|
+
for (const auto & dmd : dmds_min_ctx) {
|
|
255
|
+
sum_projected_used_min_ctx += dmd.mb.total();
|
|
256
|
+
}
|
|
257
|
+
if (sum_used_target > sum_projected_used_min_ctx) {
|
|
258
|
+
// linear interpolation between minimum and maximum context size:
|
|
259
|
+
cparams->n_ctx += (hp_nct - n_ctx_min) * (sum_used_target - sum_projected_used_min_ctx)
|
|
260
|
+
/ (sum_projected_used - sum_projected_used_min_ctx);
|
|
261
|
+
cparams->n_ctx = std::max(cparams->n_ctx - cparams->n_ctx % 256, n_ctx_min); // round down context for CUDA backend
|
|
262
|
+
|
|
263
|
+
const int64_t bytes_per_ctx = (sum_projected_used - sum_projected_used_min_ctx) / (hp_nct - n_ctx_min);
|
|
264
|
+
const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx;
|
|
265
|
+
LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
|
|
266
|
+
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
|
|
262
267
|
if (nd == 1) {
|
|
263
268
|
LLAMA_LOG_INFO("%s: entire model can be fit by reducing context\n", __func__);
|
|
264
269
|
return;
|
|
265
270
|
}
|
|
266
271
|
LLAMA_LOG_INFO("%s: entire model should be fit across devices by reducing context\n", __func__);
|
|
272
|
+
} else {
|
|
273
|
+
const int64_t memory_reduction = sum_projected_used - sum_projected_used_min_ctx;
|
|
274
|
+
LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
|
|
275
|
+
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
|
|
267
276
|
}
|
|
268
277
|
} else {
|
|
269
278
|
LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
|
|
@@ -276,28 +285,28 @@ static void llama_params_fit_impl(
|
|
|
276
285
|
}
|
|
277
286
|
|
|
278
287
|
if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
|
|
279
|
-
throw
|
|
288
|
+
throw llama_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
|
|
280
289
|
}
|
|
281
290
|
if (nd > 1) {
|
|
282
291
|
if (!tensor_split) {
|
|
283
|
-
throw
|
|
292
|
+
throw llama_params_fit_exception("did not provide a buffer to write the tensor_split to, abort");
|
|
284
293
|
}
|
|
285
294
|
if (mparams->tensor_split) {
|
|
286
295
|
for (size_t id = 0; id < nd; id++) {
|
|
287
296
|
if (mparams->tensor_split[id] != 0.0f) {
|
|
288
|
-
throw
|
|
297
|
+
throw llama_params_fit_exception("model_params::tensor_split already set by user, abort");
|
|
289
298
|
}
|
|
290
299
|
}
|
|
291
300
|
}
|
|
292
301
|
if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
|
293
|
-
throw
|
|
302
|
+
throw llama_params_fit_exception("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
|
|
294
303
|
}
|
|
295
304
|
}
|
|
296
305
|
if (!tensor_buft_overrides) {
|
|
297
|
-
throw
|
|
306
|
+
throw llama_params_fit_exception("did not provide buffer to set tensor_buft_overrides, abort");
|
|
298
307
|
}
|
|
299
308
|
if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) {
|
|
300
|
-
throw
|
|
309
|
+
throw llama_params_fit_exception("model_params::tensor_buft_overrides already set by user, abort");
|
|
301
310
|
}
|
|
302
311
|
|
|
303
312
|
// step 3: iteratively fill the back to front with "dense" layers
|
|
@@ -380,8 +389,8 @@ static void llama_params_fit_impl(
|
|
|
380
389
|
tensor_buft_overrides[itbo].buft = nullptr;
|
|
381
390
|
itbo++;
|
|
382
391
|
mparams.tensor_buft_overrides = tensor_buft_overrides;
|
|
383
|
-
throw
|
|
384
|
-
+ std::to_string(ntbo) + " is insufficient for model
|
|
392
|
+
throw llama_params_fit_exception("llama_max_tensor_buft_overrides() == "
|
|
393
|
+
+ std::to_string(ntbo) + " is insufficient for model");
|
|
385
394
|
}
|
|
386
395
|
tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
|
|
387
396
|
tensor_buft_overrides[itbo].buft = overflow_bufts[id];
|
|
@@ -503,6 +512,9 @@ static void llama_params_fit_impl(
|
|
|
503
512
|
if (mem_high[id] > targets[id]) {
|
|
504
513
|
assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
|
|
505
514
|
uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
|
|
515
|
+
if (hp_nex > 0 && size_t(id) == nd - 1) {
|
|
516
|
+
delta--;
|
|
517
|
+
}
|
|
506
518
|
LLAMA_LOG_DEBUG("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
|
|
507
519
|
while (delta > 1) {
|
|
508
520
|
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
|
|
@@ -638,7 +650,7 @@ static void llama_params_fit_impl(
|
|
|
638
650
|
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
|
|
639
651
|
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
|
|
640
652
|
std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
|
|
641
|
-
if (mem_test[id] < targets[id]) {
|
|
653
|
+
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
|
642
654
|
ngl_per_device = ngl_per_device_test;
|
|
643
655
|
mem = mem_test;
|
|
644
656
|
id_dense_start = id_dense_start_test;
|
|
@@ -648,7 +660,7 @@ static void llama_params_fit_impl(
|
|
|
648
660
|
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
|
|
649
661
|
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
|
|
650
662
|
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
|
|
651
|
-
if (mem_test[id] < targets[id]) {
|
|
663
|
+
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
|
652
664
|
ngl_per_device = ngl_per_device_test;
|
|
653
665
|
mem = mem_test;
|
|
654
666
|
id_dense_start = id_dense_start_test;
|
|
@@ -659,7 +671,7 @@ static void llama_params_fit_impl(
|
|
|
659
671
|
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
|
|
660
672
|
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
|
|
661
673
|
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
|
|
662
|
-
if (mem_test[id] < targets[id]) {
|
|
674
|
+
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
|
663
675
|
ngl_per_device = ngl_per_device_test;
|
|
664
676
|
mem = mem_test;
|
|
665
677
|
id_dense_start = id_dense_start_test;
|
|
@@ -678,22 +690,25 @@ static void llama_params_fit_impl(
|
|
|
678
690
|
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
|
|
679
691
|
}
|
|
680
692
|
|
|
681
|
-
|
|
693
|
+
enum llama_params_fit_status llama_params_fit(
|
|
682
694
|
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
|
683
695
|
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
|
684
696
|
size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
|
685
697
|
const int64_t t0_us = llama_time_us();
|
|
686
|
-
|
|
698
|
+
llama_params_fit_status status = LLAMA_PARAMS_FIT_STATUS_SUCCESS;
|
|
687
699
|
try {
|
|
688
700
|
llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margin_s, n_ctx_min, log_level);
|
|
689
701
|
LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__);
|
|
690
|
-
} catch (const
|
|
702
|
+
} catch (const llama_params_fit_exception & e) {
|
|
691
703
|
LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
|
|
692
|
-
|
|
704
|
+
status = LLAMA_PARAMS_FIT_STATUS_FAILURE;
|
|
705
|
+
} catch (const std::runtime_error & e) {
|
|
706
|
+
LLAMA_LOG_ERROR("%s: encountered an error while trying to fit params to free device memory: %s\n", __func__, e.what());
|
|
707
|
+
status = LLAMA_PARAMS_FIT_STATUS_ERROR;
|
|
693
708
|
}
|
|
694
709
|
const int64_t t1_us = llama_time_us();
|
|
695
710
|
LLAMA_LOG_INFO("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
|
|
696
|
-
return
|
|
711
|
+
return status;
|
|
697
712
|
}
|
|
698
713
|
|
|
699
714
|
struct llama_sampler_chain_params llama_sampler_chain_default_params() {
|