@fugood/llama.node 1.4.10 → 1.4.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/package.json +15 -15
  2. package/scripts/llama.cpp.patch +30 -30
  3. package/src/LlamaContext.cpp +1 -1
  4. package/src/llama.cpp/common/arg.cpp +29 -14
  5. package/src/llama.cpp/common/arg.h +1 -0
  6. package/src/llama.cpp/common/chat-parser.cpp +11 -0
  7. package/src/llama.cpp/common/chat.cpp +32 -3
  8. package/src/llama.cpp/common/chat.h +1 -0
  9. package/src/llama.cpp/common/common.cpp +23 -23
  10. package/src/llama.cpp/common/common.h +1 -1
  11. package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
  12. package/src/llama.cpp/ggml/include/ggml-backend.h +1 -1
  13. package/src/llama.cpp/ggml/src/CMakeLists.txt +23 -9
  14. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
  15. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
  16. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
  17. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
  18. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
  19. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
  20. package/src/llama.cpp/include/llama.h +13 -4
  21. package/src/llama.cpp/src/CMakeLists.txt +4 -0
  22. package/src/llama.cpp/src/llama-adapter.cpp +12 -3
  23. package/src/llama.cpp/src/llama-adapter.h +7 -1
  24. package/src/llama.cpp/src/llama-arch.cpp +76 -0
  25. package/src/llama.cpp/src/llama-arch.h +7 -0
  26. package/src/llama.cpp/src/llama-chat.cpp +11 -0
  27. package/src/llama.cpp/src/llama-chat.h +1 -0
  28. package/src/llama.cpp/src/llama-context.cpp +22 -21
  29. package/src/llama.cpp/src/llama-hparams.h +4 -3
  30. package/src/llama.cpp/src/llama-kv-cache.h +1 -1
  31. package/src/llama.cpp/src/llama-mmap.cpp +11 -4
  32. package/src/llama.cpp/src/llama-model-loader.cpp +23 -0
  33. package/src/llama.cpp/src/llama-model-loader.h +2 -0
  34. package/src/llama.cpp/src/llama-model.cpp +287 -16
  35. package/src/llama.cpp/src/llama-model.h +13 -2
  36. package/src/llama.cpp/src/llama-sampling.cpp +44 -33
  37. package/src/llama.cpp/src/llama-sampling.h +3 -0
  38. package/src/llama.cpp/src/llama-vocab.cpp +101 -33
  39. package/src/llama.cpp/src/llama-vocab.h +2 -0
  40. package/src/llama.cpp/src/llama.cpp +52 -37
  41. package/src/llama.cpp/src/models/bert.cpp +4 -2
  42. package/src/llama.cpp/src/models/cogvlm.cpp +5 -3
  43. package/src/llama.cpp/src/models/deepseek2.cpp +1 -1
  44. package/src/llama.cpp/src/models/gemma-embedding.cpp +2 -6
  45. package/src/llama.cpp/src/models/gemma3.cpp +3 -4
  46. package/src/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
  47. package/src/llama.cpp/src/models/llama.cpp +19 -6
  48. package/src/llama.cpp/src/models/maincoder.cpp +117 -0
  49. package/src/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
  50. package/src/llama.cpp/src/models/models.h +18 -0
  51. package/src/llama.cpp/src/models/modern-bert.cpp +115 -0
  52. package/src/llama.cpp/src/models/plamo3.cpp +128 -0
  53. package/src/llama.cpp/src/unicode.cpp +23 -14
@@ -421,39 +421,6 @@ void llama_sampler_free(struct llama_sampler * smpl) {
421
421
  delete smpl;
422
422
  }
423
423
 
424
- llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx) {
425
- const auto * logits = llama_get_logits_ith(ctx, idx);
426
-
427
- const llama_model * model = llama_get_model(ctx);
428
- const llama_vocab * vocab = llama_model_get_vocab(model);
429
-
430
- const int n_vocab = llama_vocab_n_tokens(vocab);
431
-
432
- // TODO: do not allocate each time
433
- std::vector<llama_token_data> cur;
434
- cur.reserve(n_vocab);
435
- for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
436
- cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
437
- }
438
-
439
- llama_token_data_array cur_p = {
440
- /* .data = */ cur.data(),
441
- /* .size = */ cur.size(),
442
- /* .selected = */ -1,
443
- /* .sorted = */ false,
444
- };
445
-
446
- llama_sampler_apply(smpl, &cur_p);
447
-
448
- GGML_ASSERT(cur_p.selected >= 0 && cur_p.selected < (int32_t) cur_p.size);
449
-
450
- auto token = cur_p.data[cur_p.selected].id;
451
-
452
- llama_sampler_accept(smpl, token);
453
-
454
- return token;
455
- }
456
-
457
424
  // sampler chain
458
425
 
459
426
  static const char * llama_sampler_chain_name(const struct llama_sampler * /*smpl*/) {
@@ -527,12 +494,56 @@ struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_param
527
494
  /* .ctx = */ new llama_sampler_chain {
528
495
  /* .params = */ params,
529
496
  /* .samplers = */ {},
497
+ /* .cur = */ {},
530
498
  /* .t_sample_us = */ 0,
531
499
  /* .n_sample = */ 0,
532
500
  }
533
501
  );
534
502
  }
535
503
 
504
+ llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx) {
505
+ const auto * logits = llama_get_logits_ith(ctx, idx);
506
+
507
+ const llama_model * model = llama_get_model(ctx);
508
+ const llama_vocab * vocab = llama_model_get_vocab(model);
509
+
510
+ const int n_vocab = llama_vocab_n_tokens(vocab);
511
+
512
+ // use pre-allocated buffer from chain if available, otherwise allocate locally
513
+ std::vector<llama_token_data> * cur_ptr;
514
+ std::vector<llama_token_data> cur_local;
515
+
516
+ if (smpl->iface == &llama_sampler_chain_i) {
517
+ auto * chain = (llama_sampler_chain *) smpl->ctx;
518
+ cur_ptr = &chain->cur;
519
+ } else {
520
+ cur_ptr = &cur_local;
521
+ }
522
+
523
+ auto & cur = *cur_ptr;
524
+ cur.resize(n_vocab);
525
+ for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
526
+ cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
527
+ }
528
+
529
+ llama_token_data_array cur_p = {
530
+ /* .data = */ cur.data(),
531
+ /* .size = */ cur.size(),
532
+ /* .selected = */ -1,
533
+ /* .sorted = */ false,
534
+ };
535
+
536
+ llama_sampler_apply(smpl, &cur_p);
537
+
538
+ GGML_ASSERT(cur_p.selected >= 0 && cur_p.selected < (int32_t) cur_p.size);
539
+
540
+ auto token = cur_p.data[cur_p.selected].id;
541
+
542
+ llama_sampler_accept(smpl, token);
543
+
544
+ return token;
545
+ }
546
+
536
547
  void llama_sampler_chain_add(struct llama_sampler * chain, struct llama_sampler * smpl) {
537
548
  auto * p = (llama_sampler_chain *) chain->ctx;
538
549
  p->samplers.push_back(smpl);
@@ -16,6 +16,9 @@ struct llama_sampler_chain {
16
16
 
17
17
  std::vector<struct llama_sampler *> samplers;
18
18
 
19
+ // pre-allocated buffer for llama_sampler_sample to avoid repeated allocations
20
+ std::vector<llama_token_data> cur;
21
+
19
22
  // timing
20
23
 
21
24
  mutable int64_t t_sample_us;
@@ -314,6 +314,12 @@ struct llm_tokenizer_bpe : llm_tokenizer {
314
314
  "[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+",
315
315
  };
316
316
  break;
317
+ case LLAMA_VOCAB_PRE_TYPE_YOUTU:
318
+ regex_exprs = {
319
+ "[가-힣ㄱ-ㆎ]+|[!…“”‘’—:;,、-〿︰-﹏]+|[ㄅ-ㄯ]+|[一-龥぀-ゟ゠-ヿ]+",
320
+ "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
321
+ };
322
+ break;
317
323
  case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
318
324
  regex_exprs = {
319
325
  "[\r\n]",
@@ -355,6 +361,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
355
361
  case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
356
362
  case LLAMA_VOCAB_PRE_TYPE_QWEN2:
357
363
  case LLAMA_VOCAB_PRE_TYPE_HUNYUAN:
364
+ case LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN:
358
365
  regex_exprs = {
359
366
  // original regex from tokenizer.json
360
367
  // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
@@ -1860,6 +1867,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1860
1867
  tokenizer_pre == "deepseek-v3") {
1861
1868
  pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
1862
1869
  clean_spaces = false;
1870
+ } else if (
1871
+ tokenizer_pre == "youtu") {
1872
+ pre_type = LLAMA_VOCAB_PRE_TYPE_YOUTU;
1873
+ clean_spaces = false;
1874
+ ignore_merges = true;
1863
1875
  } else if (
1864
1876
  tokenizer_pre == "falcon") {
1865
1877
  pre_type = LLAMA_VOCAB_PRE_TYPE_FALCON;
@@ -1878,7 +1890,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1878
1890
  tokenizer_pre == "jina-v2-es" ||
1879
1891
  tokenizer_pre == "jina-v2-de" ||
1880
1892
  tokenizer_pre == "a.x-4.0" ||
1881
- tokenizer_pre == "mellum") {
1893
+ tokenizer_pre == "mellum" ||
1894
+ tokenizer_pre == "modern-bert" ) {
1882
1895
  pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1883
1896
  } else if (
1884
1897
  tokenizer_pre == "jina-v1-en" ||
@@ -2014,6 +2027,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2014
2027
  tokenizer_pre == "minimax-m2") {
2015
2028
  pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2;
2016
2029
  clean_spaces = false;
2030
+ } else if (
2031
+ tokenizer_pre == "solar-open") {
2032
+ pre_type = LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN;
2033
+ clean_spaces = false;
2017
2034
  } else {
2018
2035
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
2019
2036
  }
@@ -2186,6 +2203,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2186
2203
  // for now, we apply this workaround to find the tokens based on their text
2187
2204
 
2188
2205
  for (const auto & t : token_to_id) {
2206
+ auto & attr = id_to_token[t.second].attr;
2207
+
2189
2208
  // find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
2190
2209
  if (special_eot_id == LLAMA_TOKEN_NULL) {
2191
2210
  if (false
@@ -2201,10 +2220,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2201
2220
  || t.first == "<end_of_utterance>" // smoldocling
2202
2221
  ) {
2203
2222
  special_eot_id = t.second;
2204
- if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2223
+ if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2205
2224
  LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2206
2225
  __func__, t.second, t.first.c_str());
2207
- id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2226
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2208
2227
  }
2209
2228
  }
2210
2229
  }
@@ -2215,10 +2234,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2215
2234
  || t.first == "<|eom_id|>"
2216
2235
  ) {
2217
2236
  special_eom_id = t.second;
2218
- if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2237
+ if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2219
2238
  LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2220
2239
  __func__, t.second, t.first.c_str());
2221
- id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2240
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2222
2241
  }
2223
2242
  }
2224
2243
  }
@@ -2235,10 +2254,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2235
2254
  || t.first == "<|code_prefix|>" // GLM-4.5
2236
2255
  ) {
2237
2256
  special_fim_pre_id = t.second;
2238
- if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2257
+ if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2239
2258
  LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2240
2259
  __func__, t.second, t.first.c_str());
2241
- id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2260
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2242
2261
  }
2243
2262
  }
2244
2263
  }
@@ -2255,10 +2274,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2255
2274
  || t.first == "<|code_suffix|>" // GLM-4.5
2256
2275
  ) {
2257
2276
  special_fim_suf_id = t.second;
2258
- if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2277
+ if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2259
2278
  LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2260
2279
  __func__, t.second, t.first.c_str());
2261
- id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2280
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2262
2281
  }
2263
2282
  }
2264
2283
  }
@@ -2275,10 +2294,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2275
2294
  || t.first == "<|code_middle|>" // GLM-4.5
2276
2295
  ) {
2277
2296
  special_fim_mid_id = t.second;
2278
- if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2297
+ if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2279
2298
  LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2280
2299
  __func__, t.second, t.first.c_str());
2281
- id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2300
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2282
2301
  }
2283
2302
  }
2284
2303
  }
@@ -2292,10 +2311,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2292
2311
  || t.first == "<PAD>"
2293
2312
  ) {
2294
2313
  special_fim_pad_id = t.second;
2295
- if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2314
+ if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2296
2315
  LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2297
2316
  __func__, t.second, t.first.c_str());
2298
- id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2317
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2299
2318
  }
2300
2319
  }
2301
2320
  }
@@ -2310,10 +2329,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2310
2329
  || t.first == "<reponame>" // Granite
2311
2330
  ) {
2312
2331
  special_fim_rep_id = t.second;
2313
- if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2332
+ if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2314
2333
  LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2315
2334
  __func__, t.second, t.first.c_str());
2316
- id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2335
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2317
2336
  }
2318
2337
  }
2319
2338
  }
@@ -2324,15 +2343,41 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2324
2343
  || t.first == "<|file_sep|>" // Qwen
2325
2344
  ) {
2326
2345
  special_fim_sep_id = t.second;
2327
- if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2346
+ if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2328
2347
  LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2329
2348
  __func__, t.second, t.first.c_str());
2330
- id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2349
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2331
2350
  }
2332
2351
  }
2333
2352
  }
2334
2353
  }
2335
2354
 
2355
+ // auto-detect unused tokens: e.g. control tokens with the word "unused"
2356
+ // ideally, these tokens should be marked as unused during conversion
2357
+ {
2358
+ uint32_t n_unused = 0;
2359
+
2360
+ for (const auto & t : token_to_id) {
2361
+ auto & attr = id_to_token[t.second].attr;
2362
+
2363
+ if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2364
+ continue;
2365
+ }
2366
+
2367
+ if ((attr & LLAMA_TOKEN_ATTR_UNUSED) == 0) {
2368
+ if (strstr(t.first.c_str(), "unused") != NULL) {
2369
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_UNUSED);
2370
+ }
2371
+ }
2372
+
2373
+ if (attr & LLAMA_TOKEN_ATTR_UNUSED) {
2374
+ n_unused++;
2375
+ }
2376
+ }
2377
+
2378
+ LLAMA_LOG_INFO("%s: %u unused tokens\n", __func__, n_unused);
2379
+ }
2380
+
2336
2381
  // maintain a list of tokens that cause end-of-generation
2337
2382
  // this is currently determined based on the token text, which is obviously not ideal
2338
2383
  // ref: https://github.com/ggerganov/llama.cpp/issues/9606
@@ -2351,12 +2396,16 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2351
2396
  }
2352
2397
 
2353
2398
  for (const auto & t : token_to_id) {
2399
+ auto & attr = id_to_token[t.second].attr;
2400
+
2354
2401
  if (false
2355
2402
  || t.first == "<|eot_id|>"
2356
2403
  || t.first == "<|im_end|>"
2357
2404
  || t.first == "<|end|>"
2358
2405
  || t.first == "<|return|>" // o200k_harmony
2359
2406
  || t.first == "<|call|>" // o200k_harmony
2407
+ || t.first == "<|flush|>" // solar-open
2408
+ || t.first == "<|calls|>" // solar-open
2360
2409
  || t.first == "<end_of_turn>"
2361
2410
  || t.first == "<|endoftext|>"
2362
2411
  || t.first == "<|eom_id|>"
@@ -2366,24 +2415,28 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2366
2415
  || t.first == "<end_of_utterance>" // smoldocling
2367
2416
  ) {
2368
2417
  special_eog_ids.insert(t.second);
2369
- if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2418
+ if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2370
2419
  LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2371
2420
  __func__, t.second, t.first.c_str());
2372
- id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2421
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2373
2422
  }
2374
2423
  } else {
2375
- // token is control, but not marked as EOG -> print a debug log
2376
- if (id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && special_eog_ids.count(t.second) == 0) {
2377
- LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
2378
- __func__, t.second, t.first.c_str());
2424
+ if (attr & LLAMA_TOKEN_ATTR_CONTROL && !(attr & LLAMA_TOKEN_ATTR_UNUSED)) {
2425
+ // token is control, but not marked as EOG -> print a debug log
2426
+ if (special_eog_ids.count(t.second) == 0) {
2427
+ LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
2428
+ __func__, t.second, t.first.c_str());
2429
+ }
2379
2430
  }
2380
2431
  }
2381
2432
  }
2382
2433
 
2383
2434
  // @ngxson : quick hack for gpt-oss, always render these tokens
2384
2435
  for (const auto & t : token_to_id) {
2436
+ auto & attr = id_to_token[t.second].attr;
2437
+
2385
2438
  if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>") {
2386
- id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
2439
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_USER_DEFINED);
2387
2440
  }
2388
2441
  }
2389
2442
 
@@ -2403,34 +2456,42 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2403
2456
  LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
2404
2457
  }
2405
2458
 
2406
- // TODO: workaround for o200k_harmony tokenizer: the "<|end|>" token should not be EOG
2407
- // we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens,
2459
+ // TODO: workaround for o200k_harmony and solar-open tokenizer: the "<|end|>" token should not be EOG
2460
+ // we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens ("<|calls|>" and "<|flush|>" for solar-open),
2408
2461
  // we remove the "<|end|>" token from the EOG list
2409
2462
  {
2410
2463
  bool has_return = false;
2411
2464
  bool has_call = false;
2412
2465
  bool has_end = false;
2466
+ bool has_flush = false;
2413
2467
 
2414
2468
  llama_token end_id = LLAMA_TOKEN_NULL;
2415
2469
 
2416
2470
  LLAMA_LOG_INFO("%s: printing all EOG tokens:\n", __func__);
2417
2471
  for (auto tid : special_eog_ids) {
2418
- LLAMA_LOG_INFO("%s: - %d ('%s')\n", __func__, tid, id_to_token[tid].text.c_str());
2472
+ auto & text = id_to_token[tid].text;
2473
+
2474
+ LLAMA_LOG_INFO("%s: - %d ('%s')\n", __func__, tid, text.c_str());
2419
2475
 
2420
- if (id_to_token[tid].text == "<|return|>") {
2476
+ if (text == "<|return|>") {
2421
2477
  has_return = true;
2422
- } else if (id_to_token[tid].text == "<|call|>") {
2478
+ } else if (text == "<|call|>" || text == "<|calls|>") {
2423
2479
  has_call = true;
2424
- } else if (id_to_token[tid].text == "<|end|>") {
2480
+ } else if (text == "<|flush|>") {
2481
+ has_flush = true;
2482
+ } else if (text == "<|end|>") {
2425
2483
  has_end = true;
2426
2484
  end_id = tid;
2427
2485
  }
2428
2486
  }
2429
2487
 
2430
- if (has_return && has_call && has_end) {
2488
+ if ((has_return && has_call && has_end) || (has_call && has_flush && has_end)) {
2431
2489
  special_eog_ids.erase(end_id);
2432
- id_to_token[end_id].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
2433
- LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
2490
+
2491
+ auto & attr = id_to_token[end_id].attr;
2492
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_USER_DEFINED);
2493
+
2494
+ LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>', or '<|calls|>' and '<|flush|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
2434
2495
  }
2435
2496
  }
2436
2497
  }
@@ -2528,6 +2589,13 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2528
2589
  for (const auto * token : {"<unk>", "<s>", "<|endoftext|>"}) {
2529
2590
  _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
2530
2591
  }
2592
+ } else if (_contains_any(model_name, {"modern-bert"})) {
2593
+ if (token_to_id.count("[MASK]") == 0 ) {
2594
+ LLAMA_LOG_WARN("%s: Mask token missing in vocab!\n", __func__);
2595
+ }
2596
+ else {
2597
+ _set_token_attr("[MASK]", LLAMA_TOKEN_ATTR_LSTRIP, true);
2598
+ }
2531
2599
  }
2532
2600
  }
2533
2601
  }
@@ -51,6 +51,8 @@ enum llama_vocab_pre_type {
51
51
  LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40,
52
52
  LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2 = 41,
53
53
  LLAMA_VOCAB_PRE_TYPE_AFMOE = 42,
54
+ LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43,
55
+ LLAMA_VOCAB_PRE_TYPE_YOUTU = 44,
54
56
  };
55
57
 
56
58
  struct LLM_KV;
@@ -140,6 +140,10 @@ enum layer_fraction_t {
140
140
  };
141
141
  // this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue
142
142
 
143
+ class llama_params_fit_exception : public std::runtime_error {
144
+ using std::runtime_error::runtime_error;
145
+ };
146
+
143
147
  static void llama_params_fit_impl(
144
148
  const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
145
149
  float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
@@ -181,12 +185,11 @@ static void llama_params_fit_impl(
181
185
  }
182
186
  }
183
187
 
184
- int64_t sum_total = 0;
188
+ int64_t sum_free = 0;
185
189
  int64_t sum_projected_free = 0;
186
190
  int64_t min_projected_free = INT64_MAX;
187
191
  int64_t sum_projected_used = 0;
188
192
  int64_t sum_projected_model = 0;
189
- int64_t sum_projected_ctx = 0;
190
193
 
191
194
  if (nd > 1) {
192
195
  LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
@@ -197,12 +200,11 @@ static void llama_params_fit_impl(
197
200
  const int64_t projected_used = dmd.mb.total();
198
201
  const int64_t projected_free = dmd.free - projected_used;
199
202
 
200
- sum_total += dmd.total;
203
+ sum_free += dmd.free;
201
204
  sum_projected_used += projected_used;
202
205
  sum_projected_free += projected_free;
203
206
  min_projected_free = std::min(min_projected_free, projected_free);
204
207
  sum_projected_model += dmd.mb.model;
205
- sum_projected_ctx += dmd.mb.context;
206
208
 
207
209
  if (nd > 1) {
208
210
  LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " %s\n",
@@ -210,10 +212,9 @@ static void llama_params_fit_impl(
210
212
  projected_free >= 0 ? "surplus" : "deficit");
211
213
  }
212
214
  }
213
- assert(sum_total >= 0 && sum_projected_used >= 0 && sum_projected_ctx >= 0);
214
- assert(sum_projected_used >= sum_projected_ctx);
215
+ assert(sum_free >= 0 && sum_projected_used >= 0);
215
216
  LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
216
- __func__, sum_projected_used/MiB, sum_total/MiB);
217
+ __func__, sum_projected_used/MiB, sum_free/MiB);
217
218
  if (min_projected_free >= margin) {
218
219
  if (nd == 1) {
219
220
  LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
@@ -236,9 +237,7 @@ static void llama_params_fit_impl(
236
237
  __func__, margin/MiB, -global_surplus/MiB);
237
238
  if (cparams->n_ctx == 0) {
238
239
  if (hp_nct > n_ctx_min) {
239
- const int64_t bytes_per_ctx = sum_projected_ctx / hp_nct;
240
-
241
- int64_t memory_reduction = -global_surplus;
240
+ int64_t sum_used_target = sum_free - nd*margin_s;
242
241
  if (nd > 1) {
243
242
  // for multiple devices we need to be more conservative in terms of how much context we think can fit:
244
243
  // - for dense models only whole layers can be assigned to devices
@@ -246,24 +245,34 @@ static void llama_params_fit_impl(
246
245
  // - on average we expect a waste of 0.5 layers/tensors per device
247
246
  // - use slightly more than the expected average for nd devices to be safe
248
247
  const int64_t model_per_layer = sum_projected_model / std::min(uint32_t(mparams->n_gpu_layers), hp_ngl);
249
- memory_reduction += (nd + 1) * model_per_layer / (hp_nex == 0 ? 2 : 6);
248
+ sum_used_target -= (nd + 1) * model_per_layer / (hp_nex == 0 ? 2 : 6);
250
249
  }
251
250
 
252
- uint32_t ctx_reduction = std::min(uint32_t((memory_reduction + bytes_per_ctx - 1) / bytes_per_ctx), hp_nct - n_ctx_min);
253
- cparams->n_ctx = hp_nct - ctx_reduction;
254
- cparams->n_ctx = std::max(cparams->n_ctx - cparams->n_ctx % 256, n_ctx_min); // round down context for CUDA backend
255
-
256
- ctx_reduction = hp_nct - cparams->n_ctx;
257
- memory_reduction = ctx_reduction * bytes_per_ctx;
258
- global_surplus += memory_reduction;
259
- LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
260
- __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
261
- if (global_surplus >= 0) {
251
+ int64_t sum_projected_used_min_ctx = 0;
252
+ cparams->n_ctx = n_ctx_min;
253
+ const dmds_t dmds_min_ctx = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
254
+ for (const auto & dmd : dmds_min_ctx) {
255
+ sum_projected_used_min_ctx += dmd.mb.total();
256
+ }
257
+ if (sum_used_target > sum_projected_used_min_ctx) {
258
+ // linear interpolation between minimum and maximum context size:
259
+ cparams->n_ctx += (hp_nct - n_ctx_min) * (sum_used_target - sum_projected_used_min_ctx)
260
+ / (sum_projected_used - sum_projected_used_min_ctx);
261
+ cparams->n_ctx = std::max(cparams->n_ctx - cparams->n_ctx % 256, n_ctx_min); // round down context for CUDA backend
262
+
263
+ const int64_t bytes_per_ctx = (sum_projected_used - sum_projected_used_min_ctx) / (hp_nct - n_ctx_min);
264
+ const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx;
265
+ LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
266
+ __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
262
267
  if (nd == 1) {
263
268
  LLAMA_LOG_INFO("%s: entire model can be fit by reducing context\n", __func__);
264
269
  return;
265
270
  }
266
271
  LLAMA_LOG_INFO("%s: entire model should be fit across devices by reducing context\n", __func__);
272
+ } else {
273
+ const int64_t memory_reduction = sum_projected_used - sum_projected_used_min_ctx;
274
+ LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
275
+ __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
267
276
  }
268
277
  } else {
269
278
  LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
@@ -276,28 +285,28 @@ static void llama_params_fit_impl(
276
285
  }
277
286
 
278
287
  if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
279
- throw std::runtime_error("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
288
+ throw llama_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
280
289
  }
281
290
  if (nd > 1) {
282
291
  if (!tensor_split) {
283
- throw std::runtime_error("did not provide a buffer to write the tensor_split to, abort");
292
+ throw llama_params_fit_exception("did not provide a buffer to write the tensor_split to, abort");
284
293
  }
285
294
  if (mparams->tensor_split) {
286
295
  for (size_t id = 0; id < nd; id++) {
287
296
  if (mparams->tensor_split[id] != 0.0f) {
288
- throw std::runtime_error("model_params::tensor_split already set by user, abort");
297
+ throw llama_params_fit_exception("model_params::tensor_split already set by user, abort");
289
298
  }
290
299
  }
291
300
  }
292
301
  if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
293
- throw std::runtime_error("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
302
+ throw llama_params_fit_exception("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
294
303
  }
295
304
  }
296
305
  if (!tensor_buft_overrides) {
297
- throw std::runtime_error("did not provide buffer to set tensor_buft_overrides, abort");
306
+ throw llama_params_fit_exception("did not provide buffer to set tensor_buft_overrides, abort");
298
307
  }
299
308
  if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) {
300
- throw std::runtime_error("model_params::tensor_buft_overrides already set by user, abort");
309
+ throw llama_params_fit_exception("model_params::tensor_buft_overrides already set by user, abort");
301
310
  }
302
311
 
303
312
  // step 3: iteratively fill the back to front with "dense" layers
@@ -380,8 +389,8 @@ static void llama_params_fit_impl(
380
389
  tensor_buft_overrides[itbo].buft = nullptr;
381
390
  itbo++;
382
391
  mparams.tensor_buft_overrides = tensor_buft_overrides;
383
- throw std::runtime_error("llama_params_fit_n_tensor_buft_overrides() == "
384
- + std::to_string(ntbo) + " is insufficient for model\n");
392
+ throw llama_params_fit_exception("llama_max_tensor_buft_overrides() == "
393
+ + std::to_string(ntbo) + " is insufficient for model");
385
394
  }
386
395
  tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
387
396
  tensor_buft_overrides[itbo].buft = overflow_bufts[id];
@@ -503,6 +512,9 @@ static void llama_params_fit_impl(
503
512
  if (mem_high[id] > targets[id]) {
504
513
  assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
505
514
  uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
515
+ if (hp_nex > 0 && size_t(id) == nd - 1) {
516
+ delta--;
517
+ }
506
518
  LLAMA_LOG_DEBUG("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
507
519
  while (delta > 1) {
508
520
  uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
@@ -638,7 +650,7 @@ static void llama_params_fit_impl(
638
650
  ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
639
651
  LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
640
652
  std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
641
- if (mem_test[id] < targets[id]) {
653
+ if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
642
654
  ngl_per_device = ngl_per_device_test;
643
655
  mem = mem_test;
644
656
  id_dense_start = id_dense_start_test;
@@ -648,7 +660,7 @@ static void llama_params_fit_impl(
648
660
  ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
649
661
  LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
650
662
  mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
651
- if (mem_test[id] < targets[id]) {
663
+ if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
652
664
  ngl_per_device = ngl_per_device_test;
653
665
  mem = mem_test;
654
666
  id_dense_start = id_dense_start_test;
@@ -659,7 +671,7 @@ static void llama_params_fit_impl(
659
671
  ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
660
672
  LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
661
673
  mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
662
- if (mem_test[id] < targets[id]) {
674
+ if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
663
675
  ngl_per_device = ngl_per_device_test;
664
676
  mem = mem_test;
665
677
  id_dense_start = id_dense_start_test;
@@ -678,22 +690,25 @@ static void llama_params_fit_impl(
678
690
  set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
679
691
  }
680
692
 
681
- bool llama_params_fit(
693
+ enum llama_params_fit_status llama_params_fit(
682
694
  const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
683
695
  float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
684
696
  size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
685
697
  const int64_t t0_us = llama_time_us();
686
- bool ok = true;
698
+ llama_params_fit_status status = LLAMA_PARAMS_FIT_STATUS_SUCCESS;
687
699
  try {
688
700
  llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margin_s, n_ctx_min, log_level);
689
701
  LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__);
690
- } catch (const std::runtime_error & e) {
702
+ } catch (const llama_params_fit_exception & e) {
691
703
  LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
692
- ok = false;
704
+ status = LLAMA_PARAMS_FIT_STATUS_FAILURE;
705
+ } catch (const std::runtime_error & e) {
706
+ LLAMA_LOG_ERROR("%s: encountered an error while trying to fit params to free device memory: %s\n", __func__, e.what());
707
+ status = LLAMA_PARAMS_FIT_STATUS_ERROR;
693
708
  }
694
709
  const int64_t t1_us = llama_time_us();
695
710
  LLAMA_LOG_INFO("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
696
- return ok;
711
+ return status;
697
712
  }
698
713
 
699
714
  struct llama_sampler_chain_params llama_sampler_chain_default_params() {