@fugood/llama.node 1.4.11 → 1.4.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/package.json +15 -15
  2. package/scripts/llama.cpp.patch +31 -31
  3. package/src/llama.cpp/common/arg.cpp +128 -59
  4. package/src/llama.cpp/common/arg.h +1 -0
  5. package/src/llama.cpp/common/chat-parser.cpp +11 -0
  6. package/src/llama.cpp/common/chat.cpp +36 -7
  7. package/src/llama.cpp/common/chat.h +1 -0
  8. package/src/llama.cpp/common/common.cpp +42 -23
  9. package/src/llama.cpp/common/common.h +11 -1
  10. package/src/llama.cpp/common/llguidance.cpp +10 -6
  11. package/src/llama.cpp/common/regex-partial.cpp +13 -13
  12. package/src/llama.cpp/common/sampling.cpp +58 -14
  13. package/src/llama.cpp/common/sampling.h +3 -1
  14. package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
  15. package/src/llama.cpp/ggml/include/ggml-backend.h +1 -1
  16. package/src/llama.cpp/ggml/src/CMakeLists.txt +23 -9
  17. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
  18. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
  19. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
  20. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
  21. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
  22. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
  23. package/src/llama.cpp/include/llama.h +100 -12
  24. package/src/llama.cpp/src/CMakeLists.txt +4 -0
  25. package/src/llama.cpp/src/llama-adapter.cpp +12 -3
  26. package/src/llama.cpp/src/llama-adapter.h +7 -1
  27. package/src/llama.cpp/src/llama-arch.cpp +78 -0
  28. package/src/llama.cpp/src/llama-arch.h +8 -0
  29. package/src/llama.cpp/src/llama-chat.cpp +11 -0
  30. package/src/llama.cpp/src/llama-chat.h +1 -0
  31. package/src/llama.cpp/src/llama-context.cpp +637 -49
  32. package/src/llama.cpp/src/llama-context.h +43 -1
  33. package/src/llama.cpp/src/llama-grammar.cpp +40 -13
  34. package/src/llama.cpp/src/llama-grammar.h +2 -0
  35. package/src/llama.cpp/src/llama-graph.cpp +173 -5
  36. package/src/llama.cpp/src/llama-graph.h +71 -6
  37. package/src/llama.cpp/src/llama-hparams.cpp +4 -0
  38. package/src/llama.cpp/src/llama-hparams.h +12 -5
  39. package/src/llama.cpp/src/llama-kv-cache.h +1 -1
  40. package/src/llama.cpp/src/llama-mmap.cpp +11 -4
  41. package/src/llama.cpp/src/llama-model-loader.cpp +23 -0
  42. package/src/llama.cpp/src/llama-model-loader.h +2 -0
  43. package/src/llama.cpp/src/llama-model-saver.cpp +3 -0
  44. package/src/llama.cpp/src/llama-model.cpp +337 -26
  45. package/src/llama.cpp/src/llama-model.h +13 -2
  46. package/src/llama.cpp/src/llama-sampling.cpp +1259 -186
  47. package/src/llama.cpp/src/llama-sampling.h +19 -7
  48. package/src/llama.cpp/src/llama-vocab.cpp +101 -33
  49. package/src/llama.cpp/src/llama-vocab.h +2 -0
  50. package/src/llama.cpp/src/llama.cpp +87 -64
  51. package/src/llama.cpp/src/models/afmoe.cpp +9 -5
  52. package/src/llama.cpp/src/models/bert.cpp +4 -2
  53. package/src/llama.cpp/src/models/cogvlm.cpp +5 -3
  54. package/src/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
  55. package/src/llama.cpp/src/models/deepseek2.cpp +1 -1
  56. package/src/llama.cpp/src/models/gemma-embedding.cpp +2 -6
  57. package/src/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
  58. package/src/llama.cpp/src/models/gemma3.cpp +3 -4
  59. package/src/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
  60. package/src/llama.cpp/src/models/llama-iswa.cpp +6 -2
  61. package/src/llama.cpp/src/models/llama.cpp +19 -6
  62. package/src/llama.cpp/src/models/maincoder.cpp +117 -0
  63. package/src/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
  64. package/src/llama.cpp/src/models/models.h +18 -0
  65. package/src/llama.cpp/src/models/modern-bert.cpp +116 -0
  66. package/src/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
  67. package/src/llama.cpp/src/models/plamo3.cpp +128 -0
  68. package/src/llama.cpp/src/models/smallthinker.cpp +11 -5
  69. package/src/llama.cpp/src/unicode.cpp +23 -14
@@ -14,7 +14,19 @@ struct llama_grammar;
14
14
  struct llama_sampler_chain {
15
15
  llama_sampler_chain_params params;
16
16
 
17
- std::vector<struct llama_sampler *> samplers;
17
+ // has .backend_init() been called?
18
+ bool is_init = false;
19
+
20
+ struct info {
21
+ bool is_backend;
22
+
23
+ llama_sampler * ptr;
24
+ };
25
+
26
+ std::vector<info> samplers;
27
+
28
+ // pre-allocated buffer for llama_sampler_sample to avoid repeated allocations
29
+ std::vector<llama_token_data> cur;
18
30
 
19
31
  // timing
20
32
 
@@ -24,9 +36,9 @@ struct llama_sampler_chain {
24
36
  };
25
37
 
26
38
  struct llama_sampler * llama_sampler_init_dry_testing(
27
- int32_t context_size,
28
- float dry_multiplier,
29
- float dry_base,
30
- int32_t dry_allowed_length,
31
- int32_t dry_penalty_last_n,
32
- const std::vector<std::vector<llama_token>>& seq_breakers);
39
+ int32_t context_size,
40
+ float dry_multiplier,
41
+ float dry_base,
42
+ int32_t dry_allowed_length,
43
+ int32_t dry_penalty_last_n,
44
+ const std::vector<std::vector<llama_token>> & seq_breakers);
@@ -314,6 +314,12 @@ struct llm_tokenizer_bpe : llm_tokenizer {
314
314
  "[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+",
315
315
  };
316
316
  break;
317
+ case LLAMA_VOCAB_PRE_TYPE_YOUTU:
318
+ regex_exprs = {
319
+ "[가-힣ㄱ-ㆎ]+|[!…“”‘’—:;,、-〿︰-﹏]+|[ㄅ-ㄯ]+|[一-龥぀-ゟ゠-ヿ]+",
320
+ "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
321
+ };
322
+ break;
317
323
  case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
318
324
  regex_exprs = {
319
325
  "[\r\n]",
@@ -355,6 +361,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
355
361
  case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
356
362
  case LLAMA_VOCAB_PRE_TYPE_QWEN2:
357
363
  case LLAMA_VOCAB_PRE_TYPE_HUNYUAN:
364
+ case LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN:
358
365
  regex_exprs = {
359
366
  // original regex from tokenizer.json
360
367
  // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
@@ -1860,6 +1867,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1860
1867
  tokenizer_pre == "deepseek-v3") {
1861
1868
  pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
1862
1869
  clean_spaces = false;
1870
+ } else if (
1871
+ tokenizer_pre == "youtu") {
1872
+ pre_type = LLAMA_VOCAB_PRE_TYPE_YOUTU;
1873
+ clean_spaces = false;
1874
+ ignore_merges = true;
1863
1875
  } else if (
1864
1876
  tokenizer_pre == "falcon") {
1865
1877
  pre_type = LLAMA_VOCAB_PRE_TYPE_FALCON;
@@ -1878,7 +1890,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1878
1890
  tokenizer_pre == "jina-v2-es" ||
1879
1891
  tokenizer_pre == "jina-v2-de" ||
1880
1892
  tokenizer_pre == "a.x-4.0" ||
1881
- tokenizer_pre == "mellum") {
1893
+ tokenizer_pre == "mellum" ||
1894
+ tokenizer_pre == "modern-bert" ) {
1882
1895
  pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1883
1896
  } else if (
1884
1897
  tokenizer_pre == "jina-v1-en" ||
@@ -2014,6 +2027,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2014
2027
  tokenizer_pre == "minimax-m2") {
2015
2028
  pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2;
2016
2029
  clean_spaces = false;
2030
+ } else if (
2031
+ tokenizer_pre == "solar-open") {
2032
+ pre_type = LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN;
2033
+ clean_spaces = false;
2017
2034
  } else {
2018
2035
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
2019
2036
  }
@@ -2186,6 +2203,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2186
2203
  // for now, we apply this workaround to find the tokens based on their text
2187
2204
 
2188
2205
  for (const auto & t : token_to_id) {
2206
+ auto & attr = id_to_token[t.second].attr;
2207
+
2189
2208
  // find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
2190
2209
  if (special_eot_id == LLAMA_TOKEN_NULL) {
2191
2210
  if (false
@@ -2201,10 +2220,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2201
2220
  || t.first == "<end_of_utterance>" // smoldocling
2202
2221
  ) {
2203
2222
  special_eot_id = t.second;
2204
- if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2223
+ if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2205
2224
  LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2206
2225
  __func__, t.second, t.first.c_str());
2207
- id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2226
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2208
2227
  }
2209
2228
  }
2210
2229
  }
@@ -2215,10 +2234,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2215
2234
  || t.first == "<|eom_id|>"
2216
2235
  ) {
2217
2236
  special_eom_id = t.second;
2218
- if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2237
+ if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2219
2238
  LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2220
2239
  __func__, t.second, t.first.c_str());
2221
- id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2240
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2222
2241
  }
2223
2242
  }
2224
2243
  }
@@ -2235,10 +2254,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2235
2254
  || t.first == "<|code_prefix|>" // GLM-4.5
2236
2255
  ) {
2237
2256
  special_fim_pre_id = t.second;
2238
- if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2257
+ if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2239
2258
  LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2240
2259
  __func__, t.second, t.first.c_str());
2241
- id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2260
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2242
2261
  }
2243
2262
  }
2244
2263
  }
@@ -2255,10 +2274,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2255
2274
  || t.first == "<|code_suffix|>" // GLM-4.5
2256
2275
  ) {
2257
2276
  special_fim_suf_id = t.second;
2258
- if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2277
+ if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2259
2278
  LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2260
2279
  __func__, t.second, t.first.c_str());
2261
- id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2280
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2262
2281
  }
2263
2282
  }
2264
2283
  }
@@ -2275,10 +2294,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2275
2294
  || t.first == "<|code_middle|>" // GLM-4.5
2276
2295
  ) {
2277
2296
  special_fim_mid_id = t.second;
2278
- if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2297
+ if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2279
2298
  LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2280
2299
  __func__, t.second, t.first.c_str());
2281
- id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2300
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2282
2301
  }
2283
2302
  }
2284
2303
  }
@@ -2292,10 +2311,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2292
2311
  || t.first == "<PAD>"
2293
2312
  ) {
2294
2313
  special_fim_pad_id = t.second;
2295
- if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2314
+ if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2296
2315
  LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2297
2316
  __func__, t.second, t.first.c_str());
2298
- id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2317
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2299
2318
  }
2300
2319
  }
2301
2320
  }
@@ -2310,10 +2329,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2310
2329
  || t.first == "<reponame>" // Granite
2311
2330
  ) {
2312
2331
  special_fim_rep_id = t.second;
2313
- if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2332
+ if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2314
2333
  LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2315
2334
  __func__, t.second, t.first.c_str());
2316
- id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2335
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2317
2336
  }
2318
2337
  }
2319
2338
  }
@@ -2324,15 +2343,41 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2324
2343
  || t.first == "<|file_sep|>" // Qwen
2325
2344
  ) {
2326
2345
  special_fim_sep_id = t.second;
2327
- if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2346
+ if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2328
2347
  LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2329
2348
  __func__, t.second, t.first.c_str());
2330
- id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2349
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2331
2350
  }
2332
2351
  }
2333
2352
  }
2334
2353
  }
2335
2354
 
2355
+ // auto-detect unused tokens: e.g. control tokens with the word "unused"
2356
+ // ideally, these tokens should be marked as unused during conversion
2357
+ {
2358
+ uint32_t n_unused = 0;
2359
+
2360
+ for (const auto & t : token_to_id) {
2361
+ auto & attr = id_to_token[t.second].attr;
2362
+
2363
+ if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2364
+ continue;
2365
+ }
2366
+
2367
+ if ((attr & LLAMA_TOKEN_ATTR_UNUSED) == 0) {
2368
+ if (strstr(t.first.c_str(), "unused") != NULL) {
2369
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_UNUSED);
2370
+ }
2371
+ }
2372
+
2373
+ if (attr & LLAMA_TOKEN_ATTR_UNUSED) {
2374
+ n_unused++;
2375
+ }
2376
+ }
2377
+
2378
+ LLAMA_LOG_INFO("%s: %u unused tokens\n", __func__, n_unused);
2379
+ }
2380
+
2336
2381
  // maintain a list of tokens that cause end-of-generation
2337
2382
  // this is currently determined based on the token text, which is obviously not ideal
2338
2383
  // ref: https://github.com/ggerganov/llama.cpp/issues/9606
@@ -2351,12 +2396,16 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2351
2396
  }
2352
2397
 
2353
2398
  for (const auto & t : token_to_id) {
2399
+ auto & attr = id_to_token[t.second].attr;
2400
+
2354
2401
  if (false
2355
2402
  || t.first == "<|eot_id|>"
2356
2403
  || t.first == "<|im_end|>"
2357
2404
  || t.first == "<|end|>"
2358
2405
  || t.first == "<|return|>" // o200k_harmony
2359
2406
  || t.first == "<|call|>" // o200k_harmony
2407
+ || t.first == "<|flush|>" // solar-open
2408
+ || t.first == "<|calls|>" // solar-open
2360
2409
  || t.first == "<end_of_turn>"
2361
2410
  || t.first == "<|endoftext|>"
2362
2411
  || t.first == "<|eom_id|>"
@@ -2366,24 +2415,28 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2366
2415
  || t.first == "<end_of_utterance>" // smoldocling
2367
2416
  ) {
2368
2417
  special_eog_ids.insert(t.second);
2369
- if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2418
+ if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2370
2419
  LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2371
2420
  __func__, t.second, t.first.c_str());
2372
- id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2421
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2373
2422
  }
2374
2423
  } else {
2375
- // token is control, but not marked as EOG -> print a debug log
2376
- if (id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && special_eog_ids.count(t.second) == 0) {
2377
- LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
2378
- __func__, t.second, t.first.c_str());
2424
+ if (attr & LLAMA_TOKEN_ATTR_CONTROL && !(attr & LLAMA_TOKEN_ATTR_UNUSED)) {
2425
+ // token is control, but not marked as EOG -> print a debug log
2426
+ if (special_eog_ids.count(t.second) == 0) {
2427
+ LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
2428
+ __func__, t.second, t.first.c_str());
2429
+ }
2379
2430
  }
2380
2431
  }
2381
2432
  }
2382
2433
 
2383
2434
  // @ngxson : quick hack for gpt-oss, always render these tokens
2384
2435
  for (const auto & t : token_to_id) {
2436
+ auto & attr = id_to_token[t.second].attr;
2437
+
2385
2438
  if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>") {
2386
- id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
2439
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_USER_DEFINED);
2387
2440
  }
2388
2441
  }
2389
2442
 
@@ -2403,34 +2456,42 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2403
2456
  LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
2404
2457
  }
2405
2458
 
2406
- // TODO: workaround for o200k_harmony tokenizer: the "<|end|>" token should not be EOG
2407
- // we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens,
2459
+ // TODO: workaround for o200k_harmony and solar-open tokenizer: the "<|end|>" token should not be EOG
2460
+ // we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens ("<|calls|>" and "<|flush|>" for solar-open),
2408
2461
  // we remove the "<|end|>" token from the EOG list
2409
2462
  {
2410
2463
  bool has_return = false;
2411
2464
  bool has_call = false;
2412
2465
  bool has_end = false;
2466
+ bool has_flush = false;
2413
2467
 
2414
2468
  llama_token end_id = LLAMA_TOKEN_NULL;
2415
2469
 
2416
2470
  LLAMA_LOG_INFO("%s: printing all EOG tokens:\n", __func__);
2417
2471
  for (auto tid : special_eog_ids) {
2418
- LLAMA_LOG_INFO("%s: - %d ('%s')\n", __func__, tid, id_to_token[tid].text.c_str());
2472
+ auto & text = id_to_token[tid].text;
2473
+
2474
+ LLAMA_LOG_INFO("%s: - %d ('%s')\n", __func__, tid, text.c_str());
2419
2475
 
2420
- if (id_to_token[tid].text == "<|return|>") {
2476
+ if (text == "<|return|>") {
2421
2477
  has_return = true;
2422
- } else if (id_to_token[tid].text == "<|call|>") {
2478
+ } else if (text == "<|call|>" || text == "<|calls|>") {
2423
2479
  has_call = true;
2424
- } else if (id_to_token[tid].text == "<|end|>") {
2480
+ } else if (text == "<|flush|>") {
2481
+ has_flush = true;
2482
+ } else if (text == "<|end|>") {
2425
2483
  has_end = true;
2426
2484
  end_id = tid;
2427
2485
  }
2428
2486
  }
2429
2487
 
2430
- if (has_return && has_call && has_end) {
2488
+ if ((has_return && has_call && has_end) || (has_call && has_flush && has_end)) {
2431
2489
  special_eog_ids.erase(end_id);
2432
- id_to_token[end_id].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
2433
- LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
2490
+
2491
+ auto & attr = id_to_token[end_id].attr;
2492
+ attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_USER_DEFINED);
2493
+
2494
+ LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>', or '<|calls|>' and '<|flush|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
2434
2495
  }
2435
2496
  }
2436
2497
  }
@@ -2528,6 +2589,13 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2528
2589
  for (const auto * token : {"<unk>", "<s>", "<|endoftext|>"}) {
2529
2590
  _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
2530
2591
  }
2592
+ } else if (_contains_any(model_name, {"modern-bert"})) {
2593
+ if (token_to_id.count("[MASK]") == 0 ) {
2594
+ LLAMA_LOG_WARN("%s: Mask token missing in vocab!\n", __func__);
2595
+ }
2596
+ else {
2597
+ _set_token_attr("[MASK]", LLAMA_TOKEN_ATTR_LSTRIP, true);
2598
+ }
2531
2599
  }
2532
2600
  }
2533
2601
  }
@@ -51,6 +51,8 @@ enum llama_vocab_pre_type {
51
51
  LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40,
52
52
  LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2 = 41,
53
53
  LLAMA_VOCAB_PRE_TYPE_AFMOE = 42,
54
+ LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43,
55
+ LLAMA_VOCAB_PRE_TYPE_YOUTU = 44,
54
56
  };
55
57
 
56
58
  struct LLM_KV;