@fugood/llama.node 0.3.9 → 0.3.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.js +2 -2
  18. package/lib/binding.ts +47 -8
  19. package/lib/index.js +21 -1
  20. package/lib/index.ts +31 -1
  21. package/package.json +12 -3
  22. package/src/LlamaCompletionWorker.cpp +33 -6
  23. package/src/LlamaCompletionWorker.h +3 -1
  24. package/src/LlamaContext.cpp +336 -28
  25. package/src/LlamaContext.h +2 -0
  26. package/src/common.hpp +19 -2
  27. package/src/llama.cpp/.github/workflows/build.yml +289 -107
  28. package/src/llama.cpp/.github/workflows/close-issue.yml +1 -1
  29. package/src/llama.cpp/.github/workflows/docker.yml +2 -1
  30. package/src/llama.cpp/.github/workflows/server.yml +25 -2
  31. package/src/llama.cpp/CMakeLists.txt +10 -19
  32. package/src/llama.cpp/cmake/build-info.cmake +1 -1
  33. package/src/llama.cpp/common/CMakeLists.txt +32 -0
  34. package/src/llama.cpp/common/arg.cpp +66 -16
  35. package/src/llama.cpp/common/chat-template.hpp +515 -0
  36. package/src/llama.cpp/common/chat.cpp +966 -0
  37. package/src/llama.cpp/common/chat.hpp +52 -0
  38. package/src/llama.cpp/common/common.cpp +159 -36
  39. package/src/llama.cpp/common/common.h +56 -14
  40. package/src/llama.cpp/common/json-schema-to-grammar.cpp +46 -66
  41. package/src/llama.cpp/common/json-schema-to-grammar.h +15 -1
  42. package/src/llama.cpp/common/llguidance.cpp +270 -0
  43. package/src/llama.cpp/common/log.cpp +1 -10
  44. package/src/llama.cpp/common/log.h +10 -0
  45. package/src/llama.cpp/common/minja.hpp +2868 -0
  46. package/src/llama.cpp/common/sampling.cpp +22 -1
  47. package/src/llama.cpp/common/sampling.h +3 -0
  48. package/src/llama.cpp/docs/build.md +54 -9
  49. package/src/llama.cpp/examples/export-lora/export-lora.cpp +12 -2
  50. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +1 -1
  51. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  52. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +59 -0
  53. package/src/llama.cpp/examples/llava/clip.cpp +133 -14
  54. package/src/llama.cpp/examples/llava/clip.h +2 -0
  55. package/src/llama.cpp/examples/llava/llava.cpp +22 -8
  56. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +9 -1
  57. package/src/llama.cpp/examples/main/main.cpp +26 -25
  58. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +136 -137
  59. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +18 -4
  60. package/src/llama.cpp/examples/run/run.cpp +224 -69
  61. package/src/llama.cpp/examples/server/server.cpp +252 -81
  62. package/src/llama.cpp/examples/server/utils.hpp +73 -21
  63. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +6 -4
  64. package/src/llama.cpp/examples/simple-cmake-pkg/CMakeLists.txt +11 -0
  65. package/src/llama.cpp/ggml/CMakeLists.txt +78 -1
  66. package/src/llama.cpp/ggml/include/ggml.h +1 -1
  67. package/src/llama.cpp/ggml/src/CMakeLists.txt +21 -4
  68. package/src/llama.cpp/ggml/src/ggml-alloc.c +1 -13
  69. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +91 -78
  70. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +7 -7
  71. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -1
  72. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +1 -1
  73. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +46 -0
  74. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +16 -1
  75. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +1 -1
  76. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +28 -8
  77. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +5 -7
  78. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +33 -23
  79. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +1 -5
  80. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +323 -121
  81. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +13 -3
  82. package/src/llama.cpp/ggml/src/ggml.c +23 -13
  83. package/src/llama.cpp/include/llama.h +14 -1
  84. package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +112 -0
  85. package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +46 -0
  86. package/src/llama.cpp/src/CMakeLists.txt +1 -1
  87. package/src/llama.cpp/src/llama-arch.cpp +7 -2
  88. package/src/llama.cpp/src/llama-arch.h +3 -1
  89. package/src/llama.cpp/src/llama-chat.cpp +11 -2
  90. package/src/llama.cpp/src/llama-chat.h +1 -0
  91. package/src/llama.cpp/src/llama-grammar.cpp +86 -6
  92. package/src/llama.cpp/src/llama-grammar.h +22 -1
  93. package/src/llama.cpp/src/llama-mmap.cpp +1 -0
  94. package/src/llama.cpp/src/llama-model-loader.cpp +1 -1
  95. package/src/llama.cpp/src/llama-model.cpp +76 -6
  96. package/src/llama.cpp/src/llama-sampling.cpp +47 -4
  97. package/src/llama.cpp/src/llama-vocab.cpp +10 -4
  98. package/src/llama.cpp/src/llama.cpp +181 -123
  99. package/src/llama.cpp/tests/CMakeLists.txt +4 -0
  100. package/src/llama.cpp/tests/test-backend-ops.cpp +158 -57
  101. package/src/llama.cpp/tests/test-chat-template.cpp +154 -31
  102. package/src/llama.cpp/tests/test-chat.cpp +607 -0
  103. package/src/llama.cpp/tests/test-grammar-integration.cpp +2 -2
  104. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +1140 -0
  105. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +1 -1
  106. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +0 -32
@@ -1093,8 +1093,20 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1093
1093
  {
1094
1094
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1095
1095
  switch (hparams.n_layer) {
1096
- case 28: type = LLM_TYPE_6B; break;
1097
- case 40: type = LLM_TYPE_9B; break;
1096
+ case 28: {
1097
+ if (hparams.n_head(0) == 16) {
1098
+ type = LLM_TYPE_1_5B;
1099
+ } else {
1100
+ type = LLM_TYPE_6B;
1101
+ }
1102
+ } break;
1103
+ case 40: {
1104
+ if (hparams.n_head(0) == 24) {
1105
+ type = LLM_TYPE_4B;
1106
+ } else {
1107
+ type = LLM_TYPE_9B;
1108
+ }
1109
+ } break;
1098
1110
  default: type = LLM_TYPE_UNKNOWN;
1099
1111
  }
1100
1112
  } break;
@@ -1263,6 +1275,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1263
1275
 
1264
1276
  const bool use_mmap_buffer = true;
1265
1277
 
1278
+ LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, use_mmap_buffer ? "true" : "false");
1279
+
1266
1280
  // build a list of buffer types for the CPU and GPU devices
1267
1281
  pimpl->cpu_buft_list = make_cpu_buft_list(devices);
1268
1282
  for (auto * dev : devices) {
@@ -1303,10 +1317,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1303
1317
  const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
1304
1318
  auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
1305
1319
  if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
1320
+ LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s\n", il, ggml_backend_dev_name(cpu_dev));
1306
1321
  return {cpu_dev, &pimpl->cpu_buft_list};
1307
1322
  }
1308
1323
  const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
1309
1324
  auto * dev = devices.at(layer_gpu);
1325
+ LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s\n", il, ggml_backend_dev_name(dev));
1310
1326
  return {dev, &pimpl->gpu_buft_list.at(dev)};
1311
1327
  };
1312
1328
 
@@ -2203,6 +2219,50 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2203
2219
  layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2204
2220
  }
2205
2221
  } break;
2222
+ case LLM_ARCH_PHIMOE:
2223
+ {
2224
+ const int64_t n_embd_head = n_embd / n_head;
2225
+
2226
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
2227
+
2228
+ // output
2229
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
2230
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
2231
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
2232
+ output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), { n_vocab }, 0);
2233
+
2234
+ for (int i = 0; i < n_layer; ++i) {
2235
+ auto & layer = layers[i];
2236
+
2237
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
2238
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), { n_embd }, 0);
2239
+
2240
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED);
2241
+ if (layer.wqkv == nullptr) {
2242
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
2243
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
2244
+
2245
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
2246
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
2247
+
2248
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
2249
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
2250
+ }
2251
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
2252
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, 0);
2253
+
2254
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
2255
+ layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), { n_embd }, 0);
2256
+
2257
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
2258
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
2259
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
2260
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
2261
+
2262
+ layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2263
+ layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2264
+ }
2265
+ } break;
2206
2266
  case LLM_ARCH_PLAMO:
2207
2267
  {
2208
2268
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -3022,9 +3082,17 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3022
3082
  auto & layer = layers[i];
3023
3083
 
3024
3084
  layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3085
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
3086
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
3025
3087
 
3026
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
3027
- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
3088
+ if (layer.wqkv == nullptr) {
3089
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3090
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
3091
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
3092
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
3093
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
3094
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
3095
+ }
3028
3096
 
3029
3097
  layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3030
3098
 
@@ -3911,8 +3979,10 @@ uint64_t llama_model_size(const struct llama_model * model) {
3911
3979
  return model->size();
3912
3980
  }
3913
3981
 
3914
- const char * llama_model_chat_template(const struct llama_model * model) {
3915
- const auto & it = model->gguf_kv.find(LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE));
3982
+ const char * llama_model_chat_template(const struct llama_model * model, const char * name) {
3983
+ const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE_N)
3984
+ : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
3985
+ const auto & it = model->gguf_kv.find(key);
3916
3986
  if (it == model->gguf_kv.end()) {
3917
3987
  return nullptr;
3918
3988
  }
@@ -1433,13 +1433,30 @@ static void llama_sampler_grammar_apply(struct llama_sampler * smpl, llama_token
1433
1433
  }
1434
1434
  }
1435
1435
 
1436
+ // Fwd declare to break reset --> init_impl --> llama_sampler_grammar_i --> reset cycle.
1437
+ static struct llama_sampler * llama_sampler_init_grammar_impl(
1438
+ const struct llama_vocab * vocab,
1439
+ const char * grammar_str,
1440
+ const char * grammar_root,
1441
+ bool lazy,
1442
+ const char ** trigger_words,
1443
+ size_t num_trigger_words,
1444
+ const llama_token * trigger_tokens,
1445
+ size_t num_trigger_tokens);
1446
+
1436
1447
  static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
1437
1448
  auto * ctx = (llama_sampler_grammar *) smpl->ctx;
1438
1449
  if (!ctx->grammar) {
1439
1450
  return;
1440
1451
  }
1441
1452
 
1442
- auto * grammar_new = llama_grammar_init_impl(ctx->grammar->vocab, ctx->grammar_str.c_str(), ctx->grammar_root.c_str());
1453
+ std::vector<const char *> trigger_words;
1454
+ for (auto & word : ctx->grammar->trigger_words) {
1455
+ trigger_words.push_back(word.c_str());
1456
+ }
1457
+ auto * grammar_new = llama_grammar_init_impl(ctx->grammar->vocab, ctx->grammar_str.c_str(), ctx->grammar_root.c_str(),
1458
+ ctx->grammar->lazy, trigger_words.data(), trigger_words.size(),
1459
+ ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size());
1443
1460
 
1444
1461
  llama_grammar_free_impl(ctx->grammar);
1445
1462
  ctx->grammar = grammar_new;
@@ -1448,7 +1465,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
1448
1465
  static struct llama_sampler * llama_sampler_grammar_clone(const struct llama_sampler * smpl) {
1449
1466
  const auto * ctx = (const llama_sampler_grammar *) smpl->ctx;
1450
1467
 
1451
- auto * result = llama_sampler_init_grammar(ctx->vocab, nullptr, nullptr);
1468
+ auto * result = llama_sampler_init_grammar_impl(ctx->vocab, nullptr, nullptr, false, nullptr, 0, nullptr, 0);
1452
1469
 
1453
1470
  // copy the state
1454
1471
  {
@@ -1484,7 +1501,15 @@ static struct llama_sampler_i llama_sampler_grammar_i = {
1484
1501
  /* .free = */ llama_sampler_grammar_free,
1485
1502
  };
1486
1503
 
1487
- struct llama_sampler * llama_sampler_init_grammar(const struct llama_vocab * vocab, const char * grammar_str, const char * grammar_root) {
1504
+ static struct llama_sampler * llama_sampler_init_grammar_impl(
1505
+ const struct llama_vocab * vocab,
1506
+ const char * grammar_str,
1507
+ const char * grammar_root,
1508
+ bool lazy,
1509
+ const char ** trigger_words,
1510
+ size_t num_trigger_words,
1511
+ const llama_token * trigger_tokens,
1512
+ size_t num_trigger_tokens) {
1488
1513
  auto * ctx = new llama_sampler_grammar;
1489
1514
 
1490
1515
  if (grammar_str != nullptr && grammar_str[0] != '\0') {
@@ -1492,7 +1517,7 @@ struct llama_sampler * llama_sampler_init_grammar(const struct llama_vocab * voc
1492
1517
  /* .vocab = */ vocab,
1493
1518
  /* .grammar_str = */ grammar_str,
1494
1519
  /* .grammar_root = */ grammar_root,
1495
- /* .grammar = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root),
1520
+ /* .grammar = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens),
1496
1521
  };
1497
1522
  } else {
1498
1523
  *ctx = {
@@ -1509,6 +1534,24 @@ struct llama_sampler * llama_sampler_init_grammar(const struct llama_vocab * voc
1509
1534
  };
1510
1535
  }
1511
1536
 
1537
+ struct llama_sampler * llama_sampler_init_grammar(
1538
+ const struct llama_vocab * vocab,
1539
+ const char * grammar_str,
1540
+ const char * grammar_root) {
1541
+ return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ false, nullptr, 0, nullptr, 0);
1542
+ }
1543
+
1544
+ struct llama_sampler * llama_sampler_init_grammar_lazy(
1545
+ const struct llama_vocab * vocab,
1546
+ const char * grammar_str,
1547
+ const char * grammar_root,
1548
+ const char ** trigger_words,
1549
+ size_t num_trigger_words,
1550
+ const llama_token * trigger_tokens,
1551
+ size_t num_trigger_tokens) {
1552
+ return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens);
1553
+ }
1554
+
1512
1555
  // penalties
1513
1556
 
1514
1557
  struct llama_sampler_penalties {
@@ -1245,8 +1245,13 @@ struct llama_vocab::impl {
1245
1245
 
1246
1246
  std::vector<llama_token> cache_special_tokens;
1247
1247
  std::vector<std::string> cache_token_to_piece; // llama_token_to_piece(special = true);
1248
-
1249
- std::map<std::pair<std::string, std::string>, int> bpe_ranks;
1248
+ struct pair_hash {
1249
+ size_t operator()(const std::pair<std::string, std::string> & p) const {
1250
+ return std::hash<std::string>{}(p.first) ^ //create some hash for pair
1251
+ (std::hash<std::string>{}(p.second) << 1);
1252
+ }
1253
+ };
1254
+ std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> bpe_ranks;
1250
1255
 
1251
1256
  // set of all tokens that cause "end of generation"
1252
1257
  std::set<llama_token> special_eog_ids;
@@ -1523,7 +1528,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1523
1528
  pre_type = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
1524
1529
  clean_spaces = false;
1525
1530
  } else if (
1526
- tokenizer_pre == "qwen2") {
1531
+ tokenizer_pre == "qwen2" ||
1532
+ tokenizer_pre == "deepseek-r1-qwen") {
1527
1533
  pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
1528
1534
  clean_spaces = false;
1529
1535
  } else if (
@@ -1686,7 +1692,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1686
1692
  GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
1687
1693
  linefeed_id = ids[0];
1688
1694
  } else {
1689
- const std::vector<int> ids = tokenize("\xC4\x8A", false); // U+010A
1695
+ const std::vector<int> ids = tokenize("\n", false);
1690
1696
 
1691
1697
  //GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
1692
1698
  if (ids.empty()) {