@fugood/llama.node 0.3.13 → 0.3.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (184) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +1 -1
  18. package/package.json +1 -1
  19. package/src/LlamaContext.cpp +98 -76
  20. package/src/LlamaContext.h +1 -1
  21. package/src/common.hpp +1 -2
  22. package/src/llama.cpp/.github/workflows/build.yml +89 -10
  23. package/src/llama.cpp/.github/workflows/server.yml +2 -0
  24. package/src/llama.cpp/CMakeLists.txt +9 -1
  25. package/src/llama.cpp/cmake/common.cmake +2 -0
  26. package/src/llama.cpp/common/CMakeLists.txt +3 -3
  27. package/src/llama.cpp/common/arg.cpp +132 -13
  28. package/src/llama.cpp/common/chat.cpp +960 -266
  29. package/src/llama.cpp/common/chat.h +135 -0
  30. package/src/llama.cpp/common/common.cpp +33 -174
  31. package/src/llama.cpp/common/common.h +27 -67
  32. package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
  33. package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
  34. package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
  35. package/src/llama.cpp/common/ngram-cache.cpp +1 -0
  36. package/src/llama.cpp/common/sampling.cpp +45 -7
  37. package/src/llama.cpp/common/speculative.cpp +10 -9
  38. package/src/llama.cpp/common/speculative.h +1 -1
  39. package/src/llama.cpp/docs/build.md +45 -7
  40. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +2 -2
  41. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +4 -2
  42. package/src/llama.cpp/examples/embedding/embedding.cpp +2 -1
  43. package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
  44. package/src/llama.cpp/examples/gritlm/gritlm.cpp +2 -2
  45. package/src/llama.cpp/examples/imatrix/imatrix.cpp +3 -4
  46. package/src/llama.cpp/examples/infill/infill.cpp +2 -2
  47. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
  48. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +5 -5
  49. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  50. package/src/llama.cpp/examples/llava/clip.cpp +373 -107
  51. package/src/llama.cpp/examples/llava/clip.h +19 -3
  52. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
  53. package/src/llama.cpp/examples/llava/llava.cpp +4 -2
  54. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
  55. package/src/llama.cpp/examples/lookahead/lookahead.cpp +7 -6
  56. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  57. package/src/llama.cpp/examples/main/main.cpp +79 -34
  58. package/src/llama.cpp/examples/parallel/parallel.cpp +6 -5
  59. package/src/llama.cpp/examples/passkey/passkey.cpp +15 -14
  60. package/src/llama.cpp/examples/perplexity/perplexity.cpp +6 -6
  61. package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
  62. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -2
  63. package/src/llama.cpp/examples/retrieval/retrieval.cpp +1 -1
  64. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
  65. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
  66. package/src/llama.cpp/examples/run/run.cpp +196 -108
  67. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +2 -2
  68. package/src/llama.cpp/examples/server/server.cpp +113 -101
  69. package/src/llama.cpp/examples/server/utils.hpp +94 -105
  70. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
  71. package/src/llama.cpp/examples/speculative/speculative.cpp +14 -14
  72. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  73. package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
  74. package/src/llama.cpp/examples/tts/tts.cpp +263 -151
  75. package/src/llama.cpp/ggml/CMakeLists.txt +14 -1
  76. package/src/llama.cpp/ggml/cmake/common.cmake +26 -0
  77. package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
  78. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
  79. package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
  80. package/src/llama.cpp/ggml/include/ggml.h +29 -1
  81. package/src/llama.cpp/ggml/src/CMakeLists.txt +15 -34
  82. package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
  83. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
  84. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
  85. package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
  86. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +6 -2
  87. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -7
  88. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
  89. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +139 -16
  90. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
  91. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
  93. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
  94. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1546 -387
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1645 -113
  96. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
  97. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  101. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
  102. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +2 -1
  103. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -1
  104. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
  105. package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
  106. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
  107. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +242 -0
  108. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -6
  109. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
  110. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -138
  111. package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
  112. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
  113. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +5 -0
  114. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +2 -1
  115. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
  116. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +117 -36
  117. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
  118. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
  119. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
  120. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
  121. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +147 -16
  123. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +40 -40
  124. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +307 -0
  125. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
  126. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +262 -746
  127. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -1
  128. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -78
  129. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +114 -6
  130. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +6 -0
  131. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +4 -1
  132. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  133. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  134. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +305 -0
  135. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +10 -0
  136. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +498 -188
  137. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -4
  138. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +16 -3
  139. package/src/llama.cpp/ggml/src/ggml.c +93 -5
  140. package/src/llama.cpp/include/llama.h +105 -27
  141. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
  142. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
  143. package/src/llama.cpp/requirements/requirements-all.txt +1 -0
  144. package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
  145. package/src/llama.cpp/requirements.txt +1 -0
  146. package/src/llama.cpp/src/CMakeLists.txt +5 -2
  147. package/src/llama.cpp/src/llama-adapter.cpp +19 -20
  148. package/src/llama.cpp/src/llama-adapter.h +11 -9
  149. package/src/llama.cpp/src/llama-arch.cpp +123 -16
  150. package/src/llama.cpp/src/llama-arch.h +19 -0
  151. package/src/llama.cpp/src/llama-batch.h +2 -2
  152. package/src/llama.cpp/src/llama-chat.cpp +1 -0
  153. package/src/llama.cpp/src/llama-context.cpp +2253 -1222
  154. package/src/llama.cpp/src/llama-context.h +214 -77
  155. package/src/llama.cpp/src/llama-cparams.h +1 -0
  156. package/src/llama.cpp/src/llama-grammar.cpp +182 -182
  157. package/src/llama.cpp/src/llama-grammar.h +12 -3
  158. package/src/llama.cpp/src/llama-graph.cpp +1662 -0
  159. package/src/llama.cpp/src/llama-graph.h +574 -0
  160. package/src/llama.cpp/src/llama-hparams.cpp +8 -0
  161. package/src/llama.cpp/src/llama-hparams.h +9 -0
  162. package/src/llama.cpp/src/llama-io.cpp +15 -0
  163. package/src/llama.cpp/src/llama-io.h +35 -0
  164. package/src/llama.cpp/src/llama-kv-cache.cpp +1006 -291
  165. package/src/llama.cpp/src/llama-kv-cache.h +178 -109
  166. package/src/llama.cpp/src/llama-memory.cpp +1 -0
  167. package/src/llama.cpp/src/llama-memory.h +21 -0
  168. package/src/llama.cpp/src/llama-mmap.cpp +11 -1
  169. package/src/llama.cpp/src/llama-model.cpp +8230 -122
  170. package/src/llama.cpp/src/llama-model.h +34 -1
  171. package/src/llama.cpp/src/llama-quant.cpp +10 -1
  172. package/src/llama.cpp/src/llama-sampling.cpp +43 -10
  173. package/src/llama.cpp/src/llama-vocab.cpp +12 -0
  174. package/src/llama.cpp/src/llama.cpp +51 -9837
  175. package/src/llama.cpp/tests/test-backend-ops.cpp +247 -112
  176. package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
  177. package/src/llama.cpp/tests/test-chat.cpp +593 -395
  178. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
  179. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
  180. package/src/llama.cpp/Sources/llama/llama.h +0 -4
  181. package/src/llama.cpp/common/chat.hpp +0 -55
  182. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +0 -143
  183. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +0 -9
  184. /package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0
@@ -2,7 +2,9 @@
2
2
 
3
3
  #include "llama.h"
4
4
  #include "llama-arch.h"
5
+ #include "llama-graph.h"
5
6
  #include "llama-hparams.h"
7
+ #include "llama-memory.h"
6
8
  #include "llama-vocab.h"
7
9
 
8
10
  #include <memory>
@@ -10,6 +12,8 @@
10
12
  #include <unordered_map>
11
13
  #include <vector>
12
14
 
15
+ struct llama_cparams;
16
+ struct llama_ubatch;
13
17
  struct llama_model_loader;
14
18
 
15
19
  // available models
@@ -25,6 +29,7 @@ enum llm_type {
25
29
  LLM_TYPE_109M,
26
30
  LLM_TYPE_137M,
27
31
  LLM_TYPE_160M,
32
+ LLM_TYPE_190M,
28
33
  LLM_TYPE_220M,
29
34
  LLM_TYPE_250M,
30
35
  LLM_TYPE_270M,
@@ -41,6 +46,7 @@ enum llm_type {
41
46
  LLM_TYPE_1_6B,
42
47
  LLM_TYPE_2B,
43
48
  LLM_TYPE_2_8B,
49
+ LLM_TYPE_2_9B,
44
50
  LLM_TYPE_3B,
45
51
  LLM_TYPE_4B,
46
52
  LLM_TYPE_6B,
@@ -256,6 +262,20 @@ struct llama_layer {
256
262
  struct ggml_tensor * time_mix_receptance_b = nullptr;
257
263
  struct ggml_tensor * time_mix_gate = nullptr;
258
264
 
265
+ // rwkv7
266
+ struct ggml_tensor * time_mix_w0 = nullptr;
267
+ struct ggml_tensor * time_mix_a0 = nullptr;
268
+ struct ggml_tensor * time_mix_a1 = nullptr;
269
+ struct ggml_tensor * time_mix_a2 = nullptr;
270
+ struct ggml_tensor * time_mix_v0 = nullptr;
271
+ struct ggml_tensor * time_mix_v1 = nullptr;
272
+ struct ggml_tensor * time_mix_v2 = nullptr;
273
+ struct ggml_tensor * time_mix_g1 = nullptr;
274
+ struct ggml_tensor * time_mix_g2 = nullptr;
275
+ struct ggml_tensor * time_mix_k_k = nullptr;
276
+ struct ggml_tensor * time_mix_k_a = nullptr;
277
+ struct ggml_tensor * time_mix_r_k = nullptr;
278
+
259
279
  struct ggml_tensor * time_mix_ln = nullptr;
260
280
  struct ggml_tensor * time_mix_ln_b = nullptr;
261
281
  struct ggml_tensor * time_mix_output = nullptr;
@@ -347,7 +367,7 @@ struct llama_model {
347
367
  std::string desc() const;
348
368
 
349
369
  size_t size() const;
350
- size_t max_nodes() const;
370
+ size_t n_tensors() const;
351
371
  size_t n_devices() const;
352
372
 
353
373
  // total number of parameters in the model
@@ -362,9 +382,22 @@ struct llama_model {
362
382
 
363
383
  const struct ggml_tensor * get_tensor(const char * name) const;
364
384
 
385
+ // TODO: move this to new llm_arch_model_i interface
386
+ llama_memory_i * create_memory() const; // TODO: params
387
+
388
+ // TODO: move this to new llm_arch_model_i interface
389
+ llm_graph_result_ptr build_graph(
390
+ const llm_graph_params & params,
391
+ ggml_cgraph * gf,
392
+ llm_graph_type type) const;
393
+
365
394
  private:
366
395
  struct impl;
367
396
  std::unique_ptr<impl> pimpl;
368
397
  };
369
398
 
370
399
  const char * llm_type_name(llm_type type);
400
+
401
+ // For internal test use
402
+ // TODO: remove
403
+ const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model);
@@ -756,10 +756,19 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
756
756
  // NOTE: can't use LLM_TN here because the layer number is not known
757
757
  quantize &= name.find("ssm_conv1d.weight") == std::string::npos;
758
758
 
759
- // do not quantize RWKV's time_mix_first tensors
759
+ // do not quantize RWKV's small yet 2D weights
760
760
  quantize &= name.find("time_mix_first.weight") == std::string::npos;
761
+ quantize &= name.find("time_mix_w0.weight") == std::string::npos;
761
762
  quantize &= name.find("time_mix_w1.weight") == std::string::npos;
762
763
  quantize &= name.find("time_mix_w2.weight") == std::string::npos;
764
+ quantize &= name.find("time_mix_v0.weight") == std::string::npos;
765
+ quantize &= name.find("time_mix_v1.weight") == std::string::npos;
766
+ quantize &= name.find("time_mix_v2.weight") == std::string::npos;
767
+ quantize &= name.find("time_mix_a0.weight") == std::string::npos;
768
+ quantize &= name.find("time_mix_a1.weight") == std::string::npos;
769
+ quantize &= name.find("time_mix_a2.weight") == std::string::npos;
770
+ quantize &= name.find("time_mix_g1.weight") == std::string::npos;
771
+ quantize &= name.find("time_mix_g2.weight") == std::string::npos;
763
772
  quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
764
773
  quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
765
774
  quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
@@ -1449,7 +1449,9 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
1449
1449
  const char ** trigger_words,
1450
1450
  size_t num_trigger_words,
1451
1451
  const llama_token * trigger_tokens,
1452
- size_t num_trigger_tokens);
1452
+ size_t num_trigger_tokens,
1453
+ const char ** trigger_patterns,
1454
+ size_t num_trigger_patterns);
1453
1455
 
1454
1456
  static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
1455
1457
  auto * ctx = (llama_sampler_grammar *) smpl->ctx;
@@ -1457,12 +1459,14 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
1457
1459
  return;
1458
1460
  }
1459
1461
 
1460
- std::vector<const char *> trigger_words;
1461
- for (auto & word : ctx->grammar->trigger_words) {
1462
- trigger_words.push_back(word.c_str());
1462
+ std::vector<const char *> trigger_patterns_c;
1463
+ trigger_patterns_c.reserve(ctx->grammar->trigger_patterns.size());
1464
+ for (auto & trigger_pattern : ctx->grammar->trigger_patterns) {
1465
+ trigger_patterns_c.push_back(trigger_pattern.pattern.c_str());
1463
1466
  }
1467
+
1464
1468
  auto * grammar_new = llama_grammar_init_impl(ctx->grammar->vocab, ctx->grammar_str.c_str(), ctx->grammar_root.c_str(),
1465
- ctx->grammar->lazy, trigger_words.data(), trigger_words.size(),
1469
+ ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(),
1466
1470
  ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size());
1467
1471
 
1468
1472
  llama_grammar_free_impl(ctx->grammar);
@@ -1472,7 +1476,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
1472
1476
  static struct llama_sampler * llama_sampler_grammar_clone(const struct llama_sampler * smpl) {
1473
1477
  const auto * ctx = (const llama_sampler_grammar *) smpl->ctx;
1474
1478
 
1475
- auto * result = llama_sampler_init_grammar_impl(ctx->vocab, nullptr, nullptr, false, nullptr, 0, nullptr, 0);
1479
+ auto * result = llama_sampler_init_grammar_impl(ctx->vocab, nullptr, nullptr, false, nullptr, 0, nullptr, 0, nullptr, 0);
1476
1480
 
1477
1481
  // copy the state
1478
1482
  {
@@ -1516,15 +1520,33 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
1516
1520
  const char ** trigger_words,
1517
1521
  size_t num_trigger_words,
1518
1522
  const llama_token * trigger_tokens,
1519
- size_t num_trigger_tokens) {
1523
+ size_t num_trigger_tokens,
1524
+ const char ** trigger_patterns,
1525
+ size_t num_trigger_patterns) {
1520
1526
  auto * ctx = new llama_sampler_grammar;
1521
1527
 
1522
1528
  if (grammar_str != nullptr && grammar_str[0] != '\0') {
1529
+ // TODO: remove trigger_words support.
1530
+ if (trigger_words != nullptr && num_trigger_words > 0) {
1531
+ GGML_ASSERT(trigger_patterns == nullptr && num_trigger_patterns == 0);
1532
+ std::string trigger_pattern("[\\s\\S]*?(");
1533
+ for (size_t i = 0; i < num_trigger_words; ++i) {
1534
+ static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
1535
+ if (i > 0) {
1536
+ trigger_pattern += "|";
1537
+ }
1538
+ trigger_pattern += std::regex_replace(trigger_words[i], special_chars, "\\$0");
1539
+ }
1540
+ trigger_pattern += ")[\\s\\S]*";
1541
+ auto trigger_pattern_c = trigger_pattern.c_str();
1542
+ trigger_patterns = &trigger_pattern_c;
1543
+ num_trigger_patterns = 1;
1544
+ }
1523
1545
  *ctx = {
1524
1546
  /* .vocab = */ vocab,
1525
1547
  /* .grammar_str = */ grammar_str,
1526
1548
  /* .grammar_root = */ grammar_root,
1527
- /* .grammar = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens),
1549
+ /* .grammar = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_patterns, num_trigger_patterns, trigger_tokens, num_trigger_tokens),
1528
1550
  };
1529
1551
  } else {
1530
1552
  *ctx = {
@@ -1545,7 +1567,7 @@ struct llama_sampler * llama_sampler_init_grammar(
1545
1567
  const struct llama_vocab * vocab,
1546
1568
  const char * grammar_str,
1547
1569
  const char * grammar_root) {
1548
- return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ false, nullptr, 0, nullptr, 0);
1570
+ return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ false, nullptr, 0, nullptr, 0, nullptr, 0);
1549
1571
  }
1550
1572
 
1551
1573
  struct llama_sampler * llama_sampler_init_grammar_lazy(
@@ -1556,7 +1578,18 @@ struct llama_sampler * llama_sampler_init_grammar_lazy(
1556
1578
  size_t num_trigger_words,
1557
1579
  const llama_token * trigger_tokens,
1558
1580
  size_t num_trigger_tokens) {
1559
- return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens);
1581
+ return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens, nullptr, 0);
1582
+ }
1583
+
1584
+ struct llama_sampler * llama_sampler_init_grammar_lazy_patterns(
1585
+ const struct llama_vocab * vocab,
1586
+ const char * grammar_str,
1587
+ const char * grammar_root,
1588
+ const char ** trigger_patterns,
1589
+ size_t num_trigger_patterns,
1590
+ const llama_token * trigger_tokens,
1591
+ size_t num_trigger_tokens) {
1592
+ return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, nullptr, 0, trigger_tokens, num_trigger_tokens, trigger_patterns, num_trigger_patterns);
1560
1593
  }
1561
1594
 
1562
1595
  // penalties
@@ -16,6 +16,7 @@
16
16
  #include <queue>
17
17
  #include <set>
18
18
  #include <unordered_map>
19
+ #include <cctype>
19
20
 
20
21
  //
21
22
  // helpers
@@ -392,6 +393,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
392
393
  "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
393
394
  };
394
395
  break;
396
+ case LLAMA_VOCAB_PRE_TYPE_GPT4O:
397
+ regex_exprs = {
398
+ // original regex from tokenizer.json
399
+ // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
400
+ "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
401
+ };
402
+ break;
395
403
  default:
396
404
  // default regex for BPE tokenization pre-processing
397
405
  regex_exprs = {
@@ -1592,6 +1600,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1592
1600
  } else if (
1593
1601
  tokenizer_pre == "megrez") {
1594
1602
  pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
1603
+ } else if (
1604
+ tokenizer_pre == "gpt-4o") {
1605
+ pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O;
1606
+ clean_spaces = false;
1595
1607
  } else {
1596
1608
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
1597
1609
  }