@fugood/llama.node 0.3.15 → 0.3.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (203) hide show
  1. package/CMakeLists.txt +3 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +5 -0
  19. package/package.json +1 -1
  20. package/src/LlamaCompletionWorker.cpp +8 -0
  21. package/src/LlamaCompletionWorker.h +1 -0
  22. package/src/LlamaContext.cpp +3 -2
  23. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
  24. package/src/llama.cpp/.github/workflows/build.yml +70 -27
  25. package/src/llama.cpp/.github/workflows/docker.yml +6 -6
  26. package/src/llama.cpp/.github/workflows/server.yml +7 -11
  27. package/src/llama.cpp/CMakeLists.txt +23 -1
  28. package/src/llama.cpp/common/CMakeLists.txt +6 -3
  29. package/src/llama.cpp/common/arg.cpp +809 -105
  30. package/src/llama.cpp/common/arg.h +9 -0
  31. package/src/llama.cpp/common/chat.cpp +1 -1
  32. package/src/llama.cpp/common/common.cpp +31 -521
  33. package/src/llama.cpp/common/common.h +17 -36
  34. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  35. package/src/llama.cpp/common/llguidance.cpp +30 -47
  36. package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
  37. package/src/llama.cpp/common/minja/minja.hpp +119 -93
  38. package/src/llama.cpp/common/sampling.cpp +3 -0
  39. package/src/llama.cpp/docs/build.md +122 -7
  40. package/src/llama.cpp/examples/CMakeLists.txt +0 -9
  41. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  42. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
  43. package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
  44. package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
  45. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
  46. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  47. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
  48. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  49. package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
  50. package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
  51. package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
  52. package/src/llama.cpp/examples/llava/clip.h +39 -22
  53. package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
  54. package/src/llama.cpp/examples/llava/llava.cpp +64 -52
  55. package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
  56. package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
  57. package/src/llama.cpp/examples/llava/mtmd.h +168 -0
  58. package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
  59. package/src/llama.cpp/examples/main/main.cpp +16 -5
  60. package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
  61. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  62. package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
  63. package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
  64. package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
  65. package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
  66. package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
  67. package/src/llama.cpp/examples/run/run.cpp +14 -28
  68. package/src/llama.cpp/examples/server/httplib.h +313 -247
  69. package/src/llama.cpp/examples/server/server.cpp +243 -139
  70. package/src/llama.cpp/examples/server/utils.hpp +51 -2
  71. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  72. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  73. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  74. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  75. package/src/llama.cpp/examples/tts/tts.cpp +14 -9
  76. package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
  77. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  78. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  79. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  80. package/src/llama.cpp/ggml/include/ggml.h +66 -99
  81. package/src/llama.cpp/ggml/src/CMakeLists.txt +15 -8
  82. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  83. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  84. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  85. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  86. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  87. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  88. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  89. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  90. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
  91. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  92. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  93. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  94. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2413 -228
  96. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1004 -13516
  99. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
  101. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
  102. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
  103. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
  104. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
  105. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  106. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  107. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  108. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  109. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
  110. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  111. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  112. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  113. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  114. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  115. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
  116. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  117. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
  118. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
  119. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +127 -33
  120. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  121. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  123. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  124. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +29 -293
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +12 -43
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  130. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  131. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +210 -286
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  133. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  134. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  135. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  136. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  137. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  138. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  139. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
  140. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +692 -126
  141. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
  142. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +21 -10
  143. package/src/llama.cpp/ggml/src/ggml.c +141 -245
  144. package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
  145. package/src/llama.cpp/include/llama.h +30 -11
  146. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  147. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  148. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  149. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  150. package/src/llama.cpp/requirements/requirements-all.txt +2 -0
  151. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  152. package/src/llama.cpp/src/CMakeLists.txt +3 -2
  153. package/src/llama.cpp/src/llama-adapter.cpp +37 -1
  154. package/src/llama.cpp/src/llama-arch.cpp +161 -17
  155. package/src/llama.cpp/src/llama-arch.h +16 -0
  156. package/src/llama.cpp/src/llama-chat.cpp +82 -17
  157. package/src/llama.cpp/src/llama-chat.h +6 -2
  158. package/src/llama.cpp/src/llama-context.cpp +108 -92
  159. package/src/llama.cpp/src/llama-context.h +1 -2
  160. package/src/llama.cpp/src/llama-graph.cpp +189 -119
  161. package/src/llama.cpp/src/llama-graph.h +26 -6
  162. package/src/llama.cpp/src/llama-hparams.h +13 -0
  163. package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
  164. package/src/llama.cpp/src/llama-kv-cache.h +41 -115
  165. package/src/llama.cpp/src/llama-memory.h +1 -1
  166. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  167. package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
  168. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  169. package/src/llama.cpp/src/llama-model.cpp +1544 -291
  170. package/src/llama.cpp/src/llama-model.h +13 -1
  171. package/src/llama.cpp/src/llama-quant.cpp +29 -8
  172. package/src/llama.cpp/src/llama-sampling.cpp +7 -1
  173. package/src/llama.cpp/src/llama-vocab.cpp +44 -6
  174. package/src/llama.cpp/src/llama.cpp +1 -1
  175. package/src/llama.cpp/tests/CMakeLists.txt +43 -30
  176. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  177. package/src/llama.cpp/tests/test-backend-ops.cpp +139 -57
  178. package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
  179. package/src/llama.cpp/tests/test-chat.cpp +12 -2
  180. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  181. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  182. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  183. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  184. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  185. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  186. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  187. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  188. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  189. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  190. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  191. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  192. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  193. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  194. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  195. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  196. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  197. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  198. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  199. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  200. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  201. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  202. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  203. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
@@ -121,10 +121,6 @@ struct common_grammar_trigger {
121
121
  common_grammar_trigger_type type;
122
122
  std::string value;
123
123
  llama_token token = LLAMA_TOKEN_NULL;
124
-
125
- // T can only be nlohmann::ordered_json
126
- template <class T> T to_json() const;
127
- template <class T> static common_grammar_trigger from_json(const T & in);
128
124
  };
129
125
 
130
126
  // sampling parameters
@@ -184,6 +180,13 @@ struct common_params_sampling {
184
180
  std::string print() const;
185
181
  };
186
182
 
183
+ struct common_params_model {
184
+ std::string path = ""; // model local path // NOLINT
185
+ std::string url = ""; // model url to download // NOLINT
186
+ std::string hf_repo = ""; // HF repo // NOLINT
187
+ std::string hf_file = ""; // HF file // NOLINT
188
+ };
189
+
187
190
  struct common_params_speculative {
188
191
  std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
189
192
 
@@ -197,19 +200,11 @@ struct common_params_speculative {
197
200
  struct cpu_params cpuparams;
198
201
  struct cpu_params cpuparams_batch;
199
202
 
200
- std::string hf_repo = ""; // HF repo // NOLINT
201
- std::string hf_file = ""; // HF file // NOLINT
202
-
203
- std::string model = ""; // draft model for speculative decoding // NOLINT
204
- std::string model_url = ""; // model url to download // NOLINT
203
+ struct common_params_model model;
205
204
  };
206
205
 
207
206
  struct common_params_vocoder {
208
- std::string hf_repo = ""; // HF repo // NOLINT
209
- std::string hf_file = ""; // HF file // NOLINT
210
-
211
- std::string model = ""; // model path // NOLINT
212
- std::string model_url = ""; // model url to download // NOLINT
207
+ struct common_params_model model;
213
208
 
214
209
  std::string speaker_file = ""; // speaker file path // NOLINT
215
210
 
@@ -267,12 +262,10 @@ struct common_params {
267
262
  struct common_params_speculative speculative;
268
263
  struct common_params_vocoder vocoder;
269
264
 
270
- std::string model = ""; // model path // NOLINT
265
+ struct common_params_model model;
266
+
271
267
  std::string model_alias = ""; // model alias // NOLINT
272
- std::string model_url = ""; // model url to download // NOLINT
273
268
  std::string hf_token = ""; // HF token // NOLINT
274
- std::string hf_repo = ""; // HF repo // NOLINT
275
- std::string hf_file = ""; // HF file // NOLINT
276
269
  std::string prompt = ""; // NOLINT
277
270
  std::string system_prompt = ""; // NOLINT
278
271
  std::string prompt_file = ""; // store the external prompt file name // NOLINT
@@ -286,6 +279,7 @@ struct common_params {
286
279
  std::vector<std::string> in_files; // all input files
287
280
  std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
288
281
  std::vector<llama_model_kv_override> kv_overrides;
282
+ std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
289
283
 
290
284
  bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
291
285
  std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
@@ -347,7 +341,9 @@ struct common_params {
347
341
  common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
348
342
 
349
343
  // multimodal models (see examples/llava)
350
- std::string mmproj = ""; // path to multimodal projector // NOLINT
344
+ struct common_params_model mmproj;
345
+ bool mmproj_use_gpu = true; // use GPU for multimodal model
346
+ bool no_mmproj = false; // explicitly disable multimodal model
351
347
  std::vector<std::string> image; // path to image file(s)
352
348
 
353
349
  // embedding
@@ -546,26 +542,11 @@ struct llama_model_params common_model_params_to_llama ( common_params
546
542
  struct llama_context_params common_context_params_to_llama(const common_params & params);
547
543
  struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
548
544
 
549
- struct llama_model * common_load_model_from_url(
550
- const std::string & model_url,
551
- const std::string & local_path,
552
- const std::string & hf_token,
553
- const struct llama_model_params & params);
554
-
555
- struct llama_model * common_load_model_from_hf(
556
- const std::string & repo,
557
- const std::string & remote_path,
558
- const std::string & local_path,
559
- const std::string & hf_token,
560
- const struct llama_model_params & params);
561
-
562
- std::pair<std::string, std::string> common_get_hf_file(
563
- const std::string & hf_repo_with_tag,
564
- const std::string & hf_token);
565
-
566
545
  // clear LoRA adapters from context, then apply new list of adapters
567
546
  void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
568
547
 
548
+ std::string get_model_endpoint();
549
+
569
550
  //
570
551
  // Batch utils
571
552
  //
@@ -16,6 +16,9 @@ using json = nlohmann::ordered_json;
16
16
  static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
17
17
  auto has_max = max_items != std::numeric_limits<int>::max();
18
18
 
19
+ if (max_items == 0) {
20
+ return "";
21
+ }
19
22
  if (min_items == 0 && max_items == 1) {
20
23
  return item_rule + "?";
21
24
  }
@@ -11,25 +11,24 @@ struct llama_sampler_llg {
11
11
  std::string grammar_kind;
12
12
  std::string grammar_data;
13
13
  LlgTokenizer * tokenizer;
14
- LlgConstraint * grammar;
15
- LlgMaskResult llg_res;
16
- bool has_llg_res;
14
+ LlgMatcher * grammar;
17
15
  };
18
16
 
19
- static LlgConstraint * llama_sampler_llg_new(LlgTokenizer * tokenizer, const char * grammar_kind,
20
- const char * grammar_data) {
17
+ static LlgMatcher * llama_sampler_llg_new(LlgTokenizer * tokenizer, const char * grammar_kind,
18
+ const char * grammar_data) {
21
19
  LlgConstraintInit cinit;
22
20
  llg_constraint_init_set_defaults(&cinit, tokenizer);
23
21
  const char * log_level = getenv("LLGUIDANCE_LOG_LEVEL");
24
22
  if (log_level && *log_level) {
25
23
  cinit.log_stderr_level = atoi(log_level);
26
24
  }
27
- auto c = llg_new_constraint_any(&cinit, grammar_kind, grammar_data);
28
- if (llg_get_error(c)) {
29
- LOG_ERR("llg error: %s\n", llg_get_error(c));
30
- llg_free_constraint(c);
25
+ auto c = llg_new_matcher(&cinit, grammar_kind, grammar_data);
26
+ if (llg_matcher_get_error(c)) {
27
+ LOG_ERR("llg error: %s\n", llg_matcher_get_error(c));
28
+ llg_free_matcher(c);
31
29
  return nullptr;
32
30
  }
31
+
33
32
  return c;
34
33
  }
35
34
 
@@ -40,39 +39,29 @@ static const char * llama_sampler_llg_name(const llama_sampler * /*smpl*/) {
40
39
  static void llama_sampler_llg_accept_impl(llama_sampler * smpl, llama_token token) {
41
40
  auto * ctx = (llama_sampler_llg *) smpl->ctx;
42
41
  if (ctx->grammar) {
43
- LlgCommitResult res;
44
- llg_commit_token(ctx->grammar, token, &res);
45
- ctx->has_llg_res = false;
42
+ llg_matcher_consume_token(ctx->grammar, token);
46
43
  }
47
44
  }
48
45
 
49
46
  static void llama_sampler_llg_apply(llama_sampler * smpl, llama_token_data_array * cur_p) {
50
47
  auto * ctx = (llama_sampler_llg *) smpl->ctx;
51
48
  if (ctx->grammar) {
52
- if (!ctx->has_llg_res) {
53
- if (llg_compute_mask(ctx->grammar, &ctx->llg_res) == 0) {
54
- ctx->has_llg_res = true;
49
+ const uint32_t * mask = llg_matcher_get_mask(ctx->grammar);
50
+ if (mask == nullptr) {
51
+ if (llg_matcher_compute_mask(ctx->grammar) == 0) {
52
+ mask = llg_matcher_get_mask(ctx->grammar);
55
53
  } else {
56
- LOG_ERR("llg error: %s\n", llg_get_error(ctx->grammar));
57
- llg_free_constraint(ctx->grammar);
54
+ LOG_ERR("llg error: %s\n", llg_matcher_get_error(ctx->grammar));
55
+ llg_free_matcher(ctx->grammar);
58
56
  ctx->grammar = nullptr;
57
+ return;
59
58
  }
60
59
  }
61
- if (ctx->has_llg_res) {
62
- if (ctx->llg_res.is_stop) {
63
- for (size_t i = 0; i < cur_p->size; ++i) {
64
- if (!llama_vocab_is_eog(ctx->vocab, cur_p->data[i].id)) {
65
- cur_p->data[i].logit = -INFINITY;
66
- }
67
- }
68
- } else {
69
- const uint32_t * mask = ctx->llg_res.sample_mask;
70
- for (size_t i = 0; i < cur_p->size; ++i) {
71
- auto token = cur_p->data[i].id;
72
- if ((mask[token / 32] & (1 << (token % 32))) == 0) {
73
- cur_p->data[i].logit = -INFINITY;
74
- }
75
- }
60
+
61
+ for (size_t i = 0; i < cur_p->size; ++i) {
62
+ auto token = cur_p->data[i].id;
63
+ if ((mask[token / 32] & (1 << (token % 32))) == 0) {
64
+ cur_p->data[i].logit = -INFINITY;
76
65
  }
77
66
  }
78
67
  }
@@ -80,14 +69,9 @@ static void llama_sampler_llg_apply(llama_sampler * smpl, llama_token_data_array
80
69
 
81
70
  static void llama_sampler_llg_reset(llama_sampler * smpl) {
82
71
  auto * ctx = (llama_sampler_llg *) smpl->ctx;
83
- if (!ctx->grammar) {
84
- return;
72
+ if (ctx->grammar) {
73
+ llg_matcher_reset(ctx->grammar);
85
74
  }
86
-
87
- auto * grammar_new = llama_sampler_llg_new(ctx->tokenizer, ctx->grammar_kind.c_str(), ctx->grammar_data.c_str());
88
- llg_free_constraint(ctx->grammar);
89
- ctx->grammar = grammar_new;
90
- ctx->has_llg_res = false;
91
75
  }
92
76
 
93
77
  static llama_sampler * llama_sampler_llg_clone(const llama_sampler * smpl) {
@@ -102,7 +86,7 @@ static llama_sampler * llama_sampler_llg_clone(const llama_sampler * smpl) {
102
86
  if (ctx->grammar) {
103
87
  result_ctx->grammar_kind = ctx->grammar_kind;
104
88
  result_ctx->grammar_data = ctx->grammar_data;
105
- result_ctx->grammar = llg_clone_constraint(ctx->grammar);
89
+ result_ctx->grammar = llg_clone_matcher(ctx->grammar);
106
90
  result_ctx->tokenizer = llg_clone_tokenizer(ctx->tokenizer);
107
91
  }
108
92
  }
@@ -114,7 +98,7 @@ static void llama_sampler_llg_free(llama_sampler * smpl) {
114
98
  const auto * ctx = (llama_sampler_llg *) smpl->ctx;
115
99
 
116
100
  if (ctx->grammar) {
117
- llg_free_constraint(ctx->grammar);
101
+ llg_free_matcher(ctx->grammar);
118
102
  llg_free_tokenizer(ctx->tokenizer);
119
103
  }
120
104
 
@@ -239,9 +223,11 @@ llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab, const char * g
239
223
  /* .grammar_data = */ grammar_data,
240
224
  /* .tokenizer = */ tokenizer,
241
225
  /* .grammar = */ llama_sampler_llg_new(tokenizer, grammar_kind, grammar_data),
242
- /* .llg_res = */ {},
243
- /* .has_llg_res = */ false,
244
226
  };
227
+ if (ctx->grammar) {
228
+ GGML_ASSERT(((size_t) llama_vocab_n_tokens(vocab) + 31) / 32 * 4 ==
229
+ llg_matcher_get_mask_byte_size(ctx->grammar));
230
+ }
245
231
  } else {
246
232
  *ctx = {
247
233
  /* .vocab = */ vocab,
@@ -249,15 +235,12 @@ llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab, const char * g
249
235
  /* .grammar_data = */ {},
250
236
  /* .tokenizer = */ nullptr,
251
237
  /* .grammar = */ nullptr,
252
- /* .llg_res = */ {},
253
- /* .has_llg_res = */ false,
254
238
  };
255
239
  }
256
240
 
257
241
  return llama_sampler_init(
258
242
  /* .iface = */ &llama_sampler_llg_i,
259
- /* .ctx = */ ctx
260
- );
243
+ /* .ctx = */ ctx);
261
244
  }
262
245
 
263
246
  #else
@@ -9,10 +9,19 @@
9
9
  #pragma once
10
10
 
11
11
  #include "minja.hpp"
12
- #include <json.hpp>
12
+
13
+ #include <chrono>
14
+ #include <cstddef>
15
+ #include <cstdio>
16
+ #include <exception>
17
+ #include <iomanip>
18
+ #include <memory>
19
+ #include <sstream>
13
20
  #include <string>
14
21
  #include <vector>
15
22
 
23
+ #include <json.hpp>
24
+
16
25
  using json = nlohmann::ordered_json;
17
26
 
18
27
  namespace minja {
@@ -425,7 +434,7 @@ class chat_template {
425
434
  auto obj = json {
426
435
  {"tool_calls", tool_calls},
427
436
  };
428
- if (!content.is_null() && content != "") {
437
+ if (!content.is_null() && !content.empty()) {
429
438
  obj["content"] = content;
430
439
  }
431
440
  message["content"] = obj.dump(2);
@@ -435,13 +444,12 @@ class chat_template {
435
444
  if (polyfill_tool_responses && role == "tool") {
436
445
  message["role"] = "user";
437
446
  auto obj = json {
438
- {"tool_response", {
439
- {"content", message.at("content")},
440
- }},
447
+ {"tool_response", json::object()},
441
448
  };
442
449
  if (message.contains("name")) {
443
- obj["tool_response"]["name"] = message.at("name");
450
+ obj["tool_response"]["tool"] = message.at("name");
444
451
  }
452
+ obj["tool_response"]["content"] = message.at("content");
445
453
  if (message.contains("tool_call_id")) {
446
454
  obj["tool_response"]["tool_call_id"] = message.at("tool_call_id");
447
455
  }
@@ -510,7 +518,7 @@ class chat_template {
510
518
  static nlohmann::ordered_json add_system(const nlohmann::ordered_json & messages, const std::string & system_prompt) {
511
519
  json messages_with_system = messages;
512
520
 
513
- if (messages_with_system.size() > 0 && messages_with_system[0].at("role") == "system") {
521
+ if (!messages_with_system.empty() && messages_with_system[0].at("role") == "system") {
514
522
  std::string existing_system = messages_with_system.at(0).at("content");
515
523
  messages_with_system[0] = json {
516
524
  {"role", "system"},