@fugood/llama.node 0.3.12 → 0.3.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +2 -1
  18. package/package.json +1 -1
  19. package/src/LlamaCompletionWorker.cpp +14 -0
  20. package/src/LlamaContext.cpp +110 -79
  21. package/src/LlamaContext.h +1 -1
  22. package/src/common.hpp +1 -2
  23. package/src/llama.cpp/.github/workflows/build.yml +95 -13
  24. package/src/llama.cpp/.github/workflows/docker.yml +2 -0
  25. package/src/llama.cpp/.github/workflows/labeler.yml +1 -1
  26. package/src/llama.cpp/.github/workflows/server.yml +2 -0
  27. package/src/llama.cpp/common/CMakeLists.txt +23 -6
  28. package/src/llama.cpp/common/arg.cpp +292 -14
  29. package/src/llama.cpp/common/chat.cpp +1128 -315
  30. package/src/llama.cpp/common/chat.h +135 -0
  31. package/src/llama.cpp/common/common.cpp +27 -171
  32. package/src/llama.cpp/common/common.h +41 -73
  33. package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
  34. package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
  35. package/src/llama.cpp/common/llguidance.cpp +3 -3
  36. package/src/llama.cpp/common/log.cpp +1 -0
  37. package/src/llama.cpp/common/log.h +2 -1
  38. package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +21 -7
  39. package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +61 -14
  40. package/src/llama.cpp/common/ngram-cache.cpp +1 -0
  41. package/src/llama.cpp/common/sampling.cpp +93 -49
  42. package/src/llama.cpp/common/speculative.cpp +6 -5
  43. package/src/llama.cpp/common/speculative.h +1 -1
  44. package/src/llama.cpp/docs/build.md +47 -9
  45. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
  46. package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
  47. package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
  48. package/src/llama.cpp/examples/imatrix/imatrix.cpp +4 -4
  49. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +6 -5
  50. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +1 -1
  51. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
  52. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  53. package/src/llama.cpp/examples/llava/clip.cpp +373 -107
  54. package/src/llama.cpp/examples/llava/clip.h +19 -3
  55. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
  56. package/src/llama.cpp/examples/llava/llava.cpp +4 -2
  57. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
  58. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
  59. package/src/llama.cpp/examples/main/main.cpp +73 -28
  60. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
  61. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
  62. package/src/llama.cpp/examples/perplexity/perplexity.cpp +1 -0
  63. package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
  64. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
  65. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
  66. package/src/llama.cpp/examples/run/run.cpp +115 -79
  67. package/src/llama.cpp/examples/server/CMakeLists.txt +1 -1
  68. package/src/llama.cpp/examples/server/httplib.h +381 -292
  69. package/src/llama.cpp/examples/server/server.cpp +134 -128
  70. package/src/llama.cpp/examples/server/utils.hpp +95 -106
  71. package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
  72. package/src/llama.cpp/examples/tts/tts.cpp +251 -142
  73. package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
  74. package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
  75. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
  76. package/src/llama.cpp/ggml/include/ggml-cpu.h +4 -1
  77. package/src/llama.cpp/ggml/include/ggml-metal.h +1 -1
  78. package/src/llama.cpp/ggml/include/ggml-vulkan.h +0 -2
  79. package/src/llama.cpp/ggml/include/ggml.h +6 -2
  80. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
  81. package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
  82. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
  83. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
  84. package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
  85. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
  86. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
  87. package/src/llama.cpp/ggml/src/ggml-common.h +0 -2
  88. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
  89. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
  90. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
  91. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +156 -11
  93. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +2235 -641
  94. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1572 -198
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +24 -5
  96. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
  97. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +9 -8
  101. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +16 -3
  102. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
  103. package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
  104. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
  105. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
  106. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
  107. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
  108. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +246 -120
  109. package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
  110. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
  111. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
  112. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  113. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
  114. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
  115. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
  116. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
  117. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
  119. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
  120. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
  121. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
  123. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +174 -728
  124. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
  125. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
  126. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  127. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  128. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +949 -602
  129. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +37 -3
  130. package/src/llama.cpp/ggml/src/ggml.c +9 -4
  131. package/src/llama.cpp/include/llama.h +32 -14
  132. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
  133. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
  134. package/src/llama.cpp/requirements/requirements-all.txt +1 -0
  135. package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
  136. package/src/llama.cpp/requirements.txt +1 -0
  137. package/src/llama.cpp/src/llama-arch.cpp +21 -0
  138. package/src/llama.cpp/src/llama-arch.h +1 -0
  139. package/src/llama.cpp/src/llama-chat.cpp +1 -0
  140. package/src/llama.cpp/src/llama-grammar.cpp +183 -183
  141. package/src/llama.cpp/src/llama-grammar.h +13 -4
  142. package/src/llama.cpp/src/llama-impl.h +6 -6
  143. package/src/llama.cpp/src/llama-kv-cache.h +2 -1
  144. package/src/llama.cpp/src/llama-mmap.cpp +11 -1
  145. package/src/llama.cpp/src/llama-mmap.h +1 -0
  146. package/src/llama.cpp/src/llama-model.cpp +70 -6
  147. package/src/llama.cpp/src/llama-sampling.cpp +174 -67
  148. package/src/llama.cpp/src/llama-vocab.cpp +12 -0
  149. package/src/llama.cpp/src/llama.cpp +154 -5
  150. package/src/llama.cpp/src/unicode.cpp +9 -2
  151. package/src/llama.cpp/tests/test-backend-ops.cpp +171 -115
  152. package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
  153. package/src/llama.cpp/tests/test-chat.cpp +691 -325
  154. package/src/llama.cpp/tests/test-gguf.cpp +4 -4
  155. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
  156. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
  157. package/src/llama.cpp/tests/test-sampling.cpp +15 -0
  158. package/src/llama.cpp/Sources/llama/llama.h +0 -4
  159. package/src/llama.cpp/common/chat.hpp +0 -52
@@ -345,194 +345,194 @@ const char * llama_grammar_parser::parse_sequence(
345
345
  size_t last_sym_start = rule.size();
346
346
  const char * pos = src;
347
347
 
348
- auto handle_repetitions = [&](int min_times, int max_times) {
348
+ auto handle_repetitions = [&](int min_times, int max_times) {
349
349
 
350
- if (last_sym_start == rule.size()) {
351
- throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos);
352
- }
350
+ if (last_sym_start == rule.size()) {
351
+ throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos);
352
+ }
353
353
 
354
- // apply transformation to previous symbol (last_sym_start to end) according to
355
- // the following rewrite rules:
356
- // S{m,n} --> S S S (m times) S'(n-m)
357
- // S'(x) ::= S S'(x-1) |
358
- // (... n-m definitions of these S' rules ...)
359
- // S'(1) ::= S |
360
- // S{m,} --> S S S (m times) S'
361
- // S' ::= S S' |
362
- // S* --> S{0,}
363
- // --> S' ::= S S' |
364
- // S+ --> S{1,}
365
- // --> S S'
366
- // S' ::= S S' |
367
- // S? --> S{0,1}
368
- // --> S'
369
- // S' ::= S |
370
-
371
- llama_grammar_rule prev_rule(rule.begin() + last_sym_start, rule.end());
372
- if (min_times == 0) {
373
- rule.resize(last_sym_start);
374
- } else {
375
- // Repeat the previous elements (min_times - 1) times
376
- for (int i = 1; i < min_times; i++) {
377
- rule.insert(rule.end(), prev_rule.begin(), prev_rule.end());
378
- }
354
+ // apply transformation to previous symbol (last_sym_start to end) according to
355
+ // the following rewrite rules:
356
+ // S{m,n} --> S S S (m times) S'(n-m)
357
+ // S'(x) ::= S S'(x-1) |
358
+ // (... n-m definitions of these S' rules ...)
359
+ // S'(1) ::= S |
360
+ // S{m,} --> S S S (m times) S'
361
+ // S' ::= S S' |
362
+ // S* --> S{0,}
363
+ // --> S' ::= S S' |
364
+ // S+ --> S{1,}
365
+ // --> S S'
366
+ // S' ::= S S' |
367
+ // S? --> S{0,1}
368
+ // --> S'
369
+ // S' ::= S |
370
+
371
+ llama_grammar_rule prev_rule(rule.begin() + last_sym_start, rule.end());
372
+ if (min_times == 0) {
373
+ rule.resize(last_sym_start);
374
+ } else {
375
+ // Repeat the previous elements (min_times - 1) times
376
+ for (int i = 1; i < min_times; i++) {
377
+ rule.insert(rule.end(), prev_rule.begin(), prev_rule.end());
379
378
  }
379
+ }
380
380
 
381
- uint32_t last_rec_rule_id = 0;
382
- auto n_opt = max_times < 0 ? 1 : max_times - min_times;
381
+ uint32_t last_rec_rule_id = 0;
382
+ auto n_opt = max_times < 0 ? 1 : max_times - min_times;
383
383
 
384
- llama_grammar_rule rec_rule(prev_rule);
385
- for (int i = 0; i < n_opt; i++) {
386
- rec_rule.resize(prev_rule.size());
387
- uint32_t rec_rule_id = generate_symbol_id( rule_name);
388
- if (i > 0 || max_times < 0) {
389
- rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id});
390
- }
391
- rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
392
- rec_rule.push_back({LLAMA_GRETYPE_END, 0});
393
- add_rule( rec_rule_id, rec_rule);
394
- last_rec_rule_id = rec_rule_id;
384
+ llama_grammar_rule rec_rule(prev_rule);
385
+ for (int i = 0; i < n_opt; i++) {
386
+ rec_rule.resize(prev_rule.size());
387
+ uint32_t rec_rule_id = generate_symbol_id( rule_name);
388
+ if (i > 0 || max_times < 0) {
389
+ rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id});
395
390
  }
396
- if (n_opt > 0) {
397
- rule.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id});
398
- }
399
- };
391
+ rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
392
+ rec_rule.push_back({LLAMA_GRETYPE_END, 0});
393
+ add_rule( rec_rule_id, rec_rule);
394
+ last_rec_rule_id = rec_rule_id;
395
+ }
396
+ if (n_opt > 0) {
397
+ rule.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id});
398
+ }
399
+ };
400
400
 
401
- while (*pos) {
402
- if (*pos == '"') { // literal string
403
- pos++;
404
- last_sym_start = rule.size();
405
- while (*pos != '"') {
406
- if (!*pos) {
407
- throw std::runtime_error("unexpected end of input");
408
- }
409
- auto char_pair = parse_char(pos);
410
- pos = char_pair.second;
411
- rule.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
401
+ while (*pos) {
402
+ if (*pos == '"') { // literal string
403
+ pos++;
404
+ last_sym_start = rule.size();
405
+ while (*pos != '"') {
406
+ if (!*pos) {
407
+ throw std::runtime_error("unexpected end of input");
412
408
  }
413
- pos = parse_space(pos + 1, is_nested);
414
- } else if (*pos == '[') { // char range(s)
409
+ auto char_pair = parse_char(pos);
410
+ pos = char_pair.second;
411
+ rule.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
412
+ }
413
+ pos = parse_space(pos + 1, is_nested);
414
+ } else if (*pos == '[') { // char range(s)
415
+ pos++;
416
+ enum llama_gretype start_type = LLAMA_GRETYPE_CHAR;
417
+ if (*pos == '^') {
415
418
  pos++;
416
- enum llama_gretype start_type = LLAMA_GRETYPE_CHAR;
417
- if (*pos == '^') {
418
- pos++;
419
- start_type = LLAMA_GRETYPE_CHAR_NOT;
419
+ start_type = LLAMA_GRETYPE_CHAR_NOT;
420
+ }
421
+ last_sym_start = rule.size();
422
+ while (*pos != ']') {
423
+ if (!*pos) {
424
+ throw std::runtime_error("unexpected end of input");
420
425
  }
421
- last_sym_start = rule.size();
422
- while (*pos != ']') {
423
- if (!*pos) {
426
+ auto char_pair = parse_char(pos);
427
+ pos = char_pair.second;
428
+ enum llama_gretype type = last_sym_start < rule.size()
429
+ ? LLAMA_GRETYPE_CHAR_ALT
430
+ : start_type;
431
+
432
+ rule.push_back({type, char_pair.first});
433
+ if (pos[0] == '-' && pos[1] != ']') {
434
+ if (!pos[1]) {
424
435
  throw std::runtime_error("unexpected end of input");
425
436
  }
426
- auto char_pair = parse_char(pos);
427
- pos = char_pair.second;
428
- enum llama_gretype type = last_sym_start < rule.size()
429
- ? LLAMA_GRETYPE_CHAR_ALT
430
- : start_type;
431
-
432
- rule.push_back({type, char_pair.first});
433
- if (pos[0] == '-' && pos[1] != ']') {
434
- if (!pos[1]) {
435
- throw std::runtime_error("unexpected end of input");
436
- }
437
- auto endchar_pair = parse_char(pos + 1);
438
- pos = endchar_pair.second;
439
- rule.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
440
- }
441
- }
442
- pos = parse_space(pos + 1, is_nested);
443
- } else if (is_word_char(*pos)) { // rule reference
444
- const char * name_end = parse_name(pos);
445
- uint32_t ref_rule_id = get_symbol_id(pos, name_end - pos);
446
- pos = parse_space(name_end, is_nested);
447
- last_sym_start = rule.size();
448
- rule.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id});
449
- } else if (*pos == '(') { // grouping
450
- // parse nested alternates into synthesized rule
451
- pos = parse_space(pos + 1, true);
452
- uint32_t sub_rule_id = generate_symbol_id(rule_name);
453
- pos = parse_alternates(pos, rule_name, sub_rule_id, true);
454
- last_sym_start = rule.size();
455
- // output reference to synthesized rule
456
- rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
457
- if (*pos != ')') {
458
- throw std::runtime_error(std::string("expecting ')' at ") + pos);
437
+ auto endchar_pair = parse_char(pos + 1);
438
+ pos = endchar_pair.second;
439
+ rule.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
459
440
  }
441
+ }
442
+ pos = parse_space(pos + 1, is_nested);
443
+ } else if (is_word_char(*pos)) { // rule reference
444
+ const char * name_end = parse_name(pos);
445
+ uint32_t ref_rule_id = get_symbol_id(pos, name_end - pos);
446
+ pos = parse_space(name_end, is_nested);
447
+ last_sym_start = rule.size();
448
+ rule.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id});
449
+ } else if (*pos == '(') { // grouping
450
+ // parse nested alternates into synthesized rule
451
+ pos = parse_space(pos + 1, true);
452
+ uint32_t sub_rule_id = generate_symbol_id(rule_name);
453
+ pos = parse_alternates(pos, rule_name, sub_rule_id, true);
454
+ last_sym_start = rule.size();
455
+ // output reference to synthesized rule
456
+ rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
457
+ if (*pos != ')') {
458
+ throw std::runtime_error(std::string("expecting ')' at ") + pos);
459
+ }
460
+ pos = parse_space(pos + 1, is_nested);
461
+ } else if (*pos == '.') { // any char
462
+ last_sym_start = rule.size();
463
+ rule.push_back({LLAMA_GRETYPE_CHAR_ANY, 0});
464
+ pos = parse_space(pos + 1, is_nested);
465
+ } else if (*pos == '*') {
466
+ pos = parse_space(pos + 1, is_nested);
467
+ handle_repetitions(0, -1);
468
+ } else if (*pos == '+') {
469
+ pos = parse_space(pos + 1, is_nested);
470
+ handle_repetitions(1, -1);
471
+ } else if (*pos == '?') {
472
+ pos = parse_space(pos + 1, is_nested);
473
+ handle_repetitions(0, 1);
474
+ } else if (*pos == '{') {
475
+ pos = parse_space(pos + 1, is_nested);
476
+
477
+ if (!is_digit_char(*pos)) {
478
+ throw std::runtime_error(std::string("expecting an int at ") + pos);
479
+ }
480
+ const char * int_end = parse_int(pos);
481
+ int min_times = std::stoul(std::string(pos, int_end - pos));
482
+ pos = parse_space(int_end, is_nested);
483
+
484
+ int max_times = -1;
485
+
486
+ if (*pos == '}') {
487
+ max_times = min_times;
460
488
  pos = parse_space(pos + 1, is_nested);
461
- } else if (*pos == '.') { // any char
462
- last_sym_start = rule.size();
463
- rule.push_back({LLAMA_GRETYPE_CHAR_ANY, 0});
464
- pos = parse_space(pos + 1, is_nested);
465
- } else if (*pos == '*') {
466
- pos = parse_space(pos + 1, is_nested);
467
- handle_repetitions(0, -1);
468
- } else if (*pos == '+') {
469
- pos = parse_space(pos + 1, is_nested);
470
- handle_repetitions(1, -1);
471
- } else if (*pos == '?') {
472
- pos = parse_space(pos + 1, is_nested);
473
- handle_repetitions(0, 1);
474
- } else if (*pos == '{') {
489
+ } else if (*pos == ',') {
475
490
  pos = parse_space(pos + 1, is_nested);
476
491
 
477
- if (!is_digit_char(*pos)) {
478
- throw std::runtime_error(std::string("expecting an int at ") + pos);
492
+ if (is_digit_char(*pos)) {
493
+ const char * int_end = parse_int(pos);
494
+ max_times = std::stoul(std::string(pos, int_end - pos));
495
+ pos = parse_space(int_end, is_nested);
479
496
  }
480
- const char * int_end = parse_int(pos);
481
- int min_times = std::stoul(std::string(pos, int_end - pos));
482
- pos = parse_space(int_end, is_nested);
483
-
484
- int max_times = -1;
485
-
486
- if (*pos == '}') {
487
- max_times = min_times;
488
- pos = parse_space(pos + 1, is_nested);
489
- } else if (*pos == ',') {
490
- pos = parse_space(pos + 1, is_nested);
491
-
492
- if (is_digit_char(*pos)) {
493
- const char * int_end = parse_int(pos);
494
- max_times = std::stoul(std::string(pos, int_end - pos));
495
- pos = parse_space(int_end, is_nested);
496
- }
497
497
 
498
- if (*pos != '}') {
499
- throw std::runtime_error(std::string("expecting '}' at ") + pos);
500
- }
501
- pos = parse_space(pos + 1, is_nested);
502
- } else {
503
- throw std::runtime_error(std::string("expecting ',' at ") + pos);
498
+ if (*pos != '}') {
499
+ throw std::runtime_error(std::string("expecting '}' at ") + pos);
504
500
  }
505
- handle_repetitions(min_times, max_times);
501
+ pos = parse_space(pos + 1, is_nested);
506
502
  } else {
507
- break;
503
+ throw std::runtime_error(std::string("expecting ',' at ") + pos);
508
504
  }
505
+ handle_repetitions(min_times, max_times);
506
+ } else {
507
+ break;
509
508
  }
510
- return pos;
511
509
  }
510
+ return pos;
511
+ }
512
512
 
513
513
  const char * llama_grammar_parser::parse_rule(const char * src) {
514
- const char * name_end = parse_name(src);
515
- const char * pos = parse_space(name_end, false);
516
- size_t name_len = name_end - src;
517
- uint32_t rule_id = get_symbol_id(src, name_len);
518
- const std::string name(src, name_len);
519
-
520
- if (!(pos[0] == ':' && pos[1] == ':' && pos[2] == '=')) {
521
- throw std::runtime_error(std::string("expecting ::= at ") + pos);
522
- }
523
- pos = parse_space(pos + 3, true);
514
+ const char * name_end = parse_name(src);
515
+ const char * pos = parse_space(name_end, false);
516
+ size_t name_len = name_end - src;
517
+ uint32_t rule_id = get_symbol_id(src, name_len);
518
+ const std::string name(src, name_len);
519
+
520
+ if (!(pos[0] == ':' && pos[1] == ':' && pos[2] == '=')) {
521
+ throw std::runtime_error(std::string("expecting ::= at ") + pos);
522
+ }
523
+ pos = parse_space(pos + 3, true);
524
524
 
525
- pos = parse_alternates(pos, name, rule_id, false);
525
+ pos = parse_alternates(pos, name, rule_id, false);
526
526
 
527
- if (*pos == '\r') {
528
- pos += pos[1] == '\n' ? 2 : 1;
529
- } else if (*pos == '\n') {
530
- pos++;
531
- } else if (*pos) {
532
- throw std::runtime_error(std::string("expecting newline or end at ") + pos);
533
- }
534
- return parse_space(pos, true);
527
+ if (*pos == '\r') {
528
+ pos += pos[1] == '\n' ? 2 : 1;
529
+ } else if (*pos == '\n') {
530
+ pos++;
531
+ } else if (*pos) {
532
+ throw std::runtime_error(std::string("expecting newline or end at ") + pos);
535
533
  }
534
+ return parse_space(pos, true);
535
+ }
536
536
 
537
537
  bool llama_grammar_parser::parse(const char * src) {
538
538
  try {
@@ -969,7 +969,7 @@ struct llama_grammar * llama_grammar_init_impl(
969
969
  /* .awaiting_trigger = */ false,
970
970
  /* .trigger_buffer = */ "",
971
971
  /* .trigger_tokens = */ {},
972
- /* .trigger_words = */ {},
972
+ /* .trigger_patterns = */ {},
973
973
  };
974
974
  }
975
975
 
@@ -978,19 +978,15 @@ struct llama_grammar * llama_grammar_init_impl(
978
978
  const char * grammar_str,
979
979
  const char * grammar_root,
980
980
  bool lazy,
981
- const char ** trigger_words,
982
- size_t num_trigger_words,
981
+ const char ** trigger_patterns,
982
+ size_t num_trigger_patterns,
983
983
  const llama_token * trigger_tokens,
984
984
  size_t num_trigger_tokens) {
985
985
  llama_grammar_parser parser;
986
986
 
987
987
  // if there is a grammar, parse it
988
- if (!parser.parse(grammar_str)) {
989
- return nullptr;
990
- }
991
-
992
- // will be empty (default) if there are parse errors
993
- if (parser.rules.empty()) {
988
+ // rules will be empty (default) if there are parse errors
989
+ if (!parser.parse(grammar_str) || parser.rules.empty()) {
994
990
  fprintf(stderr, "%s: failed to parse grammar\n", __func__);
995
991
  return nullptr;
996
992
  }
@@ -1054,14 +1050,16 @@ struct llama_grammar * llama_grammar_init_impl(
1054
1050
  } while (true);
1055
1051
 
1056
1052
  std::vector<llama_token> vec_trigger_tokens;
1057
- std::vector<std::string> vec_trigger_words;
1053
+ std::vector<llama_grammar_trigger_pattern> vec_trigger_patterns;
1058
1054
  for (size_t i = 0; i < num_trigger_tokens; i++) {
1059
1055
  GGML_ASSERT(trigger_tokens != nullptr);
1060
1056
  vec_trigger_tokens.push_back(trigger_tokens[i]);
1061
1057
  }
1062
- for (size_t i = 0; i < num_trigger_words; i++) {
1063
- GGML_ASSERT(trigger_words != nullptr);
1064
- vec_trigger_words.push_back(trigger_words[i]);
1058
+ for (size_t i = 0; i < num_trigger_patterns; i++) {
1059
+ GGML_ASSERT(trigger_patterns != nullptr);
1060
+ auto & trigger = vec_trigger_patterns.emplace_back();
1061
+ trigger.pattern = trigger_patterns[i];
1062
+ trigger.regex = std::regex(trigger.pattern);
1065
1063
  }
1066
1064
 
1067
1065
  // Important: vec_rules has to be moved here, not copied, because stacks contains
@@ -1076,7 +1074,7 @@ struct llama_grammar * llama_grammar_init_impl(
1076
1074
  /* .awaiting_trigger = */ lazy,
1077
1075
  /* .trigger_buffer = */ "",
1078
1076
  std::move(vec_trigger_tokens),
1079
- std::move(vec_trigger_words),
1077
+ std::move(vec_trigger_patterns),
1080
1078
  };
1081
1079
  }
1082
1080
 
@@ -1089,7 +1087,7 @@ void llama_grammar_free_impl(struct llama_grammar * grammar) {
1089
1087
  }
1090
1088
 
1091
1089
  struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & grammar) {
1092
- llama_grammar * result = new llama_grammar {
1090
+ auto * result = new llama_grammar {
1093
1091
  grammar.vocab,
1094
1092
  grammar.rules,
1095
1093
  grammar.stacks,
@@ -1098,7 +1096,7 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
1098
1096
  grammar.awaiting_trigger,
1099
1097
  grammar.trigger_buffer,
1100
1098
  grammar.trigger_tokens,
1101
- grammar.trigger_words,
1099
+ grammar.trigger_patterns,
1102
1100
  };
1103
1101
 
1104
1102
  // redirect elements in stacks to point to new rules
@@ -1173,20 +1171,22 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
1173
1171
  LLAMA_LOG_DEBUG("Grammar triggered on token %u (`%s`)", token, piece.c_str());
1174
1172
  return;
1175
1173
  } else {
1176
- // TODO: consider a smarter incremental substring search algorithm (store last position to search from).
1177
1174
  grammar.trigger_buffer += piece;
1178
- for (const auto & word : grammar.trigger_words) {
1179
- auto pos = grammar.trigger_buffer.find(word);
1180
- if (pos != std::string::npos) {
1175
+
1176
+ std::smatch match;
1177
+ for (const auto & trigger_pattern : grammar.trigger_patterns) {
1178
+ if (std::regex_match(grammar.trigger_buffer, match, trigger_pattern.regex)) {
1181
1179
  grammar.awaiting_trigger = false;
1182
- auto constrained_str = grammar.trigger_buffer.substr(pos);
1180
+ // get from the first match to the end of the string
1181
+ auto constrained_str = grammar.trigger_buffer.substr(match.position(1));
1182
+ // std::string constrained_str(match[1].first, grammar.trigger_buffer.end());
1183
1183
  grammar.trigger_buffer.clear();
1184
1184
  llama_grammar_accept_str(grammar, constrained_str);
1185
- LLAMA_LOG_DEBUG("Grammar triggered on word `%s`", word.c_str());
1185
+ LLAMA_LOG_DEBUG("Grammar triggered on regex: '%s'\n", constrained_str.c_str());
1186
1186
  return;
1187
1187
  }
1188
1188
  }
1189
- LLAMA_LOG_DEBUG("Grammar still awaiting trigger after token %d (`%s`) (buffer: `%s`)\n", token, piece.c_str(), grammar.trigger_buffer.c_str());
1189
+ LLAMA_LOG_DEBUG("Grammar still awaiting trigger after token %d (`%s`)\n", token, piece.c_str());
1190
1190
  return;
1191
1191
  }
1192
1192
  }
@@ -3,6 +3,7 @@
3
3
  #include "llama.h"
4
4
 
5
5
  #include <map>
6
+ #include <regex>
6
7
  #include <string>
7
8
  #include <vector>
8
9
 
@@ -105,6 +106,11 @@ struct llama_grammar_parser {
105
106
  void print(FILE * file);
106
107
  };
107
108
 
109
+ struct llama_grammar_trigger_pattern {
110
+ std::string pattern;
111
+ std::regex regex;
112
+ };
113
+
108
114
  struct llama_grammar {
109
115
  // note: allow null vocab for testing (not great)
110
116
  const llama_vocab * vocab;
@@ -116,13 +122,16 @@ struct llama_grammar {
116
122
  llama_partial_utf8 partial_utf8;
117
123
 
118
124
  // lazy grammars wait for trigger words or tokens before constraining the sampling.
119
- // we still ahve trigger_tokens for non-lazy grammars to force printing of special trigger tokens.
125
+ // we still have trigger_tokens for non-lazy grammars to force printing of special trigger tokens.
120
126
  // (useful e.g. for tool_choice=required)
121
127
  bool lazy = false;
122
128
  bool awaiting_trigger = false; // Initialized to true for lazy grammars only
123
129
  std::string trigger_buffer; // Output buffered by lazy grammar. Will be cleared once trigger is found.
124
130
  std::vector<llama_token> trigger_tokens; // Tokens that trigger a lazy grammar, or tokens to force printing of (even if special).
125
- std::vector<std::string> trigger_words;
131
+ std::vector<llama_grammar_trigger_pattern>
132
+ trigger_patterns; // Regular expressions that trigger a lazy grammar. Must be a full match of the entire generated
133
+ // string, and the grammar will be given the string from the first match group onwards.
134
+
126
135
  };
127
136
 
128
137
  //
@@ -141,8 +150,8 @@ struct llama_grammar * llama_grammar_init_impl(
141
150
  const char * grammar_str,
142
151
  const char * grammar_root,
143
152
  bool lazy,
144
- const char ** trigger_words,
145
- size_t num_trigger_words,
153
+ const char ** trigger_patterns,
154
+ size_t num_trigger_patterns,
146
155
  const llama_token * trigger_tokens,
147
156
  size_t num_trigger_tokens);
148
157
 
@@ -6,13 +6,13 @@
6
6
  #include <vector>
7
7
 
8
8
  #ifdef __GNUC__
9
- #ifdef __MINGW32__
10
- #define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
9
+ # if defined(__MINGW32__) && !defined(__clang__)
10
+ # define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
11
+ # else
12
+ # define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
13
+ # endif
11
14
  #else
12
- #define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
13
- #endif
14
- #else
15
- #define LLAMA_ATTRIBUTE_FORMAT(...)
15
+ # define LLAMA_ATTRIBUTE_FORMAT(...)
16
16
  #endif
17
17
 
18
18
  //
@@ -6,6 +6,7 @@
6
6
 
7
7
  #include <set>
8
8
  #include <vector>
9
+ #include <algorithm>
9
10
 
10
11
  struct llama_kv_cell {
11
12
  llama_pos pos = -1;
@@ -37,7 +38,7 @@ struct llama_kv_cache {
37
38
  bool can_shift = false;
38
39
 
39
40
  // Note: The value of head isn't only used to optimize searching
40
- // for a free KV slot. llama_decode_internal also uses it, so it
41
+ // for a free KV slot. llama_decode_impl also uses it, so it
41
42
  // cannot be freely changed after a slot has been allocated.
42
43
  uint32_t head = 0;
43
44
  uint32_t size = 0;
@@ -8,6 +8,7 @@
8
8
  #include <climits>
9
9
  #include <stdexcept>
10
10
  #include <cerrno>
11
+ #include <algorithm>
11
12
 
12
13
  #ifdef __has_include
13
14
  #if __has_include(<unistd.h>)
@@ -34,6 +35,10 @@
34
35
  #include <io.h>
35
36
  #endif
36
37
 
38
+ #if defined(__APPLE__)
39
+ #include <TargetConditionals.h>
40
+ #endif
41
+
37
42
  // TODO: consider moving to llama-impl.h if needed in more places
38
43
  #if defined(_WIN32)
39
44
  static std::string llama_format_win_err(DWORD err) {
@@ -471,7 +476,11 @@ struct llama_mlock::impl {
471
476
 
472
477
  char* errmsg = std::strerror(errno);
473
478
  bool suggest = (errno == ENOMEM);
474
-
479
+ #if defined(TARGET_OS_VISION) || defined(TARGET_OS_TV)
480
+ // visionOS/tvOS dont't support RLIMIT_MEMLOCK
481
+ // Skip resource limit checks on visionOS/tvOS
482
+ suggest = false;
483
+ #else
475
484
  struct rlimit lock_limit;
476
485
  if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) {
477
486
  suggest = false;
@@ -479,6 +488,7 @@ struct llama_mlock::impl {
479
488
  if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) {
480
489
  suggest = false;
481
490
  }
491
+ #endif
482
492
 
483
493
  LLAMA_LOG_WARN("warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
484
494
  size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
@@ -1,5 +1,6 @@
1
1
  #pragma once
2
2
 
3
+ #include <cstdint>
3
4
  #include <memory>
4
5
  #include <vector>
5
6