@fugood/llama.node 1.4.6 → 1.4.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/lib/binding.ts +8 -0
  2. package/package.json +15 -15
  3. package/scripts/llama.cpp.patch +25 -26
  4. package/src/LlamaContext.cpp +2 -2
  5. package/src/llama.cpp/common/CMakeLists.txt +2 -0
  6. package/src/llama.cpp/common/arg.cpp +364 -193
  7. package/src/llama.cpp/common/arg.h +43 -2
  8. package/src/llama.cpp/common/chat-parser-xml-toolcall.cpp +36 -18
  9. package/src/llama.cpp/common/chat-parser-xml-toolcall.h +1 -1
  10. package/src/llama.cpp/common/chat-parser.cpp +3 -2
  11. package/src/llama.cpp/common/chat-peg-parser.cpp +16 -2
  12. package/src/llama.cpp/common/chat.cpp +272 -0
  13. package/src/llama.cpp/common/common.cpp +130 -67
  14. package/src/llama.cpp/common/common.h +40 -16
  15. package/src/llama.cpp/common/console.cpp +680 -47
  16. package/src/llama.cpp/common/console.h +30 -8
  17. package/src/llama.cpp/common/download.cpp +69 -25
  18. package/src/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
  19. package/src/llama.cpp/common/json-schema-to-grammar.h +20 -0
  20. package/src/llama.cpp/common/log.cpp +5 -0
  21. package/src/llama.cpp/common/log.h +1 -0
  22. package/src/llama.cpp/common/peg-parser.cpp +1 -1
  23. package/src/llama.cpp/common/preset.cpp +206 -0
  24. package/src/llama.cpp/common/preset.h +32 -0
  25. package/src/llama.cpp/common/sampling.cpp +91 -92
  26. package/src/llama.cpp/common/sampling.h +11 -6
  27. package/src/llama.cpp/common/speculative.cpp +1 -1
  28. package/src/llama.cpp/ggml/CMakeLists.txt +5 -0
  29. package/src/llama.cpp/ggml/include/ggml-alloc.h +9 -0
  30. package/src/llama.cpp/ggml/include/ggml-backend.h +1 -0
  31. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
  32. package/src/llama.cpp/ggml/include/ggml.h +7 -8
  33. package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
  34. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +3 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2 -0
  36. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +69 -39
  37. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
  38. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +2 -1
  39. package/src/llama.cpp/include/llama.h +18 -1
  40. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  41. package/src/llama.cpp/src/llama-arch.cpp +1890 -2248
  42. package/src/llama.cpp/src/llama-arch.h +9 -2
  43. package/src/llama.cpp/src/llama-batch.cpp +12 -2
  44. package/src/llama.cpp/src/llama-batch.h +4 -2
  45. package/src/llama.cpp/src/llama-context.cpp +99 -29
  46. package/src/llama.cpp/src/llama-context.h +9 -3
  47. package/src/llama.cpp/src/llama-grammar.cpp +233 -33
  48. package/src/llama.cpp/src/llama-grammar.h +20 -1
  49. package/src/llama.cpp/src/llama-graph.cpp +85 -17
  50. package/src/llama.cpp/src/llama-graph.h +17 -4
  51. package/src/llama.cpp/src/llama-hparams.cpp +6 -0
  52. package/src/llama.cpp/src/llama-hparams.h +5 -1
  53. package/src/llama.cpp/src/llama-impl.cpp +4 -0
  54. package/src/llama.cpp/src/llama-kv-cache.cpp +90 -42
  55. package/src/llama.cpp/src/llama-kv-cache.h +19 -2
  56. package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
  57. package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
  58. package/src/llama.cpp/src/llama-model-loader.h +2 -0
  59. package/src/llama.cpp/src/llama-model.cpp +123 -52
  60. package/src/llama.cpp/src/llama-model.h +1 -0
  61. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  62. package/src/llama.cpp/src/llama-vocab.cpp +2 -1
  63. package/src/llama.cpp/src/llama.cpp +675 -1
  64. package/src/llama.cpp/src/models/deepseek2.cpp +9 -5
  65. package/src/llama.cpp/src/models/{gemma3-iswa.cpp → gemma3.cpp} +30 -5
  66. package/src/llama.cpp/src/models/glm4-moe.cpp +28 -11
  67. package/src/llama.cpp/src/models/glm4.cpp +27 -4
  68. package/src/llama.cpp/src/models/models.h +8 -7
  69. package/src/llama.cpp/src/models/nemotron-h.cpp +35 -6
  70. package/src/llama.cpp/src/models/qwen2.cpp +12 -3
  71. package/src/llama.cpp/src/models/qwen3next.cpp +81 -266
@@ -181,6 +181,52 @@ static std::pair<uint32_t, const char *> parse_char(const char * src) {
181
181
  throw std::runtime_error("unexpected end of input");
182
182
  }
183
183
 
184
+ static std::pair<uint32_t, const char *> parse_token(const llama_vocab * vocab, const char * src) {
185
+ const char * pos = src;
186
+ if (*pos != '<') {
187
+ throw std::runtime_error(std::string("expecting '<' at ") + pos);
188
+ }
189
+ pos++;
190
+
191
+ // Parse <[id]>
192
+ if (*pos == '[') {
193
+ pos++;
194
+ const char * int_end = parse_int(pos);
195
+ uint32_t token_id = std::stoul(std::string(pos, int_end - pos));
196
+ pos = int_end;
197
+ if (*pos != ']') {
198
+ throw std::runtime_error(std::string("expecting ']' at ") + pos);
199
+ }
200
+ pos++;
201
+ if (*pos != '>') {
202
+ throw std::runtime_error(std::string("expecting '>' at ") + pos);
203
+ }
204
+ pos++;
205
+ return std::make_pair(token_id, pos);
206
+ }
207
+
208
+ if (vocab == nullptr) {
209
+ throw std::runtime_error(std::string("no vocab to parse token at ") + src);
210
+ }
211
+
212
+ // Parse <token> and tokenize to obtain the token id
213
+ while (*pos != 0 && *pos != '>') {
214
+ pos++;
215
+ }
216
+ if (*pos != '>') {
217
+ throw std::runtime_error(std::string("expecting '>' at ") + pos);
218
+ }
219
+ pos++;
220
+
221
+ llama_token tokens[2];
222
+ int32_t n_tokens = vocab->tokenize(src, static_cast<int32_t>(pos - src), tokens, 2, false, true);
223
+ if (n_tokens != 1) {
224
+ // must tokenize to exactly 1 token
225
+ throw std::runtime_error("invalid token '" + std::string(src, pos - src) + "'");
226
+ }
227
+ return std::make_pair(tokens[0], pos);
228
+ }
229
+
184
230
  static void print_grammar_char(FILE * file, uint32_t c) {
185
231
  if (0x20 <= c && c <= 0x7f) {
186
232
  fprintf(file, "%c", static_cast<char>(c));
@@ -212,6 +258,8 @@ static void print_rule_binary(FILE * file, const llama_grammar_rule & rule) {
212
258
  case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
213
259
  case LLAMA_GRETYPE_CHAR_ALT: fprintf(file, "CHAR_ALT"); break;
214
260
  case LLAMA_GRETYPE_CHAR_ANY: fprintf(file, "CHAR_ANY"); break;
261
+ case LLAMA_GRETYPE_TOKEN: fprintf(file, "TOKEN"); break;
262
+ case LLAMA_GRETYPE_TOKEN_NOT: fprintf(file, "TOKEN_NOT"); break;
215
263
  }
216
264
  switch (elem.type) {
217
265
  case LLAMA_GRETYPE_END:
@@ -228,6 +276,17 @@ static void print_rule_binary(FILE * file, const llama_grammar_rule & rule) {
228
276
  print_grammar_char(file, elem.value);
229
277
  fprintf(file, "\") ");
230
278
  break;
279
+ case LLAMA_GRETYPE_TOKEN:
280
+ fprintf(file, "<[");
281
+ fprintf(file, "%u", elem.value);
282
+ fprintf(file, "]> ");
283
+ break;
284
+ case LLAMA_GRETYPE_TOKEN_NOT:
285
+ fprintf(file, "!");
286
+ fprintf(file, "<[");
287
+ fprintf(file, "%u", elem.value);
288
+ fprintf(file, "]> ");
289
+ break;
231
290
  }
232
291
  }
233
292
  fprintf(file, "\n");
@@ -284,6 +343,17 @@ static void print_rule(
284
343
  case LLAMA_GRETYPE_CHAR_ANY:
285
344
  fprintf(file, ".");
286
345
  break;
346
+ case LLAMA_GRETYPE_TOKEN:
347
+ fprintf(file, "<[");
348
+ fprintf(file, "%u", elem.value);
349
+ fprintf(file, "]> ");
350
+ break;
351
+ case LLAMA_GRETYPE_TOKEN_NOT:
352
+ fprintf(file, "!");
353
+ fprintf(file, "<[");
354
+ fprintf(file, "%u", elem.value);
355
+ fprintf(file, "]> ");
356
+ break;
287
357
  }
288
358
  if (is_char_element(elem)) {
289
359
  switch (rule[i + 1].type) {
@@ -444,6 +514,17 @@ const char * llama_grammar_parser::parse_sequence(
444
514
  }
445
515
  }
446
516
  pos = parse_space(pos + 1, is_nested);
517
+ } else if (*pos == '<' || *pos == '!') { // token
518
+ auto type = LLAMA_GRETYPE_TOKEN;
519
+ if (*pos == '!') { // token inverse
520
+ type = LLAMA_GRETYPE_TOKEN_NOT;
521
+ pos++;
522
+ }
523
+ auto token_pair = parse_token(vocab, pos);
524
+ const char * token_end = token_pair.second;
525
+ last_sym_start = rule.size();
526
+ rule.push_back({type, token_pair.first});
527
+ pos = parse_space(token_end, is_nested);
447
528
  } else if (is_word_char(*pos)) { // rule reference
448
529
  const char * name_end = parse_name(pos);
449
530
  uint32_t ref_rule_id = get_symbol_id(pos, name_end - pos);
@@ -691,6 +772,21 @@ static bool llama_grammar_match_partial_char(
691
772
  return !is_positive_char;
692
773
  }
693
774
 
775
+ // returns true iff token matches the rule at pos (regular or inverse)
776
+ // asserts that pos is pointing to a token element
777
+ static bool llama_grammar_match_token(
778
+ const llama_grammar_element * pos,
779
+ const llama_token token) {
780
+ GGML_ASSERT(pos->type == LLAMA_GRETYPE_TOKEN || pos->type == LLAMA_GRETYPE_TOKEN_NOT);
781
+ if (pos->type == LLAMA_GRETYPE_TOKEN) {
782
+ return pos->value == static_cast<uint32_t>(token);
783
+ }
784
+ if (pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
785
+ return pos->value != static_cast<uint32_t>(token);
786
+ }
787
+ return false;
788
+ }
789
+
694
790
  // transforms a grammar pushdown stack into N possible stacks, all ending
695
791
  // at a character range (terminal element)
696
792
  static void llama_grammar_advance_stack(
@@ -738,6 +834,8 @@ static void llama_grammar_advance_stack(
738
834
  case LLAMA_GRETYPE_CHAR:
739
835
  case LLAMA_GRETYPE_CHAR_NOT:
740
836
  case LLAMA_GRETYPE_CHAR_ANY:
837
+ case LLAMA_GRETYPE_TOKEN:
838
+ case LLAMA_GRETYPE_TOKEN_NOT:
741
839
  if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
742
840
  // only add the stack if it's not a duplicate of one we already have
743
841
  new_stacks.emplace_back(stack);
@@ -831,26 +929,38 @@ llama_grammar_stacks & llama_grammar_get_stacks(struct llama_grammar * grammar)
831
929
  return grammar->stacks;
832
930
  }
833
931
 
932
+ static void llama_grammar_accept_chr(
933
+ struct llama_grammar & grammar,
934
+ const llama_grammar_stack & stack,
935
+ uint32_t chr,
936
+ llama_grammar_stacks & new_stacks) {
937
+ if (stack.empty()) {
938
+ return;
939
+ }
940
+
941
+ const llama_grammar_element * pos = stack.back();
942
+
943
+ // ignore if this turns into a token
944
+ if (pos->type == LLAMA_GRETYPE_TOKEN || pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
945
+ return;
946
+ }
947
+
948
+ auto match = llama_grammar_match_char(pos, chr);
949
+ if (match.first) {
950
+ llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
951
+ if (!llama_grammar_is_end_of_sequence(match.second)) {
952
+ new_stack.push_back(match.second);
953
+ }
954
+ llama_grammar_advance_stack(grammar.rules, new_stack, new_stacks);
955
+ }
956
+ }
957
+
834
958
  void llama_grammar_accept(struct llama_grammar * grammar, uint32_t chr) {
835
959
  llama_grammar_stacks stacks_new;
836
960
  stacks_new.reserve(grammar->stacks.size());
837
961
 
838
962
  for (const auto & stack : grammar->stacks) {
839
- if (stack.empty()) {
840
- continue;
841
- }
842
-
843
- auto match = llama_grammar_match_char(stack.back(), chr);
844
- if (match.first) {
845
- const llama_grammar_element * pos = match.second;
846
-
847
- // update top of stack to next element, if any
848
- llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
849
- if (!llama_grammar_is_end_of_sequence(pos)) {
850
- new_stack.push_back(pos);
851
- }
852
- llama_grammar_advance_stack(grammar->rules, new_stack, stacks_new);
853
- }
963
+ llama_grammar_accept_chr(*grammar, stack, chr, stacks_new);
854
964
  }
855
965
 
856
966
  grammar->stacks = std::move(stacks_new);
@@ -875,6 +985,22 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
875
985
 
876
986
  const llama_grammar_element * stack_pos = stack.back();
877
987
 
988
+ // if the top of the stack is a token rule, then we only need to check the token id
989
+ if (stack_pos->type == LLAMA_GRETYPE_TOKEN || stack_pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
990
+ for (const auto & tok : candidates) {
991
+ if (*tok.code_points == 0) {
992
+ // reached the end of a token consumed by char rules, reject iff it ended
993
+ // in a partial response
994
+ if (tok.partial_utf8.n_remain != 0) {
995
+ rejects.push_back(tok);
996
+ }
997
+ } else if (!llama_grammar_match_token(stack_pos, tok.id)) {
998
+ rejects.push_back(tok);
999
+ }
1000
+ }
1001
+ return rejects;
1002
+ }
1003
+
878
1004
  llama_grammar_candidates next_candidates;
879
1005
  next_candidates.reserve(candidates.size());
880
1006
 
@@ -887,7 +1013,7 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
887
1013
  rejects.push_back(tok);
888
1014
  }
889
1015
  } else if (llama_grammar_match_char(stack_pos, *tok.code_points).first) {
890
- next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8 });
1016
+ next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8, tok.id });
891
1017
  } else {
892
1018
  rejects.push_back(tok);
893
1019
  }
@@ -905,7 +1031,7 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
905
1031
 
906
1032
  auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
907
1033
  for (const auto & tok : next_rejects) {
908
- rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
1034
+ rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8, tok.id });
909
1035
  }
910
1036
 
911
1037
  return rejects;
@@ -972,12 +1098,13 @@ struct llama_grammar * llama_grammar_init_impl(
972
1098
  vocab,
973
1099
  std::move(vec_rules),
974
1100
  std::move(stacks),
975
- /* .partial_utf8 = */ {},
976
- /* .lazy =*/ false,
977
- /* .awaiting_trigger = */ false,
978
- /* .trigger_buffer = */ "",
979
- /* .trigger_tokens = */ {},
980
- /* .trigger_patterns = */ {},
1101
+ /* .partial_utf8 = */ {},
1102
+ /* .lazy = */ false,
1103
+ /* .awaiting_trigger = */ false,
1104
+ /* .trigger_buffer = */ "",
1105
+ /* .trigger_buffer_positions = */ {},
1106
+ /* .trigger_tokens = */ {},
1107
+ /* .trigger_patterns = */ {},
981
1108
  };
982
1109
  }
983
1110
 
@@ -990,7 +1117,7 @@ struct llama_grammar * llama_grammar_init_impl(
990
1117
  size_t num_trigger_patterns,
991
1118
  const llama_token * trigger_tokens,
992
1119
  size_t num_trigger_tokens) {
993
- llama_grammar_parser parser;
1120
+ llama_grammar_parser parser(vocab);
994
1121
 
995
1122
  // if there is a grammar, parse it
996
1123
  // rules will be empty (default) if there are parse errors
@@ -1077,10 +1204,11 @@ struct llama_grammar * llama_grammar_init_impl(
1077
1204
  vocab,
1078
1205
  std::move(vec_rules),
1079
1206
  std::move(stacks),
1080
- /* .partial_utf8 = */ {},
1081
- /* .lazy = */ lazy,
1082
- /* .awaiting_trigger = */ lazy,
1083
- /* .trigger_buffer = */ "",
1207
+ /* .partial_utf8 = */ {},
1208
+ /* .lazy = */ lazy,
1209
+ /* .awaiting_trigger = */ lazy,
1210
+ /* .trigger_buffer = */ "",
1211
+ /* .trigger_buffer_positions = */ {},
1084
1212
  std::move(vec_trigger_tokens),
1085
1213
  std::move(vec_trigger_patterns),
1086
1214
  };
@@ -1103,6 +1231,7 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
1103
1231
  grammar.lazy,
1104
1232
  grammar.awaiting_trigger,
1105
1233
  grammar.trigger_buffer,
1234
+ grammar.trigger_buffer_positions,
1106
1235
  grammar.trigger_tokens,
1107
1236
  grammar.trigger_patterns,
1108
1237
  };
@@ -1156,7 +1285,7 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
1156
1285
  cur_p->data[i].logit = -INFINITY;
1157
1286
  } else {
1158
1287
  candidates_decoded.push_back(decode_utf8(piece, grammar.partial_utf8));
1159
- candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
1288
+ candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second, id });
1160
1289
  }
1161
1290
  }
1162
1291
 
@@ -1175,10 +1304,12 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
1175
1304
  if (std::find(grammar.trigger_tokens.begin(), grammar.trigger_tokens.end(), token) != grammar.trigger_tokens.end()) {
1176
1305
  grammar.awaiting_trigger = false;
1177
1306
  grammar.trigger_buffer.clear();
1178
- llama_grammar_accept_str(grammar, piece);
1307
+ llama_grammar_accept_token(grammar, token, piece);
1179
1308
  LLAMA_LOG_DEBUG("Grammar triggered on token %u (`%s`)", token, piece.c_str());
1180
1309
  return;
1181
1310
  } else {
1311
+ auto position = std::make_pair(grammar.trigger_buffer.size(), grammar.trigger_buffer.size() + piece.size());
1312
+ grammar.trigger_buffer_positions.push_back(std::make_pair(token, position));
1182
1313
  grammar.trigger_buffer += piece;
1183
1314
 
1184
1315
  std::smatch match;
@@ -1196,10 +1327,23 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
1196
1327
  if (start == std::string::npos) {
1197
1328
  start = match.position(0);
1198
1329
  }
1330
+
1331
+ // replay tokens that overlap with [start, end)
1332
+ for (const auto & [tok, tok_pos] : grammar.trigger_buffer_positions) {
1333
+ auto [tok_start, tok_end] = tok_pos;
1334
+ if (tok_end <= start) {
1335
+ continue;
1336
+ }
1337
+
1338
+ size_t piece_start = (tok_start < start) ? start : tok_start; // allow for partial token pieces
1339
+ size_t piece_len = tok_end - piece_start;
1340
+ auto tok_piece = grammar.trigger_buffer.substr(piece_start, piece_len);
1341
+ llama_grammar_accept_token(grammar, tok, tok_piece);
1342
+ }
1343
+
1199
1344
  auto constrained_str = grammar.trigger_buffer.substr(start);
1200
- // std::string constrained_str(match[1].first, grammar.trigger_buffer.end());
1201
1345
  grammar.trigger_buffer.clear();
1202
- llama_grammar_accept_str(grammar, constrained_str);
1346
+ grammar.trigger_buffer_positions.clear();
1203
1347
  LLAMA_LOG_DEBUG("Grammar triggered on regex: '%s'\n", constrained_str.c_str());
1204
1348
  return;
1205
1349
  }
@@ -1218,7 +1362,7 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
1218
1362
  GGML_ABORT("fatal error");
1219
1363
  }
1220
1364
 
1221
- llama_grammar_accept_str(grammar, piece);
1365
+ llama_grammar_accept_token(grammar, token, piece);
1222
1366
  }
1223
1367
 
1224
1368
  void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string & piece) {
@@ -1235,3 +1379,59 @@ void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string
1235
1379
  throw std::runtime_error("Unexpected empty grammar stack after accepting piece: " + piece);
1236
1380
  }
1237
1381
  }
1382
+
1383
+ void llama_grammar_accept_token(struct llama_grammar & grammar, llama_token token, const std::string & piece) {
1384
+ // Note terminating 0 in decoded string
1385
+ const auto decoded = decode_utf8(piece, grammar.partial_utf8);
1386
+ const auto & code_points = decoded.first;
1387
+
1388
+ llama_grammar_stacks stacks_new;
1389
+ stacks_new.reserve(grammar.stacks.size());
1390
+
1391
+ for (const auto & stack : grammar.stacks) {
1392
+ if (stack.empty()) {
1393
+ continue;
1394
+ }
1395
+
1396
+ const llama_grammar_element * pos = stack.back();
1397
+
1398
+ if (pos->type == LLAMA_GRETYPE_TOKEN || pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
1399
+ if (llama_grammar_match_token(pos, token)) {
1400
+ llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
1401
+ if (!llama_grammar_is_end_of_sequence(pos + 1)) {
1402
+ new_stack.push_back(pos + 1);
1403
+ }
1404
+ llama_grammar_advance_stack(grammar.rules, new_stack, stacks_new);
1405
+ }
1406
+ } else {
1407
+ llama_grammar_stacks current_stacks = {stack};
1408
+
1409
+ for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
1410
+ llama_grammar_stacks next_stacks;
1411
+
1412
+ for (const auto & cur_stack : current_stacks) {
1413
+ llama_grammar_accept_chr(grammar, cur_stack, *it, next_stacks);
1414
+ }
1415
+
1416
+ current_stacks = std::move(next_stacks);
1417
+ if (current_stacks.empty()) {
1418
+ break;
1419
+ }
1420
+ }
1421
+
1422
+ for (auto & surviving_stack : current_stacks) {
1423
+ if (std::find(stacks_new.begin(), stacks_new.end(), surviving_stack) == stacks_new.end()) {
1424
+ stacks_new.emplace_back(surviving_stack);
1425
+ }
1426
+ }
1427
+ }
1428
+ }
1429
+
1430
+ grammar.stacks = std::move(stacks_new);
1431
+ grammar.partial_utf8 = decoded.second;
1432
+
1433
+ if (grammar.stacks.empty()) {
1434
+ throw std::runtime_error("Unexpected empty grammar stack after accepting piece: " + piece + " (" + std::to_string(token) + ")");
1435
+ }
1436
+ }
1437
+
@@ -36,11 +36,17 @@ enum llama_gretype {
36
36
 
37
37
  // any character (.)
38
38
  LLAMA_GRETYPE_CHAR_ANY = 7,
39
+
40
+ // terminal element: token (<[token-id]>)
41
+ LLAMA_GRETYPE_TOKEN = 8,
42
+
43
+ // inverse token (!<[token-id]>)
44
+ LLAMA_GRETYPE_TOKEN_NOT = 9,
39
45
  };
40
46
 
41
47
  typedef struct llama_grammar_element {
42
48
  enum llama_gretype type;
43
- uint32_t value; // Unicode code point or rule ID
49
+ uint32_t value; // Unicode code point, rule ID, or token ID
44
50
  } llama_grammar_element;
45
51
 
46
52
  struct llama_partial_utf8 {
@@ -52,6 +58,7 @@ struct llama_grammar_candidate {
52
58
  size_t index;
53
59
  const uint32_t * code_points;
54
60
  llama_partial_utf8 partial_utf8;
61
+ llama_token id;
55
62
  };
56
63
 
57
64
  using llama_grammar_rule = std::vector< llama_grammar_element>;
@@ -77,10 +84,13 @@ std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
77
84
  const llama_grammar_candidates & candidates);
78
85
 
79
86
  struct llama_grammar_parser {
87
+ const llama_vocab * vocab;
80
88
  std::map<std::string, uint32_t> symbol_ids;
81
89
 
82
90
  llama_grammar_rules rules;
83
91
 
92
+ llama_grammar_parser(const struct llama_vocab * vocab = nullptr) : vocab(vocab) {}
93
+
84
94
  llama_grammar_stack c_rules() const;
85
95
 
86
96
  uint32_t get_symbol_id(const char * src, size_t len);
@@ -112,6 +122,9 @@ struct llama_grammar_trigger_pattern {
112
122
  };
113
123
 
114
124
  struct llama_grammar {
125
+ // maintain a list of llama_tokens and their positions in the trigger_buffer
126
+ using token_pos = std::pair<llama_token, std::pair<size_t, size_t>>;
127
+
115
128
  // note: allow null vocab for testing (not great)
116
129
  const llama_vocab * vocab;
117
130
 
@@ -127,6 +140,7 @@ struct llama_grammar {
127
140
  bool lazy = false;
128
141
  bool awaiting_trigger = false; // Initialized to true for lazy grammars only
129
142
  std::string trigger_buffer; // Output buffered by lazy grammar. Will be cleared once trigger is found.
143
+ std::vector<token_pos> trigger_buffer_positions; // Tokens buffered by lazy grammar. Used to replay when a trigger is found.
130
144
  std::vector<llama_token> trigger_tokens; // Tokens that trigger a lazy grammar, or tokens to force printing of (even if special).
131
145
  std::vector<llama_grammar_trigger_pattern>
132
146
  trigger_patterns; // Regular expressions that trigger a lazy grammar. Must be a full match of the entire generated
@@ -171,3 +185,8 @@ void llama_grammar_accept_impl(
171
185
  void llama_grammar_accept_str(
172
186
  struct llama_grammar & grammar,
173
187
  const std::string & piece);
188
+
189
+ void llama_grammar_accept_token(
190
+ struct llama_grammar & grammar,
191
+ llama_token token,
192
+ const std::string & piece);