@fugood/llama.node 1.4.6 → 1.4.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -168,6 +168,7 @@ option(GGML_RVV "ggml: enable rvv" ON)
168
168
  option(GGML_RV_ZFH "ggml: enable riscv zfh" ON)
169
169
  option(GGML_RV_ZVFH "ggml: enable riscv zvfh" ON)
170
170
  option(GGML_RV_ZICBOP "ggml: enable riscv zicbop" ON)
171
+ option(GGML_RV_ZIHINTPAUSE "ggml: enable riscv zihintpause " ON)
171
172
  option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
172
173
  option(GGML_VXE "ggml: enable vxe" ${GGML_NATIVE})
173
174
 
@@ -469,6 +469,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
469
469
  if (GGML_RV_ZICBOP)
470
470
  string(APPEND MARCH_STR "_zicbop")
471
471
  endif()
472
+ if (GGML_RV_ZIHINTPAUSE)
473
+ string(APPEND MARCH_STR "_zihintpause")
474
+ endif()
472
475
  list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
473
476
  else()
474
477
  # Begin with the lowest baseline
@@ -490,6 +490,15 @@ static inline void ggml_thread_cpu_relax(void) {
490
490
  static inline void ggml_thread_cpu_relax(void) {
491
491
  _mm_pause();
492
492
  }
493
+ #elif defined(__riscv)
494
+ static inline void ggml_thread_cpu_relax(void) {
495
+ #ifdef __riscv_zihintpause
496
+ __asm__ __volatile__ ("pause");
497
+ #else
498
+ /* Encoding of the pause instruction */
499
+ __asm__ __volatile__ (".4byte 0x100000F");
500
+ #endif
501
+ }
493
502
  #else
494
503
  static inline void ggml_thread_cpu_relax(void) {;}
495
504
  #endif
@@ -67,7 +67,7 @@ add_library(llama
67
67
  models/gemma-embedding.cpp
68
68
  models/gemma.cpp
69
69
  models/gemma2-iswa.cpp
70
- models/gemma3-iswa.cpp
70
+ models/gemma3.cpp
71
71
  models/gemma3n-iswa.cpp
72
72
  models/glm4-moe.cpp
73
73
  models/glm4.cpp
@@ -139,6 +139,7 @@ add_library(llama
139
139
  set_target_properties(llama PROPERTIES
140
140
  VERSION ${LLAMA_INSTALL_VERSION}
141
141
  SOVERSION 0
142
+ MACHO_CURRENT_VERSION 0 # keep macOS linker from seeing oversized version number
142
143
  )
143
144
 
144
145
  target_include_directories(llama PRIVATE .)
@@ -248,7 +248,10 @@ llama_context::llama_context(
248
248
 
249
249
  LLAMA_LOG_DEBUG("%s: backend_ptrs.size() = %zu\n", __func__, backend_ptrs.size());
250
250
 
251
- const size_t max_nodes = this->graph_max_nodes();
251
+ const uint32_t n_seqs = cparams.n_seq_max;
252
+ const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
253
+
254
+ const size_t max_nodes = this->graph_max_nodes(n_tokens);
252
255
 
253
256
  LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes);
254
257
 
@@ -300,9 +303,6 @@ llama_context::llama_context(
300
303
 
301
304
  cross.v_embd.clear();
302
305
 
303
- const uint32_t n_seqs = cparams.n_seq_max;
304
- const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
305
-
306
306
  // avoid reserving graphs with zero outputs - assume one output per sequence
307
307
  n_outputs = n_seqs;
308
308
 
@@ -1386,9 +1386,9 @@ void llama_context::output_reorder() {
1386
1386
  // graph
1387
1387
  //
1388
1388
 
1389
- uint32_t llama_context::graph_max_nodes() const {
1389
+ uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
1390
1390
  if (model.arch == LLM_ARCH_QWEN3NEXT) {
1391
- return std::max<uint32_t>(8192u, 32u*model.n_tensors());
1391
+ return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
1392
1392
  }
1393
1393
  return std::max<uint32_t>(1024u, 8u*model.n_tensors());
1394
1394
  }
@@ -197,7 +197,7 @@ private:
197
197
  //
198
198
 
199
199
  public:
200
- uint32_t graph_max_nodes() const;
200
+ uint32_t graph_max_nodes(uint32_t n_tokens) const;
201
201
 
202
202
  // can reuse the llm_graph_result instance of the context (for example to update a memory module)
203
203
  llm_graph_result * get_gf_res_reserve() const;
@@ -181,6 +181,52 @@ static std::pair<uint32_t, const char *> parse_char(const char * src) {
181
181
  throw std::runtime_error("unexpected end of input");
182
182
  }
183
183
 
184
+ static std::pair<uint32_t, const char *> parse_token(const llama_vocab * vocab, const char * src) {
185
+ const char * pos = src;
186
+ if (*pos != '<') {
187
+ throw std::runtime_error(std::string("expecting '<' at ") + pos);
188
+ }
189
+ pos++;
190
+
191
+ // Parse <[id]>
192
+ if (*pos == '[') {
193
+ pos++;
194
+ const char * int_end = parse_int(pos);
195
+ uint32_t token_id = std::stoul(std::string(pos, int_end - pos));
196
+ pos = int_end;
197
+ if (*pos != ']') {
198
+ throw std::runtime_error(std::string("expecting ']' at ") + pos);
199
+ }
200
+ pos++;
201
+ if (*pos != '>') {
202
+ throw std::runtime_error(std::string("expecting '>' at ") + pos);
203
+ }
204
+ pos++;
205
+ return std::make_pair(token_id, pos);
206
+ }
207
+
208
+ if (vocab == nullptr) {
209
+ throw std::runtime_error(std::string("no vocab to parse token at ") + src);
210
+ }
211
+
212
+ // Parse <token> and tokenize to obtain the token id
213
+ while (*pos != 0 && *pos != '>') {
214
+ pos++;
215
+ }
216
+ if (*pos != '>') {
217
+ throw std::runtime_error(std::string("expecting '>' at ") + pos);
218
+ }
219
+ pos++;
220
+
221
+ llama_token tokens[2];
222
+ int32_t n_tokens = vocab->tokenize(src, static_cast<int32_t>(pos - src), tokens, 2, false, true);
223
+ if (n_tokens != 1) {
224
+ // must tokenize to exactly 1 token
225
+ throw std::runtime_error("invalid token '" + std::string(src, pos - src) + "'");
226
+ }
227
+ return std::make_pair(tokens[0], pos);
228
+ }
229
+
184
230
  static void print_grammar_char(FILE * file, uint32_t c) {
185
231
  if (0x20 <= c && c <= 0x7f) {
186
232
  fprintf(file, "%c", static_cast<char>(c));
@@ -212,6 +258,8 @@ static void print_rule_binary(FILE * file, const llama_grammar_rule & rule) {
212
258
  case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
213
259
  case LLAMA_GRETYPE_CHAR_ALT: fprintf(file, "CHAR_ALT"); break;
214
260
  case LLAMA_GRETYPE_CHAR_ANY: fprintf(file, "CHAR_ANY"); break;
261
+ case LLAMA_GRETYPE_TOKEN: fprintf(file, "TOKEN"); break;
262
+ case LLAMA_GRETYPE_TOKEN_NOT: fprintf(file, "TOKEN_NOT"); break;
215
263
  }
216
264
  switch (elem.type) {
217
265
  case LLAMA_GRETYPE_END:
@@ -228,6 +276,17 @@ static void print_rule_binary(FILE * file, const llama_grammar_rule & rule) {
228
276
  print_grammar_char(file, elem.value);
229
277
  fprintf(file, "\") ");
230
278
  break;
279
+ case LLAMA_GRETYPE_TOKEN:
280
+ fprintf(file, "<[");
281
+ fprintf(file, "%u", elem.value);
282
+ fprintf(file, "]> ");
283
+ break;
284
+ case LLAMA_GRETYPE_TOKEN_NOT:
285
+ fprintf(file, "!");
286
+ fprintf(file, "<[");
287
+ fprintf(file, "%u", elem.value);
288
+ fprintf(file, "]> ");
289
+ break;
231
290
  }
232
291
  }
233
292
  fprintf(file, "\n");
@@ -284,6 +343,17 @@ static void print_rule(
284
343
  case LLAMA_GRETYPE_CHAR_ANY:
285
344
  fprintf(file, ".");
286
345
  break;
346
+ case LLAMA_GRETYPE_TOKEN:
347
+ fprintf(file, "<[");
348
+ fprintf(file, "%u", elem.value);
349
+ fprintf(file, "]> ");
350
+ break;
351
+ case LLAMA_GRETYPE_TOKEN_NOT:
352
+ fprintf(file, "!");
353
+ fprintf(file, "<[");
354
+ fprintf(file, "%u", elem.value);
355
+ fprintf(file, "]> ");
356
+ break;
287
357
  }
288
358
  if (is_char_element(elem)) {
289
359
  switch (rule[i + 1].type) {
@@ -444,6 +514,17 @@ const char * llama_grammar_parser::parse_sequence(
444
514
  }
445
515
  }
446
516
  pos = parse_space(pos + 1, is_nested);
517
+ } else if (*pos == '<' || *pos == '!') { // token
518
+ auto type = LLAMA_GRETYPE_TOKEN;
519
+ if (*pos == '!') { // token inverse
520
+ type = LLAMA_GRETYPE_TOKEN_NOT;
521
+ pos++;
522
+ }
523
+ auto token_pair = parse_token(vocab, pos);
524
+ const char * token_end = token_pair.second;
525
+ last_sym_start = rule.size();
526
+ rule.push_back({type, token_pair.first});
527
+ pos = parse_space(token_end, is_nested);
447
528
  } else if (is_word_char(*pos)) { // rule reference
448
529
  const char * name_end = parse_name(pos);
449
530
  uint32_t ref_rule_id = get_symbol_id(pos, name_end - pos);
@@ -691,6 +772,21 @@ static bool llama_grammar_match_partial_char(
691
772
  return !is_positive_char;
692
773
  }
693
774
 
775
+ // returns true iff token matches the rule at pos (regular or inverse)
776
+ // asserts that pos is pointing to a token element
777
+ static bool llama_grammar_match_token(
778
+ const llama_grammar_element * pos,
779
+ const llama_token token) {
780
+ GGML_ASSERT(pos->type == LLAMA_GRETYPE_TOKEN || pos->type == LLAMA_GRETYPE_TOKEN_NOT);
781
+ if (pos->type == LLAMA_GRETYPE_TOKEN) {
782
+ return pos->value == static_cast<uint32_t>(token);
783
+ }
784
+ if (pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
785
+ return pos->value != static_cast<uint32_t>(token);
786
+ }
787
+ return false;
788
+ }
789
+
694
790
  // transforms a grammar pushdown stack into N possible stacks, all ending
695
791
  // at a character range (terminal element)
696
792
  static void llama_grammar_advance_stack(
@@ -738,6 +834,8 @@ static void llama_grammar_advance_stack(
738
834
  case LLAMA_GRETYPE_CHAR:
739
835
  case LLAMA_GRETYPE_CHAR_NOT:
740
836
  case LLAMA_GRETYPE_CHAR_ANY:
837
+ case LLAMA_GRETYPE_TOKEN:
838
+ case LLAMA_GRETYPE_TOKEN_NOT:
741
839
  if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
742
840
  // only add the stack if it's not a duplicate of one we already have
743
841
  new_stacks.emplace_back(stack);
@@ -831,26 +929,38 @@ llama_grammar_stacks & llama_grammar_get_stacks(struct llama_grammar * grammar)
831
929
  return grammar->stacks;
832
930
  }
833
931
 
932
+ static void llama_grammar_accept_chr(
933
+ struct llama_grammar & grammar,
934
+ const llama_grammar_stack & stack,
935
+ uint32_t chr,
936
+ llama_grammar_stacks & new_stacks) {
937
+ if (stack.empty()) {
938
+ return;
939
+ }
940
+
941
+ const llama_grammar_element * pos = stack.back();
942
+
943
+ // ignore if this turns into a token
944
+ if (pos->type == LLAMA_GRETYPE_TOKEN || pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
945
+ return;
946
+ }
947
+
948
+ auto match = llama_grammar_match_char(pos, chr);
949
+ if (match.first) {
950
+ llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
951
+ if (!llama_grammar_is_end_of_sequence(match.second)) {
952
+ new_stack.push_back(match.second);
953
+ }
954
+ llama_grammar_advance_stack(grammar.rules, new_stack, new_stacks);
955
+ }
956
+ }
957
+
834
958
  void llama_grammar_accept(struct llama_grammar * grammar, uint32_t chr) {
835
959
  llama_grammar_stacks stacks_new;
836
960
  stacks_new.reserve(grammar->stacks.size());
837
961
 
838
962
  for (const auto & stack : grammar->stacks) {
839
- if (stack.empty()) {
840
- continue;
841
- }
842
-
843
- auto match = llama_grammar_match_char(stack.back(), chr);
844
- if (match.first) {
845
- const llama_grammar_element * pos = match.second;
846
-
847
- // update top of stack to next element, if any
848
- llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
849
- if (!llama_grammar_is_end_of_sequence(pos)) {
850
- new_stack.push_back(pos);
851
- }
852
- llama_grammar_advance_stack(grammar->rules, new_stack, stacks_new);
853
- }
963
+ llama_grammar_accept_chr(*grammar, stack, chr, stacks_new);
854
964
  }
855
965
 
856
966
  grammar->stacks = std::move(stacks_new);
@@ -875,6 +985,22 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
875
985
 
876
986
  const llama_grammar_element * stack_pos = stack.back();
877
987
 
988
+ // if the top of the stack is a token rule, then we only need to check the token id
989
+ if (stack_pos->type == LLAMA_GRETYPE_TOKEN || stack_pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
990
+ for (const auto & tok : candidates) {
991
+ if (*tok.code_points == 0) {
992
+ // reached the end of a token consumed by char rules, reject iff it ended
993
+ // in a partial response
994
+ if (tok.partial_utf8.n_remain != 0) {
995
+ rejects.push_back(tok);
996
+ }
997
+ } else if (!llama_grammar_match_token(stack_pos, tok.id)) {
998
+ rejects.push_back(tok);
999
+ }
1000
+ }
1001
+ return rejects;
1002
+ }
1003
+
878
1004
  llama_grammar_candidates next_candidates;
879
1005
  next_candidates.reserve(candidates.size());
880
1006
 
@@ -887,7 +1013,7 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
887
1013
  rejects.push_back(tok);
888
1014
  }
889
1015
  } else if (llama_grammar_match_char(stack_pos, *tok.code_points).first) {
890
- next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8 });
1016
+ next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8, tok.id });
891
1017
  } else {
892
1018
  rejects.push_back(tok);
893
1019
  }
@@ -905,7 +1031,7 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
905
1031
 
906
1032
  auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
907
1033
  for (const auto & tok : next_rejects) {
908
- rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
1034
+ rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8, tok.id });
909
1035
  }
910
1036
 
911
1037
  return rejects;
@@ -972,12 +1098,13 @@ struct llama_grammar * llama_grammar_init_impl(
972
1098
  vocab,
973
1099
  std::move(vec_rules),
974
1100
  std::move(stacks),
975
- /* .partial_utf8 = */ {},
976
- /* .lazy =*/ false,
977
- /* .awaiting_trigger = */ false,
978
- /* .trigger_buffer = */ "",
979
- /* .trigger_tokens = */ {},
980
- /* .trigger_patterns = */ {},
1101
+ /* .partial_utf8 = */ {},
1102
+ /* .lazy = */ false,
1103
+ /* .awaiting_trigger = */ false,
1104
+ /* .trigger_buffer = */ "",
1105
+ /* .trigger_buffer_positions = */ {},
1106
+ /* .trigger_tokens = */ {},
1107
+ /* .trigger_patterns = */ {},
981
1108
  };
982
1109
  }
983
1110
 
@@ -990,7 +1117,7 @@ struct llama_grammar * llama_grammar_init_impl(
990
1117
  size_t num_trigger_patterns,
991
1118
  const llama_token * trigger_tokens,
992
1119
  size_t num_trigger_tokens) {
993
- llama_grammar_parser parser;
1120
+ llama_grammar_parser parser(vocab);
994
1121
 
995
1122
  // if there is a grammar, parse it
996
1123
  // rules will be empty (default) if there are parse errors
@@ -1077,10 +1204,11 @@ struct llama_grammar * llama_grammar_init_impl(
1077
1204
  vocab,
1078
1205
  std::move(vec_rules),
1079
1206
  std::move(stacks),
1080
- /* .partial_utf8 = */ {},
1081
- /* .lazy = */ lazy,
1082
- /* .awaiting_trigger = */ lazy,
1083
- /* .trigger_buffer = */ "",
1207
+ /* .partial_utf8 = */ {},
1208
+ /* .lazy = */ lazy,
1209
+ /* .awaiting_trigger = */ lazy,
1210
+ /* .trigger_buffer = */ "",
1211
+ /* .trigger_buffer_positions = */ {},
1084
1212
  std::move(vec_trigger_tokens),
1085
1213
  std::move(vec_trigger_patterns),
1086
1214
  };
@@ -1103,6 +1231,7 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
1103
1231
  grammar.lazy,
1104
1232
  grammar.awaiting_trigger,
1105
1233
  grammar.trigger_buffer,
1234
+ grammar.trigger_buffer_positions,
1106
1235
  grammar.trigger_tokens,
1107
1236
  grammar.trigger_patterns,
1108
1237
  };
@@ -1156,7 +1285,7 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
1156
1285
  cur_p->data[i].logit = -INFINITY;
1157
1286
  } else {
1158
1287
  candidates_decoded.push_back(decode_utf8(piece, grammar.partial_utf8));
1159
- candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
1288
+ candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second, id });
1160
1289
  }
1161
1290
  }
1162
1291
 
@@ -1175,10 +1304,12 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
1175
1304
  if (std::find(grammar.trigger_tokens.begin(), grammar.trigger_tokens.end(), token) != grammar.trigger_tokens.end()) {
1176
1305
  grammar.awaiting_trigger = false;
1177
1306
  grammar.trigger_buffer.clear();
1178
- llama_grammar_accept_str(grammar, piece);
1307
+ llama_grammar_accept_token(grammar, token, piece);
1179
1308
  LLAMA_LOG_DEBUG("Grammar triggered on token %u (`%s`)", token, piece.c_str());
1180
1309
  return;
1181
1310
  } else {
1311
+ auto position = std::make_pair(grammar.trigger_buffer.size(), grammar.trigger_buffer.size() + piece.size());
1312
+ grammar.trigger_buffer_positions.push_back(std::make_pair(token, position));
1182
1313
  grammar.trigger_buffer += piece;
1183
1314
 
1184
1315
  std::smatch match;
@@ -1196,10 +1327,23 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
1196
1327
  if (start == std::string::npos) {
1197
1328
  start = match.position(0);
1198
1329
  }
1330
+
1331
+ // replay tokens that overlap with [start, end)
1332
+ for (const auto & [tok, tok_pos] : grammar.trigger_buffer_positions) {
1333
+ auto [tok_start, tok_end] = tok_pos;
1334
+ if (tok_end <= start) {
1335
+ continue;
1336
+ }
1337
+
1338
+ size_t piece_start = (tok_start < start) ? start : tok_start; // allow for partial token pieces
1339
+ size_t piece_len = tok_end - piece_start;
1340
+ auto tok_piece = grammar.trigger_buffer.substr(piece_start, piece_len);
1341
+ llama_grammar_accept_token(grammar, tok, tok_piece);
1342
+ }
1343
+
1199
1344
  auto constrained_str = grammar.trigger_buffer.substr(start);
1200
- // std::string constrained_str(match[1].first, grammar.trigger_buffer.end());
1201
1345
  grammar.trigger_buffer.clear();
1202
- llama_grammar_accept_str(grammar, constrained_str);
1346
+ grammar.trigger_buffer_positions.clear();
1203
1347
  LLAMA_LOG_DEBUG("Grammar triggered on regex: '%s'\n", constrained_str.c_str());
1204
1348
  return;
1205
1349
  }
@@ -1218,7 +1362,7 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
1218
1362
  GGML_ABORT("fatal error");
1219
1363
  }
1220
1364
 
1221
- llama_grammar_accept_str(grammar, piece);
1365
+ llama_grammar_accept_token(grammar, token, piece);
1222
1366
  }
1223
1367
 
1224
1368
  void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string & piece) {
@@ -1235,3 +1379,59 @@ void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string
1235
1379
  throw std::runtime_error("Unexpected empty grammar stack after accepting piece: " + piece);
1236
1380
  }
1237
1381
  }
1382
+
1383
+ void llama_grammar_accept_token(struct llama_grammar & grammar, llama_token token, const std::string & piece) {
1384
+ // Note terminating 0 in decoded string
1385
+ const auto decoded = decode_utf8(piece, grammar.partial_utf8);
1386
+ const auto & code_points = decoded.first;
1387
+
1388
+ llama_grammar_stacks stacks_new;
1389
+ stacks_new.reserve(grammar.stacks.size());
1390
+
1391
+ for (const auto & stack : grammar.stacks) {
1392
+ if (stack.empty()) {
1393
+ continue;
1394
+ }
1395
+
1396
+ const llama_grammar_element * pos = stack.back();
1397
+
1398
+ if (pos->type == LLAMA_GRETYPE_TOKEN || pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
1399
+ if (llama_grammar_match_token(pos, token)) {
1400
+ llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
1401
+ if (!llama_grammar_is_end_of_sequence(pos + 1)) {
1402
+ new_stack.push_back(pos + 1);
1403
+ }
1404
+ llama_grammar_advance_stack(grammar.rules, new_stack, stacks_new);
1405
+ }
1406
+ } else {
1407
+ llama_grammar_stacks current_stacks = {stack};
1408
+
1409
+ for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
1410
+ llama_grammar_stacks next_stacks;
1411
+
1412
+ for (const auto & cur_stack : current_stacks) {
1413
+ llama_grammar_accept_chr(grammar, cur_stack, *it, next_stacks);
1414
+ }
1415
+
1416
+ current_stacks = std::move(next_stacks);
1417
+ if (current_stacks.empty()) {
1418
+ break;
1419
+ }
1420
+ }
1421
+
1422
+ for (auto & surviving_stack : current_stacks) {
1423
+ if (std::find(stacks_new.begin(), stacks_new.end(), surviving_stack) == stacks_new.end()) {
1424
+ stacks_new.emplace_back(surviving_stack);
1425
+ }
1426
+ }
1427
+ }
1428
+ }
1429
+
1430
+ grammar.stacks = std::move(stacks_new);
1431
+ grammar.partial_utf8 = decoded.second;
1432
+
1433
+ if (grammar.stacks.empty()) {
1434
+ throw std::runtime_error("Unexpected empty grammar stack after accepting piece: " + piece + " (" + std::to_string(token) + ")");
1435
+ }
1436
+ }
1437
+
@@ -36,11 +36,17 @@ enum llama_gretype {
36
36
 
37
37
  // any character (.)
38
38
  LLAMA_GRETYPE_CHAR_ANY = 7,
39
+
40
+ // terminal element: token (<[token-id]>)
41
+ LLAMA_GRETYPE_TOKEN = 8,
42
+
43
+ // inverse token (!<[token-id]>)
44
+ LLAMA_GRETYPE_TOKEN_NOT = 9,
39
45
  };
40
46
 
41
47
  typedef struct llama_grammar_element {
42
48
  enum llama_gretype type;
43
- uint32_t value; // Unicode code point or rule ID
49
+ uint32_t value; // Unicode code point, rule ID, or token ID
44
50
  } llama_grammar_element;
45
51
 
46
52
  struct llama_partial_utf8 {
@@ -52,6 +58,7 @@ struct llama_grammar_candidate {
52
58
  size_t index;
53
59
  const uint32_t * code_points;
54
60
  llama_partial_utf8 partial_utf8;
61
+ llama_token id;
55
62
  };
56
63
 
57
64
  using llama_grammar_rule = std::vector< llama_grammar_element>;
@@ -77,10 +84,13 @@ std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
77
84
  const llama_grammar_candidates & candidates);
78
85
 
79
86
  struct llama_grammar_parser {
87
+ const llama_vocab * vocab;
80
88
  std::map<std::string, uint32_t> symbol_ids;
81
89
 
82
90
  llama_grammar_rules rules;
83
91
 
92
+ llama_grammar_parser(const struct llama_vocab * vocab = nullptr) : vocab(vocab) {}
93
+
84
94
  llama_grammar_stack c_rules() const;
85
95
 
86
96
  uint32_t get_symbol_id(const char * src, size_t len);
@@ -112,6 +122,9 @@ struct llama_grammar_trigger_pattern {
112
122
  };
113
123
 
114
124
  struct llama_grammar {
125
+ // maintain a list of llama_tokens and their positions in the trigger_buffer
126
+ using token_pos = std::pair<llama_token, std::pair<size_t, size_t>>;
127
+
115
128
  // note: allow null vocab for testing (not great)
116
129
  const llama_vocab * vocab;
117
130
 
@@ -127,6 +140,7 @@ struct llama_grammar {
127
140
  bool lazy = false;
128
141
  bool awaiting_trigger = false; // Initialized to true for lazy grammars only
129
142
  std::string trigger_buffer; // Output buffered by lazy grammar. Will be cleared once trigger is found.
143
+ std::vector<token_pos> trigger_buffer_positions; // Tokens buffered by lazy grammar. Used to replay when a trigger is found.
130
144
  std::vector<llama_token> trigger_tokens; // Tokens that trigger a lazy grammar, or tokens to force printing of (even if special).
131
145
  std::vector<llama_grammar_trigger_pattern>
132
146
  trigger_patterns; // Regular expressions that trigger a lazy grammar. Must be a full match of the entire generated
@@ -171,3 +185,8 @@ void llama_grammar_accept_impl(
171
185
  void llama_grammar_accept_str(
172
186
  struct llama_grammar & grammar,
173
187
  const std::string & piece);
188
+
189
+ void llama_grammar_accept_token(
190
+ struct llama_grammar & grammar,
191
+ llama_token token,
192
+ const std::string & piece);
@@ -973,7 +973,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
973
973
 
974
974
  // mask out the other groups
975
975
  selection_probs = ggml_get_rows(ctx0, selection_groups, expert_groups); // [n_exp_per_group, n_group_used, n_tokens]
976
- selection_probs = ggml_set_rows(ctx0, ggml_scale_bias(ctx0, selection_groups, 0.0f, -INFINITY), selection_probs, expert_groups); // [n_exp_per_group, n_expert_groups, n_tokens]
976
+ selection_probs = ggml_set_rows(ctx0, ggml_fill(ctx0, selection_groups, -INFINITY), selection_probs, expert_groups); // [n_exp_per_group, n_expert_groups, n_tokens]
977
977
  selection_probs = ggml_reshape_2d(ctx0, selection_probs, n_expert, n_tokens); // [n_expert, n_tokens]
978
978
  cb(selection_probs, "ffn_moe_probs_masked", il);
979
979
  }
@@ -1264,18 +1264,25 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1264
1264
  } break;
1265
1265
  case LLM_ARCH_GEMMA3:
1266
1266
  {
1267
- hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1268
- hparams.set_swa_pattern(6);
1267
+ const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
1268
+ if (found_swa && hparams.n_swa > 0) {
1269
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1270
+ hparams.set_swa_pattern(6);
1269
1271
 
1270
- hparams.rope_freq_base_train_swa = 10000.0f;
1271
- hparams.rope_freq_scale_train_swa = 1.0f;
1272
+ hparams.rope_freq_base_train_swa = 10000.0f;
1273
+ hparams.rope_freq_scale_train_swa = 1.0f;
1274
+ } else {
1275
+ hparams.swa_type = LLAMA_SWA_TYPE_NONE;
1276
+ }
1272
1277
 
1273
- ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1278
+ hparams.f_final_logit_softcapping = 0.0f;
1279
+ ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
1274
1280
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1275
1281
 
1276
1282
  switch (hparams.n_layer) {
1277
1283
  case 18: type = LLM_TYPE_270M; break;
1278
1284
  case 26: type = LLM_TYPE_1B; break;
1285
+ case 32: type = LLM_TYPE_8B; break; // Rnj-1
1279
1286
  case 34: type = LLM_TYPE_4B; break;
1280
1287
  case 48: type = LLM_TYPE_12B; break;
1281
1288
  case 62: type = LLM_TYPE_27B; break;
@@ -1599,8 +1606,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1599
1606
  ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
1600
1607
  ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
1601
1608
 
1602
- switch (hparams.n_layer) {
1603
- case 28: type = LLM_TYPE_20B; break;
1609
+ switch (hparams.n_ff_exp) {
1610
+ case 1408: type = LLM_TYPE_16B; break;
1611
+ case 1792: type = LLM_TYPE_20B; break;
1604
1612
  default: type = LLM_TYPE_UNKNOWN;
1605
1613
  }
1606
1614
  } break;
@@ -7304,7 +7312,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
7304
7312
  } break;
7305
7313
  case LLM_ARCH_GEMMA3:
7306
7314
  {
7307
- llm = std::make_unique<llm_build_gemma3_iswa>(*this, params);
7315
+ if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
7316
+ llm = std::make_unique<llm_build_gemma3<true>>(*this, params);
7317
+ } else {
7318
+ llm = std::make_unique<llm_build_gemma3<false>>(*this, params);
7319
+ }
7308
7320
  } break;
7309
7321
  case LLM_ARCH_GEMMA3N:
7310
7322
  {