@fugood/llama.node 1.4.6 → 1.4.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +8 -0
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +25 -26
- package/src/LlamaContext.cpp +2 -2
- package/src/llama.cpp/common/CMakeLists.txt +2 -0
- package/src/llama.cpp/common/arg.cpp +364 -193
- package/src/llama.cpp/common/arg.h +43 -2
- package/src/llama.cpp/common/chat-parser-xml-toolcall.cpp +36 -18
- package/src/llama.cpp/common/chat-parser-xml-toolcall.h +1 -1
- package/src/llama.cpp/common/chat-parser.cpp +3 -2
- package/src/llama.cpp/common/chat-peg-parser.cpp +16 -2
- package/src/llama.cpp/common/chat.cpp +272 -0
- package/src/llama.cpp/common/common.cpp +130 -67
- package/src/llama.cpp/common/common.h +40 -16
- package/src/llama.cpp/common/console.cpp +680 -47
- package/src/llama.cpp/common/console.h +30 -8
- package/src/llama.cpp/common/download.cpp +69 -25
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
- package/src/llama.cpp/common/json-schema-to-grammar.h +20 -0
- package/src/llama.cpp/common/log.cpp +5 -0
- package/src/llama.cpp/common/log.h +1 -0
- package/src/llama.cpp/common/peg-parser.cpp +1 -1
- package/src/llama.cpp/common/preset.cpp +206 -0
- package/src/llama.cpp/common/preset.h +32 -0
- package/src/llama.cpp/common/sampling.cpp +91 -92
- package/src/llama.cpp/common/sampling.h +11 -6
- package/src/llama.cpp/common/speculative.cpp +1 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +5 -0
- package/src/llama.cpp/ggml/include/ggml-alloc.h +9 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +1 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/include/ggml.h +7 -8
- package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +69 -39
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +2 -1
- package/src/llama.cpp/include/llama.h +18 -1
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-arch.cpp +1890 -2248
- package/src/llama.cpp/src/llama-arch.h +9 -2
- package/src/llama.cpp/src/llama-batch.cpp +12 -2
- package/src/llama.cpp/src/llama-batch.h +4 -2
- package/src/llama.cpp/src/llama-context.cpp +99 -29
- package/src/llama.cpp/src/llama-context.h +9 -3
- package/src/llama.cpp/src/llama-grammar.cpp +233 -33
- package/src/llama.cpp/src/llama-grammar.h +20 -1
- package/src/llama.cpp/src/llama-graph.cpp +85 -17
- package/src/llama.cpp/src/llama-graph.h +17 -4
- package/src/llama.cpp/src/llama-hparams.cpp +6 -0
- package/src/llama.cpp/src/llama-hparams.h +5 -1
- package/src/llama.cpp/src/llama-impl.cpp +4 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +90 -42
- package/src/llama.cpp/src/llama-kv-cache.h +19 -2
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model.cpp +123 -52
- package/src/llama.cpp/src/llama-model.h +1 -0
- package/src/llama.cpp/src/llama-quant.cpp +1 -1
- package/src/llama.cpp/src/llama-vocab.cpp +2 -1
- package/src/llama.cpp/src/llama.cpp +675 -1
- package/src/llama.cpp/src/models/deepseek2.cpp +9 -5
- package/src/llama.cpp/src/models/{gemma3-iswa.cpp → gemma3.cpp} +30 -5
- package/src/llama.cpp/src/models/glm4-moe.cpp +28 -11
- package/src/llama.cpp/src/models/glm4.cpp +27 -4
- package/src/llama.cpp/src/models/models.h +8 -7
- package/src/llama.cpp/src/models/nemotron-h.cpp +35 -6
- package/src/llama.cpp/src/models/qwen2.cpp +12 -3
- package/src/llama.cpp/src/models/qwen3next.cpp +81 -266
|
@@ -181,6 +181,52 @@ static std::pair<uint32_t, const char *> parse_char(const char * src) {
|
|
|
181
181
|
throw std::runtime_error("unexpected end of input");
|
|
182
182
|
}
|
|
183
183
|
|
|
184
|
+
static std::pair<uint32_t, const char *> parse_token(const llama_vocab * vocab, const char * src) {
|
|
185
|
+
const char * pos = src;
|
|
186
|
+
if (*pos != '<') {
|
|
187
|
+
throw std::runtime_error(std::string("expecting '<' at ") + pos);
|
|
188
|
+
}
|
|
189
|
+
pos++;
|
|
190
|
+
|
|
191
|
+
// Parse <[id]>
|
|
192
|
+
if (*pos == '[') {
|
|
193
|
+
pos++;
|
|
194
|
+
const char * int_end = parse_int(pos);
|
|
195
|
+
uint32_t token_id = std::stoul(std::string(pos, int_end - pos));
|
|
196
|
+
pos = int_end;
|
|
197
|
+
if (*pos != ']') {
|
|
198
|
+
throw std::runtime_error(std::string("expecting ']' at ") + pos);
|
|
199
|
+
}
|
|
200
|
+
pos++;
|
|
201
|
+
if (*pos != '>') {
|
|
202
|
+
throw std::runtime_error(std::string("expecting '>' at ") + pos);
|
|
203
|
+
}
|
|
204
|
+
pos++;
|
|
205
|
+
return std::make_pair(token_id, pos);
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
if (vocab == nullptr) {
|
|
209
|
+
throw std::runtime_error(std::string("no vocab to parse token at ") + src);
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
// Parse <token> and tokenize to obtain the token id
|
|
213
|
+
while (*pos != 0 && *pos != '>') {
|
|
214
|
+
pos++;
|
|
215
|
+
}
|
|
216
|
+
if (*pos != '>') {
|
|
217
|
+
throw std::runtime_error(std::string("expecting '>' at ") + pos);
|
|
218
|
+
}
|
|
219
|
+
pos++;
|
|
220
|
+
|
|
221
|
+
llama_token tokens[2];
|
|
222
|
+
int32_t n_tokens = vocab->tokenize(src, static_cast<int32_t>(pos - src), tokens, 2, false, true);
|
|
223
|
+
if (n_tokens != 1) {
|
|
224
|
+
// must tokenize to exactly 1 token
|
|
225
|
+
throw std::runtime_error("invalid token '" + std::string(src, pos - src) + "'");
|
|
226
|
+
}
|
|
227
|
+
return std::make_pair(tokens[0], pos);
|
|
228
|
+
}
|
|
229
|
+
|
|
184
230
|
static void print_grammar_char(FILE * file, uint32_t c) {
|
|
185
231
|
if (0x20 <= c && c <= 0x7f) {
|
|
186
232
|
fprintf(file, "%c", static_cast<char>(c));
|
|
@@ -212,6 +258,8 @@ static void print_rule_binary(FILE * file, const llama_grammar_rule & rule) {
|
|
|
212
258
|
case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
|
|
213
259
|
case LLAMA_GRETYPE_CHAR_ALT: fprintf(file, "CHAR_ALT"); break;
|
|
214
260
|
case LLAMA_GRETYPE_CHAR_ANY: fprintf(file, "CHAR_ANY"); break;
|
|
261
|
+
case LLAMA_GRETYPE_TOKEN: fprintf(file, "TOKEN"); break;
|
|
262
|
+
case LLAMA_GRETYPE_TOKEN_NOT: fprintf(file, "TOKEN_NOT"); break;
|
|
215
263
|
}
|
|
216
264
|
switch (elem.type) {
|
|
217
265
|
case LLAMA_GRETYPE_END:
|
|
@@ -228,6 +276,17 @@ static void print_rule_binary(FILE * file, const llama_grammar_rule & rule) {
|
|
|
228
276
|
print_grammar_char(file, elem.value);
|
|
229
277
|
fprintf(file, "\") ");
|
|
230
278
|
break;
|
|
279
|
+
case LLAMA_GRETYPE_TOKEN:
|
|
280
|
+
fprintf(file, "<[");
|
|
281
|
+
fprintf(file, "%u", elem.value);
|
|
282
|
+
fprintf(file, "]> ");
|
|
283
|
+
break;
|
|
284
|
+
case LLAMA_GRETYPE_TOKEN_NOT:
|
|
285
|
+
fprintf(file, "!");
|
|
286
|
+
fprintf(file, "<[");
|
|
287
|
+
fprintf(file, "%u", elem.value);
|
|
288
|
+
fprintf(file, "]> ");
|
|
289
|
+
break;
|
|
231
290
|
}
|
|
232
291
|
}
|
|
233
292
|
fprintf(file, "\n");
|
|
@@ -284,6 +343,17 @@ static void print_rule(
|
|
|
284
343
|
case LLAMA_GRETYPE_CHAR_ANY:
|
|
285
344
|
fprintf(file, ".");
|
|
286
345
|
break;
|
|
346
|
+
case LLAMA_GRETYPE_TOKEN:
|
|
347
|
+
fprintf(file, "<[");
|
|
348
|
+
fprintf(file, "%u", elem.value);
|
|
349
|
+
fprintf(file, "]> ");
|
|
350
|
+
break;
|
|
351
|
+
case LLAMA_GRETYPE_TOKEN_NOT:
|
|
352
|
+
fprintf(file, "!");
|
|
353
|
+
fprintf(file, "<[");
|
|
354
|
+
fprintf(file, "%u", elem.value);
|
|
355
|
+
fprintf(file, "]> ");
|
|
356
|
+
break;
|
|
287
357
|
}
|
|
288
358
|
if (is_char_element(elem)) {
|
|
289
359
|
switch (rule[i + 1].type) {
|
|
@@ -444,6 +514,17 @@ const char * llama_grammar_parser::parse_sequence(
|
|
|
444
514
|
}
|
|
445
515
|
}
|
|
446
516
|
pos = parse_space(pos + 1, is_nested);
|
|
517
|
+
} else if (*pos == '<' || *pos == '!') { // token
|
|
518
|
+
auto type = LLAMA_GRETYPE_TOKEN;
|
|
519
|
+
if (*pos == '!') { // token inverse
|
|
520
|
+
type = LLAMA_GRETYPE_TOKEN_NOT;
|
|
521
|
+
pos++;
|
|
522
|
+
}
|
|
523
|
+
auto token_pair = parse_token(vocab, pos);
|
|
524
|
+
const char * token_end = token_pair.second;
|
|
525
|
+
last_sym_start = rule.size();
|
|
526
|
+
rule.push_back({type, token_pair.first});
|
|
527
|
+
pos = parse_space(token_end, is_nested);
|
|
447
528
|
} else if (is_word_char(*pos)) { // rule reference
|
|
448
529
|
const char * name_end = parse_name(pos);
|
|
449
530
|
uint32_t ref_rule_id = get_symbol_id(pos, name_end - pos);
|
|
@@ -691,6 +772,21 @@ static bool llama_grammar_match_partial_char(
|
|
|
691
772
|
return !is_positive_char;
|
|
692
773
|
}
|
|
693
774
|
|
|
775
|
+
// returns true iff token matches the rule at pos (regular or inverse)
|
|
776
|
+
// asserts that pos is pointing to a token element
|
|
777
|
+
static bool llama_grammar_match_token(
|
|
778
|
+
const llama_grammar_element * pos,
|
|
779
|
+
const llama_token token) {
|
|
780
|
+
GGML_ASSERT(pos->type == LLAMA_GRETYPE_TOKEN || pos->type == LLAMA_GRETYPE_TOKEN_NOT);
|
|
781
|
+
if (pos->type == LLAMA_GRETYPE_TOKEN) {
|
|
782
|
+
return pos->value == static_cast<uint32_t>(token);
|
|
783
|
+
}
|
|
784
|
+
if (pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
|
|
785
|
+
return pos->value != static_cast<uint32_t>(token);
|
|
786
|
+
}
|
|
787
|
+
return false;
|
|
788
|
+
}
|
|
789
|
+
|
|
694
790
|
// transforms a grammar pushdown stack into N possible stacks, all ending
|
|
695
791
|
// at a character range (terminal element)
|
|
696
792
|
static void llama_grammar_advance_stack(
|
|
@@ -738,6 +834,8 @@ static void llama_grammar_advance_stack(
|
|
|
738
834
|
case LLAMA_GRETYPE_CHAR:
|
|
739
835
|
case LLAMA_GRETYPE_CHAR_NOT:
|
|
740
836
|
case LLAMA_GRETYPE_CHAR_ANY:
|
|
837
|
+
case LLAMA_GRETYPE_TOKEN:
|
|
838
|
+
case LLAMA_GRETYPE_TOKEN_NOT:
|
|
741
839
|
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
|
|
742
840
|
// only add the stack if it's not a duplicate of one we already have
|
|
743
841
|
new_stacks.emplace_back(stack);
|
|
@@ -831,26 +929,38 @@ llama_grammar_stacks & llama_grammar_get_stacks(struct llama_grammar * grammar)
|
|
|
831
929
|
return grammar->stacks;
|
|
832
930
|
}
|
|
833
931
|
|
|
932
|
+
static void llama_grammar_accept_chr(
|
|
933
|
+
struct llama_grammar & grammar,
|
|
934
|
+
const llama_grammar_stack & stack,
|
|
935
|
+
uint32_t chr,
|
|
936
|
+
llama_grammar_stacks & new_stacks) {
|
|
937
|
+
if (stack.empty()) {
|
|
938
|
+
return;
|
|
939
|
+
}
|
|
940
|
+
|
|
941
|
+
const llama_grammar_element * pos = stack.back();
|
|
942
|
+
|
|
943
|
+
// ignore if this turns into a token
|
|
944
|
+
if (pos->type == LLAMA_GRETYPE_TOKEN || pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
|
|
945
|
+
return;
|
|
946
|
+
}
|
|
947
|
+
|
|
948
|
+
auto match = llama_grammar_match_char(pos, chr);
|
|
949
|
+
if (match.first) {
|
|
950
|
+
llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
|
|
951
|
+
if (!llama_grammar_is_end_of_sequence(match.second)) {
|
|
952
|
+
new_stack.push_back(match.second);
|
|
953
|
+
}
|
|
954
|
+
llama_grammar_advance_stack(grammar.rules, new_stack, new_stacks);
|
|
955
|
+
}
|
|
956
|
+
}
|
|
957
|
+
|
|
834
958
|
void llama_grammar_accept(struct llama_grammar * grammar, uint32_t chr) {
|
|
835
959
|
llama_grammar_stacks stacks_new;
|
|
836
960
|
stacks_new.reserve(grammar->stacks.size());
|
|
837
961
|
|
|
838
962
|
for (const auto & stack : grammar->stacks) {
|
|
839
|
-
|
|
840
|
-
continue;
|
|
841
|
-
}
|
|
842
|
-
|
|
843
|
-
auto match = llama_grammar_match_char(stack.back(), chr);
|
|
844
|
-
if (match.first) {
|
|
845
|
-
const llama_grammar_element * pos = match.second;
|
|
846
|
-
|
|
847
|
-
// update top of stack to next element, if any
|
|
848
|
-
llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
|
|
849
|
-
if (!llama_grammar_is_end_of_sequence(pos)) {
|
|
850
|
-
new_stack.push_back(pos);
|
|
851
|
-
}
|
|
852
|
-
llama_grammar_advance_stack(grammar->rules, new_stack, stacks_new);
|
|
853
|
-
}
|
|
963
|
+
llama_grammar_accept_chr(*grammar, stack, chr, stacks_new);
|
|
854
964
|
}
|
|
855
965
|
|
|
856
966
|
grammar->stacks = std::move(stacks_new);
|
|
@@ -875,6 +985,22 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
|
|
|
875
985
|
|
|
876
986
|
const llama_grammar_element * stack_pos = stack.back();
|
|
877
987
|
|
|
988
|
+
// if the top of the stack is a token rule, then we only need to check the token id
|
|
989
|
+
if (stack_pos->type == LLAMA_GRETYPE_TOKEN || stack_pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
|
|
990
|
+
for (const auto & tok : candidates) {
|
|
991
|
+
if (*tok.code_points == 0) {
|
|
992
|
+
// reached the end of a token consumed by char rules, reject iff it ended
|
|
993
|
+
// in a partial response
|
|
994
|
+
if (tok.partial_utf8.n_remain != 0) {
|
|
995
|
+
rejects.push_back(tok);
|
|
996
|
+
}
|
|
997
|
+
} else if (!llama_grammar_match_token(stack_pos, tok.id)) {
|
|
998
|
+
rejects.push_back(tok);
|
|
999
|
+
}
|
|
1000
|
+
}
|
|
1001
|
+
return rejects;
|
|
1002
|
+
}
|
|
1003
|
+
|
|
878
1004
|
llama_grammar_candidates next_candidates;
|
|
879
1005
|
next_candidates.reserve(candidates.size());
|
|
880
1006
|
|
|
@@ -887,7 +1013,7 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
|
|
|
887
1013
|
rejects.push_back(tok);
|
|
888
1014
|
}
|
|
889
1015
|
} else if (llama_grammar_match_char(stack_pos, *tok.code_points).first) {
|
|
890
|
-
next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8 });
|
|
1016
|
+
next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8, tok.id });
|
|
891
1017
|
} else {
|
|
892
1018
|
rejects.push_back(tok);
|
|
893
1019
|
}
|
|
@@ -905,7 +1031,7 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
|
|
|
905
1031
|
|
|
906
1032
|
auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
|
|
907
1033
|
for (const auto & tok : next_rejects) {
|
|
908
|
-
rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
|
|
1034
|
+
rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8, tok.id });
|
|
909
1035
|
}
|
|
910
1036
|
|
|
911
1037
|
return rejects;
|
|
@@ -972,12 +1098,13 @@ struct llama_grammar * llama_grammar_init_impl(
|
|
|
972
1098
|
vocab,
|
|
973
1099
|
std::move(vec_rules),
|
|
974
1100
|
std::move(stacks),
|
|
975
|
-
/* .partial_utf8 = */
|
|
976
|
-
/* .lazy
|
|
977
|
-
/* .awaiting_trigger = */
|
|
978
|
-
/* .trigger_buffer = */
|
|
979
|
-
/* .
|
|
980
|
-
/* .
|
|
1101
|
+
/* .partial_utf8 = */ {},
|
|
1102
|
+
/* .lazy = */ false,
|
|
1103
|
+
/* .awaiting_trigger = */ false,
|
|
1104
|
+
/* .trigger_buffer = */ "",
|
|
1105
|
+
/* .trigger_buffer_positions = */ {},
|
|
1106
|
+
/* .trigger_tokens = */ {},
|
|
1107
|
+
/* .trigger_patterns = */ {},
|
|
981
1108
|
};
|
|
982
1109
|
}
|
|
983
1110
|
|
|
@@ -990,7 +1117,7 @@ struct llama_grammar * llama_grammar_init_impl(
|
|
|
990
1117
|
size_t num_trigger_patterns,
|
|
991
1118
|
const llama_token * trigger_tokens,
|
|
992
1119
|
size_t num_trigger_tokens) {
|
|
993
|
-
llama_grammar_parser parser;
|
|
1120
|
+
llama_grammar_parser parser(vocab);
|
|
994
1121
|
|
|
995
1122
|
// if there is a grammar, parse it
|
|
996
1123
|
// rules will be empty (default) if there are parse errors
|
|
@@ -1077,10 +1204,11 @@ struct llama_grammar * llama_grammar_init_impl(
|
|
|
1077
1204
|
vocab,
|
|
1078
1205
|
std::move(vec_rules),
|
|
1079
1206
|
std::move(stacks),
|
|
1080
|
-
/* .partial_utf8 = */
|
|
1081
|
-
/* .lazy = */
|
|
1082
|
-
/* .awaiting_trigger = */
|
|
1083
|
-
/* .trigger_buffer = */
|
|
1207
|
+
/* .partial_utf8 = */ {},
|
|
1208
|
+
/* .lazy = */ lazy,
|
|
1209
|
+
/* .awaiting_trigger = */ lazy,
|
|
1210
|
+
/* .trigger_buffer = */ "",
|
|
1211
|
+
/* .trigger_buffer_positions = */ {},
|
|
1084
1212
|
std::move(vec_trigger_tokens),
|
|
1085
1213
|
std::move(vec_trigger_patterns),
|
|
1086
1214
|
};
|
|
@@ -1103,6 +1231,7 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
|
|
|
1103
1231
|
grammar.lazy,
|
|
1104
1232
|
grammar.awaiting_trigger,
|
|
1105
1233
|
grammar.trigger_buffer,
|
|
1234
|
+
grammar.trigger_buffer_positions,
|
|
1106
1235
|
grammar.trigger_tokens,
|
|
1107
1236
|
grammar.trigger_patterns,
|
|
1108
1237
|
};
|
|
@@ -1156,7 +1285,7 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
|
|
|
1156
1285
|
cur_p->data[i].logit = -INFINITY;
|
|
1157
1286
|
} else {
|
|
1158
1287
|
candidates_decoded.push_back(decode_utf8(piece, grammar.partial_utf8));
|
|
1159
|
-
candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
|
|
1288
|
+
candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second, id });
|
|
1160
1289
|
}
|
|
1161
1290
|
}
|
|
1162
1291
|
|
|
@@ -1175,10 +1304,12 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
|
|
|
1175
1304
|
if (std::find(grammar.trigger_tokens.begin(), grammar.trigger_tokens.end(), token) != grammar.trigger_tokens.end()) {
|
|
1176
1305
|
grammar.awaiting_trigger = false;
|
|
1177
1306
|
grammar.trigger_buffer.clear();
|
|
1178
|
-
|
|
1307
|
+
llama_grammar_accept_token(grammar, token, piece);
|
|
1179
1308
|
LLAMA_LOG_DEBUG("Grammar triggered on token %u (`%s`)", token, piece.c_str());
|
|
1180
1309
|
return;
|
|
1181
1310
|
} else {
|
|
1311
|
+
auto position = std::make_pair(grammar.trigger_buffer.size(), grammar.trigger_buffer.size() + piece.size());
|
|
1312
|
+
grammar.trigger_buffer_positions.push_back(std::make_pair(token, position));
|
|
1182
1313
|
grammar.trigger_buffer += piece;
|
|
1183
1314
|
|
|
1184
1315
|
std::smatch match;
|
|
@@ -1196,10 +1327,23 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
|
|
|
1196
1327
|
if (start == std::string::npos) {
|
|
1197
1328
|
start = match.position(0);
|
|
1198
1329
|
}
|
|
1330
|
+
|
|
1331
|
+
// replay tokens that overlap with [start, end)
|
|
1332
|
+
for (const auto & [tok, tok_pos] : grammar.trigger_buffer_positions) {
|
|
1333
|
+
auto [tok_start, tok_end] = tok_pos;
|
|
1334
|
+
if (tok_end <= start) {
|
|
1335
|
+
continue;
|
|
1336
|
+
}
|
|
1337
|
+
|
|
1338
|
+
size_t piece_start = (tok_start < start) ? start : tok_start; // allow for partial token pieces
|
|
1339
|
+
size_t piece_len = tok_end - piece_start;
|
|
1340
|
+
auto tok_piece = grammar.trigger_buffer.substr(piece_start, piece_len);
|
|
1341
|
+
llama_grammar_accept_token(grammar, tok, tok_piece);
|
|
1342
|
+
}
|
|
1343
|
+
|
|
1199
1344
|
auto constrained_str = grammar.trigger_buffer.substr(start);
|
|
1200
|
-
// std::string constrained_str(match[1].first, grammar.trigger_buffer.end());
|
|
1201
1345
|
grammar.trigger_buffer.clear();
|
|
1202
|
-
|
|
1346
|
+
grammar.trigger_buffer_positions.clear();
|
|
1203
1347
|
LLAMA_LOG_DEBUG("Grammar triggered on regex: '%s'\n", constrained_str.c_str());
|
|
1204
1348
|
return;
|
|
1205
1349
|
}
|
|
@@ -1218,7 +1362,7 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
|
|
|
1218
1362
|
GGML_ABORT("fatal error");
|
|
1219
1363
|
}
|
|
1220
1364
|
|
|
1221
|
-
|
|
1365
|
+
llama_grammar_accept_token(grammar, token, piece);
|
|
1222
1366
|
}
|
|
1223
1367
|
|
|
1224
1368
|
void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string & piece) {
|
|
@@ -1235,3 +1379,59 @@ void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string
|
|
|
1235
1379
|
throw std::runtime_error("Unexpected empty grammar stack after accepting piece: " + piece);
|
|
1236
1380
|
}
|
|
1237
1381
|
}
|
|
1382
|
+
|
|
1383
|
+
void llama_grammar_accept_token(struct llama_grammar & grammar, llama_token token, const std::string & piece) {
|
|
1384
|
+
// Note terminating 0 in decoded string
|
|
1385
|
+
const auto decoded = decode_utf8(piece, grammar.partial_utf8);
|
|
1386
|
+
const auto & code_points = decoded.first;
|
|
1387
|
+
|
|
1388
|
+
llama_grammar_stacks stacks_new;
|
|
1389
|
+
stacks_new.reserve(grammar.stacks.size());
|
|
1390
|
+
|
|
1391
|
+
for (const auto & stack : grammar.stacks) {
|
|
1392
|
+
if (stack.empty()) {
|
|
1393
|
+
continue;
|
|
1394
|
+
}
|
|
1395
|
+
|
|
1396
|
+
const llama_grammar_element * pos = stack.back();
|
|
1397
|
+
|
|
1398
|
+
if (pos->type == LLAMA_GRETYPE_TOKEN || pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
|
|
1399
|
+
if (llama_grammar_match_token(pos, token)) {
|
|
1400
|
+
llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
|
|
1401
|
+
if (!llama_grammar_is_end_of_sequence(pos + 1)) {
|
|
1402
|
+
new_stack.push_back(pos + 1);
|
|
1403
|
+
}
|
|
1404
|
+
llama_grammar_advance_stack(grammar.rules, new_stack, stacks_new);
|
|
1405
|
+
}
|
|
1406
|
+
} else {
|
|
1407
|
+
llama_grammar_stacks current_stacks = {stack};
|
|
1408
|
+
|
|
1409
|
+
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
|
1410
|
+
llama_grammar_stacks next_stacks;
|
|
1411
|
+
|
|
1412
|
+
for (const auto & cur_stack : current_stacks) {
|
|
1413
|
+
llama_grammar_accept_chr(grammar, cur_stack, *it, next_stacks);
|
|
1414
|
+
}
|
|
1415
|
+
|
|
1416
|
+
current_stacks = std::move(next_stacks);
|
|
1417
|
+
if (current_stacks.empty()) {
|
|
1418
|
+
break;
|
|
1419
|
+
}
|
|
1420
|
+
}
|
|
1421
|
+
|
|
1422
|
+
for (auto & surviving_stack : current_stacks) {
|
|
1423
|
+
if (std::find(stacks_new.begin(), stacks_new.end(), surviving_stack) == stacks_new.end()) {
|
|
1424
|
+
stacks_new.emplace_back(surviving_stack);
|
|
1425
|
+
}
|
|
1426
|
+
}
|
|
1427
|
+
}
|
|
1428
|
+
}
|
|
1429
|
+
|
|
1430
|
+
grammar.stacks = std::move(stacks_new);
|
|
1431
|
+
grammar.partial_utf8 = decoded.second;
|
|
1432
|
+
|
|
1433
|
+
if (grammar.stacks.empty()) {
|
|
1434
|
+
throw std::runtime_error("Unexpected empty grammar stack after accepting piece: " + piece + " (" + std::to_string(token) + ")");
|
|
1435
|
+
}
|
|
1436
|
+
}
|
|
1437
|
+
|
|
@@ -36,11 +36,17 @@ enum llama_gretype {
|
|
|
36
36
|
|
|
37
37
|
// any character (.)
|
|
38
38
|
LLAMA_GRETYPE_CHAR_ANY = 7,
|
|
39
|
+
|
|
40
|
+
// terminal element: token (<[token-id]>)
|
|
41
|
+
LLAMA_GRETYPE_TOKEN = 8,
|
|
42
|
+
|
|
43
|
+
// inverse token (!<[token-id]>)
|
|
44
|
+
LLAMA_GRETYPE_TOKEN_NOT = 9,
|
|
39
45
|
};
|
|
40
46
|
|
|
41
47
|
typedef struct llama_grammar_element {
|
|
42
48
|
enum llama_gretype type;
|
|
43
|
-
uint32_t value; // Unicode code point or
|
|
49
|
+
uint32_t value; // Unicode code point, rule ID, or token ID
|
|
44
50
|
} llama_grammar_element;
|
|
45
51
|
|
|
46
52
|
struct llama_partial_utf8 {
|
|
@@ -52,6 +58,7 @@ struct llama_grammar_candidate {
|
|
|
52
58
|
size_t index;
|
|
53
59
|
const uint32_t * code_points;
|
|
54
60
|
llama_partial_utf8 partial_utf8;
|
|
61
|
+
llama_token id;
|
|
55
62
|
};
|
|
56
63
|
|
|
57
64
|
using llama_grammar_rule = std::vector< llama_grammar_element>;
|
|
@@ -77,10 +84,13 @@ std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
|
|
|
77
84
|
const llama_grammar_candidates & candidates);
|
|
78
85
|
|
|
79
86
|
struct llama_grammar_parser {
|
|
87
|
+
const llama_vocab * vocab;
|
|
80
88
|
std::map<std::string, uint32_t> symbol_ids;
|
|
81
89
|
|
|
82
90
|
llama_grammar_rules rules;
|
|
83
91
|
|
|
92
|
+
llama_grammar_parser(const struct llama_vocab * vocab = nullptr) : vocab(vocab) {}
|
|
93
|
+
|
|
84
94
|
llama_grammar_stack c_rules() const;
|
|
85
95
|
|
|
86
96
|
uint32_t get_symbol_id(const char * src, size_t len);
|
|
@@ -112,6 +122,9 @@ struct llama_grammar_trigger_pattern {
|
|
|
112
122
|
};
|
|
113
123
|
|
|
114
124
|
struct llama_grammar {
|
|
125
|
+
// maintain a list of llama_tokens and their positions in the trigger_buffer
|
|
126
|
+
using token_pos = std::pair<llama_token, std::pair<size_t, size_t>>;
|
|
127
|
+
|
|
115
128
|
// note: allow null vocab for testing (not great)
|
|
116
129
|
const llama_vocab * vocab;
|
|
117
130
|
|
|
@@ -127,6 +140,7 @@ struct llama_grammar {
|
|
|
127
140
|
bool lazy = false;
|
|
128
141
|
bool awaiting_trigger = false; // Initialized to true for lazy grammars only
|
|
129
142
|
std::string trigger_buffer; // Output buffered by lazy grammar. Will be cleared once trigger is found.
|
|
143
|
+
std::vector<token_pos> trigger_buffer_positions; // Tokens buffered by lazy grammar. Used to replay when a trigger is found.
|
|
130
144
|
std::vector<llama_token> trigger_tokens; // Tokens that trigger a lazy grammar, or tokens to force printing of (even if special).
|
|
131
145
|
std::vector<llama_grammar_trigger_pattern>
|
|
132
146
|
trigger_patterns; // Regular expressions that trigger a lazy grammar. Must be a full match of the entire generated
|
|
@@ -171,3 +185,8 @@ void llama_grammar_accept_impl(
|
|
|
171
185
|
void llama_grammar_accept_str(
|
|
172
186
|
struct llama_grammar & grammar,
|
|
173
187
|
const std::string & piece);
|
|
188
|
+
|
|
189
|
+
void llama_grammar_accept_token(
|
|
190
|
+
struct llama_grammar & grammar,
|
|
191
|
+
llama_token token,
|
|
192
|
+
const std::string & piece);
|