@fugood/llama.node 1.4.5 → 1.4.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +49 -0
- package/lib/index.js +13 -0
- package/lib/index.ts +13 -0
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +8 -8
- package/src/LlamaContext.cpp +69 -0
- package/src/LlamaContext.h +3 -0
- package/src/llama.cpp/common/chat-parser-xml-toolcall.cpp +36 -18
- package/src/llama.cpp/common/chat-parser-xml-toolcall.h +1 -1
- package/src/llama.cpp/common/chat-parser.cpp +3 -2
- package/src/llama.cpp/common/chat.cpp +132 -0
- package/src/llama.cpp/common/console.cpp +582 -29
- package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +9 -0
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-context.cpp +6 -6
- package/src/llama.cpp/src/llama-context.h +1 -1
- package/src/llama.cpp/src/llama-grammar.cpp +233 -33
- package/src/llama.cpp/src/llama-grammar.h +20 -1
- package/src/llama.cpp/src/llama-graph.cpp +1 -1
- package/src/llama.cpp/src/llama-model.cpp +20 -8
- package/src/llama.cpp/src/models/{gemma3-iswa.cpp → gemma3.cpp} +30 -5
- package/src/llama.cpp/src/models/models.h +3 -2
|
@@ -168,6 +168,7 @@ option(GGML_RVV "ggml: enable rvv" ON)
|
|
|
168
168
|
option(GGML_RV_ZFH "ggml: enable riscv zfh" ON)
|
|
169
169
|
option(GGML_RV_ZVFH "ggml: enable riscv zvfh" ON)
|
|
170
170
|
option(GGML_RV_ZICBOP "ggml: enable riscv zicbop" ON)
|
|
171
|
+
option(GGML_RV_ZIHINTPAUSE "ggml: enable riscv zihintpause " ON)
|
|
171
172
|
option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
|
|
172
173
|
option(GGML_VXE "ggml: enable vxe" ${GGML_NATIVE})
|
|
173
174
|
|
|
@@ -469,6 +469,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
469
469
|
if (GGML_RV_ZICBOP)
|
|
470
470
|
string(APPEND MARCH_STR "_zicbop")
|
|
471
471
|
endif()
|
|
472
|
+
if (GGML_RV_ZIHINTPAUSE)
|
|
473
|
+
string(APPEND MARCH_STR "_zihintpause")
|
|
474
|
+
endif()
|
|
472
475
|
list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
|
|
473
476
|
else()
|
|
474
477
|
# Begin with the lowest baseline
|
|
@@ -490,6 +490,15 @@ static inline void ggml_thread_cpu_relax(void) {
|
|
|
490
490
|
static inline void ggml_thread_cpu_relax(void) {
|
|
491
491
|
_mm_pause();
|
|
492
492
|
}
|
|
493
|
+
#elif defined(__riscv)
|
|
494
|
+
static inline void ggml_thread_cpu_relax(void) {
|
|
495
|
+
#ifdef __riscv_zihintpause
|
|
496
|
+
__asm__ __volatile__ ("pause");
|
|
497
|
+
#else
|
|
498
|
+
/* Encoding of the pause instruction */
|
|
499
|
+
__asm__ __volatile__ (".4byte 0x100000F");
|
|
500
|
+
#endif
|
|
501
|
+
}
|
|
493
502
|
#else
|
|
494
503
|
static inline void ggml_thread_cpu_relax(void) {;}
|
|
495
504
|
#endif
|
|
@@ -67,7 +67,7 @@ add_library(llama
|
|
|
67
67
|
models/gemma-embedding.cpp
|
|
68
68
|
models/gemma.cpp
|
|
69
69
|
models/gemma2-iswa.cpp
|
|
70
|
-
models/gemma3
|
|
70
|
+
models/gemma3.cpp
|
|
71
71
|
models/gemma3n-iswa.cpp
|
|
72
72
|
models/glm4-moe.cpp
|
|
73
73
|
models/glm4.cpp
|
|
@@ -139,6 +139,7 @@ add_library(llama
|
|
|
139
139
|
set_target_properties(llama PROPERTIES
|
|
140
140
|
VERSION ${LLAMA_INSTALL_VERSION}
|
|
141
141
|
SOVERSION 0
|
|
142
|
+
MACHO_CURRENT_VERSION 0 # keep macOS linker from seeing oversized version number
|
|
142
143
|
)
|
|
143
144
|
|
|
144
145
|
target_include_directories(llama PRIVATE .)
|
|
@@ -248,7 +248,10 @@ llama_context::llama_context(
|
|
|
248
248
|
|
|
249
249
|
LLAMA_LOG_DEBUG("%s: backend_ptrs.size() = %zu\n", __func__, backend_ptrs.size());
|
|
250
250
|
|
|
251
|
-
const
|
|
251
|
+
const uint32_t n_seqs = cparams.n_seq_max;
|
|
252
|
+
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
|
253
|
+
|
|
254
|
+
const size_t max_nodes = this->graph_max_nodes(n_tokens);
|
|
252
255
|
|
|
253
256
|
LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes);
|
|
254
257
|
|
|
@@ -300,9 +303,6 @@ llama_context::llama_context(
|
|
|
300
303
|
|
|
301
304
|
cross.v_embd.clear();
|
|
302
305
|
|
|
303
|
-
const uint32_t n_seqs = cparams.n_seq_max;
|
|
304
|
-
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
|
305
|
-
|
|
306
306
|
// avoid reserving graphs with zero outputs - assume one output per sequence
|
|
307
307
|
n_outputs = n_seqs;
|
|
308
308
|
|
|
@@ -1386,9 +1386,9 @@ void llama_context::output_reorder() {
|
|
|
1386
1386
|
// graph
|
|
1387
1387
|
//
|
|
1388
1388
|
|
|
1389
|
-
uint32_t llama_context::graph_max_nodes() const {
|
|
1389
|
+
uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
|
|
1390
1390
|
if (model.arch == LLM_ARCH_QWEN3NEXT) {
|
|
1391
|
-
return std::max<uint32_t>(
|
|
1391
|
+
return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
|
|
1392
1392
|
}
|
|
1393
1393
|
return std::max<uint32_t>(1024u, 8u*model.n_tensors());
|
|
1394
1394
|
}
|
|
@@ -197,7 +197,7 @@ private:
|
|
|
197
197
|
//
|
|
198
198
|
|
|
199
199
|
public:
|
|
200
|
-
uint32_t graph_max_nodes() const;
|
|
200
|
+
uint32_t graph_max_nodes(uint32_t n_tokens) const;
|
|
201
201
|
|
|
202
202
|
// can reuse the llm_graph_result instance of the context (for example to update a memory module)
|
|
203
203
|
llm_graph_result * get_gf_res_reserve() const;
|
|
@@ -181,6 +181,52 @@ static std::pair<uint32_t, const char *> parse_char(const char * src) {
|
|
|
181
181
|
throw std::runtime_error("unexpected end of input");
|
|
182
182
|
}
|
|
183
183
|
|
|
184
|
+
static std::pair<uint32_t, const char *> parse_token(const llama_vocab * vocab, const char * src) {
|
|
185
|
+
const char * pos = src;
|
|
186
|
+
if (*pos != '<') {
|
|
187
|
+
throw std::runtime_error(std::string("expecting '<' at ") + pos);
|
|
188
|
+
}
|
|
189
|
+
pos++;
|
|
190
|
+
|
|
191
|
+
// Parse <[id]>
|
|
192
|
+
if (*pos == '[') {
|
|
193
|
+
pos++;
|
|
194
|
+
const char * int_end = parse_int(pos);
|
|
195
|
+
uint32_t token_id = std::stoul(std::string(pos, int_end - pos));
|
|
196
|
+
pos = int_end;
|
|
197
|
+
if (*pos != ']') {
|
|
198
|
+
throw std::runtime_error(std::string("expecting ']' at ") + pos);
|
|
199
|
+
}
|
|
200
|
+
pos++;
|
|
201
|
+
if (*pos != '>') {
|
|
202
|
+
throw std::runtime_error(std::string("expecting '>' at ") + pos);
|
|
203
|
+
}
|
|
204
|
+
pos++;
|
|
205
|
+
return std::make_pair(token_id, pos);
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
if (vocab == nullptr) {
|
|
209
|
+
throw std::runtime_error(std::string("no vocab to parse token at ") + src);
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
// Parse <token> and tokenize to obtain the token id
|
|
213
|
+
while (*pos != 0 && *pos != '>') {
|
|
214
|
+
pos++;
|
|
215
|
+
}
|
|
216
|
+
if (*pos != '>') {
|
|
217
|
+
throw std::runtime_error(std::string("expecting '>' at ") + pos);
|
|
218
|
+
}
|
|
219
|
+
pos++;
|
|
220
|
+
|
|
221
|
+
llama_token tokens[2];
|
|
222
|
+
int32_t n_tokens = vocab->tokenize(src, static_cast<int32_t>(pos - src), tokens, 2, false, true);
|
|
223
|
+
if (n_tokens != 1) {
|
|
224
|
+
// must tokenize to exactly 1 token
|
|
225
|
+
throw std::runtime_error("invalid token '" + std::string(src, pos - src) + "'");
|
|
226
|
+
}
|
|
227
|
+
return std::make_pair(tokens[0], pos);
|
|
228
|
+
}
|
|
229
|
+
|
|
184
230
|
static void print_grammar_char(FILE * file, uint32_t c) {
|
|
185
231
|
if (0x20 <= c && c <= 0x7f) {
|
|
186
232
|
fprintf(file, "%c", static_cast<char>(c));
|
|
@@ -212,6 +258,8 @@ static void print_rule_binary(FILE * file, const llama_grammar_rule & rule) {
|
|
|
212
258
|
case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
|
|
213
259
|
case LLAMA_GRETYPE_CHAR_ALT: fprintf(file, "CHAR_ALT"); break;
|
|
214
260
|
case LLAMA_GRETYPE_CHAR_ANY: fprintf(file, "CHAR_ANY"); break;
|
|
261
|
+
case LLAMA_GRETYPE_TOKEN: fprintf(file, "TOKEN"); break;
|
|
262
|
+
case LLAMA_GRETYPE_TOKEN_NOT: fprintf(file, "TOKEN_NOT"); break;
|
|
215
263
|
}
|
|
216
264
|
switch (elem.type) {
|
|
217
265
|
case LLAMA_GRETYPE_END:
|
|
@@ -228,6 +276,17 @@ static void print_rule_binary(FILE * file, const llama_grammar_rule & rule) {
|
|
|
228
276
|
print_grammar_char(file, elem.value);
|
|
229
277
|
fprintf(file, "\") ");
|
|
230
278
|
break;
|
|
279
|
+
case LLAMA_GRETYPE_TOKEN:
|
|
280
|
+
fprintf(file, "<[");
|
|
281
|
+
fprintf(file, "%u", elem.value);
|
|
282
|
+
fprintf(file, "]> ");
|
|
283
|
+
break;
|
|
284
|
+
case LLAMA_GRETYPE_TOKEN_NOT:
|
|
285
|
+
fprintf(file, "!");
|
|
286
|
+
fprintf(file, "<[");
|
|
287
|
+
fprintf(file, "%u", elem.value);
|
|
288
|
+
fprintf(file, "]> ");
|
|
289
|
+
break;
|
|
231
290
|
}
|
|
232
291
|
}
|
|
233
292
|
fprintf(file, "\n");
|
|
@@ -284,6 +343,17 @@ static void print_rule(
|
|
|
284
343
|
case LLAMA_GRETYPE_CHAR_ANY:
|
|
285
344
|
fprintf(file, ".");
|
|
286
345
|
break;
|
|
346
|
+
case LLAMA_GRETYPE_TOKEN:
|
|
347
|
+
fprintf(file, "<[");
|
|
348
|
+
fprintf(file, "%u", elem.value);
|
|
349
|
+
fprintf(file, "]> ");
|
|
350
|
+
break;
|
|
351
|
+
case LLAMA_GRETYPE_TOKEN_NOT:
|
|
352
|
+
fprintf(file, "!");
|
|
353
|
+
fprintf(file, "<[");
|
|
354
|
+
fprintf(file, "%u", elem.value);
|
|
355
|
+
fprintf(file, "]> ");
|
|
356
|
+
break;
|
|
287
357
|
}
|
|
288
358
|
if (is_char_element(elem)) {
|
|
289
359
|
switch (rule[i + 1].type) {
|
|
@@ -444,6 +514,17 @@ const char * llama_grammar_parser::parse_sequence(
|
|
|
444
514
|
}
|
|
445
515
|
}
|
|
446
516
|
pos = parse_space(pos + 1, is_nested);
|
|
517
|
+
} else if (*pos == '<' || *pos == '!') { // token
|
|
518
|
+
auto type = LLAMA_GRETYPE_TOKEN;
|
|
519
|
+
if (*pos == '!') { // token inverse
|
|
520
|
+
type = LLAMA_GRETYPE_TOKEN_NOT;
|
|
521
|
+
pos++;
|
|
522
|
+
}
|
|
523
|
+
auto token_pair = parse_token(vocab, pos);
|
|
524
|
+
const char * token_end = token_pair.second;
|
|
525
|
+
last_sym_start = rule.size();
|
|
526
|
+
rule.push_back({type, token_pair.first});
|
|
527
|
+
pos = parse_space(token_end, is_nested);
|
|
447
528
|
} else if (is_word_char(*pos)) { // rule reference
|
|
448
529
|
const char * name_end = parse_name(pos);
|
|
449
530
|
uint32_t ref_rule_id = get_symbol_id(pos, name_end - pos);
|
|
@@ -691,6 +772,21 @@ static bool llama_grammar_match_partial_char(
|
|
|
691
772
|
return !is_positive_char;
|
|
692
773
|
}
|
|
693
774
|
|
|
775
|
+
// returns true iff token matches the rule at pos (regular or inverse)
|
|
776
|
+
// asserts that pos is pointing to a token element
|
|
777
|
+
static bool llama_grammar_match_token(
|
|
778
|
+
const llama_grammar_element * pos,
|
|
779
|
+
const llama_token token) {
|
|
780
|
+
GGML_ASSERT(pos->type == LLAMA_GRETYPE_TOKEN || pos->type == LLAMA_GRETYPE_TOKEN_NOT);
|
|
781
|
+
if (pos->type == LLAMA_GRETYPE_TOKEN) {
|
|
782
|
+
return pos->value == static_cast<uint32_t>(token);
|
|
783
|
+
}
|
|
784
|
+
if (pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
|
|
785
|
+
return pos->value != static_cast<uint32_t>(token);
|
|
786
|
+
}
|
|
787
|
+
return false;
|
|
788
|
+
}
|
|
789
|
+
|
|
694
790
|
// transforms a grammar pushdown stack into N possible stacks, all ending
|
|
695
791
|
// at a character range (terminal element)
|
|
696
792
|
static void llama_grammar_advance_stack(
|
|
@@ -738,6 +834,8 @@ static void llama_grammar_advance_stack(
|
|
|
738
834
|
case LLAMA_GRETYPE_CHAR:
|
|
739
835
|
case LLAMA_GRETYPE_CHAR_NOT:
|
|
740
836
|
case LLAMA_GRETYPE_CHAR_ANY:
|
|
837
|
+
case LLAMA_GRETYPE_TOKEN:
|
|
838
|
+
case LLAMA_GRETYPE_TOKEN_NOT:
|
|
741
839
|
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
|
|
742
840
|
// only add the stack if it's not a duplicate of one we already have
|
|
743
841
|
new_stacks.emplace_back(stack);
|
|
@@ -831,26 +929,38 @@ llama_grammar_stacks & llama_grammar_get_stacks(struct llama_grammar * grammar)
|
|
|
831
929
|
return grammar->stacks;
|
|
832
930
|
}
|
|
833
931
|
|
|
932
|
+
static void llama_grammar_accept_chr(
|
|
933
|
+
struct llama_grammar & grammar,
|
|
934
|
+
const llama_grammar_stack & stack,
|
|
935
|
+
uint32_t chr,
|
|
936
|
+
llama_grammar_stacks & new_stacks) {
|
|
937
|
+
if (stack.empty()) {
|
|
938
|
+
return;
|
|
939
|
+
}
|
|
940
|
+
|
|
941
|
+
const llama_grammar_element * pos = stack.back();
|
|
942
|
+
|
|
943
|
+
// ignore if this turns into a token
|
|
944
|
+
if (pos->type == LLAMA_GRETYPE_TOKEN || pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
|
|
945
|
+
return;
|
|
946
|
+
}
|
|
947
|
+
|
|
948
|
+
auto match = llama_grammar_match_char(pos, chr);
|
|
949
|
+
if (match.first) {
|
|
950
|
+
llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
|
|
951
|
+
if (!llama_grammar_is_end_of_sequence(match.second)) {
|
|
952
|
+
new_stack.push_back(match.second);
|
|
953
|
+
}
|
|
954
|
+
llama_grammar_advance_stack(grammar.rules, new_stack, new_stacks);
|
|
955
|
+
}
|
|
956
|
+
}
|
|
957
|
+
|
|
834
958
|
void llama_grammar_accept(struct llama_grammar * grammar, uint32_t chr) {
|
|
835
959
|
llama_grammar_stacks stacks_new;
|
|
836
960
|
stacks_new.reserve(grammar->stacks.size());
|
|
837
961
|
|
|
838
962
|
for (const auto & stack : grammar->stacks) {
|
|
839
|
-
|
|
840
|
-
continue;
|
|
841
|
-
}
|
|
842
|
-
|
|
843
|
-
auto match = llama_grammar_match_char(stack.back(), chr);
|
|
844
|
-
if (match.first) {
|
|
845
|
-
const llama_grammar_element * pos = match.second;
|
|
846
|
-
|
|
847
|
-
// update top of stack to next element, if any
|
|
848
|
-
llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
|
|
849
|
-
if (!llama_grammar_is_end_of_sequence(pos)) {
|
|
850
|
-
new_stack.push_back(pos);
|
|
851
|
-
}
|
|
852
|
-
llama_grammar_advance_stack(grammar->rules, new_stack, stacks_new);
|
|
853
|
-
}
|
|
963
|
+
llama_grammar_accept_chr(*grammar, stack, chr, stacks_new);
|
|
854
964
|
}
|
|
855
965
|
|
|
856
966
|
grammar->stacks = std::move(stacks_new);
|
|
@@ -875,6 +985,22 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
|
|
|
875
985
|
|
|
876
986
|
const llama_grammar_element * stack_pos = stack.back();
|
|
877
987
|
|
|
988
|
+
// if the top of the stack is a token rule, then we only need to check the token id
|
|
989
|
+
if (stack_pos->type == LLAMA_GRETYPE_TOKEN || stack_pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
|
|
990
|
+
for (const auto & tok : candidates) {
|
|
991
|
+
if (*tok.code_points == 0) {
|
|
992
|
+
// reached the end of a token consumed by char rules, reject iff it ended
|
|
993
|
+
// in a partial response
|
|
994
|
+
if (tok.partial_utf8.n_remain != 0) {
|
|
995
|
+
rejects.push_back(tok);
|
|
996
|
+
}
|
|
997
|
+
} else if (!llama_grammar_match_token(stack_pos, tok.id)) {
|
|
998
|
+
rejects.push_back(tok);
|
|
999
|
+
}
|
|
1000
|
+
}
|
|
1001
|
+
return rejects;
|
|
1002
|
+
}
|
|
1003
|
+
|
|
878
1004
|
llama_grammar_candidates next_candidates;
|
|
879
1005
|
next_candidates.reserve(candidates.size());
|
|
880
1006
|
|
|
@@ -887,7 +1013,7 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
|
|
|
887
1013
|
rejects.push_back(tok);
|
|
888
1014
|
}
|
|
889
1015
|
} else if (llama_grammar_match_char(stack_pos, *tok.code_points).first) {
|
|
890
|
-
next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8 });
|
|
1016
|
+
next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8, tok.id });
|
|
891
1017
|
} else {
|
|
892
1018
|
rejects.push_back(tok);
|
|
893
1019
|
}
|
|
@@ -905,7 +1031,7 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
|
|
|
905
1031
|
|
|
906
1032
|
auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
|
|
907
1033
|
for (const auto & tok : next_rejects) {
|
|
908
|
-
rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
|
|
1034
|
+
rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8, tok.id });
|
|
909
1035
|
}
|
|
910
1036
|
|
|
911
1037
|
return rejects;
|
|
@@ -972,12 +1098,13 @@ struct llama_grammar * llama_grammar_init_impl(
|
|
|
972
1098
|
vocab,
|
|
973
1099
|
std::move(vec_rules),
|
|
974
1100
|
std::move(stacks),
|
|
975
|
-
/* .partial_utf8 = */
|
|
976
|
-
/* .lazy
|
|
977
|
-
/* .awaiting_trigger = */
|
|
978
|
-
/* .trigger_buffer = */
|
|
979
|
-
/* .
|
|
980
|
-
/* .
|
|
1101
|
+
/* .partial_utf8 = */ {},
|
|
1102
|
+
/* .lazy = */ false,
|
|
1103
|
+
/* .awaiting_trigger = */ false,
|
|
1104
|
+
/* .trigger_buffer = */ "",
|
|
1105
|
+
/* .trigger_buffer_positions = */ {},
|
|
1106
|
+
/* .trigger_tokens = */ {},
|
|
1107
|
+
/* .trigger_patterns = */ {},
|
|
981
1108
|
};
|
|
982
1109
|
}
|
|
983
1110
|
|
|
@@ -990,7 +1117,7 @@ struct llama_grammar * llama_grammar_init_impl(
|
|
|
990
1117
|
size_t num_trigger_patterns,
|
|
991
1118
|
const llama_token * trigger_tokens,
|
|
992
1119
|
size_t num_trigger_tokens) {
|
|
993
|
-
llama_grammar_parser parser;
|
|
1120
|
+
llama_grammar_parser parser(vocab);
|
|
994
1121
|
|
|
995
1122
|
// if there is a grammar, parse it
|
|
996
1123
|
// rules will be empty (default) if there are parse errors
|
|
@@ -1077,10 +1204,11 @@ struct llama_grammar * llama_grammar_init_impl(
|
|
|
1077
1204
|
vocab,
|
|
1078
1205
|
std::move(vec_rules),
|
|
1079
1206
|
std::move(stacks),
|
|
1080
|
-
/* .partial_utf8 = */
|
|
1081
|
-
/* .lazy = */
|
|
1082
|
-
/* .awaiting_trigger = */
|
|
1083
|
-
/* .trigger_buffer = */
|
|
1207
|
+
/* .partial_utf8 = */ {},
|
|
1208
|
+
/* .lazy = */ lazy,
|
|
1209
|
+
/* .awaiting_trigger = */ lazy,
|
|
1210
|
+
/* .trigger_buffer = */ "",
|
|
1211
|
+
/* .trigger_buffer_positions = */ {},
|
|
1084
1212
|
std::move(vec_trigger_tokens),
|
|
1085
1213
|
std::move(vec_trigger_patterns),
|
|
1086
1214
|
};
|
|
@@ -1103,6 +1231,7 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
|
|
|
1103
1231
|
grammar.lazy,
|
|
1104
1232
|
grammar.awaiting_trigger,
|
|
1105
1233
|
grammar.trigger_buffer,
|
|
1234
|
+
grammar.trigger_buffer_positions,
|
|
1106
1235
|
grammar.trigger_tokens,
|
|
1107
1236
|
grammar.trigger_patterns,
|
|
1108
1237
|
};
|
|
@@ -1156,7 +1285,7 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
|
|
|
1156
1285
|
cur_p->data[i].logit = -INFINITY;
|
|
1157
1286
|
} else {
|
|
1158
1287
|
candidates_decoded.push_back(decode_utf8(piece, grammar.partial_utf8));
|
|
1159
|
-
candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
|
|
1288
|
+
candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second, id });
|
|
1160
1289
|
}
|
|
1161
1290
|
}
|
|
1162
1291
|
|
|
@@ -1175,10 +1304,12 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
|
|
|
1175
1304
|
if (std::find(grammar.trigger_tokens.begin(), grammar.trigger_tokens.end(), token) != grammar.trigger_tokens.end()) {
|
|
1176
1305
|
grammar.awaiting_trigger = false;
|
|
1177
1306
|
grammar.trigger_buffer.clear();
|
|
1178
|
-
|
|
1307
|
+
llama_grammar_accept_token(grammar, token, piece);
|
|
1179
1308
|
LLAMA_LOG_DEBUG("Grammar triggered on token %u (`%s`)", token, piece.c_str());
|
|
1180
1309
|
return;
|
|
1181
1310
|
} else {
|
|
1311
|
+
auto position = std::make_pair(grammar.trigger_buffer.size(), grammar.trigger_buffer.size() + piece.size());
|
|
1312
|
+
grammar.trigger_buffer_positions.push_back(std::make_pair(token, position));
|
|
1182
1313
|
grammar.trigger_buffer += piece;
|
|
1183
1314
|
|
|
1184
1315
|
std::smatch match;
|
|
@@ -1196,10 +1327,23 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
|
|
|
1196
1327
|
if (start == std::string::npos) {
|
|
1197
1328
|
start = match.position(0);
|
|
1198
1329
|
}
|
|
1330
|
+
|
|
1331
|
+
// replay tokens that overlap with [start, end)
|
|
1332
|
+
for (const auto & [tok, tok_pos] : grammar.trigger_buffer_positions) {
|
|
1333
|
+
auto [tok_start, tok_end] = tok_pos;
|
|
1334
|
+
if (tok_end <= start) {
|
|
1335
|
+
continue;
|
|
1336
|
+
}
|
|
1337
|
+
|
|
1338
|
+
size_t piece_start = (tok_start < start) ? start : tok_start; // allow for partial token pieces
|
|
1339
|
+
size_t piece_len = tok_end - piece_start;
|
|
1340
|
+
auto tok_piece = grammar.trigger_buffer.substr(piece_start, piece_len);
|
|
1341
|
+
llama_grammar_accept_token(grammar, tok, tok_piece);
|
|
1342
|
+
}
|
|
1343
|
+
|
|
1199
1344
|
auto constrained_str = grammar.trigger_buffer.substr(start);
|
|
1200
|
-
// std::string constrained_str(match[1].first, grammar.trigger_buffer.end());
|
|
1201
1345
|
grammar.trigger_buffer.clear();
|
|
1202
|
-
|
|
1346
|
+
grammar.trigger_buffer_positions.clear();
|
|
1203
1347
|
LLAMA_LOG_DEBUG("Grammar triggered on regex: '%s'\n", constrained_str.c_str());
|
|
1204
1348
|
return;
|
|
1205
1349
|
}
|
|
@@ -1218,7 +1362,7 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
|
|
|
1218
1362
|
GGML_ABORT("fatal error");
|
|
1219
1363
|
}
|
|
1220
1364
|
|
|
1221
|
-
|
|
1365
|
+
llama_grammar_accept_token(grammar, token, piece);
|
|
1222
1366
|
}
|
|
1223
1367
|
|
|
1224
1368
|
void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string & piece) {
|
|
@@ -1235,3 +1379,59 @@ void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string
|
|
|
1235
1379
|
throw std::runtime_error("Unexpected empty grammar stack after accepting piece: " + piece);
|
|
1236
1380
|
}
|
|
1237
1381
|
}
|
|
1382
|
+
|
|
1383
|
+
void llama_grammar_accept_token(struct llama_grammar & grammar, llama_token token, const std::string & piece) {
|
|
1384
|
+
// Note terminating 0 in decoded string
|
|
1385
|
+
const auto decoded = decode_utf8(piece, grammar.partial_utf8);
|
|
1386
|
+
const auto & code_points = decoded.first;
|
|
1387
|
+
|
|
1388
|
+
llama_grammar_stacks stacks_new;
|
|
1389
|
+
stacks_new.reserve(grammar.stacks.size());
|
|
1390
|
+
|
|
1391
|
+
for (const auto & stack : grammar.stacks) {
|
|
1392
|
+
if (stack.empty()) {
|
|
1393
|
+
continue;
|
|
1394
|
+
}
|
|
1395
|
+
|
|
1396
|
+
const llama_grammar_element * pos = stack.back();
|
|
1397
|
+
|
|
1398
|
+
if (pos->type == LLAMA_GRETYPE_TOKEN || pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
|
|
1399
|
+
if (llama_grammar_match_token(pos, token)) {
|
|
1400
|
+
llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
|
|
1401
|
+
if (!llama_grammar_is_end_of_sequence(pos + 1)) {
|
|
1402
|
+
new_stack.push_back(pos + 1);
|
|
1403
|
+
}
|
|
1404
|
+
llama_grammar_advance_stack(grammar.rules, new_stack, stacks_new);
|
|
1405
|
+
}
|
|
1406
|
+
} else {
|
|
1407
|
+
llama_grammar_stacks current_stacks = {stack};
|
|
1408
|
+
|
|
1409
|
+
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
|
1410
|
+
llama_grammar_stacks next_stacks;
|
|
1411
|
+
|
|
1412
|
+
for (const auto & cur_stack : current_stacks) {
|
|
1413
|
+
llama_grammar_accept_chr(grammar, cur_stack, *it, next_stacks);
|
|
1414
|
+
}
|
|
1415
|
+
|
|
1416
|
+
current_stacks = std::move(next_stacks);
|
|
1417
|
+
if (current_stacks.empty()) {
|
|
1418
|
+
break;
|
|
1419
|
+
}
|
|
1420
|
+
}
|
|
1421
|
+
|
|
1422
|
+
for (auto & surviving_stack : current_stacks) {
|
|
1423
|
+
if (std::find(stacks_new.begin(), stacks_new.end(), surviving_stack) == stacks_new.end()) {
|
|
1424
|
+
stacks_new.emplace_back(surviving_stack);
|
|
1425
|
+
}
|
|
1426
|
+
}
|
|
1427
|
+
}
|
|
1428
|
+
}
|
|
1429
|
+
|
|
1430
|
+
grammar.stacks = std::move(stacks_new);
|
|
1431
|
+
grammar.partial_utf8 = decoded.second;
|
|
1432
|
+
|
|
1433
|
+
if (grammar.stacks.empty()) {
|
|
1434
|
+
throw std::runtime_error("Unexpected empty grammar stack after accepting piece: " + piece + " (" + std::to_string(token) + ")");
|
|
1435
|
+
}
|
|
1436
|
+
}
|
|
1437
|
+
|
|
@@ -36,11 +36,17 @@ enum llama_gretype {
|
|
|
36
36
|
|
|
37
37
|
// any character (.)
|
|
38
38
|
LLAMA_GRETYPE_CHAR_ANY = 7,
|
|
39
|
+
|
|
40
|
+
// terminal element: token (<[token-id]>)
|
|
41
|
+
LLAMA_GRETYPE_TOKEN = 8,
|
|
42
|
+
|
|
43
|
+
// inverse token (!<[token-id]>)
|
|
44
|
+
LLAMA_GRETYPE_TOKEN_NOT = 9,
|
|
39
45
|
};
|
|
40
46
|
|
|
41
47
|
typedef struct llama_grammar_element {
|
|
42
48
|
enum llama_gretype type;
|
|
43
|
-
uint32_t value; // Unicode code point or
|
|
49
|
+
uint32_t value; // Unicode code point, rule ID, or token ID
|
|
44
50
|
} llama_grammar_element;
|
|
45
51
|
|
|
46
52
|
struct llama_partial_utf8 {
|
|
@@ -52,6 +58,7 @@ struct llama_grammar_candidate {
|
|
|
52
58
|
size_t index;
|
|
53
59
|
const uint32_t * code_points;
|
|
54
60
|
llama_partial_utf8 partial_utf8;
|
|
61
|
+
llama_token id;
|
|
55
62
|
};
|
|
56
63
|
|
|
57
64
|
using llama_grammar_rule = std::vector< llama_grammar_element>;
|
|
@@ -77,10 +84,13 @@ std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
|
|
|
77
84
|
const llama_grammar_candidates & candidates);
|
|
78
85
|
|
|
79
86
|
struct llama_grammar_parser {
|
|
87
|
+
const llama_vocab * vocab;
|
|
80
88
|
std::map<std::string, uint32_t> symbol_ids;
|
|
81
89
|
|
|
82
90
|
llama_grammar_rules rules;
|
|
83
91
|
|
|
92
|
+
llama_grammar_parser(const struct llama_vocab * vocab = nullptr) : vocab(vocab) {}
|
|
93
|
+
|
|
84
94
|
llama_grammar_stack c_rules() const;
|
|
85
95
|
|
|
86
96
|
uint32_t get_symbol_id(const char * src, size_t len);
|
|
@@ -112,6 +122,9 @@ struct llama_grammar_trigger_pattern {
|
|
|
112
122
|
};
|
|
113
123
|
|
|
114
124
|
struct llama_grammar {
|
|
125
|
+
// maintain a list of llama_tokens and their positions in the trigger_buffer
|
|
126
|
+
using token_pos = std::pair<llama_token, std::pair<size_t, size_t>>;
|
|
127
|
+
|
|
115
128
|
// note: allow null vocab for testing (not great)
|
|
116
129
|
const llama_vocab * vocab;
|
|
117
130
|
|
|
@@ -127,6 +140,7 @@ struct llama_grammar {
|
|
|
127
140
|
bool lazy = false;
|
|
128
141
|
bool awaiting_trigger = false; // Initialized to true for lazy grammars only
|
|
129
142
|
std::string trigger_buffer; // Output buffered by lazy grammar. Will be cleared once trigger is found.
|
|
143
|
+
std::vector<token_pos> trigger_buffer_positions; // Tokens buffered by lazy grammar. Used to replay when a trigger is found.
|
|
130
144
|
std::vector<llama_token> trigger_tokens; // Tokens that trigger a lazy grammar, or tokens to force printing of (even if special).
|
|
131
145
|
std::vector<llama_grammar_trigger_pattern>
|
|
132
146
|
trigger_patterns; // Regular expressions that trigger a lazy grammar. Must be a full match of the entire generated
|
|
@@ -171,3 +185,8 @@ void llama_grammar_accept_impl(
|
|
|
171
185
|
void llama_grammar_accept_str(
|
|
172
186
|
struct llama_grammar & grammar,
|
|
173
187
|
const std::string & piece);
|
|
188
|
+
|
|
189
|
+
void llama_grammar_accept_token(
|
|
190
|
+
struct llama_grammar & grammar,
|
|
191
|
+
llama_token token,
|
|
192
|
+
const std::string & piece);
|
|
@@ -973,7 +973,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
|
973
973
|
|
|
974
974
|
// mask out the other groups
|
|
975
975
|
selection_probs = ggml_get_rows(ctx0, selection_groups, expert_groups); // [n_exp_per_group, n_group_used, n_tokens]
|
|
976
|
-
selection_probs = ggml_set_rows(ctx0,
|
|
976
|
+
selection_probs = ggml_set_rows(ctx0, ggml_fill(ctx0, selection_groups, -INFINITY), selection_probs, expert_groups); // [n_exp_per_group, n_expert_groups, n_tokens]
|
|
977
977
|
selection_probs = ggml_reshape_2d(ctx0, selection_probs, n_expert, n_tokens); // [n_expert, n_tokens]
|
|
978
978
|
cb(selection_probs, "ffn_moe_probs_masked", il);
|
|
979
979
|
}
|
|
@@ -1264,18 +1264,25 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1264
1264
|
} break;
|
|
1265
1265
|
case LLM_ARCH_GEMMA3:
|
|
1266
1266
|
{
|
|
1267
|
-
hparams.
|
|
1268
|
-
hparams.
|
|
1267
|
+
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
|
1268
|
+
if (found_swa && hparams.n_swa > 0) {
|
|
1269
|
+
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1270
|
+
hparams.set_swa_pattern(6);
|
|
1269
1271
|
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
+
hparams.rope_freq_base_train_swa = 10000.0f;
|
|
1273
|
+
hparams.rope_freq_scale_train_swa = 1.0f;
|
|
1274
|
+
} else {
|
|
1275
|
+
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
|
1276
|
+
}
|
|
1272
1277
|
|
|
1273
|
-
|
|
1278
|
+
hparams.f_final_logit_softcapping = 0.0f;
|
|
1279
|
+
ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
|
|
1274
1280
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1275
1281
|
|
|
1276
1282
|
switch (hparams.n_layer) {
|
|
1277
1283
|
case 18: type = LLM_TYPE_270M; break;
|
|
1278
1284
|
case 26: type = LLM_TYPE_1B; break;
|
|
1285
|
+
case 32: type = LLM_TYPE_8B; break; // Rnj-1
|
|
1279
1286
|
case 34: type = LLM_TYPE_4B; break;
|
|
1280
1287
|
case 48: type = LLM_TYPE_12B; break;
|
|
1281
1288
|
case 62: type = LLM_TYPE_27B; break;
|
|
@@ -1599,8 +1606,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1599
1606
|
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
|
1600
1607
|
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
|
1601
1608
|
|
|
1602
|
-
switch (hparams.
|
|
1603
|
-
case
|
|
1609
|
+
switch (hparams.n_ff_exp) {
|
|
1610
|
+
case 1408: type = LLM_TYPE_16B; break;
|
|
1611
|
+
case 1792: type = LLM_TYPE_20B; break;
|
|
1604
1612
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1605
1613
|
}
|
|
1606
1614
|
} break;
|
|
@@ -7304,7 +7312,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
7304
7312
|
} break;
|
|
7305
7313
|
case LLM_ARCH_GEMMA3:
|
|
7306
7314
|
{
|
|
7307
|
-
|
|
7315
|
+
if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
|
|
7316
|
+
llm = std::make_unique<llm_build_gemma3<true>>(*this, params);
|
|
7317
|
+
} else {
|
|
7318
|
+
llm = std::make_unique<llm_build_gemma3<false>>(*this, params);
|
|
7319
|
+
}
|
|
7308
7320
|
} break;
|
|
7309
7321
|
case LLM_ARCH_GEMMA3N:
|
|
7310
7322
|
{
|