@fugood/llama.node 1.4.11 → 1.4.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +31 -31
- package/src/llama.cpp/common/arg.cpp +128 -59
- package/src/llama.cpp/common/arg.h +1 -0
- package/src/llama.cpp/common/chat-parser.cpp +11 -0
- package/src/llama.cpp/common/chat.cpp +36 -7
- package/src/llama.cpp/common/chat.h +1 -0
- package/src/llama.cpp/common/common.cpp +42 -23
- package/src/llama.cpp/common/common.h +11 -1
- package/src/llama.cpp/common/llguidance.cpp +10 -6
- package/src/llama.cpp/common/regex-partial.cpp +13 -13
- package/src/llama.cpp/common/sampling.cpp +58 -14
- package/src/llama.cpp/common/sampling.h +3 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +1 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +23 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
- package/src/llama.cpp/include/llama.h +100 -12
- package/src/llama.cpp/src/CMakeLists.txt +4 -0
- package/src/llama.cpp/src/llama-adapter.cpp +12 -3
- package/src/llama.cpp/src/llama-adapter.h +7 -1
- package/src/llama.cpp/src/llama-arch.cpp +78 -0
- package/src/llama.cpp/src/llama-arch.h +8 -0
- package/src/llama.cpp/src/llama-chat.cpp +11 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +637 -49
- package/src/llama.cpp/src/llama-context.h +43 -1
- package/src/llama.cpp/src/llama-grammar.cpp +40 -13
- package/src/llama.cpp/src/llama-grammar.h +2 -0
- package/src/llama.cpp/src/llama-graph.cpp +173 -5
- package/src/llama.cpp/src/llama-graph.h +71 -6
- package/src/llama.cpp/src/llama-hparams.cpp +4 -0
- package/src/llama.cpp/src/llama-hparams.h +12 -5
- package/src/llama.cpp/src/llama-kv-cache.h +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +11 -4
- package/src/llama.cpp/src/llama-model-loader.cpp +23 -0
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model-saver.cpp +3 -0
- package/src/llama.cpp/src/llama-model.cpp +337 -26
- package/src/llama.cpp/src/llama-model.h +13 -2
- package/src/llama.cpp/src/llama-sampling.cpp +1259 -186
- package/src/llama.cpp/src/llama-sampling.h +19 -7
- package/src/llama.cpp/src/llama-vocab.cpp +101 -33
- package/src/llama.cpp/src/llama-vocab.h +2 -0
- package/src/llama.cpp/src/llama.cpp +87 -64
- package/src/llama.cpp/src/models/afmoe.cpp +9 -5
- package/src/llama.cpp/src/models/bert.cpp +4 -2
- package/src/llama.cpp/src/models/cogvlm.cpp +5 -3
- package/src/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
- package/src/llama.cpp/src/models/deepseek2.cpp +1 -1
- package/src/llama.cpp/src/models/gemma-embedding.cpp +2 -6
- package/src/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
- package/src/llama.cpp/src/models/gemma3.cpp +3 -4
- package/src/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
- package/src/llama.cpp/src/models/llama-iswa.cpp +6 -2
- package/src/llama.cpp/src/models/llama.cpp +19 -6
- package/src/llama.cpp/src/models/maincoder.cpp +117 -0
- package/src/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
- package/src/llama.cpp/src/models/models.h +18 -0
- package/src/llama.cpp/src/models/modern-bert.cpp +116 -0
- package/src/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
- package/src/llama.cpp/src/models/plamo3.cpp +128 -0
- package/src/llama.cpp/src/models/smallthinker.cpp +11 -5
- package/src/llama.cpp/src/unicode.cpp +23 -14
|
@@ -70,6 +70,18 @@ struct llama_context {
|
|
|
70
70
|
float * get_embeddings_ith(int32_t i);
|
|
71
71
|
float * get_embeddings_seq(llama_seq_id seq_id);
|
|
72
72
|
|
|
73
|
+
llama_token * get_sampled_tokens() const;
|
|
74
|
+
llama_token get_sampled_token_ith(int32_t idx);
|
|
75
|
+
|
|
76
|
+
float * get_sampled_logits_ith(int32_t idx);
|
|
77
|
+
size_t get_sampled_logits_count(int32_t idx);
|
|
78
|
+
|
|
79
|
+
float * get_sampled_probs_ith(int32_t idx);
|
|
80
|
+
size_t get_sampled_probs_count(int32_t idx);
|
|
81
|
+
|
|
82
|
+
const llama_token * get_sampled_candidates_ith(int32_t idx);
|
|
83
|
+
size_t get_sampled_candidates_count(int32_t idx);
|
|
84
|
+
|
|
73
85
|
void attach_threadpool(
|
|
74
86
|
ggml_threadpool_t threadpool,
|
|
75
87
|
ggml_threadpool_t threadpool_batch);
|
|
@@ -192,10 +204,13 @@ private:
|
|
|
192
204
|
|
|
193
205
|
// Make sure enough space is available for outputs.
|
|
194
206
|
// Returns max number of outputs for which space was reserved.
|
|
195
|
-
uint32_t output_reserve(int32_t n_outputs);
|
|
207
|
+
uint32_t output_reserve(int32_t n_outputs, const llama_batch & batch);
|
|
196
208
|
|
|
197
209
|
void output_reorder();
|
|
198
210
|
|
|
211
|
+
// map the output row index `i` to batch index
|
|
212
|
+
int64_t output_resolve_row(int32_t i) const;
|
|
213
|
+
|
|
199
214
|
//
|
|
200
215
|
// graph
|
|
201
216
|
//
|
|
@@ -213,6 +228,8 @@ public:
|
|
|
213
228
|
ggml_cgraph * graph_reserve(
|
|
214
229
|
uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false, size_t * sizes = nullptr);
|
|
215
230
|
|
|
231
|
+
bool set_sampler(llama_seq_id seq_id, llama_sampler * sampler);
|
|
232
|
+
|
|
216
233
|
private:
|
|
217
234
|
llm_graph_params graph_params(
|
|
218
235
|
llm_graph_result * res,
|
|
@@ -252,6 +269,31 @@ private:
|
|
|
252
269
|
size_t embd_size = 0; // capacity (of floats) for embeddings
|
|
253
270
|
float * embd = nullptr;
|
|
254
271
|
|
|
272
|
+
// TODO: simplify
|
|
273
|
+
struct sampling_info {
|
|
274
|
+
std::map<llama_seq_id, llama_sampler *> samplers;
|
|
275
|
+
|
|
276
|
+
float * logits = nullptr;
|
|
277
|
+
size_t logits_size = 0;
|
|
278
|
+
|
|
279
|
+
llama_token * sampled = nullptr;
|
|
280
|
+
size_t sampled_size = 0;
|
|
281
|
+
|
|
282
|
+
float * probs = nullptr;
|
|
283
|
+
size_t probs_size = 0;
|
|
284
|
+
|
|
285
|
+
llama_token * candidates = nullptr;
|
|
286
|
+
size_t candidates_size = 0;
|
|
287
|
+
|
|
288
|
+
std::vector<uint32_t> logits_count;
|
|
289
|
+
std::vector<uint32_t> probs_count;
|
|
290
|
+
std::vector<uint32_t> candidates_count;
|
|
291
|
+
|
|
292
|
+
std::vector<llama_token> token_ids_full_vocab;
|
|
293
|
+
};
|
|
294
|
+
|
|
295
|
+
sampling_info sampling;
|
|
296
|
+
|
|
255
297
|
// sequence embeddings output (map of [n_embd] vectors)
|
|
256
298
|
// populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
|
|
257
299
|
std::map<llama_seq_id, std::vector<float>> embd_seq;
|
|
@@ -369,6 +369,44 @@ static void print_rule(
|
|
|
369
369
|
fprintf(file, "\n");
|
|
370
370
|
}
|
|
371
371
|
|
|
372
|
+
//
|
|
373
|
+
// Regex utilities
|
|
374
|
+
//
|
|
375
|
+
|
|
376
|
+
size_t llama_grammar_trigger_pattern::find(const std::string & input) const {
|
|
377
|
+
auto find_start_pos = [](const std::smatch & match) {
|
|
378
|
+
// get from the first matched capturing group to the end of the string
|
|
379
|
+
size_t start = std::string::npos;
|
|
380
|
+
for (auto i = 1u; i < match.size(); i++) {
|
|
381
|
+
if (match.length(i) > 0) {
|
|
382
|
+
start = match.position(i);
|
|
383
|
+
break;
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
if (start == std::string::npos) {
|
|
387
|
+
start = match.position(0);
|
|
388
|
+
}
|
|
389
|
+
return start;
|
|
390
|
+
};
|
|
391
|
+
|
|
392
|
+
if (!pattern.empty() && pattern.front() == '^' && pattern.back() == '$') {
|
|
393
|
+
// match against the entire input
|
|
394
|
+
std::smatch match;
|
|
395
|
+
if (std::regex_match(input, match, regex)) {
|
|
396
|
+
return find_start_pos(match);
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
// search anywhere
|
|
401
|
+
std::smatch match;
|
|
402
|
+
if (std::regex_search(input, match, regex)) {
|
|
403
|
+
return find_start_pos(match);
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
return std::string::npos;
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
|
|
372
410
|
//
|
|
373
411
|
// implementation
|
|
374
412
|
//
|
|
@@ -1312,21 +1350,10 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
|
|
|
1312
1350
|
grammar.trigger_buffer_positions.push_back(std::make_pair(token, position));
|
|
1313
1351
|
grammar.trigger_buffer += piece;
|
|
1314
1352
|
|
|
1315
|
-
std::smatch match;
|
|
1316
1353
|
for (const auto & trigger_pattern : grammar.trigger_patterns) {
|
|
1317
|
-
|
|
1354
|
+
auto start = trigger_pattern.find(grammar.trigger_buffer);
|
|
1355
|
+
if (start != std::string::npos) {
|
|
1318
1356
|
grammar.awaiting_trigger = false;
|
|
1319
|
-
// get from the first matched capturing group to the end of the string
|
|
1320
|
-
size_t start = std::string::npos;
|
|
1321
|
-
for (auto i = 1u; i < match.size(); i++) {
|
|
1322
|
-
if (match.length(i) > 0) {
|
|
1323
|
-
start = match.position(i);
|
|
1324
|
-
break;
|
|
1325
|
-
}
|
|
1326
|
-
}
|
|
1327
|
-
if (start == std::string::npos) {
|
|
1328
|
-
start = match.position(0);
|
|
1329
|
-
}
|
|
1330
1357
|
|
|
1331
1358
|
// replay tokens that overlap with [start, end)
|
|
1332
1359
|
for (const auto & [tok, tok_pos] : grammar.trigger_buffer_positions) {
|
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
#include <cassert>
|
|
13
13
|
#include <cmath>
|
|
14
14
|
#include <cstring>
|
|
15
|
+
#include <unordered_set>
|
|
15
16
|
|
|
16
17
|
void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
|
|
17
18
|
if (ubatch->token) {
|
|
@@ -32,7 +33,7 @@ bool llm_graph_input_embd::can_reuse(const llm_graph_params & params) {
|
|
|
32
33
|
bool res = true;
|
|
33
34
|
|
|
34
35
|
res &= (!tokens && !params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens);
|
|
35
|
-
res &= (!embd && !params.ubatch.embd) || (embd && embd->ne[
|
|
36
|
+
res &= (!embd && !params.ubatch.embd) || (embd && embd->ne[1] == params.ubatch.n_tokens);
|
|
36
37
|
|
|
37
38
|
return res;
|
|
38
39
|
}
|
|
@@ -62,7 +63,7 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
|
|
|
62
63
|
bool llm_graph_input_pos::can_reuse(const llm_graph_params & params) {
|
|
63
64
|
bool res = true;
|
|
64
65
|
|
|
65
|
-
res &= pos->ne[0] == params.ubatch.n_tokens;
|
|
66
|
+
res &= pos->ne[0] == params.ubatch.n_tokens*n_pos_per_embd;
|
|
66
67
|
|
|
67
68
|
return res;
|
|
68
69
|
}
|
|
@@ -521,6 +522,43 @@ bool llm_graph_input_mem_hybrid::can_reuse(const llm_graph_params & params) {
|
|
|
521
522
|
return res;
|
|
522
523
|
}
|
|
523
524
|
|
|
525
|
+
void llm_graph_input_sampling::set_input(const llama_ubatch * ubatch) {
|
|
526
|
+
// set the inputs only for the active samplers in the current ubatch
|
|
527
|
+
std::unordered_set<llama_seq_id> active_samplers;
|
|
528
|
+
for (uint32_t i = 0; i < ubatch->n_tokens; i++) {
|
|
529
|
+
if (ubatch->output[i]) {
|
|
530
|
+
llama_seq_id seq_id = ubatch->seq_id[i][0];
|
|
531
|
+
active_samplers.insert(seq_id);
|
|
532
|
+
}
|
|
533
|
+
}
|
|
534
|
+
|
|
535
|
+
for (auto seq_id : active_samplers) {
|
|
536
|
+
if (samplers.find(seq_id) == samplers.end()) {
|
|
537
|
+
continue;
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
auto & sampler = samplers[seq_id];
|
|
541
|
+
|
|
542
|
+
if (sampler->iface->backend_set_input) {
|
|
543
|
+
sampler->iface->backend_set_input(sampler);
|
|
544
|
+
}
|
|
545
|
+
}
|
|
546
|
+
}
|
|
547
|
+
|
|
548
|
+
bool llm_graph_input_sampling::can_reuse(const llm_graph_params & params) {
|
|
549
|
+
if (samplers.size() != params.samplers.size()) {
|
|
550
|
+
return false;
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
for (const auto & [seq_id, sampler] : params.samplers) {
|
|
554
|
+
if (samplers[seq_id] != sampler) {
|
|
555
|
+
return false;
|
|
556
|
+
}
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
return true;
|
|
560
|
+
}
|
|
561
|
+
|
|
524
562
|
//
|
|
525
563
|
// llm_graph_result
|
|
526
564
|
//
|
|
@@ -541,6 +579,10 @@ void llm_graph_result::reset() {
|
|
|
541
579
|
t_logits = nullptr;
|
|
542
580
|
t_embd = nullptr;
|
|
543
581
|
t_embd_pooled = nullptr;
|
|
582
|
+
t_sampled.clear();
|
|
583
|
+
t_sampled_probs.clear();
|
|
584
|
+
t_sampled_logits.clear();
|
|
585
|
+
t_candidates.clear();
|
|
544
586
|
|
|
545
587
|
params = {};
|
|
546
588
|
|
|
@@ -565,6 +607,38 @@ void llm_graph_result::set_inputs(const llama_ubatch * ubatch) {
|
|
|
565
607
|
}
|
|
566
608
|
}
|
|
567
609
|
|
|
610
|
+
void llm_graph_result::set_outputs() {
|
|
611
|
+
if (t_logits != nullptr) {
|
|
612
|
+
ggml_set_output(t_logits);
|
|
613
|
+
}
|
|
614
|
+
if (t_embd != nullptr) {
|
|
615
|
+
ggml_set_output(t_embd);
|
|
616
|
+
}
|
|
617
|
+
if (t_embd_pooled != nullptr) {
|
|
618
|
+
ggml_set_output(t_embd_pooled);
|
|
619
|
+
}
|
|
620
|
+
for (auto & [seq_id, t] : t_sampled) {
|
|
621
|
+
if (t != nullptr) {
|
|
622
|
+
ggml_set_output(t);
|
|
623
|
+
}
|
|
624
|
+
}
|
|
625
|
+
for (auto & [seq_id, t] : t_sampled_probs) {
|
|
626
|
+
if (t != nullptr) {
|
|
627
|
+
ggml_set_output(t);
|
|
628
|
+
}
|
|
629
|
+
}
|
|
630
|
+
for (auto & [seq_id, t] : t_sampled_logits) {
|
|
631
|
+
if (t != nullptr) {
|
|
632
|
+
ggml_set_output(t);
|
|
633
|
+
}
|
|
634
|
+
}
|
|
635
|
+
for (auto & [seq_id, t] : t_candidates) {
|
|
636
|
+
if (t != nullptr) {
|
|
637
|
+
ggml_set_output(t);
|
|
638
|
+
}
|
|
639
|
+
}
|
|
640
|
+
}
|
|
641
|
+
|
|
568
642
|
bool llm_graph_result::can_reuse(const llm_graph_params & params) {
|
|
569
643
|
if (!this->params.allow_reuse(params)) {
|
|
570
644
|
if (debug > 1) {
|
|
@@ -646,6 +720,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
|
|
|
646
720
|
loras (params.loras),
|
|
647
721
|
mctx (params.mctx),
|
|
648
722
|
cross (params.cross),
|
|
723
|
+
samplers (params.samplers),
|
|
649
724
|
cb_func (params.cb),
|
|
650
725
|
res (params.res),
|
|
651
726
|
ctx0 (res->get_ctx()),
|
|
@@ -1251,6 +1326,10 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
|
|
|
1251
1326
|
|
|
1252
1327
|
res->add_input(std::move(inp));
|
|
1253
1328
|
|
|
1329
|
+
// make sure the produced embeddings are immediately materialized in the ggml graph
|
|
1330
|
+
// ref: https://github.com/ggml-org/llama.cpp/pull/18599
|
|
1331
|
+
ggml_build_forward_expand(gf, cur);
|
|
1332
|
+
|
|
1254
1333
|
return cur;
|
|
1255
1334
|
}
|
|
1256
1335
|
|
|
@@ -1834,8 +1913,10 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
|
|
|
1834
1913
|
|
|
1835
1914
|
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
|
|
1836
1915
|
ggml_set_input(inp->self_kq_mask);
|
|
1916
|
+
ggml_set_name(inp->self_kq_mask, "self_kq_mask");
|
|
1837
1917
|
|
|
1838
1918
|
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
|
1919
|
+
ggml_set_name(inp->self_kq_mask_cnv, "self_kq_mask_cnv");
|
|
1839
1920
|
}
|
|
1840
1921
|
|
|
1841
1922
|
{
|
|
@@ -1848,8 +1929,10 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
|
|
|
1848
1929
|
|
|
1849
1930
|
inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
|
|
1850
1931
|
ggml_set_input(inp->self_kq_mask_swa);
|
|
1932
|
+
ggml_set_name(inp->self_kq_mask_swa, "self_kq_mask_swa");
|
|
1851
1933
|
|
|
1852
1934
|
inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
|
|
1935
|
+
ggml_set_name(inp->self_kq_mask_swa_cnv, "self_kq_mask_swa_cnv");
|
|
1853
1936
|
}
|
|
1854
1937
|
|
|
1855
1938
|
return (llm_graph_input_attn_kv_iswa *) res->add_input(std::move(inp));
|
|
@@ -1988,14 +2071,18 @@ llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
|
|
|
1988
2071
|
void llm_graph_context::build_dense_out(
|
|
1989
2072
|
ggml_tensor * dense_2,
|
|
1990
2073
|
ggml_tensor * dense_3) const {
|
|
1991
|
-
if (!cparams.embeddings || dense_2
|
|
2074
|
+
if (!cparams.embeddings || !(dense_2 || dense_3)) {
|
|
1992
2075
|
return;
|
|
1993
2076
|
}
|
|
1994
2077
|
ggml_tensor * cur = res->t_embd_pooled != nullptr ? res->t_embd_pooled : res->t_embd;
|
|
1995
2078
|
GGML_ASSERT(cur != nullptr && "missing t_embd_pooled/t_embd");
|
|
1996
2079
|
|
|
1997
|
-
|
|
1998
|
-
|
|
2080
|
+
if (dense_2) {
|
|
2081
|
+
cur = ggml_mul_mat(ctx0, dense_2, cur);
|
|
2082
|
+
}
|
|
2083
|
+
if (dense_3) {
|
|
2084
|
+
cur = ggml_mul_mat(ctx0, dense_3, cur);
|
|
2085
|
+
}
|
|
1999
2086
|
cb(cur, "result_embd_pooled", -1);
|
|
2000
2087
|
res->t_embd_pooled = cur;
|
|
2001
2088
|
ggml_build_forward_expand(gf, cur);
|
|
@@ -2086,6 +2173,87 @@ void llm_graph_context::build_pooling(
|
|
|
2086
2173
|
ggml_build_forward_expand(gf, cur);
|
|
2087
2174
|
}
|
|
2088
2175
|
|
|
2176
|
+
void llm_graph_context::build_sampling() const {
|
|
2177
|
+
if (samplers.empty() || !res->t_logits) {
|
|
2178
|
+
return;
|
|
2179
|
+
}
|
|
2180
|
+
|
|
2181
|
+
auto inp_sampling = std::make_unique<llm_graph_input_sampling>(samplers);
|
|
2182
|
+
res->add_input(std::move(inp_sampling));
|
|
2183
|
+
|
|
2184
|
+
std::map<llama_seq_id, int32_t> seq_to_logit_row;
|
|
2185
|
+
int32_t logit_row_idx = 0;
|
|
2186
|
+
|
|
2187
|
+
for (uint32_t i = 0; i < ubatch.n_tokens; i++) {
|
|
2188
|
+
if (ubatch.output[i]) {
|
|
2189
|
+
llama_seq_id seq_id = ubatch.seq_id[i][0];
|
|
2190
|
+
seq_to_logit_row[seq_id] = logit_row_idx;
|
|
2191
|
+
logit_row_idx++;
|
|
2192
|
+
}
|
|
2193
|
+
}
|
|
2194
|
+
|
|
2195
|
+
// res->t_logits will contain logits for all tokens that want the logits calculated (logits=1 or output=1)
|
|
2196
|
+
GGML_ASSERT(res->t_logits != nullptr && "missing t_logits tensor");
|
|
2197
|
+
|
|
2198
|
+
// add a dummy row of logits
|
|
2199
|
+
// this trick makes the graph static, regardless of which samplers are activated
|
|
2200
|
+
// this is important in order to minimize graph reallocations
|
|
2201
|
+
// TODO: use `ggml_build_forward_select()` when available (https://github.com/ggml-org/llama.cpp/pull/18550)
|
|
2202
|
+
ggml_tensor * logits_t = ggml_pad(ctx0, res->t_logits, 0, 1, 0, 0);
|
|
2203
|
+
|
|
2204
|
+
for (const auto & [seq_id, sampler] : samplers) {
|
|
2205
|
+
const auto it = seq_to_logit_row.find(seq_id);
|
|
2206
|
+
|
|
2207
|
+
// inactive samplers always work on the first row
|
|
2208
|
+
const auto row_idx = seq_to_logit_row.find(seq_id) != seq_to_logit_row.end() ? it->second : 0;
|
|
2209
|
+
|
|
2210
|
+
ggml_tensor * logits_seq = ggml_view_1d(ctx0, logits_t, logits_t->ne[0], row_idx * logits_t->nb[1]);
|
|
2211
|
+
ggml_format_name(logits_seq, "logits_seq_%d", seq_id);
|
|
2212
|
+
|
|
2213
|
+
struct llama_sampler_data data = {
|
|
2214
|
+
/*.logits =*/ logits_seq,
|
|
2215
|
+
/*.probs =*/ nullptr,
|
|
2216
|
+
/*.sampled =*/ nullptr,
|
|
2217
|
+
/*.candidates =*/ nullptr,
|
|
2218
|
+
};
|
|
2219
|
+
|
|
2220
|
+
assert(sampler->iface->backend_apply);
|
|
2221
|
+
sampler->iface->backend_apply(sampler, ctx0, gf, &data);
|
|
2222
|
+
|
|
2223
|
+
if (data.sampled != nullptr) {
|
|
2224
|
+
res->t_sampled[seq_id] = data.sampled;
|
|
2225
|
+
ggml_build_forward_expand(gf, data.sampled);
|
|
2226
|
+
}
|
|
2227
|
+
|
|
2228
|
+
if (data.probs != nullptr) {
|
|
2229
|
+
res->t_sampled_probs[seq_id] = data.probs;
|
|
2230
|
+
ggml_build_forward_expand(gf, data.probs);
|
|
2231
|
+
}
|
|
2232
|
+
|
|
2233
|
+
if (data.logits != nullptr) {
|
|
2234
|
+
res->t_sampled_logits[seq_id] = data.logits;
|
|
2235
|
+
ggml_build_forward_expand(gf, data.logits);
|
|
2236
|
+
}
|
|
2237
|
+
|
|
2238
|
+
if (data.candidates != nullptr) {
|
|
2239
|
+
res->t_candidates[seq_id] = data.candidates;
|
|
2240
|
+
ggml_build_forward_expand(gf, data.candidates);
|
|
2241
|
+
}
|
|
2242
|
+
}
|
|
2243
|
+
|
|
2244
|
+
// TODO: Call llama_sampler_accept_ggml after all samplers have been applied.
|
|
2245
|
+
/*
|
|
2246
|
+
for (const auto & [seq_id, sampler] : samplers) {
|
|
2247
|
+
if (auto it = res->t_sampled.find(seq_id); it != res->t_sampled.end()) {
|
|
2248
|
+
ggml_tensor * selected_token = it->second;
|
|
2249
|
+
if (selected_token != nullptr) {
|
|
2250
|
+
llama_sampler_accept_ggml(sampler, ctx0, gf, selected_token);
|
|
2251
|
+
}
|
|
2252
|
+
}
|
|
2253
|
+
}
|
|
2254
|
+
*/
|
|
2255
|
+
}
|
|
2256
|
+
|
|
2089
2257
|
int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
|
|
2090
2258
|
// TODO move to hparams if a T5 variant appears that uses a different value
|
|
2091
2259
|
const int64_t max_distance = 128;
|
|
@@ -10,6 +10,7 @@
|
|
|
10
10
|
#include <memory>
|
|
11
11
|
#include <set>
|
|
12
12
|
#include <functional>
|
|
13
|
+
#include <map>
|
|
13
14
|
|
|
14
15
|
struct ggml_cgraph;
|
|
15
16
|
struct ggml_context;
|
|
@@ -396,6 +397,18 @@ public:
|
|
|
396
397
|
const llama_memory_hybrid_context * mctx;
|
|
397
398
|
};
|
|
398
399
|
|
|
400
|
+
class llm_graph_input_sampling : public llm_graph_input_i {
|
|
401
|
+
public:
|
|
402
|
+
llm_graph_input_sampling(std::map<llama_seq_id, llama_sampler *> samplers) :
|
|
403
|
+
samplers(std::move(samplers)) { }
|
|
404
|
+
virtual ~llm_graph_input_sampling() = default;
|
|
405
|
+
|
|
406
|
+
void set_input(const llama_ubatch * ubatch) override;
|
|
407
|
+
bool can_reuse(const llm_graph_params & params) override;
|
|
408
|
+
|
|
409
|
+
std::map<llama_seq_id, llama_sampler *> samplers;
|
|
410
|
+
};
|
|
411
|
+
|
|
399
412
|
//
|
|
400
413
|
// llm_graph_result
|
|
401
414
|
//
|
|
@@ -429,6 +442,23 @@ struct llm_graph_params {
|
|
|
429
442
|
const llama_memory_context_i * mctx;
|
|
430
443
|
const llama_cross * cross;
|
|
431
444
|
|
|
445
|
+
std::map<llama_seq_id, llama_sampler *> samplers;
|
|
446
|
+
|
|
447
|
+
static bool samplers_equal(
|
|
448
|
+
const std::map<llama_seq_id, llama_sampler *> & lhs,
|
|
449
|
+
const std::map<llama_seq_id, llama_sampler *> & rhs) {
|
|
450
|
+
if (lhs.size() != rhs.size()) {
|
|
451
|
+
return false;
|
|
452
|
+
}
|
|
453
|
+
for (const auto & [seq_id, sampler] : lhs) {
|
|
454
|
+
auto it = rhs.find(seq_id);
|
|
455
|
+
if (it == rhs.end() || it->second != sampler) {
|
|
456
|
+
return false;
|
|
457
|
+
}
|
|
458
|
+
}
|
|
459
|
+
return true;
|
|
460
|
+
}
|
|
461
|
+
|
|
432
462
|
uint32_t n_outputs;
|
|
433
463
|
|
|
434
464
|
llm_graph_cb cb;
|
|
@@ -468,15 +498,36 @@ struct llm_graph_params {
|
|
|
468
498
|
return false;
|
|
469
499
|
}
|
|
470
500
|
|
|
501
|
+
if (n_outputs != other.n_outputs) {
|
|
502
|
+
return false;
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
if (!samplers_equal(samplers, other.samplers)) {
|
|
506
|
+
return false;
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
if (samplers.size() > 0) {
|
|
510
|
+
if (!ubatch.data || !other.ubatch.data) {
|
|
511
|
+
return false;
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
// check that the outputs are the same for all samplers
|
|
515
|
+
for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
|
|
516
|
+
if (ubatch.output[i] != other.ubatch.output[i] ||
|
|
517
|
+
ubatch.seq_id[i][0] != other.ubatch.seq_id[i][0]) {
|
|
518
|
+
return false;
|
|
519
|
+
}
|
|
520
|
+
}
|
|
521
|
+
}
|
|
522
|
+
|
|
471
523
|
return
|
|
472
524
|
cparams.embeddings == other.cparams.embeddings &&
|
|
473
525
|
cparams.causal_attn == other.cparams.causal_attn &&
|
|
474
|
-
arch
|
|
475
|
-
gtype
|
|
476
|
-
cvec
|
|
477
|
-
loras
|
|
478
|
-
cross
|
|
479
|
-
n_outputs == other.n_outputs;
|
|
526
|
+
arch == other.arch &&
|
|
527
|
+
gtype == other.gtype &&
|
|
528
|
+
cvec == other.cvec &&
|
|
529
|
+
loras == other.loras &&
|
|
530
|
+
cross == other.cross;
|
|
480
531
|
}
|
|
481
532
|
};
|
|
482
533
|
|
|
@@ -499,6 +550,7 @@ public:
|
|
|
499
550
|
void reset();
|
|
500
551
|
|
|
501
552
|
void set_inputs(const llama_ubatch * ubatch);
|
|
553
|
+
void set_outputs();
|
|
502
554
|
|
|
503
555
|
// try to update the existing graph result using the new graph parameters in order to reuse it
|
|
504
556
|
// this can only be done if we determine that the resulting graph using the new graph parameters
|
|
@@ -517,6 +569,11 @@ public:
|
|
|
517
569
|
ggml_tensor * t_embd = nullptr;
|
|
518
570
|
ggml_tensor * t_embd_pooled = nullptr;
|
|
519
571
|
|
|
572
|
+
std::map<llama_seq_id, ggml_tensor*> t_sampled_logits;
|
|
573
|
+
std::map<llama_seq_id, ggml_tensor*> t_candidates;
|
|
574
|
+
std::map<llama_seq_id, ggml_tensor*> t_sampled;
|
|
575
|
+
std::map<llama_seq_id, ggml_tensor*> t_sampled_probs;
|
|
576
|
+
|
|
520
577
|
std::vector<llm_graph_input_ptr> inputs;
|
|
521
578
|
|
|
522
579
|
ggml_context_ptr ctx_compute;
|
|
@@ -592,6 +649,8 @@ struct llm_graph_context {
|
|
|
592
649
|
const llama_memory_context_i * mctx;
|
|
593
650
|
const llama_cross * cross;
|
|
594
651
|
|
|
652
|
+
std::map<llama_seq_id, llama_sampler *> samplers;
|
|
653
|
+
|
|
595
654
|
const llm_graph_cb & cb_func;
|
|
596
655
|
|
|
597
656
|
llm_graph_result * res;
|
|
@@ -832,6 +891,12 @@ struct llm_graph_context {
|
|
|
832
891
|
ggml_tensor * cls_out,
|
|
833
892
|
ggml_tensor * cls_out_b) const;
|
|
834
893
|
|
|
894
|
+
//
|
|
895
|
+
// sampling (backend sampling)
|
|
896
|
+
//
|
|
897
|
+
|
|
898
|
+
void build_sampling() const;
|
|
899
|
+
|
|
835
900
|
//
|
|
836
901
|
// dense (out)
|
|
837
902
|
//
|
|
@@ -72,6 +72,10 @@ uint32_t llama_hparams::n_embd_inp() const {
|
|
|
72
72
|
return n_embd_inp;
|
|
73
73
|
}
|
|
74
74
|
|
|
75
|
+
uint32_t llama_hparams::get_n_embd_out() const {
|
|
76
|
+
return n_embd_out > 0 ? n_embd_out : n_embd;
|
|
77
|
+
}
|
|
78
|
+
|
|
75
79
|
uint32_t llama_hparams::n_embd_k_gqa(uint32_t il) const {
|
|
76
80
|
const uint32_t n_head_kv = this->n_head_kv(il);
|
|
77
81
|
|
|
@@ -105,9 +105,9 @@ struct llama_hparams {
|
|
|
105
105
|
|
|
106
106
|
float rope_attn_factor = 1.0f;
|
|
107
107
|
float rope_freq_base_train;
|
|
108
|
-
float rope_freq_base_train_swa;
|
|
108
|
+
float rope_freq_base_train_swa = 10000.0f;
|
|
109
109
|
float rope_freq_scale_train;
|
|
110
|
-
float rope_freq_scale_train_swa;
|
|
110
|
+
float rope_freq_scale_train_swa = 1.0f;
|
|
111
111
|
|
|
112
112
|
uint32_t n_ctx_orig_yarn;
|
|
113
113
|
float rope_yarn_log_mul = 0.0f;
|
|
@@ -123,10 +123,11 @@ struct llama_hparams {
|
|
|
123
123
|
llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
|
|
124
124
|
// the size of the sliding window (0 - no SWA)
|
|
125
125
|
uint32_t n_swa = 0;
|
|
126
|
-
// if swa_layers[il] ==
|
|
127
|
-
// if swa_layers[il] ==
|
|
126
|
+
// if swa_layers[il] == 1, then layer il is SWA
|
|
127
|
+
// if swa_layers[il] == 0, then layer il is dense (i.e. non-SWA)
|
|
128
128
|
// by default, all layers are dense
|
|
129
|
-
|
|
129
|
+
// note: using uint32_t type for compatibility reason
|
|
130
|
+
std::array<uint32_t, LLAMA_MAX_LAYERS> swa_layers;
|
|
130
131
|
|
|
131
132
|
// for State Space Models
|
|
132
133
|
uint32_t ssm_d_conv = 0;
|
|
@@ -161,6 +162,9 @@ struct llama_hparams {
|
|
|
161
162
|
// for Classifiers
|
|
162
163
|
uint32_t n_cls_out = 1;
|
|
163
164
|
|
|
165
|
+
// output embedding dimension (0 = use n_embd)
|
|
166
|
+
uint32_t n_embd_out = 0;
|
|
167
|
+
|
|
164
168
|
// llama4 smallthinker
|
|
165
169
|
uint32_t n_moe_layer_step = 0;
|
|
166
170
|
uint32_t n_no_rope_layer_step = 4;
|
|
@@ -233,6 +237,9 @@ struct llama_hparams {
|
|
|
233
237
|
// dimension of main + auxiliary input embeddings
|
|
234
238
|
uint32_t n_embd_inp() const;
|
|
235
239
|
|
|
240
|
+
// dimension of output embeddings
|
|
241
|
+
uint32_t get_n_embd_out() const;
|
|
242
|
+
|
|
236
243
|
// dimension of key embeddings across all k-v heads
|
|
237
244
|
uint32_t n_embd_k_gqa(uint32_t il = 0) const;
|
|
238
245
|
|
|
@@ -305,7 +305,7 @@ public:
|
|
|
305
305
|
bool do_shift,
|
|
306
306
|
stream_copy_info sc_info);
|
|
307
307
|
|
|
308
|
-
// used to create a batch
|
|
308
|
+
// used to create a batch processing context from a batch
|
|
309
309
|
llama_kv_cache_context(
|
|
310
310
|
llama_kv_cache * kv,
|
|
311
311
|
slot_info_vec_t sinfos,
|
|
@@ -240,9 +240,10 @@ struct llama_file::impl {
|
|
|
240
240
|
throw std::runtime_error("unexpectedly reached end of file");
|
|
241
241
|
}
|
|
242
242
|
} else {
|
|
243
|
-
|
|
244
|
-
while (
|
|
245
|
-
|
|
243
|
+
size_t bytes_read = 0;
|
|
244
|
+
while (bytes_read < len) {
|
|
245
|
+
const size_t to_read = len - bytes_read;
|
|
246
|
+
ssize_t ret = ::read(fd, reinterpret_cast<char *>(ptr) + bytes_read, to_read);
|
|
246
247
|
|
|
247
248
|
if (ret == -1) {
|
|
248
249
|
if (errno == EINTR) {
|
|
@@ -251,10 +252,16 @@ struct llama_file::impl {
|
|
|
251
252
|
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
|
252
253
|
}
|
|
253
254
|
if (ret == 0) {
|
|
255
|
+
// EOF: allow if this read was only pulling alignment padding past file end
|
|
256
|
+
off_t pos = lseek(fd, 0, SEEK_CUR);
|
|
257
|
+
if (pos != -1 && (size_t) pos == size) {
|
|
258
|
+
std::memset(reinterpret_cast<char *>(ptr) + bytes_read, 0, len - bytes_read);
|
|
259
|
+
return;
|
|
260
|
+
}
|
|
254
261
|
throw std::runtime_error("unexpectedly reached end of file");
|
|
255
262
|
}
|
|
256
263
|
|
|
257
|
-
|
|
264
|
+
bytes_read += (size_t) ret;
|
|
258
265
|
}
|
|
259
266
|
}
|
|
260
267
|
}
|
|
@@ -462,6 +462,29 @@ namespace GGUFMeta {
|
|
|
462
462
|
return get_key_or_arr(llm_kv(kid), result, n, required);
|
|
463
463
|
}
|
|
464
464
|
|
|
465
|
+
bool llama_model_loader::get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required) {
|
|
466
|
+
const std::string key = llm_kv(kid);
|
|
467
|
+
|
|
468
|
+
const int id = gguf_find_key(meta.get(), key.c_str());
|
|
469
|
+
|
|
470
|
+
if (id < 0) {
|
|
471
|
+
if (required) {
|
|
472
|
+
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
|
|
473
|
+
}
|
|
474
|
+
return false;
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
// throw and error if type is an array
|
|
478
|
+
if (gguf_get_kv_type(meta.get(), id) == GGUF_TYPE_ARRAY) {
|
|
479
|
+
if (required) {
|
|
480
|
+
throw std::runtime_error(format("expected scalar, found array for key: %s", key.c_str()));
|
|
481
|
+
}
|
|
482
|
+
return false;
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
return get_key(key, result, required);
|
|
486
|
+
}
|
|
487
|
+
|
|
465
488
|
// TODO: this is not very clever - figure out something better
|
|
466
489
|
template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
|
|
467
490
|
template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
|