@fugood/llama.node 1.4.11 → 1.4.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/package.json +15 -15
  2. package/scripts/llama.cpp.patch +31 -31
  3. package/src/llama.cpp/common/arg.cpp +128 -59
  4. package/src/llama.cpp/common/arg.h +1 -0
  5. package/src/llama.cpp/common/chat-parser.cpp +11 -0
  6. package/src/llama.cpp/common/chat.cpp +36 -7
  7. package/src/llama.cpp/common/chat.h +1 -0
  8. package/src/llama.cpp/common/common.cpp +42 -23
  9. package/src/llama.cpp/common/common.h +11 -1
  10. package/src/llama.cpp/common/llguidance.cpp +10 -6
  11. package/src/llama.cpp/common/regex-partial.cpp +13 -13
  12. package/src/llama.cpp/common/sampling.cpp +58 -14
  13. package/src/llama.cpp/common/sampling.h +3 -1
  14. package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
  15. package/src/llama.cpp/ggml/include/ggml-backend.h +1 -1
  16. package/src/llama.cpp/ggml/src/CMakeLists.txt +23 -9
  17. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
  18. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
  19. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
  20. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
  21. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
  22. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
  23. package/src/llama.cpp/include/llama.h +100 -12
  24. package/src/llama.cpp/src/CMakeLists.txt +4 -0
  25. package/src/llama.cpp/src/llama-adapter.cpp +12 -3
  26. package/src/llama.cpp/src/llama-adapter.h +7 -1
  27. package/src/llama.cpp/src/llama-arch.cpp +78 -0
  28. package/src/llama.cpp/src/llama-arch.h +8 -0
  29. package/src/llama.cpp/src/llama-chat.cpp +11 -0
  30. package/src/llama.cpp/src/llama-chat.h +1 -0
  31. package/src/llama.cpp/src/llama-context.cpp +637 -49
  32. package/src/llama.cpp/src/llama-context.h +43 -1
  33. package/src/llama.cpp/src/llama-grammar.cpp +40 -13
  34. package/src/llama.cpp/src/llama-grammar.h +2 -0
  35. package/src/llama.cpp/src/llama-graph.cpp +173 -5
  36. package/src/llama.cpp/src/llama-graph.h +71 -6
  37. package/src/llama.cpp/src/llama-hparams.cpp +4 -0
  38. package/src/llama.cpp/src/llama-hparams.h +12 -5
  39. package/src/llama.cpp/src/llama-kv-cache.h +1 -1
  40. package/src/llama.cpp/src/llama-mmap.cpp +11 -4
  41. package/src/llama.cpp/src/llama-model-loader.cpp +23 -0
  42. package/src/llama.cpp/src/llama-model-loader.h +2 -0
  43. package/src/llama.cpp/src/llama-model-saver.cpp +3 -0
  44. package/src/llama.cpp/src/llama-model.cpp +337 -26
  45. package/src/llama.cpp/src/llama-model.h +13 -2
  46. package/src/llama.cpp/src/llama-sampling.cpp +1259 -186
  47. package/src/llama.cpp/src/llama-sampling.h +19 -7
  48. package/src/llama.cpp/src/llama-vocab.cpp +101 -33
  49. package/src/llama.cpp/src/llama-vocab.h +2 -0
  50. package/src/llama.cpp/src/llama.cpp +87 -64
  51. package/src/llama.cpp/src/models/afmoe.cpp +9 -5
  52. package/src/llama.cpp/src/models/bert.cpp +4 -2
  53. package/src/llama.cpp/src/models/cogvlm.cpp +5 -3
  54. package/src/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
  55. package/src/llama.cpp/src/models/deepseek2.cpp +1 -1
  56. package/src/llama.cpp/src/models/gemma-embedding.cpp +2 -6
  57. package/src/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
  58. package/src/llama.cpp/src/models/gemma3.cpp +3 -4
  59. package/src/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
  60. package/src/llama.cpp/src/models/llama-iswa.cpp +6 -2
  61. package/src/llama.cpp/src/models/llama.cpp +19 -6
  62. package/src/llama.cpp/src/models/maincoder.cpp +117 -0
  63. package/src/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
  64. package/src/llama.cpp/src/models/models.h +18 -0
  65. package/src/llama.cpp/src/models/modern-bert.cpp +116 -0
  66. package/src/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
  67. package/src/llama.cpp/src/models/plamo3.cpp +128 -0
  68. package/src/llama.cpp/src/models/smallthinker.cpp +11 -5
  69. package/src/llama.cpp/src/unicode.cpp +23 -14
@@ -70,6 +70,18 @@ struct llama_context {
70
70
  float * get_embeddings_ith(int32_t i);
71
71
  float * get_embeddings_seq(llama_seq_id seq_id);
72
72
 
73
+ llama_token * get_sampled_tokens() const;
74
+ llama_token get_sampled_token_ith(int32_t idx);
75
+
76
+ float * get_sampled_logits_ith(int32_t idx);
77
+ size_t get_sampled_logits_count(int32_t idx);
78
+
79
+ float * get_sampled_probs_ith(int32_t idx);
80
+ size_t get_sampled_probs_count(int32_t idx);
81
+
82
+ const llama_token * get_sampled_candidates_ith(int32_t idx);
83
+ size_t get_sampled_candidates_count(int32_t idx);
84
+
73
85
  void attach_threadpool(
74
86
  ggml_threadpool_t threadpool,
75
87
  ggml_threadpool_t threadpool_batch);
@@ -192,10 +204,13 @@ private:
192
204
 
193
205
  // Make sure enough space is available for outputs.
194
206
  // Returns max number of outputs for which space was reserved.
195
- uint32_t output_reserve(int32_t n_outputs);
207
+ uint32_t output_reserve(int32_t n_outputs, const llama_batch & batch);
196
208
 
197
209
  void output_reorder();
198
210
 
211
+ // map the output row index `i` to batch index
212
+ int64_t output_resolve_row(int32_t i) const;
213
+
199
214
  //
200
215
  // graph
201
216
  //
@@ -213,6 +228,8 @@ public:
213
228
  ggml_cgraph * graph_reserve(
214
229
  uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false, size_t * sizes = nullptr);
215
230
 
231
+ bool set_sampler(llama_seq_id seq_id, llama_sampler * sampler);
232
+
216
233
  private:
217
234
  llm_graph_params graph_params(
218
235
  llm_graph_result * res,
@@ -252,6 +269,31 @@ private:
252
269
  size_t embd_size = 0; // capacity (of floats) for embeddings
253
270
  float * embd = nullptr;
254
271
 
272
+ // TODO: simplify
273
+ struct sampling_info {
274
+ std::map<llama_seq_id, llama_sampler *> samplers;
275
+
276
+ float * logits = nullptr;
277
+ size_t logits_size = 0;
278
+
279
+ llama_token * sampled = nullptr;
280
+ size_t sampled_size = 0;
281
+
282
+ float * probs = nullptr;
283
+ size_t probs_size = 0;
284
+
285
+ llama_token * candidates = nullptr;
286
+ size_t candidates_size = 0;
287
+
288
+ std::vector<uint32_t> logits_count;
289
+ std::vector<uint32_t> probs_count;
290
+ std::vector<uint32_t> candidates_count;
291
+
292
+ std::vector<llama_token> token_ids_full_vocab;
293
+ };
294
+
295
+ sampling_info sampling;
296
+
255
297
  // sequence embeddings output (map of [n_embd] vectors)
256
298
  // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
257
299
  std::map<llama_seq_id, std::vector<float>> embd_seq;
@@ -369,6 +369,44 @@ static void print_rule(
369
369
  fprintf(file, "\n");
370
370
  }
371
371
 
372
+ //
373
+ // Regex utilities
374
+ //
375
+
376
+ size_t llama_grammar_trigger_pattern::find(const std::string & input) const {
377
+ auto find_start_pos = [](const std::smatch & match) {
378
+ // get from the first matched capturing group to the end of the string
379
+ size_t start = std::string::npos;
380
+ for (auto i = 1u; i < match.size(); i++) {
381
+ if (match.length(i) > 0) {
382
+ start = match.position(i);
383
+ break;
384
+ }
385
+ }
386
+ if (start == std::string::npos) {
387
+ start = match.position(0);
388
+ }
389
+ return start;
390
+ };
391
+
392
+ if (!pattern.empty() && pattern.front() == '^' && pattern.back() == '$') {
393
+ // match against the entire input
394
+ std::smatch match;
395
+ if (std::regex_match(input, match, regex)) {
396
+ return find_start_pos(match);
397
+ }
398
+ }
399
+
400
+ // search anywhere
401
+ std::smatch match;
402
+ if (std::regex_search(input, match, regex)) {
403
+ return find_start_pos(match);
404
+ }
405
+
406
+ return std::string::npos;
407
+ }
408
+
409
+
372
410
  //
373
411
  // implementation
374
412
  //
@@ -1312,21 +1350,10 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
1312
1350
  grammar.trigger_buffer_positions.push_back(std::make_pair(token, position));
1313
1351
  grammar.trigger_buffer += piece;
1314
1352
 
1315
- std::smatch match;
1316
1353
  for (const auto & trigger_pattern : grammar.trigger_patterns) {
1317
- if (std::regex_match(grammar.trigger_buffer, match, trigger_pattern.regex)) {
1354
+ auto start = trigger_pattern.find(grammar.trigger_buffer);
1355
+ if (start != std::string::npos) {
1318
1356
  grammar.awaiting_trigger = false;
1319
- // get from the first matched capturing group to the end of the string
1320
- size_t start = std::string::npos;
1321
- for (auto i = 1u; i < match.size(); i++) {
1322
- if (match.length(i) > 0) {
1323
- start = match.position(i);
1324
- break;
1325
- }
1326
- }
1327
- if (start == std::string::npos) {
1328
- start = match.position(0);
1329
- }
1330
1357
 
1331
1358
  // replay tokens that overlap with [start, end)
1332
1359
  for (const auto & [tok, tok_pos] : grammar.trigger_buffer_positions) {
@@ -119,6 +119,8 @@ struct llama_grammar_parser {
119
119
  struct llama_grammar_trigger_pattern {
120
120
  std::string pattern;
121
121
  std::regex regex;
122
+
123
+ size_t find(const std::string & input) const;
122
124
  };
123
125
 
124
126
  struct llama_grammar {
@@ -12,6 +12,7 @@
12
12
  #include <cassert>
13
13
  #include <cmath>
14
14
  #include <cstring>
15
+ #include <unordered_set>
15
16
 
16
17
  void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
17
18
  if (ubatch->token) {
@@ -32,7 +33,7 @@ bool llm_graph_input_embd::can_reuse(const llm_graph_params & params) {
32
33
  bool res = true;
33
34
 
34
35
  res &= (!tokens && !params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens);
35
- res &= (!embd && !params.ubatch.embd) || (embd && embd->ne[0] == params.ubatch.n_tokens);
36
+ res &= (!embd && !params.ubatch.embd) || (embd && embd->ne[1] == params.ubatch.n_tokens);
36
37
 
37
38
  return res;
38
39
  }
@@ -62,7 +63,7 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
62
63
  bool llm_graph_input_pos::can_reuse(const llm_graph_params & params) {
63
64
  bool res = true;
64
65
 
65
- res &= pos->ne[0] == params.ubatch.n_tokens;
66
+ res &= pos->ne[0] == params.ubatch.n_tokens*n_pos_per_embd;
66
67
 
67
68
  return res;
68
69
  }
@@ -521,6 +522,43 @@ bool llm_graph_input_mem_hybrid::can_reuse(const llm_graph_params & params) {
521
522
  return res;
522
523
  }
523
524
 
525
+ void llm_graph_input_sampling::set_input(const llama_ubatch * ubatch) {
526
+ // set the inputs only for the active samplers in the current ubatch
527
+ std::unordered_set<llama_seq_id> active_samplers;
528
+ for (uint32_t i = 0; i < ubatch->n_tokens; i++) {
529
+ if (ubatch->output[i]) {
530
+ llama_seq_id seq_id = ubatch->seq_id[i][0];
531
+ active_samplers.insert(seq_id);
532
+ }
533
+ }
534
+
535
+ for (auto seq_id : active_samplers) {
536
+ if (samplers.find(seq_id) == samplers.end()) {
537
+ continue;
538
+ }
539
+
540
+ auto & sampler = samplers[seq_id];
541
+
542
+ if (sampler->iface->backend_set_input) {
543
+ sampler->iface->backend_set_input(sampler);
544
+ }
545
+ }
546
+ }
547
+
548
+ bool llm_graph_input_sampling::can_reuse(const llm_graph_params & params) {
549
+ if (samplers.size() != params.samplers.size()) {
550
+ return false;
551
+ }
552
+
553
+ for (const auto & [seq_id, sampler] : params.samplers) {
554
+ if (samplers[seq_id] != sampler) {
555
+ return false;
556
+ }
557
+ }
558
+
559
+ return true;
560
+ }
561
+
524
562
  //
525
563
  // llm_graph_result
526
564
  //
@@ -541,6 +579,10 @@ void llm_graph_result::reset() {
541
579
  t_logits = nullptr;
542
580
  t_embd = nullptr;
543
581
  t_embd_pooled = nullptr;
582
+ t_sampled.clear();
583
+ t_sampled_probs.clear();
584
+ t_sampled_logits.clear();
585
+ t_candidates.clear();
544
586
 
545
587
  params = {};
546
588
 
@@ -565,6 +607,38 @@ void llm_graph_result::set_inputs(const llama_ubatch * ubatch) {
565
607
  }
566
608
  }
567
609
 
610
+ void llm_graph_result::set_outputs() {
611
+ if (t_logits != nullptr) {
612
+ ggml_set_output(t_logits);
613
+ }
614
+ if (t_embd != nullptr) {
615
+ ggml_set_output(t_embd);
616
+ }
617
+ if (t_embd_pooled != nullptr) {
618
+ ggml_set_output(t_embd_pooled);
619
+ }
620
+ for (auto & [seq_id, t] : t_sampled) {
621
+ if (t != nullptr) {
622
+ ggml_set_output(t);
623
+ }
624
+ }
625
+ for (auto & [seq_id, t] : t_sampled_probs) {
626
+ if (t != nullptr) {
627
+ ggml_set_output(t);
628
+ }
629
+ }
630
+ for (auto & [seq_id, t] : t_sampled_logits) {
631
+ if (t != nullptr) {
632
+ ggml_set_output(t);
633
+ }
634
+ }
635
+ for (auto & [seq_id, t] : t_candidates) {
636
+ if (t != nullptr) {
637
+ ggml_set_output(t);
638
+ }
639
+ }
640
+ }
641
+
568
642
  bool llm_graph_result::can_reuse(const llm_graph_params & params) {
569
643
  if (!this->params.allow_reuse(params)) {
570
644
  if (debug > 1) {
@@ -646,6 +720,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
646
720
  loras (params.loras),
647
721
  mctx (params.mctx),
648
722
  cross (params.cross),
723
+ samplers (params.samplers),
649
724
  cb_func (params.cb),
650
725
  res (params.res),
651
726
  ctx0 (res->get_ctx()),
@@ -1251,6 +1326,10 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
1251
1326
 
1252
1327
  res->add_input(std::move(inp));
1253
1328
 
1329
+ // make sure the produced embeddings are immediately materialized in the ggml graph
1330
+ // ref: https://github.com/ggml-org/llama.cpp/pull/18599
1331
+ ggml_build_forward_expand(gf, cur);
1332
+
1254
1333
  return cur;
1255
1334
  }
1256
1335
 
@@ -1834,8 +1913,10 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
1834
1913
 
1835
1914
  inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
1836
1915
  ggml_set_input(inp->self_kq_mask);
1916
+ ggml_set_name(inp->self_kq_mask, "self_kq_mask");
1837
1917
 
1838
1918
  inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
1919
+ ggml_set_name(inp->self_kq_mask_cnv, "self_kq_mask_cnv");
1839
1920
  }
1840
1921
 
1841
1922
  {
@@ -1848,8 +1929,10 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
1848
1929
 
1849
1930
  inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
1850
1931
  ggml_set_input(inp->self_kq_mask_swa);
1932
+ ggml_set_name(inp->self_kq_mask_swa, "self_kq_mask_swa");
1851
1933
 
1852
1934
  inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
1935
+ ggml_set_name(inp->self_kq_mask_swa_cnv, "self_kq_mask_swa_cnv");
1853
1936
  }
1854
1937
 
1855
1938
  return (llm_graph_input_attn_kv_iswa *) res->add_input(std::move(inp));
@@ -1988,14 +2071,18 @@ llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
1988
2071
  void llm_graph_context::build_dense_out(
1989
2072
  ggml_tensor * dense_2,
1990
2073
  ggml_tensor * dense_3) const {
1991
- if (!cparams.embeddings || dense_2 == nullptr || dense_3 == nullptr) {
2074
+ if (!cparams.embeddings || !(dense_2 || dense_3)) {
1992
2075
  return;
1993
2076
  }
1994
2077
  ggml_tensor * cur = res->t_embd_pooled != nullptr ? res->t_embd_pooled : res->t_embd;
1995
2078
  GGML_ASSERT(cur != nullptr && "missing t_embd_pooled/t_embd");
1996
2079
 
1997
- cur = ggml_mul_mat(ctx0, dense_2, cur);
1998
- cur = ggml_mul_mat(ctx0, dense_3, cur);
2080
+ if (dense_2) {
2081
+ cur = ggml_mul_mat(ctx0, dense_2, cur);
2082
+ }
2083
+ if (dense_3) {
2084
+ cur = ggml_mul_mat(ctx0, dense_3, cur);
2085
+ }
1999
2086
  cb(cur, "result_embd_pooled", -1);
2000
2087
  res->t_embd_pooled = cur;
2001
2088
  ggml_build_forward_expand(gf, cur);
@@ -2086,6 +2173,87 @@ void llm_graph_context::build_pooling(
2086
2173
  ggml_build_forward_expand(gf, cur);
2087
2174
  }
2088
2175
 
2176
+ void llm_graph_context::build_sampling() const {
2177
+ if (samplers.empty() || !res->t_logits) {
2178
+ return;
2179
+ }
2180
+
2181
+ auto inp_sampling = std::make_unique<llm_graph_input_sampling>(samplers);
2182
+ res->add_input(std::move(inp_sampling));
2183
+
2184
+ std::map<llama_seq_id, int32_t> seq_to_logit_row;
2185
+ int32_t logit_row_idx = 0;
2186
+
2187
+ for (uint32_t i = 0; i < ubatch.n_tokens; i++) {
2188
+ if (ubatch.output[i]) {
2189
+ llama_seq_id seq_id = ubatch.seq_id[i][0];
2190
+ seq_to_logit_row[seq_id] = logit_row_idx;
2191
+ logit_row_idx++;
2192
+ }
2193
+ }
2194
+
2195
+ // res->t_logits will contain logits for all tokens that want the logits calculated (logits=1 or output=1)
2196
+ GGML_ASSERT(res->t_logits != nullptr && "missing t_logits tensor");
2197
+
2198
+ // add a dummy row of logits
2199
+ // this trick makes the graph static, regardless of which samplers are activated
2200
+ // this is important in order to minimize graph reallocations
2201
+ // TODO: use `ggml_build_forward_select()` when available (https://github.com/ggml-org/llama.cpp/pull/18550)
2202
+ ggml_tensor * logits_t = ggml_pad(ctx0, res->t_logits, 0, 1, 0, 0);
2203
+
2204
+ for (const auto & [seq_id, sampler] : samplers) {
2205
+ const auto it = seq_to_logit_row.find(seq_id);
2206
+
2207
+ // inactive samplers always work on the first row
2208
+ const auto row_idx = seq_to_logit_row.find(seq_id) != seq_to_logit_row.end() ? it->second : 0;
2209
+
2210
+ ggml_tensor * logits_seq = ggml_view_1d(ctx0, logits_t, logits_t->ne[0], row_idx * logits_t->nb[1]);
2211
+ ggml_format_name(logits_seq, "logits_seq_%d", seq_id);
2212
+
2213
+ struct llama_sampler_data data = {
2214
+ /*.logits =*/ logits_seq,
2215
+ /*.probs =*/ nullptr,
2216
+ /*.sampled =*/ nullptr,
2217
+ /*.candidates =*/ nullptr,
2218
+ };
2219
+
2220
+ assert(sampler->iface->backend_apply);
2221
+ sampler->iface->backend_apply(sampler, ctx0, gf, &data);
2222
+
2223
+ if (data.sampled != nullptr) {
2224
+ res->t_sampled[seq_id] = data.sampled;
2225
+ ggml_build_forward_expand(gf, data.sampled);
2226
+ }
2227
+
2228
+ if (data.probs != nullptr) {
2229
+ res->t_sampled_probs[seq_id] = data.probs;
2230
+ ggml_build_forward_expand(gf, data.probs);
2231
+ }
2232
+
2233
+ if (data.logits != nullptr) {
2234
+ res->t_sampled_logits[seq_id] = data.logits;
2235
+ ggml_build_forward_expand(gf, data.logits);
2236
+ }
2237
+
2238
+ if (data.candidates != nullptr) {
2239
+ res->t_candidates[seq_id] = data.candidates;
2240
+ ggml_build_forward_expand(gf, data.candidates);
2241
+ }
2242
+ }
2243
+
2244
+ // TODO: Call llama_sampler_accept_ggml after all samplers have been applied.
2245
+ /*
2246
+ for (const auto & [seq_id, sampler] : samplers) {
2247
+ if (auto it = res->t_sampled.find(seq_id); it != res->t_sampled.end()) {
2248
+ ggml_tensor * selected_token = it->second;
2249
+ if (selected_token != nullptr) {
2250
+ llama_sampler_accept_ggml(sampler, ctx0, gf, selected_token);
2251
+ }
2252
+ }
2253
+ }
2254
+ */
2255
+ }
2256
+
2089
2257
  int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
2090
2258
  // TODO move to hparams if a T5 variant appears that uses a different value
2091
2259
  const int64_t max_distance = 128;
@@ -10,6 +10,7 @@
10
10
  #include <memory>
11
11
  #include <set>
12
12
  #include <functional>
13
+ #include <map>
13
14
 
14
15
  struct ggml_cgraph;
15
16
  struct ggml_context;
@@ -396,6 +397,18 @@ public:
396
397
  const llama_memory_hybrid_context * mctx;
397
398
  };
398
399
 
400
+ class llm_graph_input_sampling : public llm_graph_input_i {
401
+ public:
402
+ llm_graph_input_sampling(std::map<llama_seq_id, llama_sampler *> samplers) :
403
+ samplers(std::move(samplers)) { }
404
+ virtual ~llm_graph_input_sampling() = default;
405
+
406
+ void set_input(const llama_ubatch * ubatch) override;
407
+ bool can_reuse(const llm_graph_params & params) override;
408
+
409
+ std::map<llama_seq_id, llama_sampler *> samplers;
410
+ };
411
+
399
412
  //
400
413
  // llm_graph_result
401
414
  //
@@ -429,6 +442,23 @@ struct llm_graph_params {
429
442
  const llama_memory_context_i * mctx;
430
443
  const llama_cross * cross;
431
444
 
445
+ std::map<llama_seq_id, llama_sampler *> samplers;
446
+
447
+ static bool samplers_equal(
448
+ const std::map<llama_seq_id, llama_sampler *> & lhs,
449
+ const std::map<llama_seq_id, llama_sampler *> & rhs) {
450
+ if (lhs.size() != rhs.size()) {
451
+ return false;
452
+ }
453
+ for (const auto & [seq_id, sampler] : lhs) {
454
+ auto it = rhs.find(seq_id);
455
+ if (it == rhs.end() || it->second != sampler) {
456
+ return false;
457
+ }
458
+ }
459
+ return true;
460
+ }
461
+
432
462
  uint32_t n_outputs;
433
463
 
434
464
  llm_graph_cb cb;
@@ -468,15 +498,36 @@ struct llm_graph_params {
468
498
  return false;
469
499
  }
470
500
 
501
+ if (n_outputs != other.n_outputs) {
502
+ return false;
503
+ }
504
+
505
+ if (!samplers_equal(samplers, other.samplers)) {
506
+ return false;
507
+ }
508
+
509
+ if (samplers.size() > 0) {
510
+ if (!ubatch.data || !other.ubatch.data) {
511
+ return false;
512
+ }
513
+
514
+ // check that the outputs are the same for all samplers
515
+ for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
516
+ if (ubatch.output[i] != other.ubatch.output[i] ||
517
+ ubatch.seq_id[i][0] != other.ubatch.seq_id[i][0]) {
518
+ return false;
519
+ }
520
+ }
521
+ }
522
+
471
523
  return
472
524
  cparams.embeddings == other.cparams.embeddings &&
473
525
  cparams.causal_attn == other.cparams.causal_attn &&
474
- arch == other.arch &&
475
- gtype == other.gtype &&
476
- cvec == other.cvec &&
477
- loras == other.loras &&
478
- cross == other.cross &&
479
- n_outputs == other.n_outputs;
526
+ arch == other.arch &&
527
+ gtype == other.gtype &&
528
+ cvec == other.cvec &&
529
+ loras == other.loras &&
530
+ cross == other.cross;
480
531
  }
481
532
  };
482
533
 
@@ -499,6 +550,7 @@ public:
499
550
  void reset();
500
551
 
501
552
  void set_inputs(const llama_ubatch * ubatch);
553
+ void set_outputs();
502
554
 
503
555
  // try to update the existing graph result using the new graph parameters in order to reuse it
504
556
  // this can only be done if we determine that the resulting graph using the new graph parameters
@@ -517,6 +569,11 @@ public:
517
569
  ggml_tensor * t_embd = nullptr;
518
570
  ggml_tensor * t_embd_pooled = nullptr;
519
571
 
572
+ std::map<llama_seq_id, ggml_tensor*> t_sampled_logits;
573
+ std::map<llama_seq_id, ggml_tensor*> t_candidates;
574
+ std::map<llama_seq_id, ggml_tensor*> t_sampled;
575
+ std::map<llama_seq_id, ggml_tensor*> t_sampled_probs;
576
+
520
577
  std::vector<llm_graph_input_ptr> inputs;
521
578
 
522
579
  ggml_context_ptr ctx_compute;
@@ -592,6 +649,8 @@ struct llm_graph_context {
592
649
  const llama_memory_context_i * mctx;
593
650
  const llama_cross * cross;
594
651
 
652
+ std::map<llama_seq_id, llama_sampler *> samplers;
653
+
595
654
  const llm_graph_cb & cb_func;
596
655
 
597
656
  llm_graph_result * res;
@@ -832,6 +891,12 @@ struct llm_graph_context {
832
891
  ggml_tensor * cls_out,
833
892
  ggml_tensor * cls_out_b) const;
834
893
 
894
+ //
895
+ // sampling (backend sampling)
896
+ //
897
+
898
+ void build_sampling() const;
899
+
835
900
  //
836
901
  // dense (out)
837
902
  //
@@ -72,6 +72,10 @@ uint32_t llama_hparams::n_embd_inp() const {
72
72
  return n_embd_inp;
73
73
  }
74
74
 
75
+ uint32_t llama_hparams::get_n_embd_out() const {
76
+ return n_embd_out > 0 ? n_embd_out : n_embd;
77
+ }
78
+
75
79
  uint32_t llama_hparams::n_embd_k_gqa(uint32_t il) const {
76
80
  const uint32_t n_head_kv = this->n_head_kv(il);
77
81
 
@@ -105,9 +105,9 @@ struct llama_hparams {
105
105
 
106
106
  float rope_attn_factor = 1.0f;
107
107
  float rope_freq_base_train;
108
- float rope_freq_base_train_swa;
108
+ float rope_freq_base_train_swa = 10000.0f;
109
109
  float rope_freq_scale_train;
110
- float rope_freq_scale_train_swa;
110
+ float rope_freq_scale_train_swa = 1.0f;
111
111
 
112
112
  uint32_t n_ctx_orig_yarn;
113
113
  float rope_yarn_log_mul = 0.0f;
@@ -123,10 +123,11 @@ struct llama_hparams {
123
123
  llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
124
124
  // the size of the sliding window (0 - no SWA)
125
125
  uint32_t n_swa = 0;
126
- // if swa_layers[il] == true, then layer il is SWA
127
- // if swa_layers[il] == false, then layer il is dense (i.e. non-SWA)
126
+ // if swa_layers[il] == 1, then layer il is SWA
127
+ // if swa_layers[il] == 0, then layer il is dense (i.e. non-SWA)
128
128
  // by default, all layers are dense
129
- std::array<bool, LLAMA_MAX_LAYERS> swa_layers;
129
+ // note: using uint32_t type for compatibility reason
130
+ std::array<uint32_t, LLAMA_MAX_LAYERS> swa_layers;
130
131
 
131
132
  // for State Space Models
132
133
  uint32_t ssm_d_conv = 0;
@@ -161,6 +162,9 @@ struct llama_hparams {
161
162
  // for Classifiers
162
163
  uint32_t n_cls_out = 1;
163
164
 
165
+ // output embedding dimension (0 = use n_embd)
166
+ uint32_t n_embd_out = 0;
167
+
164
168
  // llama4 smallthinker
165
169
  uint32_t n_moe_layer_step = 0;
166
170
  uint32_t n_no_rope_layer_step = 4;
@@ -233,6 +237,9 @@ struct llama_hparams {
233
237
  // dimension of main + auxiliary input embeddings
234
238
  uint32_t n_embd_inp() const;
235
239
 
240
+ // dimension of output embeddings
241
+ uint32_t get_n_embd_out() const;
242
+
236
243
  // dimension of key embeddings across all k-v heads
237
244
  uint32_t n_embd_k_gqa(uint32_t il = 0) const;
238
245
 
@@ -305,7 +305,7 @@ public:
305
305
  bool do_shift,
306
306
  stream_copy_info sc_info);
307
307
 
308
- // used to create a batch procesing context from a batch
308
+ // used to create a batch processing context from a batch
309
309
  llama_kv_cache_context(
310
310
  llama_kv_cache * kv,
311
311
  slot_info_vec_t sinfos,
@@ -240,9 +240,10 @@ struct llama_file::impl {
240
240
  throw std::runtime_error("unexpectedly reached end of file");
241
241
  }
242
242
  } else {
243
- bool successful = false;
244
- while (!successful) {
245
- off_t ret = read(fd, ptr, len);
243
+ size_t bytes_read = 0;
244
+ while (bytes_read < len) {
245
+ const size_t to_read = len - bytes_read;
246
+ ssize_t ret = ::read(fd, reinterpret_cast<char *>(ptr) + bytes_read, to_read);
246
247
 
247
248
  if (ret == -1) {
248
249
  if (errno == EINTR) {
@@ -251,10 +252,16 @@ struct llama_file::impl {
251
252
  throw std::runtime_error(format("read error: %s", strerror(errno)));
252
253
  }
253
254
  if (ret == 0) {
255
+ // EOF: allow if this read was only pulling alignment padding past file end
256
+ off_t pos = lseek(fd, 0, SEEK_CUR);
257
+ if (pos != -1 && (size_t) pos == size) {
258
+ std::memset(reinterpret_cast<char *>(ptr) + bytes_read, 0, len - bytes_read);
259
+ return;
260
+ }
254
261
  throw std::runtime_error("unexpectedly reached end of file");
255
262
  }
256
263
 
257
- successful = true;
264
+ bytes_read += (size_t) ret;
258
265
  }
259
266
  }
260
267
  }
@@ -462,6 +462,29 @@ namespace GGUFMeta {
462
462
  return get_key_or_arr(llm_kv(kid), result, n, required);
463
463
  }
464
464
 
465
+ bool llama_model_loader::get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required) {
466
+ const std::string key = llm_kv(kid);
467
+
468
+ const int id = gguf_find_key(meta.get(), key.c_str());
469
+
470
+ if (id < 0) {
471
+ if (required) {
472
+ throw std::runtime_error(format("key not found in model: %s", key.c_str()));
473
+ }
474
+ return false;
475
+ }
476
+
477
+ // throw and error if type is an array
478
+ if (gguf_get_kv_type(meta.get(), id) == GGUF_TYPE_ARRAY) {
479
+ if (required) {
480
+ throw std::runtime_error(format("expected scalar, found array for key: %s", key.c_str()));
481
+ }
482
+ return false;
483
+ }
484
+
485
+ return get_key(key, result, required);
486
+ }
487
+
465
488
  // TODO: this is not very clever - figure out something better
466
489
  template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
467
490
  template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);