@fugood/llama.node 1.4.6 → 1.4.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/lib/binding.ts +8 -0
  2. package/package.json +15 -15
  3. package/scripts/llama.cpp.patch +25 -26
  4. package/src/LlamaContext.cpp +2 -2
  5. package/src/llama.cpp/common/CMakeLists.txt +2 -0
  6. package/src/llama.cpp/common/arg.cpp +364 -193
  7. package/src/llama.cpp/common/arg.h +43 -2
  8. package/src/llama.cpp/common/chat-parser-xml-toolcall.cpp +36 -18
  9. package/src/llama.cpp/common/chat-parser-xml-toolcall.h +1 -1
  10. package/src/llama.cpp/common/chat-parser.cpp +3 -2
  11. package/src/llama.cpp/common/chat-peg-parser.cpp +16 -2
  12. package/src/llama.cpp/common/chat.cpp +272 -0
  13. package/src/llama.cpp/common/common.cpp +130 -67
  14. package/src/llama.cpp/common/common.h +40 -16
  15. package/src/llama.cpp/common/console.cpp +680 -47
  16. package/src/llama.cpp/common/console.h +30 -8
  17. package/src/llama.cpp/common/download.cpp +69 -25
  18. package/src/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
  19. package/src/llama.cpp/common/json-schema-to-grammar.h +20 -0
  20. package/src/llama.cpp/common/log.cpp +5 -0
  21. package/src/llama.cpp/common/log.h +1 -0
  22. package/src/llama.cpp/common/peg-parser.cpp +1 -1
  23. package/src/llama.cpp/common/preset.cpp +206 -0
  24. package/src/llama.cpp/common/preset.h +32 -0
  25. package/src/llama.cpp/common/sampling.cpp +91 -92
  26. package/src/llama.cpp/common/sampling.h +11 -6
  27. package/src/llama.cpp/common/speculative.cpp +1 -1
  28. package/src/llama.cpp/ggml/CMakeLists.txt +5 -0
  29. package/src/llama.cpp/ggml/include/ggml-alloc.h +9 -0
  30. package/src/llama.cpp/ggml/include/ggml-backend.h +1 -0
  31. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
  32. package/src/llama.cpp/ggml/include/ggml.h +7 -8
  33. package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
  34. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +3 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2 -0
  36. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +69 -39
  37. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
  38. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +2 -1
  39. package/src/llama.cpp/include/llama.h +18 -1
  40. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  41. package/src/llama.cpp/src/llama-arch.cpp +1890 -2248
  42. package/src/llama.cpp/src/llama-arch.h +9 -2
  43. package/src/llama.cpp/src/llama-batch.cpp +12 -2
  44. package/src/llama.cpp/src/llama-batch.h +4 -2
  45. package/src/llama.cpp/src/llama-context.cpp +99 -29
  46. package/src/llama.cpp/src/llama-context.h +9 -3
  47. package/src/llama.cpp/src/llama-grammar.cpp +233 -33
  48. package/src/llama.cpp/src/llama-grammar.h +20 -1
  49. package/src/llama.cpp/src/llama-graph.cpp +85 -17
  50. package/src/llama.cpp/src/llama-graph.h +17 -4
  51. package/src/llama.cpp/src/llama-hparams.cpp +6 -0
  52. package/src/llama.cpp/src/llama-hparams.h +5 -1
  53. package/src/llama.cpp/src/llama-impl.cpp +4 -0
  54. package/src/llama.cpp/src/llama-kv-cache.cpp +90 -42
  55. package/src/llama.cpp/src/llama-kv-cache.h +19 -2
  56. package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
  57. package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
  58. package/src/llama.cpp/src/llama-model-loader.h +2 -0
  59. package/src/llama.cpp/src/llama-model.cpp +123 -52
  60. package/src/llama.cpp/src/llama-model.h +1 -0
  61. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  62. package/src/llama.cpp/src/llama-vocab.cpp +2 -1
  63. package/src/llama.cpp/src/llama.cpp +675 -1
  64. package/src/llama.cpp/src/models/deepseek2.cpp +9 -5
  65. package/src/llama.cpp/src/models/{gemma3-iswa.cpp → gemma3.cpp} +30 -5
  66. package/src/llama.cpp/src/models/glm4-moe.cpp +28 -11
  67. package/src/llama.cpp/src/models/glm4.cpp +27 -4
  68. package/src/llama.cpp/src/models/models.h +8 -7
  69. package/src/llama.cpp/src/models/nemotron-h.cpp +35 -6
  70. package/src/llama.cpp/src/models/qwen2.cpp +12 -3
  71. package/src/llama.cpp/src/models/qwen3next.cpp +81 -266
@@ -104,9 +104,10 @@ struct ring_buffer {
104
104
  struct common_sampler {
105
105
  common_params_sampling params;
106
106
 
107
- struct llama_sampler * grmr;
108
107
  struct llama_sampler * chain;
109
108
 
109
+ bool grammar;
110
+
110
111
  ring_buffer<llama_token> prev;
111
112
 
112
113
  std::vector<llama_token_data> cur;
@@ -116,7 +117,6 @@ struct common_sampler {
116
117
  void reset() {
117
118
  prev.clear();
118
119
 
119
- llama_sampler_reset(grmr);
120
120
  llama_sampler_reset(chain);
121
121
  }
122
122
 
@@ -167,10 +167,15 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
167
167
 
168
168
  lparams.no_perf = params.no_perf;
169
169
 
170
- struct llama_sampler * grmr;
170
+ llama_sampler * chain = llama_sampler_chain_init(lparams);
171
+
172
+ bool grammar = false;
173
+ std::vector<llama_sampler *> samplers;
174
+
171
175
  if (params.grammar.compare(0, 11, "%llguidance") == 0) {
172
176
  #ifdef LLAMA_USE_LLGUIDANCE
173
- grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str());
177
+ samplers.push_back(llama_sampler_init_llg(vocab, "lark", params.grammar.c_str()));
178
+ grammar = true;
174
179
  #else
175
180
  GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
176
181
  #endif // LLAMA_USE_LLGUIDANCE
@@ -217,30 +222,23 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
217
222
  trigger_patterns_c.push_back(regex.c_str());
218
223
  }
219
224
 
220
- grmr = params.grammar_lazy
221
- ? llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
222
- trigger_patterns_c.data(), trigger_patterns_c.size(),
223
- trigger_tokens.data(), trigger_tokens.size())
224
- : llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
225
- if (!grmr) {
226
- return nullptr;
225
+ if (!params.grammar.empty()) {
226
+ if (params.grammar_lazy) {
227
+ samplers.push_back(
228
+ llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
229
+ trigger_patterns_c.data(), trigger_patterns_c.size(),
230
+ trigger_tokens.data(), trigger_tokens.size()));
231
+ } else {
232
+ samplers.push_back(llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root"));
233
+ }
234
+
235
+ grammar = true;
227
236
  }
228
237
  }
229
238
 
230
- auto * result = new common_sampler {
231
- /* .params = */ params,
232
- /* .grmr = */ grmr,
233
- /* .chain = */ llama_sampler_chain_init(lparams),
234
- /* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
235
- /* .cur = */ {},
236
- /* .cur_p = */ {},
237
- };
238
-
239
- llama_sampler_chain_add(result->chain,
240
- llama_sampler_init_logit_bias(
241
- llama_vocab_n_tokens(vocab),
242
- params.logit_bias.size(),
243
- params.logit_bias.data()));
239
+ if (params.has_logit_bias()) {
240
+ samplers.push_back(llama_sampler_init_logit_bias(llama_vocab_n_tokens(vocab), params.logit_bias.size(), params.logit_bias.data()));
241
+ }
244
242
 
245
243
  if (params.mirostat == 0) {
246
244
  for (const auto & cnstr : params.samplers) {
@@ -253,58 +251,70 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
253
251
  c_breakers.push_back(str.c_str());
254
252
  }
255
253
 
256
- llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
254
+ samplers.push_back(llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
257
255
  }
258
256
  break;
259
257
  case COMMON_SAMPLER_TYPE_TOP_K:
260
- llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
258
+ samplers.push_back(llama_sampler_init_top_k (params.top_k));
261
259
  break;
262
260
  case COMMON_SAMPLER_TYPE_TOP_P:
263
- llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
261
+ samplers.push_back(llama_sampler_init_top_p (params.top_p, params.min_keep));
264
262
  break;
265
263
  case COMMON_SAMPLER_TYPE_TOP_N_SIGMA:
266
- llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
264
+ samplers.push_back(llama_sampler_init_top_n_sigma(params.top_n_sigma));
267
265
  break;
268
266
  case COMMON_SAMPLER_TYPE_MIN_P:
269
- llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
267
+ samplers.push_back(llama_sampler_init_min_p (params.min_p, params.min_keep));
270
268
  break;
271
269
  case COMMON_SAMPLER_TYPE_XTC:
272
- llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
270
+ samplers.push_back(llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
273
271
  break;
274
272
  case COMMON_SAMPLER_TYPE_TYPICAL_P:
275
- llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
273
+ samplers.push_back(llama_sampler_init_typical (params.typ_p, params.min_keep));
276
274
  break;
277
275
  case COMMON_SAMPLER_TYPE_TEMPERATURE:
278
- llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
276
+ samplers.push_back(llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
279
277
  break;
280
278
  case COMMON_SAMPLER_TYPE_INFILL:
281
- llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
279
+ samplers.push_back(llama_sampler_init_infill (vocab));
282
280
  break;
283
281
  case COMMON_SAMPLER_TYPE_PENALTIES:
284
- llama_sampler_chain_add(result->chain, llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
282
+ samplers.push_back(llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
285
283
  break;
286
284
  default:
287
285
  GGML_ASSERT(false && "unknown sampler type");
288
286
  }
289
287
  }
290
- llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
288
+
289
+ samplers.push_back(llama_sampler_init_dist(params.seed));
291
290
  } else if (params.mirostat == 1) {
292
- llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
293
- llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
291
+ samplers.push_back(llama_sampler_init_temp(params.temp));
292
+ samplers.push_back(llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
294
293
  } else if (params.mirostat == 2) {
295
- llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
296
- llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
294
+ samplers.push_back(llama_sampler_init_temp(params.temp));
295
+ samplers.push_back(llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
297
296
  } else {
298
297
  GGML_ASSERT(false && "unknown mirostat version");
299
298
  }
300
299
 
300
+ for (auto * smpl : samplers) {
301
+ llama_sampler_chain_add(chain, smpl);
302
+ }
303
+
304
+ auto * result = new common_sampler {
305
+ /* .params = */ params,
306
+ /* .chain = */ chain,
307
+ /* .grammar = */ grammar,
308
+ /* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
309
+ /* .cur = */ {},
310
+ /* .cur_p = */ {},
311
+ };
312
+
301
313
  return result;
302
314
  }
303
315
 
304
316
  void common_sampler_free(struct common_sampler * gsmpl) {
305
317
  if (gsmpl) {
306
- llama_sampler_free(gsmpl->grmr);
307
-
308
318
  llama_sampler_free(gsmpl->chain);
309
319
 
310
320
  delete gsmpl;
@@ -314,11 +324,24 @@ void common_sampler_free(struct common_sampler * gsmpl) {
314
324
  void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
315
325
  const auto tm = gsmpl->tm();
316
326
 
317
- if (accept_grammar) {
318
- llama_sampler_accept(gsmpl->grmr, token);
319
- }
327
+ if (gsmpl->grammar) {
328
+ const int n_smpl = llama_sampler_chain_n(gsmpl->chain);
320
329
 
321
- llama_sampler_accept(gsmpl->chain, token);
330
+ for (int i = 0; i < n_smpl; i++) {
331
+ auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
332
+
333
+ // the grammar sampler is always the first one
334
+ if (i == 0) {
335
+ if (accept_grammar) {
336
+ llama_sampler_accept(smpl, token);
337
+ }
338
+ } else {
339
+ llama_sampler_accept(smpl, token);
340
+ }
341
+ }
342
+ } else {
343
+ llama_sampler_accept(gsmpl->chain, token);
344
+ }
322
345
 
323
346
  gsmpl->prev.push_back(token);
324
347
  }
@@ -329,12 +352,12 @@ void common_sampler_reset(struct common_sampler * gsmpl) {
329
352
 
330
353
  struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
331
354
  return new common_sampler {
332
- /* .params = */ gsmpl->params,
333
- /* .grmr = */ llama_sampler_clone(gsmpl->grmr),
334
- /* .chain = */ llama_sampler_clone(gsmpl->chain),
335
- /* .prev = */ gsmpl->prev,
336
- /* .cur = */ gsmpl->cur,
337
- /* .cur_p = */ gsmpl->cur_p,
355
+ /* .params = */ gsmpl->params,
356
+ /* .chain = */ llama_sampler_clone(gsmpl->chain),
357
+ /* .grammar = */ gsmpl->grammar,
358
+ /* .prev = */ gsmpl->prev,
359
+ /* .cur = */ gsmpl->cur,
360
+ /* .cur_p = */ gsmpl->cur_p,
338
361
  };
339
362
  }
340
363
 
@@ -383,58 +406,33 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
383
406
  }
384
407
  }
385
408
 
386
- llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
409
+ struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl) {
410
+ return gsmpl->chain;
411
+ }
412
+
413
+ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx) {
387
414
  llama_synchronize(ctx);
388
415
 
389
416
  // start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
390
417
  const auto tm = gsmpl->tm();
391
418
 
392
- gsmpl->set_logits(ctx, idx);
419
+ llama_token id = LLAMA_TOKEN_NULL;
393
420
 
394
- auto & grmr = gsmpl->grmr;
395
421
  auto & chain = gsmpl->chain;
396
422
  auto & cur_p = gsmpl->cur_p; // initialized by set_logits
397
423
 
398
- if (grammar_first) {
399
- llama_sampler_apply(grmr, &cur_p);
400
- }
424
+ gsmpl->set_logits(ctx, idx);
401
425
 
402
426
  llama_sampler_apply(chain, &cur_p);
403
427
 
404
428
  GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
405
429
 
406
- const llama_token id = cur_p.data[cur_p.selected].id;
407
-
408
- if (grammar_first) {
409
- return id;
410
- }
411
-
412
- // check if it the sampled token fits the grammar
413
- {
414
- llama_token_data single_token_data = { id, 1.0f, 0.0f };
415
- llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
416
-
417
- llama_sampler_apply(grmr, &single_token_data_array);
418
-
419
- const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
420
- if (is_valid) {
421
- return id;
422
- }
423
- }
424
-
425
- // resampling:
426
- // if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
427
- gsmpl->set_logits(ctx, idx);
428
-
429
- llama_sampler_apply(grmr, &cur_p);
430
- llama_sampler_apply(chain, &cur_p);
431
-
432
- GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration");
430
+ id = cur_p.data[cur_p.selected].id;
433
431
 
434
- return cur_p.data[cur_p.selected].id;
432
+ return id;
435
433
  }
436
434
 
437
- std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
435
+ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft) {
438
436
  GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");
439
437
 
440
438
  std::vector<llama_token> result;
@@ -442,7 +440,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
442
440
 
443
441
  size_t i = 0;
444
442
  for (; i < draft.size(); i++) {
445
- const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
443
+ const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);
446
444
 
447
445
  common_sampler_accept(gsmpl, id, true);
448
446
 
@@ -454,7 +452,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
454
452
  }
455
453
 
456
454
  if (i == draft.size()) {
457
- const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
455
+ const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);
458
456
 
459
457
  common_sampler_accept(gsmpl, id, true);
460
458
 
@@ -464,13 +462,13 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
464
462
  return result;
465
463
  }
466
464
 
467
- std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) {
465
+ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft) {
468
466
  std::vector<int> idxs(draft.size() + 1);
469
467
  for (size_t i = 0; i < idxs.size(); ++i) {
470
468
  idxs[i] = i;
471
469
  }
472
470
 
473
- return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first);
471
+ return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft);
474
472
  }
475
473
 
476
474
  uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
@@ -515,7 +513,8 @@ std::string common_sampler_print(const struct common_sampler * gsmpl) {
515
513
 
516
514
  for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
517
515
  const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
518
- result += std::string("-> ") + llama_sampler_name(smpl) + " ";
516
+ result += std::string("-> ");
517
+ result += std::string(llama_sampler_name(smpl)) + " ";
519
518
  }
520
519
 
521
520
  return result;
@@ -48,6 +48,8 @@ struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
48
48
  // arguments can be nullptr to skip printing
49
49
  void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
50
50
 
51
+ struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
52
+
51
53
  // extended sampling implementation:
52
54
  //
53
55
  // - set logits
@@ -55,10 +57,7 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
55
57
  // - check if the token fits the grammar (if any)
56
58
  // - if not: resample by first applying the grammar constraints and then sampling again (slower path)
57
59
  //
58
- // if grammar_first is true, the grammar is applied before the samplers (slower)
59
- // useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
60
- //
61
- llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
60
+ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx);
62
61
 
63
62
  // generalized version of common_sampler_sample
64
63
  //
@@ -76,10 +75,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
76
75
  //
77
76
  // returns at least 1 token, up to idxs.size()
78
77
  //
79
- std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first = false);
78
+ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft);
80
79
 
81
80
  // assume idxs == [ 0, 1, 2, ..., draft.size() ]
82
- std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false);
81
+ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft);
83
82
 
84
83
  uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
85
84
 
@@ -107,3 +106,9 @@ std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std:
107
106
 
108
107
  llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab,
109
108
  const char * grammar_kind, const char * grammar_data);
109
+
110
+ struct common_sampler_deleter {
111
+ void operator()(common_sampler * s) { common_sampler_free(s); }
112
+ };
113
+
114
+ typedef std::unique_ptr<common_sampler, common_sampler_deleter> common_sampler_ptr;
@@ -315,7 +315,7 @@ llama_tokens common_speculative_gen_draft(
315
315
  for (int i = 0; i < params.n_draft; ++i) {
316
316
  common_batch_clear(batch);
317
317
 
318
- common_sampler_sample(smpl, ctx_dft, 0, true);
318
+ common_sampler_sample(smpl, ctx_dft, 0);
319
319
 
320
320
  const auto * cur_p = common_sampler_get_candidates(smpl, true);
321
321
 
@@ -54,6 +54,10 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
54
54
  # TODO
55
55
  else()
56
56
  set(GGML_STANDALONE OFF)
57
+
58
+ if (NOT CMAKE_RUNTIME_OUTPUT_DIRECTORY)
59
+ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
60
+ endif()
57
61
  endif()
58
62
 
59
63
  if (EMSCRIPTEN)
@@ -168,6 +172,7 @@ option(GGML_RVV "ggml: enable rvv" ON)
168
172
  option(GGML_RV_ZFH "ggml: enable riscv zfh" ON)
169
173
  option(GGML_RV_ZVFH "ggml: enable riscv zvfh" ON)
170
174
  option(GGML_RV_ZICBOP "ggml: enable riscv zicbop" ON)
175
+ option(GGML_RV_ZIHINTPAUSE "ggml: enable riscv zihintpause " ON)
171
176
  option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
172
177
  option(GGML_VXE "ggml: enable vxe" ${GGML_NATIVE})
173
178
 
@@ -53,7 +53,14 @@ GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
53
53
  // call with a worst-case graph to avoid buffer reallocations
54
54
  // not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
55
55
  // returns false if the buffer allocation failed
56
+ // ggml_gallocr_resrve_n_size writes the buffer sizes per galloc buffer that would be allocated by ggml_gallocr_reserve_n to sizes
56
57
  GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
58
+ GGML_API void ggml_gallocr_reserve_n_size(
59
+ ggml_gallocr_t galloc,
60
+ struct ggml_cgraph * graph,
61
+ const int * node_buffer_ids,
62
+ const int * leaf_buffer_ids,
63
+ size_t * sizes);
57
64
  GGML_API bool ggml_gallocr_reserve_n(
58
65
  ggml_gallocr_t galloc,
59
66
  struct ggml_cgraph * graph,
@@ -68,6 +75,8 @@ GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_i
68
75
 
69
76
  // Utils
70
77
  // Create a buffer and allocate all the tensors in a ggml_context
78
+ // ggml_backend_alloc_ctx_tensors_from_buft_size returns the size of the buffer that would be allocated by ggml_backend_alloc_ctx_tensors_from_buft
79
+ GGML_API size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
71
80
  GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
72
81
  GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
73
82
 
@@ -307,6 +307,7 @@ extern "C" {
307
307
  GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
308
308
 
309
309
  // Initialize backend buffers from a measure graph
310
+ GGML_API void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes);
310
311
  GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
311
312
 
312
313
  GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
@@ -99,6 +99,7 @@ extern "C" {
99
99
  GGML_BACKEND_API int ggml_cpu_has_sme (void);
100
100
  // other
101
101
  GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
102
+ GGML_BACKEND_API int ggml_cpu_get_rvv_vlen (void); // risc-v vector length in bytes
102
103
  GGML_BACKEND_API int ggml_cpu_has_vsx (void);
103
104
  GGML_BACKEND_API int ggml_cpu_has_vxe (void);
104
105
  GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
@@ -2305,13 +2305,11 @@ extern "C" {
2305
2305
  float stop,
2306
2306
  float step);
2307
2307
 
2308
- #define GGML_KQ_MASK_PAD 1
2309
-
2310
- // q: [n_embd_k, n_batch, n_head, ne3 ]
2311
- // k: [n_embd_k, n_kv, n_head_kv, ne3 ]
2312
- // v: [n_embd_v, n_kv, n_head_kv, ne3 ] !! not transposed !!
2313
- // mask: [n_kv, n_batch_pad, ne32, ne33] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
2314
- // res: [n_embd_v, n_head, n_batch, ne3 ] !! permuted !!
2308
+ // q: [n_embd_k, n_batch, n_head, ne3 ]
2309
+ // k: [n_embd_k, n_kv, n_head_kv, ne3 ]
2310
+ // v: [n_embd_v, n_kv, n_head_kv, ne3 ] !! not transposed !!
2311
+ // mask: [n_kv, n_batch, ne32, ne33]
2312
+ // res: [n_embd_v, n_head, n_batch, ne3 ] !! permuted !!
2315
2313
  //
2316
2314
  // broadcast:
2317
2315
  // n_head % n_head_kv == 0
@@ -2617,7 +2615,8 @@ extern "C" {
2617
2615
 
2618
2616
  // Set callback for all future logging events.
2619
2617
  // If this is not called, or NULL is supplied, everything is output on stderr.
2620
- GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
2618
+ GGML_API void ggml_log_get(ggml_log_callback * log_callback, void ** user_data);
2619
+ GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
2621
2620
 
2622
2621
  GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
2623
2622
 
@@ -386,6 +386,9 @@ if (GGML_CPU_ALL_VARIANTS)
386
386
  ggml_add_cpu_backend_variant(android_armv8.2_1 DOTPROD)
387
387
  ggml_add_cpu_backend_variant(android_armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC)
388
388
  ggml_add_cpu_backend_variant(android_armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC MATMUL_INT8)
389
+ ggml_add_cpu_backend_variant(android_armv9.0_1 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE2)
390
+ ggml_add_cpu_backend_variant(android_armv9.2_1 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SME)
391
+ ggml_add_cpu_backend_variant(android_armv9.2_2 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE SME)
389
392
  elseif (APPLE)
390
393
  ggml_add_cpu_backend_variant(apple_m1 DOTPROD)
391
394
  ggml_add_cpu_backend_variant(apple_m2_m3 DOTPROD MATMUL_INT8)
@@ -469,6 +469,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
469
469
  if (GGML_RV_ZICBOP)
470
470
  string(APPEND MARCH_STR "_zicbop")
471
471
  endif()
472
+ if (GGML_RV_ZIHINTPAUSE)
473
+ string(APPEND MARCH_STR "_zihintpause")
474
+ endif()
472
475
  list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
473
476
  else()
474
477
  # Begin with the lowest baseline
@@ -24,6 +24,7 @@
24
24
 
25
25
  #define UNUSED GGML_UNUSED
26
26
 
27
+ #if defined(__aarch64__) && defined(__ARM_NEON) && (defined(__ARM_FEATURE_MATMUL_INT8) || defined(__ARM_FEATURE_DOTPROD))
27
28
  static inline void decode_q4_Kx8_scales_mins(const uint8_t * scales_in,
28
29
  int16x8_t * out_mins,
29
30
  int8_t * out_scales) {
@@ -46,6 +47,7 @@ static inline void decode_q4_Kx8_scales_mins(const uint8_t * scales_in,
46
47
  scales_u32[1] = (sm[2] & kmask2) | (((sm[0] >> 6) & kmask3) << 4);
47
48
  memcpy(out_scales, scales_u32, 8);
48
49
  }
50
+ #endif
49
51
 
50
52
  void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
51
53
  assert(QK8_0 == 32);