@fugood/llama.node 1.4.8 → 1.4.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/lib/binding.ts +43 -0
  2. package/lib/parallel.js +26 -0
  3. package/lib/parallel.ts +33 -0
  4. package/package.json +15 -15
  5. package/scripts/llama.cpp.patch +12 -14
  6. package/src/LlamaCompletionWorker.cpp +3 -1
  7. package/src/LlamaCompletionWorker.h +2 -0
  8. package/src/LlamaContext.cpp +16 -1
  9. package/src/LlamaContext.h +3 -0
  10. package/src/llama.cpp/common/CMakeLists.txt +4 -4
  11. package/src/llama.cpp/common/arg.cpp +159 -42
  12. package/src/llama.cpp/common/arg.h +10 -1
  13. package/src/llama.cpp/common/common.cpp +1 -1
  14. package/src/llama.cpp/common/common.h +6 -2
  15. package/src/llama.cpp/common/preset.cpp +197 -5
  16. package/src/llama.cpp/common/preset.h +45 -3
  17. package/src/llama.cpp/common/sampling.cpp +51 -37
  18. package/src/llama.cpp/common/sampling.h +6 -3
  19. package/src/llama.cpp/common/speculative.cpp +1 -1
  20. package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
  21. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +4 -0
  22. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +283 -0
  23. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +28 -0
  24. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +51 -6
  25. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +286 -0
  26. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +8 -0
  27. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +41 -1
  28. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +125 -22
  29. package/src/llama.cpp/src/llama-arch.cpp +1 -1
  30. package/src/llama.cpp/src/llama-mmap.cpp +123 -28
  31. package/src/llama.cpp/src/llama-mmap.h +5 -1
  32. package/src/llama.cpp/src/llama-model-loader.cpp +56 -13
  33. package/src/llama.cpp/src/llama-model.cpp +7 -5
  34. package/src/llama.cpp/src/llama-sampling.cpp +16 -0
  35. package/src/llama.cpp/src/llama.cpp +22 -32
@@ -2,6 +2,7 @@
2
2
  #include "preset.h"
3
3
  #include "peg-parser.h"
4
4
  #include "log.h"
5
+ #include "download.h"
5
6
 
6
7
  #include <fstream>
7
8
  #include <sstream>
@@ -15,11 +16,22 @@ static std::string rm_leading_dashes(const std::string & str) {
15
16
  return str.substr(pos);
16
17
  }
17
18
 
18
- std::vector<std::string> common_preset::to_args() const {
19
+ std::vector<std::string> common_preset::to_args(const std::string & bin_path) const {
19
20
  std::vector<std::string> args;
20
21
 
22
+ if (!bin_path.empty()) {
23
+ args.push_back(bin_path);
24
+ }
25
+
21
26
  for (const auto & [opt, value] : options) {
22
- args.push_back(opt.args.back()); // use the last arg as the main arg
27
+ if (opt.is_preset_only) {
28
+ continue; // skip preset-only options (they are not CLI args)
29
+ }
30
+
31
+ // use the last arg as the main arg (i.e. --long-form)
32
+ args.push_back(opt.args.back());
33
+
34
+ // handle value(s)
23
35
  if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
24
36
  // flag option, no value
25
37
  if (common_arg_utils::is_falsey(value)) {
@@ -63,6 +75,52 @@ std::string common_preset::to_ini() const {
63
75
  return ss.str();
64
76
  }
65
77
 
78
+ void common_preset::set_option(const common_preset_context & ctx, const std::string & env, const std::string & value) {
79
+ // try if option exists, update it
80
+ for (auto & [opt, val] : options) {
81
+ if (opt.env && env == opt.env) {
82
+ val = value;
83
+ return;
84
+ }
85
+ }
86
+ // if option does not exist, we need to add it
87
+ if (ctx.key_to_opt.find(env) == ctx.key_to_opt.end()) {
88
+ throw std::runtime_error(string_format(
89
+ "%s: option with env '%s' not found in ctx_params",
90
+ __func__, env.c_str()
91
+ ));
92
+ }
93
+ options[ctx.key_to_opt.at(env)] = value;
94
+ }
95
+
96
+ void common_preset::unset_option(const std::string & env) {
97
+ for (auto it = options.begin(); it != options.end(); ) {
98
+ const common_arg & opt = it->first;
99
+ if (opt.env && env == opt.env) {
100
+ it = options.erase(it);
101
+ return;
102
+ } else {
103
+ ++it;
104
+ }
105
+ }
106
+ }
107
+
108
+ bool common_preset::get_option(const std::string & env, std::string & value) const {
109
+ for (const auto & [opt, val] : options) {
110
+ if (opt.env && env == opt.env) {
111
+ value = val;
112
+ return true;
113
+ }
114
+ }
115
+ return false;
116
+ }
117
+
118
+ void common_preset::merge(const common_preset & other) {
119
+ for (const auto & [opt, val] : other.options) {
120
+ options[opt] = val; // overwrite existing options
121
+ }
122
+ }
123
+
66
124
  static std::map<std::string, std::map<std::string, std::string>> parse_ini_from_file(const std::string & path) {
67
125
  std::map<std::string, std::map<std::string, std::string>> parsed;
68
126
 
@@ -172,9 +230,14 @@ static std::string parse_bool_arg(const common_arg & arg, const std::string & ke
172
230
  return value;
173
231
  }
174
232
 
175
- common_presets common_presets_load(const std::string & path, common_params_context & ctx_params) {
233
+ common_preset_context::common_preset_context(llama_example ex)
234
+ : ctx_params(common_params_parser_init(default_params, ex)) {
235
+ common_params_add_preset_options(ctx_params.options);
236
+ key_to_opt = get_map_key_opt(ctx_params);
237
+ }
238
+
239
+ common_presets common_preset_context::load_from_ini(const std::string & path, common_preset & global) const {
176
240
  common_presets out;
177
- auto key_to_opt = get_map_key_opt(ctx_params);
178
241
  auto ini_data = parse_ini_from_file(path);
179
242
 
180
243
  for (auto section : ini_data) {
@@ -188,7 +251,7 @@ common_presets common_presets_load(const std::string & path, common_params_conte
188
251
  for (const auto & [key, value] : section.second) {
189
252
  LOG_DBG("option: %s = %s\n", key.c_str(), value.c_str());
190
253
  if (key_to_opt.find(key) != key_to_opt.end()) {
191
- auto & opt = key_to_opt[key];
254
+ const auto & opt = key_to_opt.at(key);
192
255
  if (is_bool_arg(opt)) {
193
256
  preset.options[opt] = parse_bool_arg(opt, key, value);
194
257
  } else {
@@ -199,8 +262,137 @@ common_presets common_presets_load(const std::string & path, common_params_conte
199
262
  // TODO: maybe warn about unknown key?
200
263
  }
201
264
  }
265
+
266
+ if (preset.name == "*") {
267
+ // handle global preset
268
+ global = preset;
269
+ } else {
270
+ out[preset.name] = preset;
271
+ }
272
+ }
273
+
274
+ return out;
275
+ }
276
+
277
+ common_presets common_preset_context::load_from_cache() const {
278
+ common_presets out;
279
+
280
+ auto cached_models = common_list_cached_models();
281
+ for (const auto & model : cached_models) {
282
+ common_preset preset;
283
+ preset.name = model.to_string();
284
+ preset.set_option(*this, "LLAMA_ARG_HF_REPO", model.to_string());
202
285
  out[preset.name] = preset;
203
286
  }
204
287
 
205
288
  return out;
206
289
  }
290
+
291
+ struct local_model {
292
+ std::string name;
293
+ std::string path;
294
+ std::string path_mmproj;
295
+ };
296
+
297
+ common_presets common_preset_context::load_from_models_dir(const std::string & models_dir) const {
298
+ if (!std::filesystem::exists(models_dir) || !std::filesystem::is_directory(models_dir)) {
299
+ throw std::runtime_error(string_format("error: '%s' does not exist or is not a directory\n", models_dir.c_str()));
300
+ }
301
+
302
+ std::vector<local_model> models;
303
+ auto scan_subdir = [&models](const std::string & subdir_path, const std::string & name) {
304
+ auto files = fs_list(subdir_path, false);
305
+ common_file_info model_file;
306
+ common_file_info first_shard_file;
307
+ common_file_info mmproj_file;
308
+ for (const auto & file : files) {
309
+ if (string_ends_with(file.name, ".gguf")) {
310
+ if (file.name.find("mmproj") != std::string::npos) {
311
+ mmproj_file = file;
312
+ } else if (file.name.find("-00001-of-") != std::string::npos) {
313
+ first_shard_file = file;
314
+ } else {
315
+ model_file = file;
316
+ }
317
+ }
318
+ }
319
+ // single file model
320
+ local_model model{
321
+ /* name */ name,
322
+ /* path */ first_shard_file.path.empty() ? model_file.path : first_shard_file.path,
323
+ /* path_mmproj */ mmproj_file.path // can be empty
324
+ };
325
+ if (!model.path.empty()) {
326
+ models.push_back(model);
327
+ }
328
+ };
329
+
330
+ auto files = fs_list(models_dir, true);
331
+ for (const auto & file : files) {
332
+ if (file.is_dir) {
333
+ scan_subdir(file.path, file.name);
334
+ } else if (string_ends_with(file.name, ".gguf")) {
335
+ // single file model
336
+ std::string name = file.name;
337
+ string_replace_all(name, ".gguf", "");
338
+ local_model model{
339
+ /* name */ name,
340
+ /* path */ file.path,
341
+ /* path_mmproj */ ""
342
+ };
343
+ models.push_back(model);
344
+ }
345
+ }
346
+
347
+ // convert local models to presets
348
+ common_presets out;
349
+ for (const auto & model : models) {
350
+ common_preset preset;
351
+ preset.name = model.name;
352
+ preset.set_option(*this, "LLAMA_ARG_MODEL", model.path);
353
+ if (!model.path_mmproj.empty()) {
354
+ preset.set_option(*this, "LLAMA_ARG_MMPROJ", model.path_mmproj);
355
+ }
356
+ out[preset.name] = preset;
357
+ }
358
+
359
+ return out;
360
+ }
361
+
362
+ common_preset common_preset_context::load_from_args(int argc, char ** argv) const {
363
+ common_preset preset;
364
+ preset.name = COMMON_PRESET_DEFAULT_NAME;
365
+
366
+ bool ok = common_params_to_map(argc, argv, ctx_params.ex, preset.options);
367
+ if (!ok) {
368
+ throw std::runtime_error("failed to parse CLI arguments into preset");
369
+ }
370
+
371
+ return preset;
372
+ }
373
+
374
+ common_presets common_preset_context::cascade(const common_presets & base, const common_presets & added) const {
375
+ common_presets out = base; // copy
376
+ for (const auto & [name, preset_added] : added) {
377
+ if (out.find(name) != out.end()) {
378
+ // if exists, merge
379
+ common_preset & target = out[name];
380
+ target.merge(preset_added);
381
+ } else {
382
+ // otherwise, add directly
383
+ out[name] = preset_added;
384
+ }
385
+ }
386
+ return out;
387
+ }
388
+
389
+ common_presets common_preset_context::cascade(const common_preset & base, const common_presets & presets) const {
390
+ common_presets out;
391
+ for (const auto & [name, preset] : presets) {
392
+ common_preset tmp = base; // copy
393
+ tmp.name = name;
394
+ tmp.merge(preset);
395
+ out[name] = std::move(tmp);
396
+ }
397
+ return out;
398
+ }
@@ -13,20 +13,62 @@
13
13
 
14
14
  constexpr const char * COMMON_PRESET_DEFAULT_NAME = "default";
15
15
 
16
+ struct common_preset_context;
17
+
16
18
  struct common_preset {
17
19
  std::string name;
18
- // TODO: support repeated args in the future
20
+
21
+ // options are stored as common_arg to string mapping, representing CLI arg and its value
19
22
  std::map<common_arg, std::string> options;
20
23
 
21
24
  // convert preset to CLI argument list
22
- std::vector<std::string> to_args() const;
25
+ std::vector<std::string> to_args(const std::string & bin_path = "") const;
23
26
 
24
27
  // convert preset to INI format string
25
28
  std::string to_ini() const;
26
29
 
27
30
  // TODO: maybe implement to_env() if needed
31
+
32
+ // modify preset options where argument is identified by its env variable
33
+ void set_option(const common_preset_context & ctx, const std::string & env, const std::string & value);
34
+
35
+ // unset option by its env variable
36
+ void unset_option(const std::string & env);
37
+
38
+ // get option value by its env variable, return false if not found
39
+ bool get_option(const std::string & env, std::string & value) const;
40
+
41
+ // merge another preset into this one, overwriting existing options
42
+ void merge(const common_preset & other);
28
43
  };
29
44
 
30
45
  // interface for multiple presets in one file
31
46
  using common_presets = std::map<std::string, common_preset>;
32
- common_presets common_presets_load(const std::string & path, common_params_context & ctx_params);
47
+
48
+ // context for loading and editing presets
49
+ struct common_preset_context {
50
+ common_params default_params; // unused for now
51
+ common_params_context ctx_params;
52
+ std::map<std::string, common_arg> key_to_opt;
53
+ common_preset_context(llama_example ex);
54
+
55
+ // load presets from INI file
56
+ common_presets load_from_ini(const std::string & path, common_preset & global) const;
57
+
58
+ // generate presets from cached models
59
+ common_presets load_from_cache() const;
60
+
61
+ // generate presets from local models directory
62
+ // for the directory structure, see "Using multiple models" in server/README.md
63
+ common_presets load_from_models_dir(const std::string & models_dir) const;
64
+
65
+ // generate one preset from CLI arguments
66
+ common_preset load_from_args(int argc, char ** argv) const;
67
+
68
+ // cascade multiple presets if exist on both: base < added
69
+ // if preset does not exist in base, it will be added without modification
70
+ common_presets cascade(const common_presets & base, const common_presets & added) const;
71
+
72
+ // apply presets over a base preset (same idea as CSS cascading)
73
+ common_presets cascade(const common_preset & base, const common_presets & presets) const;
74
+ };
@@ -104,10 +104,9 @@ struct ring_buffer {
104
104
  struct common_sampler {
105
105
  common_params_sampling params;
106
106
 
107
+ struct llama_sampler * grmr;
107
108
  struct llama_sampler * chain;
108
109
 
109
- bool grammar;
110
-
111
110
  ring_buffer<llama_token> prev;
112
111
 
113
112
  std::vector<llama_token_data> cur;
@@ -167,15 +166,14 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
167
166
 
168
167
  lparams.no_perf = params.no_perf;
169
168
 
169
+ llama_sampler * grmr = nullptr;
170
170
  llama_sampler * chain = llama_sampler_chain_init(lparams);
171
171
 
172
- bool grammar = false;
173
172
  std::vector<llama_sampler *> samplers;
174
173
 
175
174
  if (params.grammar.compare(0, 11, "%llguidance") == 0) {
176
175
  #ifdef LLAMA_USE_LLGUIDANCE
177
- samplers.push_back(llama_sampler_init_llg(vocab, "lark", params.grammar.c_str()));
178
- grammar = true;
176
+ grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str());
179
177
  #else
180
178
  GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
181
179
  #endif // LLAMA_USE_LLGUIDANCE
@@ -224,15 +222,12 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
224
222
 
225
223
  if (!params.grammar.empty()) {
226
224
  if (params.grammar_lazy) {
227
- samplers.push_back(
228
- llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
229
- trigger_patterns_c.data(), trigger_patterns_c.size(),
230
- trigger_tokens.data(), trigger_tokens.size()));
225
+ grmr = llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
226
+ trigger_patterns_c.data(), trigger_patterns_c.size(),
227
+ trigger_tokens.data(), trigger_tokens.size());
231
228
  } else {
232
- samplers.push_back(llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root"));
229
+ grmr = llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
233
230
  }
234
-
235
- grammar = true;
236
231
  }
237
232
  }
238
233
 
@@ -303,8 +298,8 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
303
298
 
304
299
  auto * result = new common_sampler {
305
300
  /* .params = */ params,
301
+ /* .grmr = */ grmr,
306
302
  /* .chain = */ chain,
307
- /* .grammar = */ grammar,
308
303
  /* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
309
304
  /* .cur = */ {},
310
305
  /* .cur_p = */ {},
@@ -315,6 +310,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
315
310
 
316
311
  void common_sampler_free(struct common_sampler * gsmpl) {
317
312
  if (gsmpl) {
313
+ llama_sampler_free(gsmpl->grmr);
318
314
  llama_sampler_free(gsmpl->chain);
319
315
 
320
316
  delete gsmpl;
@@ -324,25 +320,12 @@ void common_sampler_free(struct common_sampler * gsmpl) {
324
320
  void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
325
321
  const auto tm = gsmpl->tm();
326
322
 
327
- if (gsmpl->grammar) {
328
- const int n_smpl = llama_sampler_chain_n(gsmpl->chain);
329
-
330
- for (int i = 0; i < n_smpl; i++) {
331
- auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
332
-
333
- // the grammar sampler is always the first one
334
- if (i == 0) {
335
- if (accept_grammar) {
336
- llama_sampler_accept(smpl, token);
337
- }
338
- } else {
339
- llama_sampler_accept(smpl, token);
340
- }
341
- }
342
- } else {
343
- llama_sampler_accept(gsmpl->chain, token);
323
+ if (gsmpl->grmr && accept_grammar) {
324
+ llama_sampler_accept(gsmpl->grmr, token);
344
325
  }
345
326
 
327
+ llama_sampler_accept(gsmpl->chain, token);
328
+
346
329
  gsmpl->prev.push_back(token);
347
330
  }
348
331
 
@@ -353,8 +336,8 @@ void common_sampler_reset(struct common_sampler * gsmpl) {
353
336
  struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
354
337
  return new common_sampler {
355
338
  /* .params = */ gsmpl->params,
339
+ /* .grmr = */ llama_sampler_clone(gsmpl->grmr),
356
340
  /* .chain = */ llama_sampler_clone(gsmpl->chain),
357
- /* .grammar = */ gsmpl->grammar,
358
341
  /* .prev = */ gsmpl->prev,
359
342
  /* .cur = */ gsmpl->cur,
360
343
  /* .cur_p = */ gsmpl->cur_p,
@@ -410,7 +393,7 @@ struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl) {
410
393
  return gsmpl->chain;
411
394
  }
412
395
 
413
- llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx) {
396
+ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
414
397
  llama_synchronize(ctx);
415
398
 
416
399
  // start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
@@ -418,11 +401,42 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
418
401
 
419
402
  llama_token id = LLAMA_TOKEN_NULL;
420
403
 
404
+ auto & grmr = gsmpl->grmr;
421
405
  auto & chain = gsmpl->chain;
422
406
  auto & cur_p = gsmpl->cur_p; // initialized by set_logits
423
407
 
424
408
  gsmpl->set_logits(ctx, idx);
425
409
 
410
+ if (grammar_first) {
411
+ llama_sampler_apply(grmr, &cur_p);
412
+ }
413
+
414
+ llama_sampler_apply(chain, &cur_p);
415
+
416
+ id = cur_p.data[cur_p.selected].id;
417
+
418
+ if (grammar_first) {
419
+ return id;
420
+ }
421
+
422
+ // check if it the sampled token fits the grammar (grammar-based rejection sampling)
423
+ {
424
+ llama_token_data single_token_data = { id, 1.0f, 0.0f };
425
+ llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
426
+
427
+ llama_sampler_apply(grmr, &single_token_data_array);
428
+
429
+ const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
430
+ if (is_valid) {
431
+ return id;
432
+ }
433
+ }
434
+
435
+ // resampling:
436
+ // if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
437
+ gsmpl->set_logits(ctx, idx);
438
+
439
+ llama_sampler_apply(grmr, &cur_p);
426
440
  llama_sampler_apply(chain, &cur_p);
427
441
 
428
442
  GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
@@ -432,7 +446,7 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
432
446
  return id;
433
447
  }
434
448
 
435
- std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft) {
449
+ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
436
450
  GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");
437
451
 
438
452
  std::vector<llama_token> result;
@@ -440,7 +454,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
440
454
 
441
455
  size_t i = 0;
442
456
  for (; i < draft.size(); i++) {
443
- const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);
457
+ const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
444
458
 
445
459
  common_sampler_accept(gsmpl, id, true);
446
460
 
@@ -452,7 +466,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
452
466
  }
453
467
 
454
468
  if (i == draft.size()) {
455
- const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);
469
+ const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
456
470
 
457
471
  common_sampler_accept(gsmpl, id, true);
458
472
 
@@ -462,13 +476,13 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
462
476
  return result;
463
477
  }
464
478
 
465
- std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft) {
479
+ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) {
466
480
  std::vector<int> idxs(draft.size() + 1);
467
481
  for (size_t i = 0; i < idxs.size(); ++i) {
468
482
  idxs[i] = i;
469
483
  }
470
484
 
471
- return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft);
485
+ return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first);
472
486
  }
473
487
 
474
488
  uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
@@ -57,7 +57,10 @@ struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
57
57
  // - check if the token fits the grammar (if any)
58
58
  // - if not: resample by first applying the grammar constraints and then sampling again (slower path)
59
59
  //
60
- llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx);
60
+ // if grammar_first is true, the grammar is applied before the samplers (slower)
61
+ // useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
62
+ //
63
+ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
61
64
 
62
65
  // generalized version of common_sampler_sample
63
66
  //
@@ -75,10 +78,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
75
78
  //
76
79
  // returns at least 1 token, up to idxs.size()
77
80
  //
78
- std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft);
81
+ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first = false);
79
82
 
80
83
  // assume idxs == [ 0, 1, 2, ..., draft.size() ]
81
- std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft);
84
+ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false);
82
85
 
83
86
  uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
84
87
 
@@ -315,7 +315,7 @@ llama_tokens common_speculative_gen_draft(
315
315
  for (int i = 0; i < params.n_draft; ++i) {
316
316
  common_batch_clear(batch);
317
317
 
318
- common_sampler_sample(smpl, ctx_dft, 0);
318
+ common_sampler_sample(smpl, ctx_dft, 0, true);
319
319
 
320
320
  const auto * cur_p = common_sampler_get_candidates(smpl, true);
321
321
 
@@ -254,6 +254,7 @@ set (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
254
254
  "gmml: OpenCL API version to target")
255
255
 
256
256
  option(GGML_HEXAGON "ggml: enable Hexagon backend" OFF)
257
+ set(GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE 128 CACHE STRING "ggml: quantize group size (32, 64, or 128)")
257
258
 
258
259
  # toolchain for vulkan-shaders-gen
259
260
  set (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")
@@ -458,6 +458,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
458
458
  if (GGML_RV_ZFH)
459
459
  string(APPEND MARCH_STR "_zfh")
460
460
  endif()
461
+
461
462
  if (GGML_XTHEADVECTOR)
462
463
  string(APPEND MARCH_STR "_xtheadvector")
463
464
  elseif (GGML_RVV)
@@ -465,6 +466,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
465
466
  if (GGML_RV_ZVFH)
466
467
  string(APPEND MARCH_STR "_zvfh")
467
468
  endif()
469
+ if (GGML_RV_ZVFBFWMA)
470
+ string(APPEND MARCH_STR "_zvfbfwma")
471
+ endif()
468
472
  endif()
469
473
  if (GGML_RV_ZICBOP)
470
474
  string(APPEND MARCH_STR "_zicbop")