@fugood/llama.node 1.4.8 → 1.4.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +43 -0
- package/lib/parallel.js +26 -0
- package/lib/parallel.ts +33 -0
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +12 -14
- package/src/LlamaCompletionWorker.cpp +3 -1
- package/src/LlamaCompletionWorker.h +2 -0
- package/src/LlamaContext.cpp +16 -1
- package/src/LlamaContext.h +3 -0
- package/src/llama.cpp/common/CMakeLists.txt +4 -4
- package/src/llama.cpp/common/arg.cpp +159 -42
- package/src/llama.cpp/common/arg.h +10 -1
- package/src/llama.cpp/common/common.cpp +1 -1
- package/src/llama.cpp/common/common.h +6 -2
- package/src/llama.cpp/common/preset.cpp +197 -5
- package/src/llama.cpp/common/preset.h +45 -3
- package/src/llama.cpp/common/sampling.cpp +51 -37
- package/src/llama.cpp/common/sampling.h +6 -3
- package/src/llama.cpp/common/speculative.cpp +1 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +283 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +51 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +286 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +41 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +125 -22
- package/src/llama.cpp/src/llama-arch.cpp +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +123 -28
- package/src/llama.cpp/src/llama-mmap.h +5 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +56 -13
- package/src/llama.cpp/src/llama-model.cpp +7 -5
- package/src/llama.cpp/src/llama-sampling.cpp +16 -0
- package/src/llama.cpp/src/llama.cpp +22 -32
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
#include "preset.h"
|
|
3
3
|
#include "peg-parser.h"
|
|
4
4
|
#include "log.h"
|
|
5
|
+
#include "download.h"
|
|
5
6
|
|
|
6
7
|
#include <fstream>
|
|
7
8
|
#include <sstream>
|
|
@@ -15,11 +16,22 @@ static std::string rm_leading_dashes(const std::string & str) {
|
|
|
15
16
|
return str.substr(pos);
|
|
16
17
|
}
|
|
17
18
|
|
|
18
|
-
std::vector<std::string> common_preset::to_args() const {
|
|
19
|
+
std::vector<std::string> common_preset::to_args(const std::string & bin_path) const {
|
|
19
20
|
std::vector<std::string> args;
|
|
20
21
|
|
|
22
|
+
if (!bin_path.empty()) {
|
|
23
|
+
args.push_back(bin_path);
|
|
24
|
+
}
|
|
25
|
+
|
|
21
26
|
for (const auto & [opt, value] : options) {
|
|
22
|
-
|
|
27
|
+
if (opt.is_preset_only) {
|
|
28
|
+
continue; // skip preset-only options (they are not CLI args)
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
// use the last arg as the main arg (i.e. --long-form)
|
|
32
|
+
args.push_back(opt.args.back());
|
|
33
|
+
|
|
34
|
+
// handle value(s)
|
|
23
35
|
if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
|
|
24
36
|
// flag option, no value
|
|
25
37
|
if (common_arg_utils::is_falsey(value)) {
|
|
@@ -63,6 +75,52 @@ std::string common_preset::to_ini() const {
|
|
|
63
75
|
return ss.str();
|
|
64
76
|
}
|
|
65
77
|
|
|
78
|
+
void common_preset::set_option(const common_preset_context & ctx, const std::string & env, const std::string & value) {
|
|
79
|
+
// try if option exists, update it
|
|
80
|
+
for (auto & [opt, val] : options) {
|
|
81
|
+
if (opt.env && env == opt.env) {
|
|
82
|
+
val = value;
|
|
83
|
+
return;
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
// if option does not exist, we need to add it
|
|
87
|
+
if (ctx.key_to_opt.find(env) == ctx.key_to_opt.end()) {
|
|
88
|
+
throw std::runtime_error(string_format(
|
|
89
|
+
"%s: option with env '%s' not found in ctx_params",
|
|
90
|
+
__func__, env.c_str()
|
|
91
|
+
));
|
|
92
|
+
}
|
|
93
|
+
options[ctx.key_to_opt.at(env)] = value;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
void common_preset::unset_option(const std::string & env) {
|
|
97
|
+
for (auto it = options.begin(); it != options.end(); ) {
|
|
98
|
+
const common_arg & opt = it->first;
|
|
99
|
+
if (opt.env && env == opt.env) {
|
|
100
|
+
it = options.erase(it);
|
|
101
|
+
return;
|
|
102
|
+
} else {
|
|
103
|
+
++it;
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
bool common_preset::get_option(const std::string & env, std::string & value) const {
|
|
109
|
+
for (const auto & [opt, val] : options) {
|
|
110
|
+
if (opt.env && env == opt.env) {
|
|
111
|
+
value = val;
|
|
112
|
+
return true;
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
return false;
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
void common_preset::merge(const common_preset & other) {
|
|
119
|
+
for (const auto & [opt, val] : other.options) {
|
|
120
|
+
options[opt] = val; // overwrite existing options
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
|
|
66
124
|
static std::map<std::string, std::map<std::string, std::string>> parse_ini_from_file(const std::string & path) {
|
|
67
125
|
std::map<std::string, std::map<std::string, std::string>> parsed;
|
|
68
126
|
|
|
@@ -172,9 +230,14 @@ static std::string parse_bool_arg(const common_arg & arg, const std::string & ke
|
|
|
172
230
|
return value;
|
|
173
231
|
}
|
|
174
232
|
|
|
175
|
-
|
|
233
|
+
common_preset_context::common_preset_context(llama_example ex)
|
|
234
|
+
: ctx_params(common_params_parser_init(default_params, ex)) {
|
|
235
|
+
common_params_add_preset_options(ctx_params.options);
|
|
236
|
+
key_to_opt = get_map_key_opt(ctx_params);
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
common_presets common_preset_context::load_from_ini(const std::string & path, common_preset & global) const {
|
|
176
240
|
common_presets out;
|
|
177
|
-
auto key_to_opt = get_map_key_opt(ctx_params);
|
|
178
241
|
auto ini_data = parse_ini_from_file(path);
|
|
179
242
|
|
|
180
243
|
for (auto section : ini_data) {
|
|
@@ -188,7 +251,7 @@ common_presets common_presets_load(const std::string & path, common_params_conte
|
|
|
188
251
|
for (const auto & [key, value] : section.second) {
|
|
189
252
|
LOG_DBG("option: %s = %s\n", key.c_str(), value.c_str());
|
|
190
253
|
if (key_to_opt.find(key) != key_to_opt.end()) {
|
|
191
|
-
auto & opt = key_to_opt
|
|
254
|
+
const auto & opt = key_to_opt.at(key);
|
|
192
255
|
if (is_bool_arg(opt)) {
|
|
193
256
|
preset.options[opt] = parse_bool_arg(opt, key, value);
|
|
194
257
|
} else {
|
|
@@ -199,8 +262,137 @@ common_presets common_presets_load(const std::string & path, common_params_conte
|
|
|
199
262
|
// TODO: maybe warn about unknown key?
|
|
200
263
|
}
|
|
201
264
|
}
|
|
265
|
+
|
|
266
|
+
if (preset.name == "*") {
|
|
267
|
+
// handle global preset
|
|
268
|
+
global = preset;
|
|
269
|
+
} else {
|
|
270
|
+
out[preset.name] = preset;
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
return out;
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
common_presets common_preset_context::load_from_cache() const {
|
|
278
|
+
common_presets out;
|
|
279
|
+
|
|
280
|
+
auto cached_models = common_list_cached_models();
|
|
281
|
+
for (const auto & model : cached_models) {
|
|
282
|
+
common_preset preset;
|
|
283
|
+
preset.name = model.to_string();
|
|
284
|
+
preset.set_option(*this, "LLAMA_ARG_HF_REPO", model.to_string());
|
|
202
285
|
out[preset.name] = preset;
|
|
203
286
|
}
|
|
204
287
|
|
|
205
288
|
return out;
|
|
206
289
|
}
|
|
290
|
+
|
|
291
|
+
struct local_model {
|
|
292
|
+
std::string name;
|
|
293
|
+
std::string path;
|
|
294
|
+
std::string path_mmproj;
|
|
295
|
+
};
|
|
296
|
+
|
|
297
|
+
common_presets common_preset_context::load_from_models_dir(const std::string & models_dir) const {
|
|
298
|
+
if (!std::filesystem::exists(models_dir) || !std::filesystem::is_directory(models_dir)) {
|
|
299
|
+
throw std::runtime_error(string_format("error: '%s' does not exist or is not a directory\n", models_dir.c_str()));
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
std::vector<local_model> models;
|
|
303
|
+
auto scan_subdir = [&models](const std::string & subdir_path, const std::string & name) {
|
|
304
|
+
auto files = fs_list(subdir_path, false);
|
|
305
|
+
common_file_info model_file;
|
|
306
|
+
common_file_info first_shard_file;
|
|
307
|
+
common_file_info mmproj_file;
|
|
308
|
+
for (const auto & file : files) {
|
|
309
|
+
if (string_ends_with(file.name, ".gguf")) {
|
|
310
|
+
if (file.name.find("mmproj") != std::string::npos) {
|
|
311
|
+
mmproj_file = file;
|
|
312
|
+
} else if (file.name.find("-00001-of-") != std::string::npos) {
|
|
313
|
+
first_shard_file = file;
|
|
314
|
+
} else {
|
|
315
|
+
model_file = file;
|
|
316
|
+
}
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
// single file model
|
|
320
|
+
local_model model{
|
|
321
|
+
/* name */ name,
|
|
322
|
+
/* path */ first_shard_file.path.empty() ? model_file.path : first_shard_file.path,
|
|
323
|
+
/* path_mmproj */ mmproj_file.path // can be empty
|
|
324
|
+
};
|
|
325
|
+
if (!model.path.empty()) {
|
|
326
|
+
models.push_back(model);
|
|
327
|
+
}
|
|
328
|
+
};
|
|
329
|
+
|
|
330
|
+
auto files = fs_list(models_dir, true);
|
|
331
|
+
for (const auto & file : files) {
|
|
332
|
+
if (file.is_dir) {
|
|
333
|
+
scan_subdir(file.path, file.name);
|
|
334
|
+
} else if (string_ends_with(file.name, ".gguf")) {
|
|
335
|
+
// single file model
|
|
336
|
+
std::string name = file.name;
|
|
337
|
+
string_replace_all(name, ".gguf", "");
|
|
338
|
+
local_model model{
|
|
339
|
+
/* name */ name,
|
|
340
|
+
/* path */ file.path,
|
|
341
|
+
/* path_mmproj */ ""
|
|
342
|
+
};
|
|
343
|
+
models.push_back(model);
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
// convert local models to presets
|
|
348
|
+
common_presets out;
|
|
349
|
+
for (const auto & model : models) {
|
|
350
|
+
common_preset preset;
|
|
351
|
+
preset.name = model.name;
|
|
352
|
+
preset.set_option(*this, "LLAMA_ARG_MODEL", model.path);
|
|
353
|
+
if (!model.path_mmproj.empty()) {
|
|
354
|
+
preset.set_option(*this, "LLAMA_ARG_MMPROJ", model.path_mmproj);
|
|
355
|
+
}
|
|
356
|
+
out[preset.name] = preset;
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
return out;
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
common_preset common_preset_context::load_from_args(int argc, char ** argv) const {
|
|
363
|
+
common_preset preset;
|
|
364
|
+
preset.name = COMMON_PRESET_DEFAULT_NAME;
|
|
365
|
+
|
|
366
|
+
bool ok = common_params_to_map(argc, argv, ctx_params.ex, preset.options);
|
|
367
|
+
if (!ok) {
|
|
368
|
+
throw std::runtime_error("failed to parse CLI arguments into preset");
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
return preset;
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
common_presets common_preset_context::cascade(const common_presets & base, const common_presets & added) const {
|
|
375
|
+
common_presets out = base; // copy
|
|
376
|
+
for (const auto & [name, preset_added] : added) {
|
|
377
|
+
if (out.find(name) != out.end()) {
|
|
378
|
+
// if exists, merge
|
|
379
|
+
common_preset & target = out[name];
|
|
380
|
+
target.merge(preset_added);
|
|
381
|
+
} else {
|
|
382
|
+
// otherwise, add directly
|
|
383
|
+
out[name] = preset_added;
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
return out;
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
common_presets common_preset_context::cascade(const common_preset & base, const common_presets & presets) const {
|
|
390
|
+
common_presets out;
|
|
391
|
+
for (const auto & [name, preset] : presets) {
|
|
392
|
+
common_preset tmp = base; // copy
|
|
393
|
+
tmp.name = name;
|
|
394
|
+
tmp.merge(preset);
|
|
395
|
+
out[name] = std::move(tmp);
|
|
396
|
+
}
|
|
397
|
+
return out;
|
|
398
|
+
}
|
|
@@ -13,20 +13,62 @@
|
|
|
13
13
|
|
|
14
14
|
constexpr const char * COMMON_PRESET_DEFAULT_NAME = "default";
|
|
15
15
|
|
|
16
|
+
struct common_preset_context;
|
|
17
|
+
|
|
16
18
|
struct common_preset {
|
|
17
19
|
std::string name;
|
|
18
|
-
|
|
20
|
+
|
|
21
|
+
// options are stored as common_arg to string mapping, representing CLI arg and its value
|
|
19
22
|
std::map<common_arg, std::string> options;
|
|
20
23
|
|
|
21
24
|
// convert preset to CLI argument list
|
|
22
|
-
std::vector<std::string> to_args() const;
|
|
25
|
+
std::vector<std::string> to_args(const std::string & bin_path = "") const;
|
|
23
26
|
|
|
24
27
|
// convert preset to INI format string
|
|
25
28
|
std::string to_ini() const;
|
|
26
29
|
|
|
27
30
|
// TODO: maybe implement to_env() if needed
|
|
31
|
+
|
|
32
|
+
// modify preset options where argument is identified by its env variable
|
|
33
|
+
void set_option(const common_preset_context & ctx, const std::string & env, const std::string & value);
|
|
34
|
+
|
|
35
|
+
// unset option by its env variable
|
|
36
|
+
void unset_option(const std::string & env);
|
|
37
|
+
|
|
38
|
+
// get option value by its env variable, return false if not found
|
|
39
|
+
bool get_option(const std::string & env, std::string & value) const;
|
|
40
|
+
|
|
41
|
+
// merge another preset into this one, overwriting existing options
|
|
42
|
+
void merge(const common_preset & other);
|
|
28
43
|
};
|
|
29
44
|
|
|
30
45
|
// interface for multiple presets in one file
|
|
31
46
|
using common_presets = std::map<std::string, common_preset>;
|
|
32
|
-
|
|
47
|
+
|
|
48
|
+
// context for loading and editing presets
|
|
49
|
+
struct common_preset_context {
|
|
50
|
+
common_params default_params; // unused for now
|
|
51
|
+
common_params_context ctx_params;
|
|
52
|
+
std::map<std::string, common_arg> key_to_opt;
|
|
53
|
+
common_preset_context(llama_example ex);
|
|
54
|
+
|
|
55
|
+
// load presets from INI file
|
|
56
|
+
common_presets load_from_ini(const std::string & path, common_preset & global) const;
|
|
57
|
+
|
|
58
|
+
// generate presets from cached models
|
|
59
|
+
common_presets load_from_cache() const;
|
|
60
|
+
|
|
61
|
+
// generate presets from local models directory
|
|
62
|
+
// for the directory structure, see "Using multiple models" in server/README.md
|
|
63
|
+
common_presets load_from_models_dir(const std::string & models_dir) const;
|
|
64
|
+
|
|
65
|
+
// generate one preset from CLI arguments
|
|
66
|
+
common_preset load_from_args(int argc, char ** argv) const;
|
|
67
|
+
|
|
68
|
+
// cascade multiple presets if exist on both: base < added
|
|
69
|
+
// if preset does not exist in base, it will be added without modification
|
|
70
|
+
common_presets cascade(const common_presets & base, const common_presets & added) const;
|
|
71
|
+
|
|
72
|
+
// apply presets over a base preset (same idea as CSS cascading)
|
|
73
|
+
common_presets cascade(const common_preset & base, const common_presets & presets) const;
|
|
74
|
+
};
|
|
@@ -104,10 +104,9 @@ struct ring_buffer {
|
|
|
104
104
|
struct common_sampler {
|
|
105
105
|
common_params_sampling params;
|
|
106
106
|
|
|
107
|
+
struct llama_sampler * grmr;
|
|
107
108
|
struct llama_sampler * chain;
|
|
108
109
|
|
|
109
|
-
bool grammar;
|
|
110
|
-
|
|
111
110
|
ring_buffer<llama_token> prev;
|
|
112
111
|
|
|
113
112
|
std::vector<llama_token_data> cur;
|
|
@@ -167,15 +166,14 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|
|
167
166
|
|
|
168
167
|
lparams.no_perf = params.no_perf;
|
|
169
168
|
|
|
169
|
+
llama_sampler * grmr = nullptr;
|
|
170
170
|
llama_sampler * chain = llama_sampler_chain_init(lparams);
|
|
171
171
|
|
|
172
|
-
bool grammar = false;
|
|
173
172
|
std::vector<llama_sampler *> samplers;
|
|
174
173
|
|
|
175
174
|
if (params.grammar.compare(0, 11, "%llguidance") == 0) {
|
|
176
175
|
#ifdef LLAMA_USE_LLGUIDANCE
|
|
177
|
-
|
|
178
|
-
grammar = true;
|
|
176
|
+
grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str());
|
|
179
177
|
#else
|
|
180
178
|
GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
|
|
181
179
|
#endif // LLAMA_USE_LLGUIDANCE
|
|
@@ -224,15 +222,12 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|
|
224
222
|
|
|
225
223
|
if (!params.grammar.empty()) {
|
|
226
224
|
if (params.grammar_lazy) {
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
trigger_tokens.data(), trigger_tokens.size()));
|
|
225
|
+
grmr = llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
|
|
226
|
+
trigger_patterns_c.data(), trigger_patterns_c.size(),
|
|
227
|
+
trigger_tokens.data(), trigger_tokens.size());
|
|
231
228
|
} else {
|
|
232
|
-
|
|
229
|
+
grmr = llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
|
|
233
230
|
}
|
|
234
|
-
|
|
235
|
-
grammar = true;
|
|
236
231
|
}
|
|
237
232
|
}
|
|
238
233
|
|
|
@@ -303,8 +298,8 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|
|
303
298
|
|
|
304
299
|
auto * result = new common_sampler {
|
|
305
300
|
/* .params = */ params,
|
|
301
|
+
/* .grmr = */ grmr,
|
|
306
302
|
/* .chain = */ chain,
|
|
307
|
-
/* .grammar = */ grammar,
|
|
308
303
|
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
|
|
309
304
|
/* .cur = */ {},
|
|
310
305
|
/* .cur_p = */ {},
|
|
@@ -315,6 +310,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|
|
315
310
|
|
|
316
311
|
void common_sampler_free(struct common_sampler * gsmpl) {
|
|
317
312
|
if (gsmpl) {
|
|
313
|
+
llama_sampler_free(gsmpl->grmr);
|
|
318
314
|
llama_sampler_free(gsmpl->chain);
|
|
319
315
|
|
|
320
316
|
delete gsmpl;
|
|
@@ -324,25 +320,12 @@ void common_sampler_free(struct common_sampler * gsmpl) {
|
|
|
324
320
|
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
|
|
325
321
|
const auto tm = gsmpl->tm();
|
|
326
322
|
|
|
327
|
-
if (gsmpl->
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
for (int i = 0; i < n_smpl; i++) {
|
|
331
|
-
auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
|
|
332
|
-
|
|
333
|
-
// the grammar sampler is always the first one
|
|
334
|
-
if (i == 0) {
|
|
335
|
-
if (accept_grammar) {
|
|
336
|
-
llama_sampler_accept(smpl, token);
|
|
337
|
-
}
|
|
338
|
-
} else {
|
|
339
|
-
llama_sampler_accept(smpl, token);
|
|
340
|
-
}
|
|
341
|
-
}
|
|
342
|
-
} else {
|
|
343
|
-
llama_sampler_accept(gsmpl->chain, token);
|
|
323
|
+
if (gsmpl->grmr && accept_grammar) {
|
|
324
|
+
llama_sampler_accept(gsmpl->grmr, token);
|
|
344
325
|
}
|
|
345
326
|
|
|
327
|
+
llama_sampler_accept(gsmpl->chain, token);
|
|
328
|
+
|
|
346
329
|
gsmpl->prev.push_back(token);
|
|
347
330
|
}
|
|
348
331
|
|
|
@@ -353,8 +336,8 @@ void common_sampler_reset(struct common_sampler * gsmpl) {
|
|
|
353
336
|
struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
|
|
354
337
|
return new common_sampler {
|
|
355
338
|
/* .params = */ gsmpl->params,
|
|
339
|
+
/* .grmr = */ llama_sampler_clone(gsmpl->grmr),
|
|
356
340
|
/* .chain = */ llama_sampler_clone(gsmpl->chain),
|
|
357
|
-
/* .grammar = */ gsmpl->grammar,
|
|
358
341
|
/* .prev = */ gsmpl->prev,
|
|
359
342
|
/* .cur = */ gsmpl->cur,
|
|
360
343
|
/* .cur_p = */ gsmpl->cur_p,
|
|
@@ -410,7 +393,7 @@ struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl) {
|
|
|
410
393
|
return gsmpl->chain;
|
|
411
394
|
}
|
|
412
395
|
|
|
413
|
-
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx) {
|
|
396
|
+
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
|
|
414
397
|
llama_synchronize(ctx);
|
|
415
398
|
|
|
416
399
|
// start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
|
|
@@ -418,11 +401,42 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
|
|
418
401
|
|
|
419
402
|
llama_token id = LLAMA_TOKEN_NULL;
|
|
420
403
|
|
|
404
|
+
auto & grmr = gsmpl->grmr;
|
|
421
405
|
auto & chain = gsmpl->chain;
|
|
422
406
|
auto & cur_p = gsmpl->cur_p; // initialized by set_logits
|
|
423
407
|
|
|
424
408
|
gsmpl->set_logits(ctx, idx);
|
|
425
409
|
|
|
410
|
+
if (grammar_first) {
|
|
411
|
+
llama_sampler_apply(grmr, &cur_p);
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
llama_sampler_apply(chain, &cur_p);
|
|
415
|
+
|
|
416
|
+
id = cur_p.data[cur_p.selected].id;
|
|
417
|
+
|
|
418
|
+
if (grammar_first) {
|
|
419
|
+
return id;
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
// check if it the sampled token fits the grammar (grammar-based rejection sampling)
|
|
423
|
+
{
|
|
424
|
+
llama_token_data single_token_data = { id, 1.0f, 0.0f };
|
|
425
|
+
llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
|
|
426
|
+
|
|
427
|
+
llama_sampler_apply(grmr, &single_token_data_array);
|
|
428
|
+
|
|
429
|
+
const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
|
|
430
|
+
if (is_valid) {
|
|
431
|
+
return id;
|
|
432
|
+
}
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
// resampling:
|
|
436
|
+
// if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
|
|
437
|
+
gsmpl->set_logits(ctx, idx);
|
|
438
|
+
|
|
439
|
+
llama_sampler_apply(grmr, &cur_p);
|
|
426
440
|
llama_sampler_apply(chain, &cur_p);
|
|
427
441
|
|
|
428
442
|
GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
|
|
@@ -432,7 +446,7 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
|
|
432
446
|
return id;
|
|
433
447
|
}
|
|
434
448
|
|
|
435
|
-
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft) {
|
|
449
|
+
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
|
|
436
450
|
GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");
|
|
437
451
|
|
|
438
452
|
std::vector<llama_token> result;
|
|
@@ -440,7 +454,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
|
|
|
440
454
|
|
|
441
455
|
size_t i = 0;
|
|
442
456
|
for (; i < draft.size(); i++) {
|
|
443
|
-
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);
|
|
457
|
+
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
|
|
444
458
|
|
|
445
459
|
common_sampler_accept(gsmpl, id, true);
|
|
446
460
|
|
|
@@ -452,7 +466,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
|
|
|
452
466
|
}
|
|
453
467
|
|
|
454
468
|
if (i == draft.size()) {
|
|
455
|
-
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);
|
|
469
|
+
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
|
|
456
470
|
|
|
457
471
|
common_sampler_accept(gsmpl, id, true);
|
|
458
472
|
|
|
@@ -462,13 +476,13 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
|
|
|
462
476
|
return result;
|
|
463
477
|
}
|
|
464
478
|
|
|
465
|
-
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft) {
|
|
479
|
+
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) {
|
|
466
480
|
std::vector<int> idxs(draft.size() + 1);
|
|
467
481
|
for (size_t i = 0; i < idxs.size(); ++i) {
|
|
468
482
|
idxs[i] = i;
|
|
469
483
|
}
|
|
470
484
|
|
|
471
|
-
return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft);
|
|
485
|
+
return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first);
|
|
472
486
|
}
|
|
473
487
|
|
|
474
488
|
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
|
|
@@ -57,7 +57,10 @@ struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
|
|
|
57
57
|
// - check if the token fits the grammar (if any)
|
|
58
58
|
// - if not: resample by first applying the grammar constraints and then sampling again (slower path)
|
|
59
59
|
//
|
|
60
|
-
|
|
60
|
+
// if grammar_first is true, the grammar is applied before the samplers (slower)
|
|
61
|
+
// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
|
|
62
|
+
//
|
|
63
|
+
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
|
|
61
64
|
|
|
62
65
|
// generalized version of common_sampler_sample
|
|
63
66
|
//
|
|
@@ -75,10 +78,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
|
|
75
78
|
//
|
|
76
79
|
// returns at least 1 token, up to idxs.size()
|
|
77
80
|
//
|
|
78
|
-
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft);
|
|
81
|
+
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first = false);
|
|
79
82
|
|
|
80
83
|
// assume idxs == [ 0, 1, 2, ..., draft.size() ]
|
|
81
|
-
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft);
|
|
84
|
+
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false);
|
|
82
85
|
|
|
83
86
|
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
|
|
84
87
|
|
|
@@ -315,7 +315,7 @@ llama_tokens common_speculative_gen_draft(
|
|
|
315
315
|
for (int i = 0; i < params.n_draft; ++i) {
|
|
316
316
|
common_batch_clear(batch);
|
|
317
317
|
|
|
318
|
-
common_sampler_sample(smpl, ctx_dft, 0);
|
|
318
|
+
common_sampler_sample(smpl, ctx_dft, 0, true);
|
|
319
319
|
|
|
320
320
|
const auto * cur_p = common_sampler_get_candidates(smpl, true);
|
|
321
321
|
|
|
@@ -254,6 +254,7 @@ set (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
|
|
|
254
254
|
"gmml: OpenCL API version to target")
|
|
255
255
|
|
|
256
256
|
option(GGML_HEXAGON "ggml: enable Hexagon backend" OFF)
|
|
257
|
+
set(GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE 128 CACHE STRING "ggml: quantize group size (32, 64, or 128)")
|
|
257
258
|
|
|
258
259
|
# toolchain for vulkan-shaders-gen
|
|
259
260
|
set (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")
|
|
@@ -458,6 +458,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
458
458
|
if (GGML_RV_ZFH)
|
|
459
459
|
string(APPEND MARCH_STR "_zfh")
|
|
460
460
|
endif()
|
|
461
|
+
|
|
461
462
|
if (GGML_XTHEADVECTOR)
|
|
462
463
|
string(APPEND MARCH_STR "_xtheadvector")
|
|
463
464
|
elseif (GGML_RVV)
|
|
@@ -465,6 +466,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
465
466
|
if (GGML_RV_ZVFH)
|
|
466
467
|
string(APPEND MARCH_STR "_zvfh")
|
|
467
468
|
endif()
|
|
469
|
+
if (GGML_RV_ZVFBFWMA)
|
|
470
|
+
string(APPEND MARCH_STR "_zvfbfwma")
|
|
471
|
+
endif()
|
|
468
472
|
endif()
|
|
469
473
|
if (GGML_RV_ZICBOP)
|
|
470
474
|
string(APPEND MARCH_STR "_zicbop")
|