@fugood/llama.node 1.4.7 → 1.4.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/lib/binding.ts +8 -0
  2. package/package.json +15 -15
  3. package/scripts/llama.cpp.patch +23 -24
  4. package/src/LlamaContext.cpp +4 -2
  5. package/src/llama.cpp/common/CMakeLists.txt +2 -0
  6. package/src/llama.cpp/common/arg.cpp +470 -223
  7. package/src/llama.cpp/common/arg.h +43 -2
  8. package/src/llama.cpp/common/chat-peg-parser.cpp +16 -2
  9. package/src/llama.cpp/common/chat.cpp +140 -0
  10. package/src/llama.cpp/common/common.cpp +130 -67
  11. package/src/llama.cpp/common/common.h +44 -17
  12. package/src/llama.cpp/common/console.cpp +98 -18
  13. package/src/llama.cpp/common/console.h +30 -8
  14. package/src/llama.cpp/common/download.cpp +69 -25
  15. package/src/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
  16. package/src/llama.cpp/common/json-schema-to-grammar.h +20 -0
  17. package/src/llama.cpp/common/log.cpp +5 -0
  18. package/src/llama.cpp/common/log.h +1 -0
  19. package/src/llama.cpp/common/peg-parser.cpp +1 -1
  20. package/src/llama.cpp/common/preset.cpp +206 -0
  21. package/src/llama.cpp/common/preset.h +32 -0
  22. package/src/llama.cpp/common/sampling.cpp +67 -54
  23. package/src/llama.cpp/common/sampling.h +8 -0
  24. package/src/llama.cpp/ggml/CMakeLists.txt +4 -0
  25. package/src/llama.cpp/ggml/include/ggml-alloc.h +9 -0
  26. package/src/llama.cpp/ggml/include/ggml-backend.h +1 -0
  27. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
  28. package/src/llama.cpp/ggml/include/ggml.h +7 -8
  29. package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
  30. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +4 -0
  31. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +285 -0
  32. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +28 -0
  33. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +111 -45
  34. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +288 -1
  36. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +8 -0
  37. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +41 -1
  38. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +125 -22
  39. package/src/llama.cpp/include/llama.h +18 -1
  40. package/src/llama.cpp/src/llama-arch.cpp +1890 -2248
  41. package/src/llama.cpp/src/llama-arch.h +9 -2
  42. package/src/llama.cpp/src/llama-batch.cpp +12 -2
  43. package/src/llama.cpp/src/llama-batch.h +4 -2
  44. package/src/llama.cpp/src/llama-context.cpp +93 -23
  45. package/src/llama.cpp/src/llama-context.h +8 -2
  46. package/src/llama.cpp/src/llama-graph.cpp +84 -16
  47. package/src/llama.cpp/src/llama-graph.h +17 -4
  48. package/src/llama.cpp/src/llama-hparams.cpp +6 -0
  49. package/src/llama.cpp/src/llama-hparams.h +5 -1
  50. package/src/llama.cpp/src/llama-impl.cpp +4 -0
  51. package/src/llama.cpp/src/llama-kv-cache.cpp +90 -42
  52. package/src/llama.cpp/src/llama-kv-cache.h +19 -2
  53. package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
  54. package/src/llama.cpp/src/llama-mmap.cpp +123 -28
  55. package/src/llama.cpp/src/llama-mmap.h +5 -1
  56. package/src/llama.cpp/src/llama-model-loader.cpp +58 -13
  57. package/src/llama.cpp/src/llama-model-loader.h +2 -0
  58. package/src/llama.cpp/src/llama-model.cpp +110 -49
  59. package/src/llama.cpp/src/llama-model.h +1 -0
  60. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  61. package/src/llama.cpp/src/llama-sampling.cpp +16 -0
  62. package/src/llama.cpp/src/llama-vocab.cpp +2 -1
  63. package/src/llama.cpp/src/llama.cpp +665 -1
  64. package/src/llama.cpp/src/models/deepseek2.cpp +9 -5
  65. package/src/llama.cpp/src/models/glm4-moe.cpp +28 -11
  66. package/src/llama.cpp/src/models/glm4.cpp +27 -4
  67. package/src/llama.cpp/src/models/models.h +5 -5
  68. package/src/llama.cpp/src/models/nemotron-h.cpp +35 -6
  69. package/src/llama.cpp/src/models/qwen2.cpp +12 -3
  70. package/src/llama.cpp/src/models/qwen3next.cpp +81 -266
@@ -3,11 +3,31 @@
3
3
  #include <nlohmann/json_fwd.hpp>
4
4
 
5
5
  #include <functional>
6
+ #include <memory>
6
7
  #include <string>
7
8
 
8
9
  std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
9
10
  bool force_gbnf = false);
10
11
 
12
+ class common_schema_converter;
13
+
14
+ // Probes a JSON schema to extract information about its structure and type constraints.
15
+ class common_schema_info {
16
+ std::unique_ptr<common_schema_converter> impl_;
17
+
18
+ public:
19
+ common_schema_info();
20
+ ~common_schema_info();
21
+
22
+ common_schema_info(const common_schema_info &) = delete;
23
+ common_schema_info & operator=(const common_schema_info &) = delete;
24
+ common_schema_info(common_schema_info &&) noexcept;
25
+ common_schema_info & operator=(common_schema_info &&) noexcept;
26
+
27
+ void resolve_refs(nlohmann::ordered_json & schema);
28
+ bool resolves_to_string(const nlohmann::ordered_json & schema);
29
+ };
30
+
11
31
  struct common_grammar_builder {
12
32
  std::function<std::string(const std::string &, const std::string &)> add_rule;
13
33
  std::function<std::string(const std::string &, const nlohmann::ordered_json &)> add_schema;
@@ -420,6 +420,11 @@ void common_log_set_timestamps(struct common_log * log, bool timestamps) {
420
420
  log->set_timestamps(timestamps);
421
421
  }
422
422
 
423
+ void common_log_flush(struct common_log * log) {
424
+ log->pause();
425
+ log->resume();
426
+ }
427
+
423
428
  static int common_get_verbosity(enum ggml_log_level level) {
424
429
  switch (level) {
425
430
  case GGML_LOG_LEVEL_DEBUG: return LOG_LEVEL_DEBUG;
@@ -84,6 +84,7 @@ void common_log_set_file (struct common_log * log, const char * file); // n
84
84
  void common_log_set_colors (struct common_log * log, log_colors colors); // not thread-safe
85
85
  void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log
86
86
  void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix
87
+ void common_log_flush (struct common_log * log); // flush all pending log messages
87
88
 
88
89
  // helper macros for logging
89
90
  // use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
@@ -425,7 +425,7 @@ struct parser_executor {
425
425
 
426
426
  if (result.need_more_input()) {
427
427
  // Propagate - need to know what child would match before negating
428
- return result;
428
+ return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos);
429
429
  }
430
430
 
431
431
  // Child failed, so negation succeeds
@@ -0,0 +1,206 @@
1
+ #include "arg.h"
2
+ #include "preset.h"
3
+ #include "peg-parser.h"
4
+ #include "log.h"
5
+
6
+ #include <fstream>
7
+ #include <sstream>
8
+ #include <filesystem>
9
+
10
+ static std::string rm_leading_dashes(const std::string & str) {
11
+ size_t pos = 0;
12
+ while (pos < str.size() && str[pos] == '-') {
13
+ ++pos;
14
+ }
15
+ return str.substr(pos);
16
+ }
17
+
18
+ std::vector<std::string> common_preset::to_args() const {
19
+ std::vector<std::string> args;
20
+
21
+ for (const auto & [opt, value] : options) {
22
+ args.push_back(opt.args.back()); // use the last arg as the main arg
23
+ if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
24
+ // flag option, no value
25
+ if (common_arg_utils::is_falsey(value)) {
26
+ // use negative arg if available
27
+ if (!opt.args_neg.empty()) {
28
+ args.back() = opt.args_neg.back();
29
+ } else {
30
+ // otherwise, skip the flag
31
+ // TODO: maybe throw an error instead?
32
+ args.pop_back();
33
+ }
34
+ }
35
+ }
36
+ if (opt.value_hint != nullptr) {
37
+ // single value
38
+ args.push_back(value);
39
+ }
40
+ if (opt.value_hint != nullptr && opt.value_hint_2 != nullptr) {
41
+ throw std::runtime_error(string_format(
42
+ "common_preset::to_args(): option '%s' has two values, which is not supported yet",
43
+ opt.args.back()
44
+ ));
45
+ }
46
+ }
47
+
48
+ return args;
49
+ }
50
+
51
+ std::string common_preset::to_ini() const {
52
+ std::ostringstream ss;
53
+
54
+ ss << "[" << name << "]\n";
55
+ for (const auto & [opt, value] : options) {
56
+ auto espaced_value = value;
57
+ string_replace_all(espaced_value, "\n", "\\\n");
58
+ ss << rm_leading_dashes(opt.args.back()) << " = ";
59
+ ss << espaced_value << "\n";
60
+ }
61
+ ss << "\n";
62
+
63
+ return ss.str();
64
+ }
65
+
66
+ static std::map<std::string, std::map<std::string, std::string>> parse_ini_from_file(const std::string & path) {
67
+ std::map<std::string, std::map<std::string, std::string>> parsed;
68
+
69
+ if (!std::filesystem::exists(path)) {
70
+ throw std::runtime_error("preset file does not exist: " + path);
71
+ }
72
+
73
+ std::ifstream file(path);
74
+ if (!file.good()) {
75
+ throw std::runtime_error("failed to open server preset file: " + path);
76
+ }
77
+
78
+ std::string contents((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
79
+
80
+ static const auto parser = build_peg_parser([](auto & p) {
81
+ // newline ::= "\r\n" / "\n" / "\r"
82
+ auto newline = p.rule("newline", p.literal("\r\n") | p.literal("\n") | p.literal("\r"));
83
+
84
+ // ws ::= [ \t]*
85
+ auto ws = p.rule("ws", p.chars("[ \t]", 0, -1));
86
+
87
+ // comment ::= [;#] (!newline .)*
88
+ auto comment = p.rule("comment", p.chars("[;#]", 1, 1) + p.zero_or_more(p.negate(newline) + p.any()));
89
+
90
+ // eol ::= ws comment? (newline / EOF)
91
+ auto eol = p.rule("eol", ws + p.optional(comment) + (newline | p.end()));
92
+
93
+ // ident ::= [a-zA-Z_] [a-zA-Z0-9_.-]*
94
+ auto ident = p.rule("ident", p.chars("[a-zA-Z_]", 1, 1) + p.chars("[a-zA-Z0-9_.-]", 0, -1));
95
+
96
+ // value ::= (!eol-start .)*
97
+ auto eol_start = p.rule("eol-start", ws + (p.chars("[;#]", 1, 1) | newline | p.end()));
98
+ auto value = p.rule("value", p.zero_or_more(p.negate(eol_start) + p.any()));
99
+
100
+ // header-line ::= "[" ws ident ws "]" eol
101
+ auto header_line = p.rule("header-line", "[" + ws + p.tag("section-name", p.chars("[^]]")) + ws + "]" + eol);
102
+
103
+ // kv-line ::= ident ws "=" ws value eol
104
+ auto kv_line = p.rule("kv-line", p.tag("key", ident) + ws + "=" + ws + p.tag("value", value) + eol);
105
+
106
+ // comment-line ::= ws comment (newline / EOF)
107
+ auto comment_line = p.rule("comment-line", ws + comment + (newline | p.end()));
108
+
109
+ // blank-line ::= ws (newline / EOF)
110
+ auto blank_line = p.rule("blank-line", ws + (newline | p.end()));
111
+
112
+ // line ::= header-line / kv-line / comment-line / blank-line
113
+ auto line = p.rule("line", header_line | kv_line | comment_line | blank_line);
114
+
115
+ // ini ::= line* EOF
116
+ auto ini = p.rule("ini", p.zero_or_more(line) + p.end());
117
+
118
+ return ini;
119
+ });
120
+
121
+ common_peg_parse_context ctx(contents);
122
+ const auto result = parser.parse(ctx);
123
+ if (!result.success()) {
124
+ throw std::runtime_error("failed to parse server config file: " + path);
125
+ }
126
+
127
+ std::string current_section = COMMON_PRESET_DEFAULT_NAME;
128
+ std::string current_key;
129
+
130
+ ctx.ast.visit(result, [&](const auto & node) {
131
+ if (node.tag == "section-name") {
132
+ const std::string section = std::string(node.text);
133
+ current_section = section;
134
+ parsed[current_section] = {};
135
+ } else if (node.tag == "key") {
136
+ const std::string key = std::string(node.text);
137
+ current_key = key;
138
+ } else if (node.tag == "value" && !current_key.empty() && !current_section.empty()) {
139
+ parsed[current_section][current_key] = std::string(node.text);
140
+ current_key.clear();
141
+ }
142
+ });
143
+
144
+ return parsed;
145
+ }
146
+
147
+ static std::map<std::string, common_arg> get_map_key_opt(common_params_context & ctx_params) {
148
+ std::map<std::string, common_arg> mapping;
149
+ for (const auto & opt : ctx_params.options) {
150
+ for (const auto & env : opt.get_env()) {
151
+ mapping[env] = opt;
152
+ }
153
+ for (const auto & arg : opt.get_args()) {
154
+ mapping[rm_leading_dashes(arg)] = opt;
155
+ }
156
+ }
157
+ return mapping;
158
+ }
159
+
160
+ static bool is_bool_arg(const common_arg & arg) {
161
+ return !arg.args_neg.empty();
162
+ }
163
+
164
+ static std::string parse_bool_arg(const common_arg & arg, const std::string & key, const std::string & value) {
165
+ // if this is a negated arg, we need to reverse the value
166
+ for (const auto & neg_arg : arg.args_neg) {
167
+ if (rm_leading_dashes(neg_arg) == key) {
168
+ return common_arg_utils::is_truthy(value) ? "false" : "true";
169
+ }
170
+ }
171
+ // otherwise, not negated
172
+ return value;
173
+ }
174
+
175
+ common_presets common_presets_load(const std::string & path, common_params_context & ctx_params) {
176
+ common_presets out;
177
+ auto key_to_opt = get_map_key_opt(ctx_params);
178
+ auto ini_data = parse_ini_from_file(path);
179
+
180
+ for (auto section : ini_data) {
181
+ common_preset preset;
182
+ if (section.first.empty()) {
183
+ preset.name = COMMON_PRESET_DEFAULT_NAME;
184
+ } else {
185
+ preset.name = section.first;
186
+ }
187
+ LOG_DBG("loading preset: %s\n", preset.name.c_str());
188
+ for (const auto & [key, value] : section.second) {
189
+ LOG_DBG("option: %s = %s\n", key.c_str(), value.c_str());
190
+ if (key_to_opt.find(key) != key_to_opt.end()) {
191
+ auto & opt = key_to_opt[key];
192
+ if (is_bool_arg(opt)) {
193
+ preset.options[opt] = parse_bool_arg(opt, key, value);
194
+ } else {
195
+ preset.options[opt] = value;
196
+ }
197
+ LOG_DBG("accepted option: %s = %s\n", key.c_str(), preset.options[opt].c_str());
198
+ } else {
199
+ // TODO: maybe warn about unknown key?
200
+ }
201
+ }
202
+ out[preset.name] = preset;
203
+ }
204
+
205
+ return out;
206
+ }
@@ -0,0 +1,32 @@
1
+ #pragma once
2
+
3
+ #include "common.h"
4
+ #include "arg.h"
5
+
6
+ #include <string>
7
+ #include <vector>
8
+ #include <map>
9
+
10
+ //
11
+ // INI preset parser and writer
12
+ //
13
+
14
+ constexpr const char * COMMON_PRESET_DEFAULT_NAME = "default";
15
+
16
+ struct common_preset {
17
+ std::string name;
18
+ // TODO: support repeated args in the future
19
+ std::map<common_arg, std::string> options;
20
+
21
+ // convert preset to CLI argument list
22
+ std::vector<std::string> to_args() const;
23
+
24
+ // convert preset to INI format string
25
+ std::string to_ini() const;
26
+
27
+ // TODO: maybe implement to_env() if needed
28
+ };
29
+
30
+ // interface for multiple presets in one file
31
+ using common_presets = std::map<std::string, common_preset>;
32
+ common_presets common_presets_load(const std::string & path, common_params_context & ctx_params);
@@ -116,7 +116,6 @@ struct common_sampler {
116
116
  void reset() {
117
117
  prev.clear();
118
118
 
119
- llama_sampler_reset(grmr);
120
119
  llama_sampler_reset(chain);
121
120
  }
122
121
 
@@ -167,7 +166,11 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
167
166
 
168
167
  lparams.no_perf = params.no_perf;
169
168
 
170
- struct llama_sampler * grmr;
169
+ llama_sampler * grmr = nullptr;
170
+ llama_sampler * chain = llama_sampler_chain_init(lparams);
171
+
172
+ std::vector<llama_sampler *> samplers;
173
+
171
174
  if (params.grammar.compare(0, 11, "%llguidance") == 0) {
172
175
  #ifdef LLAMA_USE_LLGUIDANCE
173
176
  grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str());
@@ -217,30 +220,20 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
217
220
  trigger_patterns_c.push_back(regex.c_str());
218
221
  }
219
222
 
220
- grmr = params.grammar_lazy
221
- ? llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
222
- trigger_patterns_c.data(), trigger_patterns_c.size(),
223
- trigger_tokens.data(), trigger_tokens.size())
224
- : llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
225
- if (!grmr) {
226
- return nullptr;
223
+ if (!params.grammar.empty()) {
224
+ if (params.grammar_lazy) {
225
+ grmr = llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
226
+ trigger_patterns_c.data(), trigger_patterns_c.size(),
227
+ trigger_tokens.data(), trigger_tokens.size());
228
+ } else {
229
+ grmr = llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
230
+ }
227
231
  }
228
232
  }
229
233
 
230
- auto * result = new common_sampler {
231
- /* .params = */ params,
232
- /* .grmr = */ grmr,
233
- /* .chain = */ llama_sampler_chain_init(lparams),
234
- /* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
235
- /* .cur = */ {},
236
- /* .cur_p = */ {},
237
- };
238
-
239
- llama_sampler_chain_add(result->chain,
240
- llama_sampler_init_logit_bias(
241
- llama_vocab_n_tokens(vocab),
242
- params.logit_bias.size(),
243
- params.logit_bias.data()));
234
+ if (params.has_logit_bias()) {
235
+ samplers.push_back(llama_sampler_init_logit_bias(llama_vocab_n_tokens(vocab), params.logit_bias.size(), params.logit_bias.data()));
236
+ }
244
237
 
245
238
  if (params.mirostat == 0) {
246
239
  for (const auto & cnstr : params.samplers) {
@@ -253,58 +246,71 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
253
246
  c_breakers.push_back(str.c_str());
254
247
  }
255
248
 
256
- llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
249
+ samplers.push_back(llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
257
250
  }
258
251
  break;
259
252
  case COMMON_SAMPLER_TYPE_TOP_K:
260
- llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
253
+ samplers.push_back(llama_sampler_init_top_k (params.top_k));
261
254
  break;
262
255
  case COMMON_SAMPLER_TYPE_TOP_P:
263
- llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
256
+ samplers.push_back(llama_sampler_init_top_p (params.top_p, params.min_keep));
264
257
  break;
265
258
  case COMMON_SAMPLER_TYPE_TOP_N_SIGMA:
266
- llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
259
+ samplers.push_back(llama_sampler_init_top_n_sigma(params.top_n_sigma));
267
260
  break;
268
261
  case COMMON_SAMPLER_TYPE_MIN_P:
269
- llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
262
+ samplers.push_back(llama_sampler_init_min_p (params.min_p, params.min_keep));
270
263
  break;
271
264
  case COMMON_SAMPLER_TYPE_XTC:
272
- llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
265
+ samplers.push_back(llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
273
266
  break;
274
267
  case COMMON_SAMPLER_TYPE_TYPICAL_P:
275
- llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
268
+ samplers.push_back(llama_sampler_init_typical (params.typ_p, params.min_keep));
276
269
  break;
277
270
  case COMMON_SAMPLER_TYPE_TEMPERATURE:
278
- llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
271
+ samplers.push_back(llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
279
272
  break;
280
273
  case COMMON_SAMPLER_TYPE_INFILL:
281
- llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
274
+ samplers.push_back(llama_sampler_init_infill (vocab));
282
275
  break;
283
276
  case COMMON_SAMPLER_TYPE_PENALTIES:
284
- llama_sampler_chain_add(result->chain, llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
277
+ samplers.push_back(llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
285
278
  break;
286
279
  default:
287
280
  GGML_ASSERT(false && "unknown sampler type");
288
281
  }
289
282
  }
290
- llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
283
+
284
+ samplers.push_back(llama_sampler_init_dist(params.seed));
291
285
  } else if (params.mirostat == 1) {
292
- llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
293
- llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
286
+ samplers.push_back(llama_sampler_init_temp(params.temp));
287
+ samplers.push_back(llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
294
288
  } else if (params.mirostat == 2) {
295
- llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
296
- llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
289
+ samplers.push_back(llama_sampler_init_temp(params.temp));
290
+ samplers.push_back(llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
297
291
  } else {
298
292
  GGML_ASSERT(false && "unknown mirostat version");
299
293
  }
300
294
 
295
+ for (auto * smpl : samplers) {
296
+ llama_sampler_chain_add(chain, smpl);
297
+ }
298
+
299
+ auto * result = new common_sampler {
300
+ /* .params = */ params,
301
+ /* .grmr = */ grmr,
302
+ /* .chain = */ chain,
303
+ /* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
304
+ /* .cur = */ {},
305
+ /* .cur_p = */ {},
306
+ };
307
+
301
308
  return result;
302
309
  }
303
310
 
304
311
  void common_sampler_free(struct common_sampler * gsmpl) {
305
312
  if (gsmpl) {
306
313
  llama_sampler_free(gsmpl->grmr);
307
-
308
314
  llama_sampler_free(gsmpl->chain);
309
315
 
310
316
  delete gsmpl;
@@ -314,7 +320,7 @@ void common_sampler_free(struct common_sampler * gsmpl) {
314
320
  void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
315
321
  const auto tm = gsmpl->tm();
316
322
 
317
- if (accept_grammar) {
323
+ if (gsmpl->grmr && accept_grammar) {
318
324
  llama_sampler_accept(gsmpl->grmr, token);
319
325
  }
320
326
 
@@ -329,12 +335,12 @@ void common_sampler_reset(struct common_sampler * gsmpl) {
329
335
 
330
336
  struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
331
337
  return new common_sampler {
332
- /* .params = */ gsmpl->params,
333
- /* .grmr = */ llama_sampler_clone(gsmpl->grmr),
334
- /* .chain = */ llama_sampler_clone(gsmpl->chain),
335
- /* .prev = */ gsmpl->prev,
336
- /* .cur = */ gsmpl->cur,
337
- /* .cur_p = */ gsmpl->cur_p,
338
+ /* .params = */ gsmpl->params,
339
+ /* .grmr = */ llama_sampler_clone(gsmpl->grmr),
340
+ /* .chain = */ llama_sampler_clone(gsmpl->chain),
341
+ /* .prev = */ gsmpl->prev,
342
+ /* .cur = */ gsmpl->cur,
343
+ /* .cur_p = */ gsmpl->cur_p,
338
344
  };
339
345
  }
340
346
 
@@ -383,33 +389,37 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
383
389
  }
384
390
  }
385
391
 
392
+ struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl) {
393
+ return gsmpl->chain;
394
+ }
395
+
386
396
  llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
387
397
  llama_synchronize(ctx);
388
398
 
389
399
  // start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
390
400
  const auto tm = gsmpl->tm();
391
401
 
392
- gsmpl->set_logits(ctx, idx);
402
+ llama_token id = LLAMA_TOKEN_NULL;
393
403
 
394
404
  auto & grmr = gsmpl->grmr;
395
405
  auto & chain = gsmpl->chain;
396
406
  auto & cur_p = gsmpl->cur_p; // initialized by set_logits
397
407
 
408
+ gsmpl->set_logits(ctx, idx);
409
+
398
410
  if (grammar_first) {
399
411
  llama_sampler_apply(grmr, &cur_p);
400
412
  }
401
413
 
402
414
  llama_sampler_apply(chain, &cur_p);
403
415
 
404
- GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
405
-
406
- const llama_token id = cur_p.data[cur_p.selected].id;
416
+ id = cur_p.data[cur_p.selected].id;
407
417
 
408
418
  if (grammar_first) {
409
419
  return id;
410
420
  }
411
421
 
412
- // check if it the sampled token fits the grammar
422
+ // check if it the sampled token fits the grammar (grammar-based rejection sampling)
413
423
  {
414
424
  llama_token_data single_token_data = { id, 1.0f, 0.0f };
415
425
  llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
@@ -429,9 +439,11 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
429
439
  llama_sampler_apply(grmr, &cur_p);
430
440
  llama_sampler_apply(chain, &cur_p);
431
441
 
432
- GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration");
442
+ GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
443
+
444
+ id = cur_p.data[cur_p.selected].id;
433
445
 
434
- return cur_p.data[cur_p.selected].id;
446
+ return id;
435
447
  }
436
448
 
437
449
  std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
@@ -515,7 +527,8 @@ std::string common_sampler_print(const struct common_sampler * gsmpl) {
515
527
 
516
528
  for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
517
529
  const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
518
- result += std::string("-> ") + llama_sampler_name(smpl) + " ";
530
+ result += std::string("-> ");
531
+ result += std::string(llama_sampler_name(smpl)) + " ";
519
532
  }
520
533
 
521
534
  return result;
@@ -48,6 +48,8 @@ struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
48
48
  // arguments can be nullptr to skip printing
49
49
  void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
50
50
 
51
+ struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
52
+
51
53
  // extended sampling implementation:
52
54
  //
53
55
  // - set logits
@@ -107,3 +109,9 @@ std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std:
107
109
 
108
110
  llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab,
109
111
  const char * grammar_kind, const char * grammar_data);
112
+
113
+ struct common_sampler_deleter {
114
+ void operator()(common_sampler * s) { common_sampler_free(s); }
115
+ };
116
+
117
+ typedef std::unique_ptr<common_sampler, common_sampler_deleter> common_sampler_ptr;
@@ -54,6 +54,10 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
54
54
  # TODO
55
55
  else()
56
56
  set(GGML_STANDALONE OFF)
57
+
58
+ if (NOT CMAKE_RUNTIME_OUTPUT_DIRECTORY)
59
+ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
60
+ endif()
57
61
  endif()
58
62
 
59
63
  if (EMSCRIPTEN)
@@ -53,7 +53,14 @@ GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
53
53
  // call with a worst-case graph to avoid buffer reallocations
54
54
  // not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
55
55
  // returns false if the buffer allocation failed
56
+ // ggml_gallocr_resrve_n_size writes the buffer sizes per galloc buffer that would be allocated by ggml_gallocr_reserve_n to sizes
56
57
  GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
58
+ GGML_API void ggml_gallocr_reserve_n_size(
59
+ ggml_gallocr_t galloc,
60
+ struct ggml_cgraph * graph,
61
+ const int * node_buffer_ids,
62
+ const int * leaf_buffer_ids,
63
+ size_t * sizes);
57
64
  GGML_API bool ggml_gallocr_reserve_n(
58
65
  ggml_gallocr_t galloc,
59
66
  struct ggml_cgraph * graph,
@@ -68,6 +75,8 @@ GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_i
68
75
 
69
76
  // Utils
70
77
  // Create a buffer and allocate all the tensors in a ggml_context
78
+ // ggml_backend_alloc_ctx_tensors_from_buft_size returns the size of the buffer that would be allocated by ggml_backend_alloc_ctx_tensors_from_buft
79
+ GGML_API size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
71
80
  GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
72
81
  GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
73
82
 
@@ -307,6 +307,7 @@ extern "C" {
307
307
  GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
308
308
 
309
309
  // Initialize backend buffers from a measure graph
310
+ GGML_API void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes);
310
311
  GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
311
312
 
312
313
  GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
@@ -99,6 +99,7 @@ extern "C" {
99
99
  GGML_BACKEND_API int ggml_cpu_has_sme (void);
100
100
  // other
101
101
  GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
102
+ GGML_BACKEND_API int ggml_cpu_get_rvv_vlen (void); // risc-v vector length in bytes
102
103
  GGML_BACKEND_API int ggml_cpu_has_vsx (void);
103
104
  GGML_BACKEND_API int ggml_cpu_has_vxe (void);
104
105
  GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
@@ -2305,13 +2305,11 @@ extern "C" {
2305
2305
  float stop,
2306
2306
  float step);
2307
2307
 
2308
- #define GGML_KQ_MASK_PAD 1
2309
-
2310
- // q: [n_embd_k, n_batch, n_head, ne3 ]
2311
- // k: [n_embd_k, n_kv, n_head_kv, ne3 ]
2312
- // v: [n_embd_v, n_kv, n_head_kv, ne3 ] !! not transposed !!
2313
- // mask: [n_kv, n_batch_pad, ne32, ne33] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
2314
- // res: [n_embd_v, n_head, n_batch, ne3 ] !! permuted !!
2308
+ // q: [n_embd_k, n_batch, n_head, ne3 ]
2309
+ // k: [n_embd_k, n_kv, n_head_kv, ne3 ]
2310
+ // v: [n_embd_v, n_kv, n_head_kv, ne3 ] !! not transposed !!
2311
+ // mask: [n_kv, n_batch, ne32, ne33]
2312
+ // res: [n_embd_v, n_head, n_batch, ne3 ] !! permuted !!
2315
2313
  //
2316
2314
  // broadcast:
2317
2315
  // n_head % n_head_kv == 0
@@ -2617,7 +2615,8 @@ extern "C" {
2617
2615
 
2618
2616
  // Set callback for all future logging events.
2619
2617
  // If this is not called, or NULL is supplied, everything is output on stderr.
2620
- GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
2618
+ GGML_API void ggml_log_get(ggml_log_callback * log_callback, void ** user_data);
2619
+ GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
2621
2620
 
2622
2621
  GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
2623
2622
 
@@ -386,6 +386,9 @@ if (GGML_CPU_ALL_VARIANTS)
386
386
  ggml_add_cpu_backend_variant(android_armv8.2_1 DOTPROD)
387
387
  ggml_add_cpu_backend_variant(android_armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC)
388
388
  ggml_add_cpu_backend_variant(android_armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC MATMUL_INT8)
389
+ ggml_add_cpu_backend_variant(android_armv9.0_1 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE2)
390
+ ggml_add_cpu_backend_variant(android_armv9.2_1 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SME)
391
+ ggml_add_cpu_backend_variant(android_armv9.2_2 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE SME)
389
392
  elseif (APPLE)
390
393
  ggml_add_cpu_backend_variant(apple_m1 DOTPROD)
391
394
  ggml_add_cpu_backend_variant(apple_m2_m3 DOTPROD MATMUL_INT8)