@fugood/llama.node 1.4.7 → 1.4.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +8 -0
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +23 -24
- package/src/LlamaContext.cpp +4 -2
- package/src/llama.cpp/common/CMakeLists.txt +2 -0
- package/src/llama.cpp/common/arg.cpp +470 -223
- package/src/llama.cpp/common/arg.h +43 -2
- package/src/llama.cpp/common/chat-peg-parser.cpp +16 -2
- package/src/llama.cpp/common/chat.cpp +140 -0
- package/src/llama.cpp/common/common.cpp +130 -67
- package/src/llama.cpp/common/common.h +44 -17
- package/src/llama.cpp/common/console.cpp +98 -18
- package/src/llama.cpp/common/console.h +30 -8
- package/src/llama.cpp/common/download.cpp +69 -25
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
- package/src/llama.cpp/common/json-schema-to-grammar.h +20 -0
- package/src/llama.cpp/common/log.cpp +5 -0
- package/src/llama.cpp/common/log.h +1 -0
- package/src/llama.cpp/common/peg-parser.cpp +1 -1
- package/src/llama.cpp/common/preset.cpp +206 -0
- package/src/llama.cpp/common/preset.h +32 -0
- package/src/llama.cpp/common/sampling.cpp +67 -54
- package/src/llama.cpp/common/sampling.h +8 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +4 -0
- package/src/llama.cpp/ggml/include/ggml-alloc.h +9 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +1 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/include/ggml.h +7 -8
- package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +285 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +111 -45
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +288 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +41 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +125 -22
- package/src/llama.cpp/include/llama.h +18 -1
- package/src/llama.cpp/src/llama-arch.cpp +1890 -2248
- package/src/llama.cpp/src/llama-arch.h +9 -2
- package/src/llama.cpp/src/llama-batch.cpp +12 -2
- package/src/llama.cpp/src/llama-batch.h +4 -2
- package/src/llama.cpp/src/llama-context.cpp +93 -23
- package/src/llama.cpp/src/llama-context.h +8 -2
- package/src/llama.cpp/src/llama-graph.cpp +84 -16
- package/src/llama.cpp/src/llama-graph.h +17 -4
- package/src/llama.cpp/src/llama-hparams.cpp +6 -0
- package/src/llama.cpp/src/llama-hparams.h +5 -1
- package/src/llama.cpp/src/llama-impl.cpp +4 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +90 -42
- package/src/llama.cpp/src/llama-kv-cache.h +19 -2
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +123 -28
- package/src/llama.cpp/src/llama-mmap.h +5 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +58 -13
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model.cpp +110 -49
- package/src/llama.cpp/src/llama-model.h +1 -0
- package/src/llama.cpp/src/llama-quant.cpp +1 -1
- package/src/llama.cpp/src/llama-sampling.cpp +16 -0
- package/src/llama.cpp/src/llama-vocab.cpp +2 -1
- package/src/llama.cpp/src/llama.cpp +665 -1
- package/src/llama.cpp/src/models/deepseek2.cpp +9 -5
- package/src/llama.cpp/src/models/glm4-moe.cpp +28 -11
- package/src/llama.cpp/src/models/glm4.cpp +27 -4
- package/src/llama.cpp/src/models/models.h +5 -5
- package/src/llama.cpp/src/models/nemotron-h.cpp +35 -6
- package/src/llama.cpp/src/models/qwen2.cpp +12 -3
- package/src/llama.cpp/src/models/qwen3next.cpp +81 -266
|
@@ -3,11 +3,31 @@
|
|
|
3
3
|
#include <nlohmann/json_fwd.hpp>
|
|
4
4
|
|
|
5
5
|
#include <functional>
|
|
6
|
+
#include <memory>
|
|
6
7
|
#include <string>
|
|
7
8
|
|
|
8
9
|
std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
|
|
9
10
|
bool force_gbnf = false);
|
|
10
11
|
|
|
12
|
+
class common_schema_converter;
|
|
13
|
+
|
|
14
|
+
// Probes a JSON schema to extract information about its structure and type constraints.
|
|
15
|
+
class common_schema_info {
|
|
16
|
+
std::unique_ptr<common_schema_converter> impl_;
|
|
17
|
+
|
|
18
|
+
public:
|
|
19
|
+
common_schema_info();
|
|
20
|
+
~common_schema_info();
|
|
21
|
+
|
|
22
|
+
common_schema_info(const common_schema_info &) = delete;
|
|
23
|
+
common_schema_info & operator=(const common_schema_info &) = delete;
|
|
24
|
+
common_schema_info(common_schema_info &&) noexcept;
|
|
25
|
+
common_schema_info & operator=(common_schema_info &&) noexcept;
|
|
26
|
+
|
|
27
|
+
void resolve_refs(nlohmann::ordered_json & schema);
|
|
28
|
+
bool resolves_to_string(const nlohmann::ordered_json & schema);
|
|
29
|
+
};
|
|
30
|
+
|
|
11
31
|
struct common_grammar_builder {
|
|
12
32
|
std::function<std::string(const std::string &, const std::string &)> add_rule;
|
|
13
33
|
std::function<std::string(const std::string &, const nlohmann::ordered_json &)> add_schema;
|
|
@@ -420,6 +420,11 @@ void common_log_set_timestamps(struct common_log * log, bool timestamps) {
|
|
|
420
420
|
log->set_timestamps(timestamps);
|
|
421
421
|
}
|
|
422
422
|
|
|
423
|
+
void common_log_flush(struct common_log * log) {
|
|
424
|
+
log->pause();
|
|
425
|
+
log->resume();
|
|
426
|
+
}
|
|
427
|
+
|
|
423
428
|
static int common_get_verbosity(enum ggml_log_level level) {
|
|
424
429
|
switch (level) {
|
|
425
430
|
case GGML_LOG_LEVEL_DEBUG: return LOG_LEVEL_DEBUG;
|
|
@@ -84,6 +84,7 @@ void common_log_set_file (struct common_log * log, const char * file); // n
|
|
|
84
84
|
void common_log_set_colors (struct common_log * log, log_colors colors); // not thread-safe
|
|
85
85
|
void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log
|
|
86
86
|
void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix
|
|
87
|
+
void common_log_flush (struct common_log * log); // flush all pending log messages
|
|
87
88
|
|
|
88
89
|
// helper macros for logging
|
|
89
90
|
// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
|
|
@@ -425,7 +425,7 @@ struct parser_executor {
|
|
|
425
425
|
|
|
426
426
|
if (result.need_more_input()) {
|
|
427
427
|
// Propagate - need to know what child would match before negating
|
|
428
|
-
return
|
|
428
|
+
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos);
|
|
429
429
|
}
|
|
430
430
|
|
|
431
431
|
// Child failed, so negation succeeds
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
#include "arg.h"
|
|
2
|
+
#include "preset.h"
|
|
3
|
+
#include "peg-parser.h"
|
|
4
|
+
#include "log.h"
|
|
5
|
+
|
|
6
|
+
#include <fstream>
|
|
7
|
+
#include <sstream>
|
|
8
|
+
#include <filesystem>
|
|
9
|
+
|
|
10
|
+
static std::string rm_leading_dashes(const std::string & str) {
|
|
11
|
+
size_t pos = 0;
|
|
12
|
+
while (pos < str.size() && str[pos] == '-') {
|
|
13
|
+
++pos;
|
|
14
|
+
}
|
|
15
|
+
return str.substr(pos);
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
std::vector<std::string> common_preset::to_args() const {
|
|
19
|
+
std::vector<std::string> args;
|
|
20
|
+
|
|
21
|
+
for (const auto & [opt, value] : options) {
|
|
22
|
+
args.push_back(opt.args.back()); // use the last arg as the main arg
|
|
23
|
+
if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
|
|
24
|
+
// flag option, no value
|
|
25
|
+
if (common_arg_utils::is_falsey(value)) {
|
|
26
|
+
// use negative arg if available
|
|
27
|
+
if (!opt.args_neg.empty()) {
|
|
28
|
+
args.back() = opt.args_neg.back();
|
|
29
|
+
} else {
|
|
30
|
+
// otherwise, skip the flag
|
|
31
|
+
// TODO: maybe throw an error instead?
|
|
32
|
+
args.pop_back();
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
if (opt.value_hint != nullptr) {
|
|
37
|
+
// single value
|
|
38
|
+
args.push_back(value);
|
|
39
|
+
}
|
|
40
|
+
if (opt.value_hint != nullptr && opt.value_hint_2 != nullptr) {
|
|
41
|
+
throw std::runtime_error(string_format(
|
|
42
|
+
"common_preset::to_args(): option '%s' has two values, which is not supported yet",
|
|
43
|
+
opt.args.back()
|
|
44
|
+
));
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
return args;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
std::string common_preset::to_ini() const {
|
|
52
|
+
std::ostringstream ss;
|
|
53
|
+
|
|
54
|
+
ss << "[" << name << "]\n";
|
|
55
|
+
for (const auto & [opt, value] : options) {
|
|
56
|
+
auto espaced_value = value;
|
|
57
|
+
string_replace_all(espaced_value, "\n", "\\\n");
|
|
58
|
+
ss << rm_leading_dashes(opt.args.back()) << " = ";
|
|
59
|
+
ss << espaced_value << "\n";
|
|
60
|
+
}
|
|
61
|
+
ss << "\n";
|
|
62
|
+
|
|
63
|
+
return ss.str();
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
static std::map<std::string, std::map<std::string, std::string>> parse_ini_from_file(const std::string & path) {
|
|
67
|
+
std::map<std::string, std::map<std::string, std::string>> parsed;
|
|
68
|
+
|
|
69
|
+
if (!std::filesystem::exists(path)) {
|
|
70
|
+
throw std::runtime_error("preset file does not exist: " + path);
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
std::ifstream file(path);
|
|
74
|
+
if (!file.good()) {
|
|
75
|
+
throw std::runtime_error("failed to open server preset file: " + path);
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
std::string contents((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
|
|
79
|
+
|
|
80
|
+
static const auto parser = build_peg_parser([](auto & p) {
|
|
81
|
+
// newline ::= "\r\n" / "\n" / "\r"
|
|
82
|
+
auto newline = p.rule("newline", p.literal("\r\n") | p.literal("\n") | p.literal("\r"));
|
|
83
|
+
|
|
84
|
+
// ws ::= [ \t]*
|
|
85
|
+
auto ws = p.rule("ws", p.chars("[ \t]", 0, -1));
|
|
86
|
+
|
|
87
|
+
// comment ::= [;#] (!newline .)*
|
|
88
|
+
auto comment = p.rule("comment", p.chars("[;#]", 1, 1) + p.zero_or_more(p.negate(newline) + p.any()));
|
|
89
|
+
|
|
90
|
+
// eol ::= ws comment? (newline / EOF)
|
|
91
|
+
auto eol = p.rule("eol", ws + p.optional(comment) + (newline | p.end()));
|
|
92
|
+
|
|
93
|
+
// ident ::= [a-zA-Z_] [a-zA-Z0-9_.-]*
|
|
94
|
+
auto ident = p.rule("ident", p.chars("[a-zA-Z_]", 1, 1) + p.chars("[a-zA-Z0-9_.-]", 0, -1));
|
|
95
|
+
|
|
96
|
+
// value ::= (!eol-start .)*
|
|
97
|
+
auto eol_start = p.rule("eol-start", ws + (p.chars("[;#]", 1, 1) | newline | p.end()));
|
|
98
|
+
auto value = p.rule("value", p.zero_or_more(p.negate(eol_start) + p.any()));
|
|
99
|
+
|
|
100
|
+
// header-line ::= "[" ws ident ws "]" eol
|
|
101
|
+
auto header_line = p.rule("header-line", "[" + ws + p.tag("section-name", p.chars("[^]]")) + ws + "]" + eol);
|
|
102
|
+
|
|
103
|
+
// kv-line ::= ident ws "=" ws value eol
|
|
104
|
+
auto kv_line = p.rule("kv-line", p.tag("key", ident) + ws + "=" + ws + p.tag("value", value) + eol);
|
|
105
|
+
|
|
106
|
+
// comment-line ::= ws comment (newline / EOF)
|
|
107
|
+
auto comment_line = p.rule("comment-line", ws + comment + (newline | p.end()));
|
|
108
|
+
|
|
109
|
+
// blank-line ::= ws (newline / EOF)
|
|
110
|
+
auto blank_line = p.rule("blank-line", ws + (newline | p.end()));
|
|
111
|
+
|
|
112
|
+
// line ::= header-line / kv-line / comment-line / blank-line
|
|
113
|
+
auto line = p.rule("line", header_line | kv_line | comment_line | blank_line);
|
|
114
|
+
|
|
115
|
+
// ini ::= line* EOF
|
|
116
|
+
auto ini = p.rule("ini", p.zero_or_more(line) + p.end());
|
|
117
|
+
|
|
118
|
+
return ini;
|
|
119
|
+
});
|
|
120
|
+
|
|
121
|
+
common_peg_parse_context ctx(contents);
|
|
122
|
+
const auto result = parser.parse(ctx);
|
|
123
|
+
if (!result.success()) {
|
|
124
|
+
throw std::runtime_error("failed to parse server config file: " + path);
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
std::string current_section = COMMON_PRESET_DEFAULT_NAME;
|
|
128
|
+
std::string current_key;
|
|
129
|
+
|
|
130
|
+
ctx.ast.visit(result, [&](const auto & node) {
|
|
131
|
+
if (node.tag == "section-name") {
|
|
132
|
+
const std::string section = std::string(node.text);
|
|
133
|
+
current_section = section;
|
|
134
|
+
parsed[current_section] = {};
|
|
135
|
+
} else if (node.tag == "key") {
|
|
136
|
+
const std::string key = std::string(node.text);
|
|
137
|
+
current_key = key;
|
|
138
|
+
} else if (node.tag == "value" && !current_key.empty() && !current_section.empty()) {
|
|
139
|
+
parsed[current_section][current_key] = std::string(node.text);
|
|
140
|
+
current_key.clear();
|
|
141
|
+
}
|
|
142
|
+
});
|
|
143
|
+
|
|
144
|
+
return parsed;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
static std::map<std::string, common_arg> get_map_key_opt(common_params_context & ctx_params) {
|
|
148
|
+
std::map<std::string, common_arg> mapping;
|
|
149
|
+
for (const auto & opt : ctx_params.options) {
|
|
150
|
+
for (const auto & env : opt.get_env()) {
|
|
151
|
+
mapping[env] = opt;
|
|
152
|
+
}
|
|
153
|
+
for (const auto & arg : opt.get_args()) {
|
|
154
|
+
mapping[rm_leading_dashes(arg)] = opt;
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
return mapping;
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
static bool is_bool_arg(const common_arg & arg) {
|
|
161
|
+
return !arg.args_neg.empty();
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
static std::string parse_bool_arg(const common_arg & arg, const std::string & key, const std::string & value) {
|
|
165
|
+
// if this is a negated arg, we need to reverse the value
|
|
166
|
+
for (const auto & neg_arg : arg.args_neg) {
|
|
167
|
+
if (rm_leading_dashes(neg_arg) == key) {
|
|
168
|
+
return common_arg_utils::is_truthy(value) ? "false" : "true";
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
// otherwise, not negated
|
|
172
|
+
return value;
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
common_presets common_presets_load(const std::string & path, common_params_context & ctx_params) {
|
|
176
|
+
common_presets out;
|
|
177
|
+
auto key_to_opt = get_map_key_opt(ctx_params);
|
|
178
|
+
auto ini_data = parse_ini_from_file(path);
|
|
179
|
+
|
|
180
|
+
for (auto section : ini_data) {
|
|
181
|
+
common_preset preset;
|
|
182
|
+
if (section.first.empty()) {
|
|
183
|
+
preset.name = COMMON_PRESET_DEFAULT_NAME;
|
|
184
|
+
} else {
|
|
185
|
+
preset.name = section.first;
|
|
186
|
+
}
|
|
187
|
+
LOG_DBG("loading preset: %s\n", preset.name.c_str());
|
|
188
|
+
for (const auto & [key, value] : section.second) {
|
|
189
|
+
LOG_DBG("option: %s = %s\n", key.c_str(), value.c_str());
|
|
190
|
+
if (key_to_opt.find(key) != key_to_opt.end()) {
|
|
191
|
+
auto & opt = key_to_opt[key];
|
|
192
|
+
if (is_bool_arg(opt)) {
|
|
193
|
+
preset.options[opt] = parse_bool_arg(opt, key, value);
|
|
194
|
+
} else {
|
|
195
|
+
preset.options[opt] = value;
|
|
196
|
+
}
|
|
197
|
+
LOG_DBG("accepted option: %s = %s\n", key.c_str(), preset.options[opt].c_str());
|
|
198
|
+
} else {
|
|
199
|
+
// TODO: maybe warn about unknown key?
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
out[preset.name] = preset;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
return out;
|
|
206
|
+
}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include "common.h"
|
|
4
|
+
#include "arg.h"
|
|
5
|
+
|
|
6
|
+
#include <string>
|
|
7
|
+
#include <vector>
|
|
8
|
+
#include <map>
|
|
9
|
+
|
|
10
|
+
//
|
|
11
|
+
// INI preset parser and writer
|
|
12
|
+
//
|
|
13
|
+
|
|
14
|
+
constexpr const char * COMMON_PRESET_DEFAULT_NAME = "default";
|
|
15
|
+
|
|
16
|
+
struct common_preset {
|
|
17
|
+
std::string name;
|
|
18
|
+
// TODO: support repeated args in the future
|
|
19
|
+
std::map<common_arg, std::string> options;
|
|
20
|
+
|
|
21
|
+
// convert preset to CLI argument list
|
|
22
|
+
std::vector<std::string> to_args() const;
|
|
23
|
+
|
|
24
|
+
// convert preset to INI format string
|
|
25
|
+
std::string to_ini() const;
|
|
26
|
+
|
|
27
|
+
// TODO: maybe implement to_env() if needed
|
|
28
|
+
};
|
|
29
|
+
|
|
30
|
+
// interface for multiple presets in one file
|
|
31
|
+
using common_presets = std::map<std::string, common_preset>;
|
|
32
|
+
common_presets common_presets_load(const std::string & path, common_params_context & ctx_params);
|
|
@@ -116,7 +116,6 @@ struct common_sampler {
|
|
|
116
116
|
void reset() {
|
|
117
117
|
prev.clear();
|
|
118
118
|
|
|
119
|
-
llama_sampler_reset(grmr);
|
|
120
119
|
llama_sampler_reset(chain);
|
|
121
120
|
}
|
|
122
121
|
|
|
@@ -167,7 +166,11 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|
|
167
166
|
|
|
168
167
|
lparams.no_perf = params.no_perf;
|
|
169
168
|
|
|
170
|
-
|
|
169
|
+
llama_sampler * grmr = nullptr;
|
|
170
|
+
llama_sampler * chain = llama_sampler_chain_init(lparams);
|
|
171
|
+
|
|
172
|
+
std::vector<llama_sampler *> samplers;
|
|
173
|
+
|
|
171
174
|
if (params.grammar.compare(0, 11, "%llguidance") == 0) {
|
|
172
175
|
#ifdef LLAMA_USE_LLGUIDANCE
|
|
173
176
|
grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str());
|
|
@@ -217,30 +220,20 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|
|
217
220
|
trigger_patterns_c.push_back(regex.c_str());
|
|
218
221
|
}
|
|
219
222
|
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
223
|
+
if (!params.grammar.empty()) {
|
|
224
|
+
if (params.grammar_lazy) {
|
|
225
|
+
grmr = llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
|
|
226
|
+
trigger_patterns_c.data(), trigger_patterns_c.size(),
|
|
227
|
+
trigger_tokens.data(), trigger_tokens.size());
|
|
228
|
+
} else {
|
|
229
|
+
grmr = llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
|
|
230
|
+
}
|
|
227
231
|
}
|
|
228
232
|
}
|
|
229
233
|
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
/* .chain = */ llama_sampler_chain_init(lparams),
|
|
234
|
-
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
|
|
235
|
-
/* .cur = */ {},
|
|
236
|
-
/* .cur_p = */ {},
|
|
237
|
-
};
|
|
238
|
-
|
|
239
|
-
llama_sampler_chain_add(result->chain,
|
|
240
|
-
llama_sampler_init_logit_bias(
|
|
241
|
-
llama_vocab_n_tokens(vocab),
|
|
242
|
-
params.logit_bias.size(),
|
|
243
|
-
params.logit_bias.data()));
|
|
234
|
+
if (params.has_logit_bias()) {
|
|
235
|
+
samplers.push_back(llama_sampler_init_logit_bias(llama_vocab_n_tokens(vocab), params.logit_bias.size(), params.logit_bias.data()));
|
|
236
|
+
}
|
|
244
237
|
|
|
245
238
|
if (params.mirostat == 0) {
|
|
246
239
|
for (const auto & cnstr : params.samplers) {
|
|
@@ -253,58 +246,71 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|
|
253
246
|
c_breakers.push_back(str.c_str());
|
|
254
247
|
}
|
|
255
248
|
|
|
256
|
-
|
|
249
|
+
samplers.push_back(llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
|
257
250
|
}
|
|
258
251
|
break;
|
|
259
252
|
case COMMON_SAMPLER_TYPE_TOP_K:
|
|
260
|
-
|
|
253
|
+
samplers.push_back(llama_sampler_init_top_k (params.top_k));
|
|
261
254
|
break;
|
|
262
255
|
case COMMON_SAMPLER_TYPE_TOP_P:
|
|
263
|
-
|
|
256
|
+
samplers.push_back(llama_sampler_init_top_p (params.top_p, params.min_keep));
|
|
264
257
|
break;
|
|
265
258
|
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA:
|
|
266
|
-
|
|
259
|
+
samplers.push_back(llama_sampler_init_top_n_sigma(params.top_n_sigma));
|
|
267
260
|
break;
|
|
268
261
|
case COMMON_SAMPLER_TYPE_MIN_P:
|
|
269
|
-
|
|
262
|
+
samplers.push_back(llama_sampler_init_min_p (params.min_p, params.min_keep));
|
|
270
263
|
break;
|
|
271
264
|
case COMMON_SAMPLER_TYPE_XTC:
|
|
272
|
-
|
|
265
|
+
samplers.push_back(llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
|
|
273
266
|
break;
|
|
274
267
|
case COMMON_SAMPLER_TYPE_TYPICAL_P:
|
|
275
|
-
|
|
268
|
+
samplers.push_back(llama_sampler_init_typical (params.typ_p, params.min_keep));
|
|
276
269
|
break;
|
|
277
270
|
case COMMON_SAMPLER_TYPE_TEMPERATURE:
|
|
278
|
-
|
|
271
|
+
samplers.push_back(llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
|
279
272
|
break;
|
|
280
273
|
case COMMON_SAMPLER_TYPE_INFILL:
|
|
281
|
-
|
|
274
|
+
samplers.push_back(llama_sampler_init_infill (vocab));
|
|
282
275
|
break;
|
|
283
276
|
case COMMON_SAMPLER_TYPE_PENALTIES:
|
|
284
|
-
|
|
277
|
+
samplers.push_back(llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
|
|
285
278
|
break;
|
|
286
279
|
default:
|
|
287
280
|
GGML_ASSERT(false && "unknown sampler type");
|
|
288
281
|
}
|
|
289
282
|
}
|
|
290
|
-
|
|
283
|
+
|
|
284
|
+
samplers.push_back(llama_sampler_init_dist(params.seed));
|
|
291
285
|
} else if (params.mirostat == 1) {
|
|
292
|
-
|
|
293
|
-
|
|
286
|
+
samplers.push_back(llama_sampler_init_temp(params.temp));
|
|
287
|
+
samplers.push_back(llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
|
|
294
288
|
} else if (params.mirostat == 2) {
|
|
295
|
-
|
|
296
|
-
|
|
289
|
+
samplers.push_back(llama_sampler_init_temp(params.temp));
|
|
290
|
+
samplers.push_back(llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
|
|
297
291
|
} else {
|
|
298
292
|
GGML_ASSERT(false && "unknown mirostat version");
|
|
299
293
|
}
|
|
300
294
|
|
|
295
|
+
for (auto * smpl : samplers) {
|
|
296
|
+
llama_sampler_chain_add(chain, smpl);
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
auto * result = new common_sampler {
|
|
300
|
+
/* .params = */ params,
|
|
301
|
+
/* .grmr = */ grmr,
|
|
302
|
+
/* .chain = */ chain,
|
|
303
|
+
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
|
|
304
|
+
/* .cur = */ {},
|
|
305
|
+
/* .cur_p = */ {},
|
|
306
|
+
};
|
|
307
|
+
|
|
301
308
|
return result;
|
|
302
309
|
}
|
|
303
310
|
|
|
304
311
|
void common_sampler_free(struct common_sampler * gsmpl) {
|
|
305
312
|
if (gsmpl) {
|
|
306
313
|
llama_sampler_free(gsmpl->grmr);
|
|
307
|
-
|
|
308
314
|
llama_sampler_free(gsmpl->chain);
|
|
309
315
|
|
|
310
316
|
delete gsmpl;
|
|
@@ -314,7 +320,7 @@ void common_sampler_free(struct common_sampler * gsmpl) {
|
|
|
314
320
|
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
|
|
315
321
|
const auto tm = gsmpl->tm();
|
|
316
322
|
|
|
317
|
-
if (accept_grammar) {
|
|
323
|
+
if (gsmpl->grmr && accept_grammar) {
|
|
318
324
|
llama_sampler_accept(gsmpl->grmr, token);
|
|
319
325
|
}
|
|
320
326
|
|
|
@@ -329,12 +335,12 @@ void common_sampler_reset(struct common_sampler * gsmpl) {
|
|
|
329
335
|
|
|
330
336
|
struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
|
|
331
337
|
return new common_sampler {
|
|
332
|
-
/* .params
|
|
333
|
-
/* .grmr
|
|
334
|
-
/* .chain
|
|
335
|
-
/* .prev
|
|
336
|
-
/* .cur
|
|
337
|
-
/* .cur_p
|
|
338
|
+
/* .params = */ gsmpl->params,
|
|
339
|
+
/* .grmr = */ llama_sampler_clone(gsmpl->grmr),
|
|
340
|
+
/* .chain = */ llama_sampler_clone(gsmpl->chain),
|
|
341
|
+
/* .prev = */ gsmpl->prev,
|
|
342
|
+
/* .cur = */ gsmpl->cur,
|
|
343
|
+
/* .cur_p = */ gsmpl->cur_p,
|
|
338
344
|
};
|
|
339
345
|
}
|
|
340
346
|
|
|
@@ -383,33 +389,37 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
|
|
|
383
389
|
}
|
|
384
390
|
}
|
|
385
391
|
|
|
392
|
+
struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl) {
|
|
393
|
+
return gsmpl->chain;
|
|
394
|
+
}
|
|
395
|
+
|
|
386
396
|
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
|
|
387
397
|
llama_synchronize(ctx);
|
|
388
398
|
|
|
389
399
|
// start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
|
|
390
400
|
const auto tm = gsmpl->tm();
|
|
391
401
|
|
|
392
|
-
|
|
402
|
+
llama_token id = LLAMA_TOKEN_NULL;
|
|
393
403
|
|
|
394
404
|
auto & grmr = gsmpl->grmr;
|
|
395
405
|
auto & chain = gsmpl->chain;
|
|
396
406
|
auto & cur_p = gsmpl->cur_p; // initialized by set_logits
|
|
397
407
|
|
|
408
|
+
gsmpl->set_logits(ctx, idx);
|
|
409
|
+
|
|
398
410
|
if (grammar_first) {
|
|
399
411
|
llama_sampler_apply(grmr, &cur_p);
|
|
400
412
|
}
|
|
401
413
|
|
|
402
414
|
llama_sampler_apply(chain, &cur_p);
|
|
403
415
|
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
const llama_token id = cur_p.data[cur_p.selected].id;
|
|
416
|
+
id = cur_p.data[cur_p.selected].id;
|
|
407
417
|
|
|
408
418
|
if (grammar_first) {
|
|
409
419
|
return id;
|
|
410
420
|
}
|
|
411
421
|
|
|
412
|
-
// check if it the sampled token fits the grammar
|
|
422
|
+
// check if it the sampled token fits the grammar (grammar-based rejection sampling)
|
|
413
423
|
{
|
|
414
424
|
llama_token_data single_token_data = { id, 1.0f, 0.0f };
|
|
415
425
|
llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
|
|
@@ -429,9 +439,11 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
|
|
429
439
|
llama_sampler_apply(grmr, &cur_p);
|
|
430
440
|
llama_sampler_apply(chain, &cur_p);
|
|
431
441
|
|
|
432
|
-
GGML_ASSERT(cur_p.selected != -1 && "no selected token during
|
|
442
|
+
GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
|
|
443
|
+
|
|
444
|
+
id = cur_p.data[cur_p.selected].id;
|
|
433
445
|
|
|
434
|
-
return
|
|
446
|
+
return id;
|
|
435
447
|
}
|
|
436
448
|
|
|
437
449
|
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
|
|
@@ -515,7 +527,8 @@ std::string common_sampler_print(const struct common_sampler * gsmpl) {
|
|
|
515
527
|
|
|
516
528
|
for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
|
|
517
529
|
const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
|
|
518
|
-
result += std::string("-> ")
|
|
530
|
+
result += std::string("-> ");
|
|
531
|
+
result += std::string(llama_sampler_name(smpl)) + " ";
|
|
519
532
|
}
|
|
520
533
|
|
|
521
534
|
return result;
|
|
@@ -48,6 +48,8 @@ struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
|
|
|
48
48
|
// arguments can be nullptr to skip printing
|
|
49
49
|
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
|
|
50
50
|
|
|
51
|
+
struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
|
|
52
|
+
|
|
51
53
|
// extended sampling implementation:
|
|
52
54
|
//
|
|
53
55
|
// - set logits
|
|
@@ -107,3 +109,9 @@ std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std:
|
|
|
107
109
|
|
|
108
110
|
llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab,
|
|
109
111
|
const char * grammar_kind, const char * grammar_data);
|
|
112
|
+
|
|
113
|
+
struct common_sampler_deleter {
|
|
114
|
+
void operator()(common_sampler * s) { common_sampler_free(s); }
|
|
115
|
+
};
|
|
116
|
+
|
|
117
|
+
typedef std::unique_ptr<common_sampler, common_sampler_deleter> common_sampler_ptr;
|
|
@@ -54,6 +54,10 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
|
|
|
54
54
|
# TODO
|
|
55
55
|
else()
|
|
56
56
|
set(GGML_STANDALONE OFF)
|
|
57
|
+
|
|
58
|
+
if (NOT CMAKE_RUNTIME_OUTPUT_DIRECTORY)
|
|
59
|
+
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
|
|
60
|
+
endif()
|
|
57
61
|
endif()
|
|
58
62
|
|
|
59
63
|
if (EMSCRIPTEN)
|
|
@@ -53,7 +53,14 @@ GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
|
|
|
53
53
|
// call with a worst-case graph to avoid buffer reallocations
|
|
54
54
|
// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
|
|
55
55
|
// returns false if the buffer allocation failed
|
|
56
|
+
// ggml_gallocr_resrve_n_size writes the buffer sizes per galloc buffer that would be allocated by ggml_gallocr_reserve_n to sizes
|
|
56
57
|
GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
|
|
58
|
+
GGML_API void ggml_gallocr_reserve_n_size(
|
|
59
|
+
ggml_gallocr_t galloc,
|
|
60
|
+
struct ggml_cgraph * graph,
|
|
61
|
+
const int * node_buffer_ids,
|
|
62
|
+
const int * leaf_buffer_ids,
|
|
63
|
+
size_t * sizes);
|
|
57
64
|
GGML_API bool ggml_gallocr_reserve_n(
|
|
58
65
|
ggml_gallocr_t galloc,
|
|
59
66
|
struct ggml_cgraph * graph,
|
|
@@ -68,6 +75,8 @@ GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_i
|
|
|
68
75
|
|
|
69
76
|
// Utils
|
|
70
77
|
// Create a buffer and allocate all the tensors in a ggml_context
|
|
78
|
+
// ggml_backend_alloc_ctx_tensors_from_buft_size returns the size of the buffer that would be allocated by ggml_backend_alloc_ctx_tensors_from_buft
|
|
79
|
+
GGML_API size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
|
|
71
80
|
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
|
|
72
81
|
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
|
|
73
82
|
|
|
@@ -307,6 +307,7 @@ extern "C" {
|
|
|
307
307
|
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
|
308
308
|
|
|
309
309
|
// Initialize backend buffers from a measure graph
|
|
310
|
+
GGML_API void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes);
|
|
310
311
|
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
|
|
311
312
|
|
|
312
313
|
GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
|
|
@@ -99,6 +99,7 @@ extern "C" {
|
|
|
99
99
|
GGML_BACKEND_API int ggml_cpu_has_sme (void);
|
|
100
100
|
// other
|
|
101
101
|
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
|
|
102
|
+
GGML_BACKEND_API int ggml_cpu_get_rvv_vlen (void); // risc-v vector length in bytes
|
|
102
103
|
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
|
|
103
104
|
GGML_BACKEND_API int ggml_cpu_has_vxe (void);
|
|
104
105
|
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
|
|
@@ -2305,13 +2305,11 @@ extern "C" {
|
|
|
2305
2305
|
float stop,
|
|
2306
2306
|
float step);
|
|
2307
2307
|
|
|
2308
|
-
|
|
2309
|
-
|
|
2310
|
-
//
|
|
2311
|
-
//
|
|
2312
|
-
//
|
|
2313
|
-
// mask: [n_kv, n_batch_pad, ne32, ne33] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
|
|
2314
|
-
// res: [n_embd_v, n_head, n_batch, ne3 ] !! permuted !!
|
|
2308
|
+
// q: [n_embd_k, n_batch, n_head, ne3 ]
|
|
2309
|
+
// k: [n_embd_k, n_kv, n_head_kv, ne3 ]
|
|
2310
|
+
// v: [n_embd_v, n_kv, n_head_kv, ne3 ] !! not transposed !!
|
|
2311
|
+
// mask: [n_kv, n_batch, ne32, ne33]
|
|
2312
|
+
// res: [n_embd_v, n_head, n_batch, ne3 ] !! permuted !!
|
|
2315
2313
|
//
|
|
2316
2314
|
// broadcast:
|
|
2317
2315
|
// n_head % n_head_kv == 0
|
|
@@ -2617,7 +2615,8 @@ extern "C" {
|
|
|
2617
2615
|
|
|
2618
2616
|
// Set callback for all future logging events.
|
|
2619
2617
|
// If this is not called, or NULL is supplied, everything is output on stderr.
|
|
2620
|
-
GGML_API void
|
|
2618
|
+
GGML_API void ggml_log_get(ggml_log_callback * log_callback, void ** user_data);
|
|
2619
|
+
GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
|
|
2621
2620
|
|
|
2622
2621
|
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
|
2623
2622
|
|
|
@@ -386,6 +386,9 @@ if (GGML_CPU_ALL_VARIANTS)
|
|
|
386
386
|
ggml_add_cpu_backend_variant(android_armv8.2_1 DOTPROD)
|
|
387
387
|
ggml_add_cpu_backend_variant(android_armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC)
|
|
388
388
|
ggml_add_cpu_backend_variant(android_armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC MATMUL_INT8)
|
|
389
|
+
ggml_add_cpu_backend_variant(android_armv9.0_1 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE2)
|
|
390
|
+
ggml_add_cpu_backend_variant(android_armv9.2_1 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SME)
|
|
391
|
+
ggml_add_cpu_backend_variant(android_armv9.2_2 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE SME)
|
|
389
392
|
elseif (APPLE)
|
|
390
393
|
ggml_add_cpu_backend_variant(apple_m1 DOTPROD)
|
|
391
394
|
ggml_add_cpu_backend_variant(apple_m2_m3 DOTPROD MATMUL_INT8)
|