@fugood/llama.node 0.3.11 → 0.3.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +1 -0
- package/lib/index.js +26 -20
- package/lib/index.ts +32 -28
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +14 -0
- package/src/LlamaContext.cpp +13 -4
- package/src/llama.cpp/.github/workflows/build.yml +35 -3
- package/src/llama.cpp/.github/workflows/docker.yml +2 -0
- package/src/llama.cpp/.github/workflows/labeler.yml +1 -1
- package/src/llama.cpp/common/CMakeLists.txt +20 -3
- package/src/llama.cpp/common/arg.cpp +180 -3
- package/src/llama.cpp/common/chat-template.hpp +21 -7
- package/src/llama.cpp/common/chat.cpp +220 -101
- package/src/llama.cpp/common/chat.hpp +3 -0
- package/src/llama.cpp/common/common.h +15 -7
- package/src/llama.cpp/common/llguidance.cpp +3 -3
- package/src/llama.cpp/common/log.cpp +1 -0
- package/src/llama.cpp/common/log.h +2 -1
- package/src/llama.cpp/common/minja.hpp +24 -9
- package/src/llama.cpp/common/sampling.cpp +52 -46
- package/src/llama.cpp/common/speculative.h +1 -1
- package/src/llama.cpp/docs/build.md +2 -2
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +6 -5
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +1 -0
- package/src/llama.cpp/examples/run/run.cpp +5 -12
- package/src/llama.cpp/examples/server/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/server/httplib.h +381 -292
- package/src/llama.cpp/examples/server/server.cpp +58 -47
- package/src/llama.cpp/examples/server/utils.hpp +7 -5
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-metal.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +0 -2
- package/src/llama.cpp/ggml/include/ggml.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-common.h +0 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +6 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +852 -268
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +200 -107
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +9 -8
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +2 -2
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +26 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +6 -7
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +812 -569
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +25 -1
- package/src/llama.cpp/ggml/src/ggml.c +1 -1
- package/src/llama.cpp/include/llama.h +14 -10
- package/src/llama.cpp/src/llama-grammar.cpp +1 -1
- package/src/llama.cpp/src/llama-grammar.h +1 -1
- package/src/llama.cpp/src/llama-impl.h +6 -6
- package/src/llama.cpp/src/llama-kv-cache.h +1 -1
- package/src/llama.cpp/src/llama-mmap.h +1 -0
- package/src/llama.cpp/src/llama-model.cpp +1 -1
- package/src/llama.cpp/src/llama-sampling.cpp +131 -57
- package/src/llama.cpp/src/llama.cpp +7 -5
- package/src/llama.cpp/src/unicode.cpp +9 -2
- package/src/llama.cpp/tests/test-backend-ops.cpp +5 -5
- package/src/llama.cpp/tests/test-chat.cpp +237 -69
- package/src/llama.cpp/tests/test-gguf.cpp +4 -4
- package/src/llama.cpp/tests/test-sampling.cpp +15 -0
|
@@ -140,6 +140,7 @@ struct common_params_sampling {
|
|
|
140
140
|
int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty
|
|
141
141
|
int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
|
|
142
142
|
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
|
143
|
+
float top_n_sigma = -1.00f;// -1.0 = disabled
|
|
143
144
|
float mirostat_tau = 5.00f; // target entropy
|
|
144
145
|
float mirostat_eta = 0.10f; // learning rate
|
|
145
146
|
bool ignore_eos = false;
|
|
@@ -202,6 +203,11 @@ struct common_params_vocoder {
|
|
|
202
203
|
bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
|
|
203
204
|
};
|
|
204
205
|
|
|
206
|
+
enum common_reasoning_format {
|
|
207
|
+
COMMON_REASONING_FORMAT_NONE,
|
|
208
|
+
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`
|
|
209
|
+
};
|
|
210
|
+
|
|
205
211
|
struct common_params {
|
|
206
212
|
int32_t n_predict = -1; // new tokens to predict
|
|
207
213
|
int32_t n_ctx = 4096; // context size
|
|
@@ -292,6 +298,7 @@ struct common_params {
|
|
|
292
298
|
bool kl_divergence = false; // compute KL divergence
|
|
293
299
|
|
|
294
300
|
bool usage = false; // print usage
|
|
301
|
+
bool completion = false; // print source-able completion script
|
|
295
302
|
bool use_color = false; // use color to distinguish generations and inputs
|
|
296
303
|
bool special = false; // enable special token output
|
|
297
304
|
bool interactive = false; // interactive mode
|
|
@@ -346,6 +353,7 @@ struct common_params {
|
|
|
346
353
|
std::string chat_template = ""; // NOLINT
|
|
347
354
|
bool use_jinja = false; // NOLINT
|
|
348
355
|
bool enable_chat_template = true;
|
|
356
|
+
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
|
349
357
|
|
|
350
358
|
std::vector<std::string> api_keys;
|
|
351
359
|
|
|
@@ -424,13 +432,13 @@ bool set_process_priority(enum ggml_sched_priority prio);
|
|
|
424
432
|
//
|
|
425
433
|
|
|
426
434
|
#ifdef __GNUC__
|
|
427
|
-
#
|
|
428
|
-
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
|
429
|
-
#else
|
|
430
|
-
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
|
431
|
-
#endif
|
|
435
|
+
# if defined(__MINGW32__) && !defined(__clang__)
|
|
436
|
+
# define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
|
437
|
+
# else
|
|
438
|
+
# define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
|
439
|
+
# endif
|
|
432
440
|
#else
|
|
433
|
-
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
|
|
441
|
+
# define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
|
|
434
442
|
#endif
|
|
435
443
|
|
|
436
444
|
LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
|
|
@@ -623,7 +631,7 @@ struct common_chat_msg {
|
|
|
623
631
|
std::string role;
|
|
624
632
|
std::string content;
|
|
625
633
|
std::vector<common_tool_call> tool_calls;
|
|
626
|
-
std::string
|
|
634
|
+
std::string reasoning_content = "";
|
|
627
635
|
};
|
|
628
636
|
|
|
629
637
|
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
|
@@ -254,10 +254,10 @@ llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab, const char * g
|
|
|
254
254
|
};
|
|
255
255
|
}
|
|
256
256
|
|
|
257
|
-
return
|
|
257
|
+
return llama_sampler_init(
|
|
258
258
|
/* .iface = */ &llama_sampler_llg_i,
|
|
259
|
-
/* .ctx = */ ctx
|
|
260
|
-
|
|
259
|
+
/* .ctx = */ ctx
|
|
260
|
+
);
|
|
261
261
|
}
|
|
262
262
|
|
|
263
263
|
#else
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
#include "ggml.h" // for ggml_log_level
|
|
4
4
|
|
|
5
|
+
#define LOG_CLR_TO_EOL "\033[K\r"
|
|
5
6
|
#define LOG_COL_DEFAULT "\033[0m"
|
|
6
7
|
#define LOG_COL_BOLD "\033[1m"
|
|
7
8
|
#define LOG_COL_RED "\033[31m"
|
|
@@ -14,7 +15,7 @@
|
|
|
14
15
|
|
|
15
16
|
#ifndef __GNUC__
|
|
16
17
|
# define LOG_ATTRIBUTE_FORMAT(...)
|
|
17
|
-
#elif defined(__MINGW32__)
|
|
18
|
+
#elif defined(__MINGW32__) && !defined(__clang__)
|
|
18
19
|
# define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
|
19
20
|
#else
|
|
20
21
|
# define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
|
@@ -1385,6 +1385,13 @@ static std::string strip(const std::string & s) {
|
|
|
1385
1385
|
return s.substr(start, end - start + 1);
|
|
1386
1386
|
}
|
|
1387
1387
|
|
|
1388
|
+
static std::string capitalize(const std::string & s) {
|
|
1389
|
+
if (s.empty()) return s;
|
|
1390
|
+
auto result = s;
|
|
1391
|
+
result[0] = std::toupper(result[0]);
|
|
1392
|
+
return result;
|
|
1393
|
+
}
|
|
1394
|
+
|
|
1388
1395
|
static std::string html_escape(const std::string & s) {
|
|
1389
1396
|
std::string result;
|
|
1390
1397
|
result.reserve(s.size());
|
|
@@ -1462,6 +1469,9 @@ public:
|
|
|
1462
1469
|
if (method->get_name() == "strip") {
|
|
1463
1470
|
vargs.expectArgs("strip method", {0, 0}, {0, 0});
|
|
1464
1471
|
return Value(strip(str));
|
|
1472
|
+
} else if (method->get_name() == "capitalize") {
|
|
1473
|
+
vargs.expectArgs("capitalize method", {0, 0}, {0, 0});
|
|
1474
|
+
return Value(capitalize(str));
|
|
1465
1475
|
} else if (method->get_name() == "endswith") {
|
|
1466
1476
|
vargs.expectArgs("endswith method", {1, 1}, {0, 0});
|
|
1467
1477
|
auto suffix = vargs.args[0].get<std::string>();
|
|
@@ -1792,7 +1802,7 @@ private:
|
|
|
1792
1802
|
auto left = parseStringConcat();
|
|
1793
1803
|
if (!left) throw std::runtime_error("Expected left side of 'logical compare' expression");
|
|
1794
1804
|
|
|
1795
|
-
static std::regex compare_tok(R"(==|!=|<=?|>=?|in\b|is\b|not
|
|
1805
|
+
static std::regex compare_tok(R"(==|!=|<=?|>=?|in\b|is\b|not\s+in\b)");
|
|
1796
1806
|
static std::regex not_tok(R"(not\b)");
|
|
1797
1807
|
std::string op_str;
|
|
1798
1808
|
while (!(op_str = consumeToken(compare_tok)).empty()) {
|
|
@@ -2171,7 +2181,7 @@ private:
|
|
|
2171
2181
|
using TemplateTokenIterator = TemplateTokenVector::const_iterator;
|
|
2172
2182
|
|
|
2173
2183
|
std::vector<std::string> parseVarNames() {
|
|
2174
|
-
static std::regex varnames_regex(R"(((?:\w+)(
|
|
2184
|
+
static std::regex varnames_regex(R"(((?:\w+)(?:\s*,\s*(?:\w+))*)\s*)");
|
|
2175
2185
|
|
|
2176
2186
|
std::vector<std::string> group;
|
|
2177
2187
|
if ((group = consumeTokenGroups(varnames_regex)).empty()) throw std::runtime_error("Expected variable names");
|
|
@@ -2194,13 +2204,13 @@ private:
|
|
|
2194
2204
|
}
|
|
2195
2205
|
|
|
2196
2206
|
TemplateTokenVector tokenize() {
|
|
2197
|
-
static std::regex comment_tok(R"(\{#([-~]?)([\s\S
|
|
2207
|
+
static std::regex comment_tok(R"(\{#([-~]?)([\s\S]*?)([-~]?)#\})");
|
|
2198
2208
|
static std::regex expr_open_regex(R"(\{\{([-~])?)");
|
|
2199
|
-
static std::regex block_open_regex(R"(^\{%([-~])
|
|
2209
|
+
static std::regex block_open_regex(R"(^\{%([-~])?\s*)");
|
|
2200
2210
|
static std::regex block_keyword_tok(R"((if|else|elif|endif|for|endfor|generation|endgeneration|set|endset|block|endblock|macro|endmacro|filter|endfilter|break|continue)\b)");
|
|
2201
2211
|
static std::regex non_text_open_regex(R"(\{\{|\{%|\{#)");
|
|
2202
|
-
static std::regex expr_close_regex(R"(
|
|
2203
|
-
static std::regex block_close_regex(R"(
|
|
2212
|
+
static std::regex expr_close_regex(R"(\s*([-~])?\}\})");
|
|
2213
|
+
static std::regex block_close_regex(R"(\s*([-~])?%\})");
|
|
2204
2214
|
|
|
2205
2215
|
TemplateTokenVector tokens;
|
|
2206
2216
|
std::vector<std::string> group;
|
|
@@ -2284,7 +2294,7 @@ private:
|
|
|
2284
2294
|
auto post_space = parseBlockClose();
|
|
2285
2295
|
tokens.push_back(std::make_unique<EndGenerationTemplateToken>(location, pre_space, post_space));
|
|
2286
2296
|
} else if (keyword == "set") {
|
|
2287
|
-
static std::regex namespaced_var_regex(R"((\w+)
|
|
2297
|
+
static std::regex namespaced_var_regex(R"((\w+)\s*\.\s*(\w+))");
|
|
2288
2298
|
|
|
2289
2299
|
std::string ns;
|
|
2290
2300
|
std::vector<std::string> var_names;
|
|
@@ -2336,6 +2346,11 @@ private:
|
|
|
2336
2346
|
throw std::runtime_error("Unexpected block: " + keyword);
|
|
2337
2347
|
}
|
|
2338
2348
|
} else if (std::regex_search(it, end, match, non_text_open_regex)) {
|
|
2349
|
+
if (!match.position()) {
|
|
2350
|
+
if (match[0] != "{#")
|
|
2351
|
+
throw std::runtime_error("Internal error: Expected a comment");
|
|
2352
|
+
throw std::runtime_error("Missing end of comment tag");
|
|
2353
|
+
}
|
|
2339
2354
|
auto text_end = it + match.position();
|
|
2340
2355
|
text = std::string(it, text_end);
|
|
2341
2356
|
it = text_end;
|
|
@@ -2400,7 +2415,7 @@ private:
|
|
|
2400
2415
|
|
|
2401
2416
|
auto text = text_token->text;
|
|
2402
2417
|
if (post_space == SpaceHandling::Strip) {
|
|
2403
|
-
static std::regex trailing_space_regex(R"(
|
|
2418
|
+
static std::regex trailing_space_regex(R"(\s+$)");
|
|
2404
2419
|
text = std::regex_replace(text, trailing_space_regex, "");
|
|
2405
2420
|
} else if (options.lstrip_blocks && it != end) {
|
|
2406
2421
|
auto i = text.size();
|
|
@@ -2410,7 +2425,7 @@ private:
|
|
|
2410
2425
|
}
|
|
2411
2426
|
}
|
|
2412
2427
|
if (pre_space == SpaceHandling::Strip) {
|
|
2413
|
-
static std::regex leading_space_regex(R"(
|
|
2428
|
+
static std::regex leading_space_regex(R"(^\s+)");
|
|
2414
2429
|
text = std::regex_replace(text, leading_space_regex, "");
|
|
2415
2430
|
} else if (options.trim_blocks && (it - 1) != begin && !dynamic_cast<ExpressionTemplateToken*>((*(it - 2)).get())) {
|
|
2416
2431
|
if (text.length() > 0 && text[0] == '\n') {
|
|
@@ -134,11 +134,11 @@ std::string common_params_sampling::print() const {
|
|
|
134
134
|
snprintf(result, sizeof(result),
|
|
135
135
|
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
|
|
136
136
|
"\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n"
|
|
137
|
-
"\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f\n"
|
|
137
|
+
"\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, top_n_sigma = %.3f, temp = %.3f\n"
|
|
138
138
|
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
|
|
139
139
|
penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
|
|
140
140
|
dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n,
|
|
141
|
-
top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, temp,
|
|
141
|
+
top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, top_n_sigma, temp,
|
|
142
142
|
mirostat, mirostat_eta, mirostat_tau);
|
|
143
143
|
|
|
144
144
|
return std::string(result);
|
|
@@ -151,12 +151,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|
|
151
151
|
|
|
152
152
|
lparams.no_perf = params.no_perf;
|
|
153
153
|
|
|
154
|
-
std::vector<const char *> trigger_words;
|
|
155
|
-
trigger_words.reserve(params.grammar_trigger_words.size());
|
|
156
|
-
for (const auto & str : params.grammar_trigger_words) {
|
|
157
|
-
trigger_words.push_back(str.word.c_str());
|
|
158
|
-
}
|
|
159
|
-
|
|
160
154
|
struct llama_sampler * grmr;
|
|
161
155
|
if (params.grammar.compare(0, 11, "%llguidance") == 0) {
|
|
162
156
|
#ifdef LLAMA_USE_LLGUIDANCE
|
|
@@ -165,6 +159,12 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|
|
165
159
|
GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
|
|
166
160
|
#endif // LLAMA_USE_LLGUIDANCE
|
|
167
161
|
} else {
|
|
162
|
+
std::vector<const char *> trigger_words;
|
|
163
|
+
trigger_words.reserve(params.grammar_trigger_words.size());
|
|
164
|
+
for (const auto & str : params.grammar_trigger_words) {
|
|
165
|
+
trigger_words.push_back(str.word.c_str());
|
|
166
|
+
}
|
|
167
|
+
|
|
168
168
|
grmr = params.grammar_lazy
|
|
169
169
|
? llama_sampler_init_grammar_lazy(vocab, params.grammar.c_str(), "root",
|
|
170
170
|
trigger_words.data(), trigger_words.size(),
|
|
@@ -188,45 +188,51 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|
|
188
188
|
params.logit_bias.data()));
|
|
189
189
|
|
|
190
190
|
if (params.mirostat == 0) {
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
191
|
+
if (params.top_n_sigma >= 0) {
|
|
192
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
|
193
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_temp (params.temp));
|
|
194
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
|
|
195
|
+
} else {
|
|
196
|
+
for (const auto & cnstr : params.samplers) {
|
|
197
|
+
switch (cnstr) {
|
|
198
|
+
case COMMON_SAMPLER_TYPE_DRY:
|
|
199
|
+
{
|
|
200
|
+
std::vector<const char *> c_breakers;
|
|
201
|
+
c_breakers.reserve(params.dry_sequence_breakers.size());
|
|
202
|
+
for (const auto & str : params.dry_sequence_breakers) {
|
|
203
|
+
c_breakers.push_back(str.c_str());
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
|
199
207
|
}
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
default:
|
|
229
|
-
GGML_ASSERT(false && "unknown sampler type");
|
|
208
|
+
break;
|
|
209
|
+
case COMMON_SAMPLER_TYPE_TOP_K:
|
|
210
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
|
211
|
+
break;
|
|
212
|
+
case COMMON_SAMPLER_TYPE_TOP_P:
|
|
213
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
|
|
214
|
+
break;
|
|
215
|
+
case COMMON_SAMPLER_TYPE_MIN_P:
|
|
216
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
|
217
|
+
break;
|
|
218
|
+
case COMMON_SAMPLER_TYPE_XTC:
|
|
219
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
|
|
220
|
+
break;
|
|
221
|
+
case COMMON_SAMPLER_TYPE_TYPICAL_P:
|
|
222
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
|
|
223
|
+
break;
|
|
224
|
+
case COMMON_SAMPLER_TYPE_TEMPERATURE:
|
|
225
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
|
226
|
+
break;
|
|
227
|
+
case COMMON_SAMPLER_TYPE_INFILL:
|
|
228
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
|
|
229
|
+
break;
|
|
230
|
+
case COMMON_SAMPLER_TYPE_PENALTIES:
|
|
231
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
|
|
232
|
+
break;
|
|
233
|
+
default:
|
|
234
|
+
GGML_ASSERT(false && "unknown sampler type");
|
|
235
|
+
}
|
|
230
236
|
}
|
|
231
237
|
}
|
|
232
238
|
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
|
|
@@ -9,7 +9,7 @@ struct common_speculative_params {
|
|
|
9
9
|
int n_draft = 16; // max drafted tokens
|
|
10
10
|
int n_reuse = 256;
|
|
11
11
|
|
|
12
|
-
float p_min = 0.9f; // min
|
|
12
|
+
float p_min = 0.9f; // min probability required to accept a token in the draft
|
|
13
13
|
};
|
|
14
14
|
|
|
15
15
|
struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
**To get the Code:**
|
|
4
4
|
|
|
5
5
|
```bash
|
|
6
|
-
git clone https://github.com/
|
|
6
|
+
git clone https://github.com/ggml-org/llama.cpp
|
|
7
7
|
cd llama.cpp
|
|
8
8
|
```
|
|
9
9
|
|
|
@@ -46,7 +46,7 @@ cmake --build build --config Release
|
|
|
46
46
|
```
|
|
47
47
|
|
|
48
48
|
- Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers:
|
|
49
|
-
- Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/
|
|
49
|
+
- Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...):
|
|
50
50
|
- Tab Workload: Desktop-development with C++
|
|
51
51
|
- Tab Components (select quickly via search): C++-_CMake_ Tools for Windows, _Git_ for Windows, C++-_Clang_ Compiler for Windows, MS-Build Support for LLVM-Toolset (clang)
|
|
52
52
|
- Please remember to always use a Developer Command Prompt / PowerShell for VS2022 for git, build, test
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
#include "log.h"
|
|
4
4
|
#include "llama.h"
|
|
5
5
|
|
|
6
|
+
#include <chrono>
|
|
6
7
|
#include <cmath>
|
|
7
8
|
#include <cstdio>
|
|
8
9
|
#include <cstring>
|
|
@@ -99,7 +100,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
|
|
99
100
|
const float * data = is_host ? (const float *) src1->data : m_src1_data.data();
|
|
100
101
|
|
|
101
102
|
// this has been adapted to the new format of storing merged experts in a single 3d tensor
|
|
102
|
-
// ref: https://github.com/
|
|
103
|
+
// ref: https://github.com/ggml-org/llama.cpp/pull/6387
|
|
103
104
|
if (t->op == GGML_OP_MUL_MAT_ID) {
|
|
104
105
|
// ids -> [n_experts_used, n_tokens]
|
|
105
106
|
// src1 -> [cols, n_expert_used, n_tokens]
|
|
@@ -876,8 +876,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|
|
876
876
|
struct test {
|
|
877
877
|
static const std::string build_commit;
|
|
878
878
|
static const int build_number;
|
|
879
|
-
|
|
880
|
-
|
|
879
|
+
const std::string cpu_info;
|
|
880
|
+
const std::string gpu_info;
|
|
881
881
|
std::string model_filename;
|
|
882
882
|
std::string model_type;
|
|
883
883
|
uint64_t model_size;
|
|
@@ -903,7 +903,10 @@ struct test {
|
|
|
903
903
|
std::string test_time;
|
|
904
904
|
std::vector<uint64_t> samples_ns;
|
|
905
905
|
|
|
906
|
-
test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx)
|
|
906
|
+
test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) :
|
|
907
|
+
cpu_info(get_cpu_info()),
|
|
908
|
+
gpu_info(get_gpu_info()) {
|
|
909
|
+
|
|
907
910
|
model_filename = inst.model;
|
|
908
911
|
char buf[128];
|
|
909
912
|
llama_model_desc(lmodel, buf, sizeof(buf));
|
|
@@ -1058,8 +1061,6 @@ struct test {
|
|
|
1058
1061
|
|
|
1059
1062
|
const std::string test::build_commit = LLAMA_COMMIT;
|
|
1060
1063
|
const int test::build_number = LLAMA_BUILD_NUMBER;
|
|
1061
|
-
const std::string test::cpu_info = get_cpu_info();
|
|
1062
|
-
const std::string test::gpu_info = get_gpu_info();
|
|
1063
1064
|
|
|
1064
1065
|
struct printer {
|
|
1065
1066
|
virtual ~printer() {}
|
|
@@ -346,7 +346,7 @@ class HttpClient {
|
|
|
346
346
|
if (!output_file.empty()) {
|
|
347
347
|
output_file_partial = output_file + ".partial";
|
|
348
348
|
if (!out.open(output_file_partial, "ab")) {
|
|
349
|
-
printe("Failed to open file\n");
|
|
349
|
+
printe("Failed to open file for writing\n");
|
|
350
350
|
|
|
351
351
|
return 1;
|
|
352
352
|
}
|
|
@@ -535,8 +535,7 @@ class HttpClient {
|
|
|
535
535
|
|
|
536
536
|
static void print_progress(const std::string & progress_prefix, const std::string & progress_bar,
|
|
537
537
|
const std::string & progress_suffix) {
|
|
538
|
-
printe("\r
|
|
539
|
-
progress_suffix.c_str());
|
|
538
|
+
printe("\r" LOG_CLR_TO_EOL "%s%s| %s", progress_prefix.c_str(), progress_bar.c_str(), progress_suffix.c_str());
|
|
540
539
|
}
|
|
541
540
|
// Function to write data to a file
|
|
542
541
|
static size_t write_data(void * ptr, size_t size, size_t nmemb, void * stream) {
|
|
@@ -797,16 +796,13 @@ class LlamaData {
|
|
|
797
796
|
llama_model_ptr initialize_model(Opt & opt) {
|
|
798
797
|
ggml_backend_load_all();
|
|
799
798
|
resolve_model(opt.model_);
|
|
800
|
-
printe(
|
|
801
|
-
"\r%*s"
|
|
802
|
-
"\rLoading model",
|
|
803
|
-
get_terminal_width(), " ");
|
|
799
|
+
printe("\r" LOG_CLR_TO_EOL "Loading model");
|
|
804
800
|
llama_model_ptr model(llama_model_load_from_file(opt.model_.c_str(), opt.model_params));
|
|
805
801
|
if (!model) {
|
|
806
802
|
printe("%s: error: unable to load model from file: %s\n", __func__, opt.model_.c_str());
|
|
807
803
|
}
|
|
808
804
|
|
|
809
|
-
printe("\r
|
|
805
|
+
printe("\r" LOG_CLR_TO_EOL);
|
|
810
806
|
return model;
|
|
811
807
|
}
|
|
812
808
|
|
|
@@ -969,10 +965,7 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
|
|
|
969
965
|
static int read_user_input(std::string & user_input) {
|
|
970
966
|
static const char * prompt_prefix = "> ";
|
|
971
967
|
#ifdef WIN32
|
|
972
|
-
printf(
|
|
973
|
-
"\r%*s"
|
|
974
|
-
"\r" LOG_COL_DEFAULT "%s",
|
|
975
|
-
get_terminal_width(), " ", prompt_prefix);
|
|
968
|
+
printf("\r" LOG_CLR_TO_EOL LOG_COL_DEFAULT "%s", prompt_prefix);
|
|
976
969
|
|
|
977
970
|
std::getline(std::cin, user_input);
|
|
978
971
|
if (std::cin.eof()) {
|
|
@@ -5,7 +5,7 @@ option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
|
|
|
5
5
|
include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
|
|
6
6
|
|
|
7
7
|
if (MINGW)
|
|
8
|
-
# fix: https://github.com/
|
|
8
|
+
# fix: https://github.com/ggml-org/llama.cpp/actions/runs/9651004652/job/26617901362?pr=8006
|
|
9
9
|
add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
|
|
10
10
|
endif()
|
|
11
11
|
|