@fugood/llama.node 0.3.12 → 0.3.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +2 -1
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +14 -0
- package/src/LlamaContext.cpp +110 -79
- package/src/LlamaContext.h +1 -1
- package/src/common.hpp +1 -2
- package/src/llama.cpp/.github/workflows/build.yml +95 -13
- package/src/llama.cpp/.github/workflows/docker.yml +2 -0
- package/src/llama.cpp/.github/workflows/labeler.yml +1 -1
- package/src/llama.cpp/.github/workflows/server.yml +2 -0
- package/src/llama.cpp/common/CMakeLists.txt +23 -6
- package/src/llama.cpp/common/arg.cpp +292 -14
- package/src/llama.cpp/common/chat.cpp +1128 -315
- package/src/llama.cpp/common/chat.h +135 -0
- package/src/llama.cpp/common/common.cpp +27 -171
- package/src/llama.cpp/common/common.h +41 -73
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
- package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
- package/src/llama.cpp/common/llguidance.cpp +3 -3
- package/src/llama.cpp/common/log.cpp +1 -0
- package/src/llama.cpp/common/log.h +2 -1
- package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +21 -7
- package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +61 -14
- package/src/llama.cpp/common/ngram-cache.cpp +1 -0
- package/src/llama.cpp/common/sampling.cpp +93 -49
- package/src/llama.cpp/common/speculative.cpp +6 -5
- package/src/llama.cpp/common/speculative.h +1 -1
- package/src/llama.cpp/docs/build.md +47 -9
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +4 -4
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +6 -5
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +373 -107
- package/src/llama.cpp/examples/llava/clip.h +19 -3
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
- package/src/llama.cpp/examples/llava/llava.cpp +4 -2
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
- package/src/llama.cpp/examples/main/main.cpp +73 -28
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +1 -0
- package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
- package/src/llama.cpp/examples/run/run.cpp +115 -79
- package/src/llama.cpp/examples/server/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/server/httplib.h +381 -292
- package/src/llama.cpp/examples/server/server.cpp +134 -128
- package/src/llama.cpp/examples/server/utils.hpp +95 -106
- package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +251 -142
- package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
- package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cpu.h +4 -1
- package/src/llama.cpp/ggml/include/ggml-metal.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +0 -2
- package/src/llama.cpp/ggml/include/ggml.h +6 -2
- package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
- package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
- package/src/llama.cpp/ggml/src/ggml-common.h +0 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +156 -11
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +2235 -641
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1572 -198
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +24 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +9 -8
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +16 -3
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +246 -120
- package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +174 -728
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +949 -602
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +37 -3
- package/src/llama.cpp/ggml/src/ggml.c +9 -4
- package/src/llama.cpp/include/llama.h +32 -14
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +1 -0
- package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
- package/src/llama.cpp/requirements.txt +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +21 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-chat.cpp +1 -0
- package/src/llama.cpp/src/llama-grammar.cpp +183 -183
- package/src/llama.cpp/src/llama-grammar.h +13 -4
- package/src/llama.cpp/src/llama-impl.h +6 -6
- package/src/llama.cpp/src/llama-kv-cache.h +2 -1
- package/src/llama.cpp/src/llama-mmap.cpp +11 -1
- package/src/llama.cpp/src/llama-mmap.h +1 -0
- package/src/llama.cpp/src/llama-model.cpp +70 -6
- package/src/llama.cpp/src/llama-sampling.cpp +174 -67
- package/src/llama.cpp/src/llama-vocab.cpp +12 -0
- package/src/llama.cpp/src/llama.cpp +154 -5
- package/src/llama.cpp/src/unicode.cpp +9 -2
- package/src/llama.cpp/tests/test-backend-ops.cpp +171 -115
- package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
- package/src/llama.cpp/tests/test-chat.cpp +691 -325
- package/src/llama.cpp/tests/test-gguf.cpp +4 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
- package/src/llama.cpp/tests/test-sampling.cpp +15 -0
- package/src/llama.cpp/Sources/llama/llama.h +0 -4
- package/src/llama.cpp/common/chat.hpp +0 -52
|
@@ -249,16 +249,30 @@ class chat_template {
|
|
|
249
249
|
inputs.add_generation_prompt = false;
|
|
250
250
|
full = apply(inputs);
|
|
251
251
|
}
|
|
252
|
-
|
|
253
|
-
if (
|
|
254
|
-
|
|
255
|
-
|
|
252
|
+
auto eos_pos_last = full.rfind(eos_token_);
|
|
253
|
+
if (eos_pos_last == prefix.size() - eos_token_.size() ||
|
|
254
|
+
(full[full.size() - 1] == '\n' && (eos_pos_last == full.size() - eos_token_.size() - 1))) {
|
|
255
|
+
full = full.substr(0, eos_pos_last);
|
|
256
|
+
}
|
|
257
|
+
size_t common_prefix_length = 0;
|
|
258
|
+
for (size_t i = 0; i < prefix.size() && i < full.size(); ++i) {
|
|
259
|
+
if (prefix[i] != full[i]) {
|
|
260
|
+
break;
|
|
256
261
|
}
|
|
262
|
+
if (prefix[i] == '<') {
|
|
263
|
+
// DeepSeek R1's template (as of 20250209) adds a trailing <think> if add_generation_prompt,
|
|
264
|
+
// but it removes thinking tags for past messages.
|
|
265
|
+
// The prefix and full strings diverge at <think> vs. <|tool▁calls▁begin|>, we avoid consuming the leading <.
|
|
266
|
+
continue;
|
|
267
|
+
}
|
|
268
|
+
common_prefix_length = i + 1;
|
|
257
269
|
}
|
|
258
|
-
|
|
270
|
+
auto example = full.substr(common_prefix_length);
|
|
271
|
+
if (example.find("tool_name") == std::string::npos && example.find("some_value") == std::string::npos) {
|
|
259
272
|
fprintf(stderr, "Failed to infer a tool call example (possible template bug)\n");
|
|
273
|
+
} else {
|
|
274
|
+
tool_call_example_ = example;
|
|
260
275
|
}
|
|
261
|
-
tool_call_example_ = full.substr(prefix.size());
|
|
262
276
|
}
|
|
263
277
|
} catch (const std::exception & e) {
|
|
264
278
|
fprintf(stderr, "Failed to generate tool call example: %s\n", e.what());
|
|
@@ -363,7 +377,7 @@ class chat_template {
|
|
|
363
377
|
if (polyfill_tools) {
|
|
364
378
|
adjusted_messages = add_system(inputs.messages,
|
|
365
379
|
"You can call any of the following tools to satisfy the user's requests: " + minja::Value(inputs.tools).dump(2, /* to_json= */ true) +
|
|
366
|
-
(!polyfill_tool_call_example || tool_call_example_.empty() ? "" : "\n\nExample tool call syntax:\n\n" + tool_call_example_));
|
|
380
|
+
(!polyfill_tool_call_example || tool_call_example_.empty() ? "" : "\n\nExample tool call syntax:\n\n" + tool_call_example_ + "\n\n"));
|
|
367
381
|
} else {
|
|
368
382
|
adjusted_messages = inputs.messages;
|
|
369
383
|
}
|
|
@@ -1378,13 +1378,34 @@ struct ArgumentsExpression {
|
|
|
1378
1378
|
}
|
|
1379
1379
|
};
|
|
1380
1380
|
|
|
1381
|
-
static std::string strip(const std::string & s) {
|
|
1382
|
-
auto
|
|
1381
|
+
static std::string strip(const std::string & s, const std::string & chars = "", bool left = true, bool right = true) {
|
|
1382
|
+
auto charset = chars.empty() ? " \t\n\r" : chars;
|
|
1383
|
+
auto start = left ? s.find_first_not_of(charset) : 0;
|
|
1383
1384
|
if (start == std::string::npos) return "";
|
|
1384
|
-
auto end = s.find_last_not_of(
|
|
1385
|
+
auto end = right ? s.find_last_not_of(charset) : s.size() - 1;
|
|
1385
1386
|
return s.substr(start, end - start + 1);
|
|
1386
1387
|
}
|
|
1387
1388
|
|
|
1389
|
+
static std::vector<std::string> split(const std::string & s, const std::string & sep) {
|
|
1390
|
+
std::vector<std::string> result;
|
|
1391
|
+
size_t start = 0;
|
|
1392
|
+
size_t end = s.find(sep);
|
|
1393
|
+
while (end != std::string::npos) {
|
|
1394
|
+
result.push_back(s.substr(start, end - start));
|
|
1395
|
+
start = end + sep.length();
|
|
1396
|
+
end = s.find(sep, start);
|
|
1397
|
+
}
|
|
1398
|
+
result.push_back(s.substr(start));
|
|
1399
|
+
return result;
|
|
1400
|
+
}
|
|
1401
|
+
|
|
1402
|
+
static std::string capitalize(const std::string & s) {
|
|
1403
|
+
if (s.empty()) return s;
|
|
1404
|
+
auto result = s;
|
|
1405
|
+
result[0] = std::toupper(result[0]);
|
|
1406
|
+
return result;
|
|
1407
|
+
}
|
|
1408
|
+
|
|
1388
1409
|
static std::string html_escape(const std::string & s) {
|
|
1389
1410
|
std::string result;
|
|
1390
1411
|
result.reserve(s.size());
|
|
@@ -1460,8 +1481,29 @@ public:
|
|
|
1460
1481
|
} else if (obj.is_string()) {
|
|
1461
1482
|
auto str = obj.get<std::string>();
|
|
1462
1483
|
if (method->get_name() == "strip") {
|
|
1463
|
-
vargs.expectArgs("strip method", {0,
|
|
1464
|
-
|
|
1484
|
+
vargs.expectArgs("strip method", {0, 1}, {0, 0});
|
|
1485
|
+
auto chars = vargs.args.empty() ? "" : vargs.args[0].get<std::string>();
|
|
1486
|
+
return Value(strip(str, chars));
|
|
1487
|
+
} else if (method->get_name() == "lstrip") {
|
|
1488
|
+
vargs.expectArgs("lstrip method", {0, 1}, {0, 0});
|
|
1489
|
+
auto chars = vargs.args.empty() ? "" : vargs.args[0].get<std::string>();
|
|
1490
|
+
return Value(strip(str, chars, /* left= */ true, /* right= */ false));
|
|
1491
|
+
} else if (method->get_name() == "rstrip") {
|
|
1492
|
+
vargs.expectArgs("rstrip method", {0, 1}, {0, 0});
|
|
1493
|
+
auto chars = vargs.args.empty() ? "" : vargs.args[0].get<std::string>();
|
|
1494
|
+
return Value(strip(str, chars, /* left= */ false, /* right= */ true));
|
|
1495
|
+
} else if (method->get_name() == "split") {
|
|
1496
|
+
vargs.expectArgs("split method", {1, 1}, {0, 0});
|
|
1497
|
+
auto sep = vargs.args[0].get<std::string>();
|
|
1498
|
+
auto parts = split(str, sep);
|
|
1499
|
+
Value result = Value::array();
|
|
1500
|
+
for (const auto& part : parts) {
|
|
1501
|
+
result.push_back(Value(part));
|
|
1502
|
+
}
|
|
1503
|
+
return result;
|
|
1504
|
+
} else if (method->get_name() == "capitalize") {
|
|
1505
|
+
vargs.expectArgs("capitalize method", {0, 0}, {0, 0});
|
|
1506
|
+
return Value(capitalize(str));
|
|
1465
1507
|
} else if (method->get_name() == "endswith") {
|
|
1466
1508
|
vargs.expectArgs("endswith method", {1, 1}, {0, 0});
|
|
1467
1509
|
auto suffix = vargs.args[0].get<std::string>();
|
|
@@ -1792,7 +1834,7 @@ private:
|
|
|
1792
1834
|
auto left = parseStringConcat();
|
|
1793
1835
|
if (!left) throw std::runtime_error("Expected left side of 'logical compare' expression");
|
|
1794
1836
|
|
|
1795
|
-
static std::regex compare_tok(R"(==|!=|<=?|>=?|in\b|is\b|not
|
|
1837
|
+
static std::regex compare_tok(R"(==|!=|<=?|>=?|in\b|is\b|not\s+in\b)");
|
|
1796
1838
|
static std::regex not_tok(R"(not\b)");
|
|
1797
1839
|
std::string op_str;
|
|
1798
1840
|
while (!(op_str = consumeToken(compare_tok)).empty()) {
|
|
@@ -2171,7 +2213,7 @@ private:
|
|
|
2171
2213
|
using TemplateTokenIterator = TemplateTokenVector::const_iterator;
|
|
2172
2214
|
|
|
2173
2215
|
std::vector<std::string> parseVarNames() {
|
|
2174
|
-
static std::regex varnames_regex(R"(((?:\w+)(
|
|
2216
|
+
static std::regex varnames_regex(R"(((?:\w+)(?:\s*,\s*(?:\w+))*)\s*)");
|
|
2175
2217
|
|
|
2176
2218
|
std::vector<std::string> group;
|
|
2177
2219
|
if ((group = consumeTokenGroups(varnames_regex)).empty()) throw std::runtime_error("Expected variable names");
|
|
@@ -2194,13 +2236,13 @@ private:
|
|
|
2194
2236
|
}
|
|
2195
2237
|
|
|
2196
2238
|
TemplateTokenVector tokenize() {
|
|
2197
|
-
static std::regex comment_tok(R"(\{#([-~]?)([\s\S
|
|
2239
|
+
static std::regex comment_tok(R"(\{#([-~]?)([\s\S]*?)([-~]?)#\})");
|
|
2198
2240
|
static std::regex expr_open_regex(R"(\{\{([-~])?)");
|
|
2199
|
-
static std::regex block_open_regex(R"(^\{%([-~])
|
|
2241
|
+
static std::regex block_open_regex(R"(^\{%([-~])?\s*)");
|
|
2200
2242
|
static std::regex block_keyword_tok(R"((if|else|elif|endif|for|endfor|generation|endgeneration|set|endset|block|endblock|macro|endmacro|filter|endfilter|break|continue)\b)");
|
|
2201
2243
|
static std::regex non_text_open_regex(R"(\{\{|\{%|\{#)");
|
|
2202
|
-
static std::regex expr_close_regex(R"(
|
|
2203
|
-
static std::regex block_close_regex(R"(
|
|
2244
|
+
static std::regex expr_close_regex(R"(\s*([-~])?\}\})");
|
|
2245
|
+
static std::regex block_close_regex(R"(\s*([-~])?%\})");
|
|
2204
2246
|
|
|
2205
2247
|
TemplateTokenVector tokens;
|
|
2206
2248
|
std::vector<std::string> group;
|
|
@@ -2284,7 +2326,7 @@ private:
|
|
|
2284
2326
|
auto post_space = parseBlockClose();
|
|
2285
2327
|
tokens.push_back(std::make_unique<EndGenerationTemplateToken>(location, pre_space, post_space));
|
|
2286
2328
|
} else if (keyword == "set") {
|
|
2287
|
-
static std::regex namespaced_var_regex(R"((\w+)
|
|
2329
|
+
static std::regex namespaced_var_regex(R"((\w+)\s*\.\s*(\w+))");
|
|
2288
2330
|
|
|
2289
2331
|
std::string ns;
|
|
2290
2332
|
std::vector<std::string> var_names;
|
|
@@ -2336,6 +2378,11 @@ private:
|
|
|
2336
2378
|
throw std::runtime_error("Unexpected block: " + keyword);
|
|
2337
2379
|
}
|
|
2338
2380
|
} else if (std::regex_search(it, end, match, non_text_open_regex)) {
|
|
2381
|
+
if (!match.position()) {
|
|
2382
|
+
if (match[0] != "{#")
|
|
2383
|
+
throw std::runtime_error("Internal error: Expected a comment");
|
|
2384
|
+
throw std::runtime_error("Missing end of comment tag");
|
|
2385
|
+
}
|
|
2339
2386
|
auto text_end = it + match.position();
|
|
2340
2387
|
text = std::string(it, text_end);
|
|
2341
2388
|
it = text_end;
|
|
@@ -2400,7 +2447,7 @@ private:
|
|
|
2400
2447
|
|
|
2401
2448
|
auto text = text_token->text;
|
|
2402
2449
|
if (post_space == SpaceHandling::Strip) {
|
|
2403
|
-
static std::regex trailing_space_regex(R"(
|
|
2450
|
+
static std::regex trailing_space_regex(R"(\s+$)");
|
|
2404
2451
|
text = std::regex_replace(text, trailing_space_regex, "");
|
|
2405
2452
|
} else if (options.lstrip_blocks && it != end) {
|
|
2406
2453
|
auto i = text.size();
|
|
@@ -2410,7 +2457,7 @@ private:
|
|
|
2410
2457
|
}
|
|
2411
2458
|
}
|
|
2412
2459
|
if (pre_space == SpaceHandling::Strip) {
|
|
2413
|
-
static std::regex leading_space_regex(R"(
|
|
2460
|
+
static std::regex leading_space_regex(R"(^\s+)");
|
|
2414
2461
|
text = std::regex_replace(text, leading_space_regex, "");
|
|
2415
2462
|
} else if (options.trim_blocks && (it - 1) != begin && !dynamic_cast<ExpressionTemplateToken*>((*(it - 2)).get())) {
|
|
2416
2463
|
if (text.length() > 0 && text[0] == '\n') {
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
#include <cmath>
|
|
6
6
|
#include <unordered_map>
|
|
7
|
+
#include <algorithm>
|
|
7
8
|
|
|
8
9
|
// the ring buffer works similarly to std::deque, but with a fixed capacity
|
|
9
10
|
// TODO: deduplicate with llama-impl.h
|
|
@@ -134,11 +135,11 @@ std::string common_params_sampling::print() const {
|
|
|
134
135
|
snprintf(result, sizeof(result),
|
|
135
136
|
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
|
|
136
137
|
"\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n"
|
|
137
|
-
"\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f\n"
|
|
138
|
+
"\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, top_n_sigma = %.3f, temp = %.3f\n"
|
|
138
139
|
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
|
|
139
140
|
penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
|
|
140
141
|
dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n,
|
|
141
|
-
top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, temp,
|
|
142
|
+
top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, top_n_sigma, temp,
|
|
142
143
|
mirostat, mirostat_eta, mirostat_tau);
|
|
143
144
|
|
|
144
145
|
return std::string(result);
|
|
@@ -151,12 +152,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|
|
151
152
|
|
|
152
153
|
lparams.no_perf = params.no_perf;
|
|
153
154
|
|
|
154
|
-
std::vector<const char *> trigger_words;
|
|
155
|
-
trigger_words.reserve(params.grammar_trigger_words.size());
|
|
156
|
-
for (const auto & str : params.grammar_trigger_words) {
|
|
157
|
-
trigger_words.push_back(str.word.c_str());
|
|
158
|
-
}
|
|
159
|
-
|
|
160
155
|
struct llama_sampler * grmr;
|
|
161
156
|
if (params.grammar.compare(0, 11, "%llguidance") == 0) {
|
|
162
157
|
#ifdef LLAMA_USE_LLGUIDANCE
|
|
@@ -165,10 +160,53 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|
|
165
160
|
GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
|
|
166
161
|
#endif // LLAMA_USE_LLGUIDANCE
|
|
167
162
|
} else {
|
|
163
|
+
std::vector<std::string> patterns_at_start;
|
|
164
|
+
std::vector<std::string> patterns_anywhere;
|
|
165
|
+
std::vector<llama_token> trigger_tokens;
|
|
166
|
+
for (const auto & trigger : params.grammar_triggers) {
|
|
167
|
+
switch (trigger.type) {
|
|
168
|
+
case COMMON_GRAMMAR_TRIGGER_TYPE_WORD:
|
|
169
|
+
{
|
|
170
|
+
const auto & word = trigger.value;
|
|
171
|
+
patterns_anywhere.push_back(regex_escape(word));
|
|
172
|
+
break;
|
|
173
|
+
}
|
|
174
|
+
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
|
|
175
|
+
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START:
|
|
176
|
+
{
|
|
177
|
+
const auto & pattern = trigger.value;
|
|
178
|
+
(trigger.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START ? patterns_at_start : patterns_anywhere).push_back(pattern);
|
|
179
|
+
break;
|
|
180
|
+
}
|
|
181
|
+
case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN:
|
|
182
|
+
{
|
|
183
|
+
const auto token = trigger.token;
|
|
184
|
+
trigger_tokens.push_back(token);
|
|
185
|
+
break;
|
|
186
|
+
}
|
|
187
|
+
default:
|
|
188
|
+
GGML_ASSERT(false && "unknown trigger type");
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
std::vector<std::string> trigger_patterns;
|
|
193
|
+
if (!patterns_at_start.empty()) {
|
|
194
|
+
trigger_patterns.push_back("^(" + string_join(patterns_at_start, "|") + ")[\\s\\S]*");
|
|
195
|
+
}
|
|
196
|
+
if (!patterns_anywhere.empty()) {
|
|
197
|
+
trigger_patterns.push_back("^[\\s\\S]*?(" + string_join(patterns_anywhere, "|") + ")[\\s\\S]*");
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
std::vector<const char *> trigger_patterns_c;
|
|
201
|
+
trigger_patterns_c.reserve(trigger_patterns.size());
|
|
202
|
+
for (const auto & regex : trigger_patterns) {
|
|
203
|
+
trigger_patterns_c.push_back(regex.c_str());
|
|
204
|
+
}
|
|
205
|
+
|
|
168
206
|
grmr = params.grammar_lazy
|
|
169
|
-
?
|
|
170
|
-
|
|
171
|
-
|
|
207
|
+
? llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
|
|
208
|
+
trigger_patterns_c.data(), trigger_patterns_c.size(),
|
|
209
|
+
trigger_tokens.data(), trigger_tokens.size())
|
|
172
210
|
: llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
|
|
173
211
|
}
|
|
174
212
|
|
|
@@ -188,45 +226,51 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|
|
188
226
|
params.logit_bias.data()));
|
|
189
227
|
|
|
190
228
|
if (params.mirostat == 0) {
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
229
|
+
if (params.top_n_sigma >= 0) {
|
|
230
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
|
231
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_temp (params.temp));
|
|
232
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
|
|
233
|
+
} else {
|
|
234
|
+
for (const auto & cnstr : params.samplers) {
|
|
235
|
+
switch (cnstr) {
|
|
236
|
+
case COMMON_SAMPLER_TYPE_DRY:
|
|
237
|
+
{
|
|
238
|
+
std::vector<const char *> c_breakers;
|
|
239
|
+
c_breakers.reserve(params.dry_sequence_breakers.size());
|
|
240
|
+
for (const auto & str : params.dry_sequence_breakers) {
|
|
241
|
+
c_breakers.push_back(str.c_str());
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
|
199
245
|
}
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
default:
|
|
229
|
-
GGML_ASSERT(false && "unknown sampler type");
|
|
246
|
+
break;
|
|
247
|
+
case COMMON_SAMPLER_TYPE_TOP_K:
|
|
248
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
|
249
|
+
break;
|
|
250
|
+
case COMMON_SAMPLER_TYPE_TOP_P:
|
|
251
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
|
|
252
|
+
break;
|
|
253
|
+
case COMMON_SAMPLER_TYPE_MIN_P:
|
|
254
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
|
255
|
+
break;
|
|
256
|
+
case COMMON_SAMPLER_TYPE_XTC:
|
|
257
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
|
|
258
|
+
break;
|
|
259
|
+
case COMMON_SAMPLER_TYPE_TYPICAL_P:
|
|
260
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
|
|
261
|
+
break;
|
|
262
|
+
case COMMON_SAMPLER_TYPE_TEMPERATURE:
|
|
263
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
|
264
|
+
break;
|
|
265
|
+
case COMMON_SAMPLER_TYPE_INFILL:
|
|
266
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
|
|
267
|
+
break;
|
|
268
|
+
case COMMON_SAMPLER_TYPE_PENALTIES:
|
|
269
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
|
|
270
|
+
break;
|
|
271
|
+
default:
|
|
272
|
+
GGML_ASSERT(false && "unknown sampler type");
|
|
273
|
+
}
|
|
230
274
|
}
|
|
231
275
|
}
|
|
232
276
|
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
|
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
#include "sampling.h"
|
|
6
6
|
|
|
7
7
|
#include <cstring>
|
|
8
|
+
#include <algorithm>
|
|
8
9
|
|
|
9
10
|
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 128
|
|
10
11
|
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
|
|
@@ -252,11 +253,6 @@ llama_tokens common_speculative_gen_draft(
|
|
|
252
253
|
// add drafted token for each sequence
|
|
253
254
|
const llama_token id = cur_p->data[0].id;
|
|
254
255
|
|
|
255
|
-
// only collect very high-confidence draft tokens
|
|
256
|
-
if (cur_p->data[0].p < params.p_min) {
|
|
257
|
-
break;
|
|
258
|
-
}
|
|
259
|
-
|
|
260
256
|
common_sampler_accept(smpl, id, true);
|
|
261
257
|
|
|
262
258
|
result.push_back(id);
|
|
@@ -265,6 +261,11 @@ llama_tokens common_speculative_gen_draft(
|
|
|
265
261
|
break;
|
|
266
262
|
}
|
|
267
263
|
|
|
264
|
+
// only collect very high-confidence draft tokens
|
|
265
|
+
if (cur_p->data[0].p < params.p_min) {
|
|
266
|
+
break;
|
|
267
|
+
}
|
|
268
|
+
|
|
268
269
|
common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
|
|
269
270
|
|
|
270
271
|
// evaluate the drafted tokens on the draft model
|
|
@@ -9,7 +9,7 @@ struct common_speculative_params {
|
|
|
9
9
|
int n_draft = 16; // max drafted tokens
|
|
10
10
|
int n_reuse = 256;
|
|
11
11
|
|
|
12
|
-
float p_min = 0.
|
|
12
|
+
float p_min = 0.75f; // min probability required to accept a token in the draft
|
|
13
13
|
};
|
|
14
14
|
|
|
15
15
|
struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
**To get the Code:**
|
|
4
4
|
|
|
5
5
|
```bash
|
|
6
|
-
git clone https://github.com/
|
|
6
|
+
git clone https://github.com/ggml-org/llama.cpp
|
|
7
7
|
cd llama.cpp
|
|
8
8
|
```
|
|
9
9
|
|
|
@@ -46,7 +46,7 @@ cmake --build build --config Release
|
|
|
46
46
|
```
|
|
47
47
|
|
|
48
48
|
- Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers:
|
|
49
|
-
- Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/
|
|
49
|
+
- Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...):
|
|
50
50
|
- Tab Workload: Desktop-development with C++
|
|
51
51
|
- Tab Components (select quickly via search): C++-_CMake_ Tools for Windows, _Git_ for Windows, C++-_Clang_ Compiler for Windows, MS-Build Support for LLVM-Toolset (clang)
|
|
52
52
|
- Please remember to always use a Developer Command Prompt / PowerShell for VS2022 for git, build, test
|
|
@@ -197,20 +197,52 @@ The following compilation options are also available to tweak performance:
|
|
|
197
197
|
|
|
198
198
|
## MUSA
|
|
199
199
|
|
|
200
|
-
This provides GPU acceleration using
|
|
200
|
+
This provides GPU acceleration using a Moore Threads GPU. Make sure to have the [MUSA SDK](https://developer.mthreads.com/musa/musa-sdk) installed.
|
|
201
201
|
|
|
202
|
-
|
|
202
|
+
#### Download directly from Moore Threads
|
|
203
203
|
|
|
204
|
-
|
|
205
|
-
|
|
204
|
+
You may find the official downloads here: [Moore Threads developer site](https://developer.mthreads.com/sdk/download/musa).
|
|
205
|
+
|
|
206
|
+
### Compilation
|
|
207
|
+
|
|
208
|
+
```bash
|
|
209
|
+
cmake -B build -DGGML_MUSA=ON
|
|
210
|
+
cmake --build build --config Release
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
#### Override Compute Capability Specifications
|
|
214
|
+
|
|
215
|
+
By default, all supported compute capabilities are enabled. To customize this behavior, you can specify the `MUSA_ARCHITECTURES` option in the CMake command:
|
|
216
|
+
|
|
217
|
+
```bash
|
|
218
|
+
cmake -B build -DGGML_MUSA=ON -DMUSA_ARCHITECTURES="21"
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
This configuration enables only compute capability `2.1` (MTT S80) during compilation, which can help reduce compilation time.
|
|
222
|
+
|
|
223
|
+
#### Compilation options
|
|
224
|
+
|
|
225
|
+
Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
|
|
226
|
+
|
|
227
|
+
- For static builds, add `-DBUILD_SHARED_LIBS=OFF` and `-DCMAKE_POSITION_INDEPENDENT_CODE=ON`:
|
|
228
|
+
```
|
|
229
|
+
cmake -B build -DGGML_MUSA=ON \
|
|
230
|
+
-DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
|
|
206
231
|
cmake --build build --config Release
|
|
207
232
|
```
|
|
208
233
|
|
|
209
|
-
|
|
234
|
+
### Runtime MUSA environmental variables
|
|
210
235
|
|
|
211
|
-
|
|
236
|
+
You may set the [musa environmental variables](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) at runtime.
|
|
212
237
|
|
|
213
|
-
|
|
238
|
+
```bash
|
|
239
|
+
# Use `MUSA_VISIBLE_DEVICES` to hide the first compute device.
|
|
240
|
+
MUSA_VISIBLE_DEVICES="-0" ./build/bin/llama-server --model /srv/models/llama.gguf
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
### Unified Memory
|
|
244
|
+
|
|
245
|
+
The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted.
|
|
214
246
|
|
|
215
247
|
## HIP
|
|
216
248
|
|
|
@@ -227,6 +259,12 @@ You can download it from your Linux distro's package manager or from here: [ROCm
|
|
|
227
259
|
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
|
|
228
260
|
However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
|
|
229
261
|
|
|
262
|
+
To enhance flash attention performance on RDNA3+ or CDNA architectures, you can utilize the rocWMMA library by enabling the `-DGGML_HIP_ROCWMMA_FATTN=ON` option. This requires rocWMMA headers to be installed on the build system.
|
|
263
|
+
|
|
264
|
+
The rocWMMA library is included by default when installing the ROCm SDK using the `rocm` meta package provided by AMD. Alternatively, if you are not using the meta package, you can install the library using the `rocwmma-dev` or `rocwmma-devel` package, depending on your system's package manager.
|
|
265
|
+
|
|
266
|
+
As an alternative, you can manually install the library by cloning it from the official [GitHub repository](https://github.com/ROCm/rocWMMA), checkout the corresponding version tag (e.g. `rocm-6.2.4`) and set `-DCMAKE_CXX_FLAGS="-I<path/to/rocwmma>/library/include/"` in CMake. This also works under Windows despite not officially supported by AMD.
|
|
267
|
+
|
|
230
268
|
Note that if you get the following error:
|
|
231
269
|
```
|
|
232
270
|
clang: error: cannot find ROCm device library; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library
|
|
@@ -394,6 +394,8 @@ static int prepare_entries(common_params & params, train_context & ctx_train) {
|
|
|
394
394
|
int main(int argc, char ** argv) {
|
|
395
395
|
common_params params;
|
|
396
396
|
|
|
397
|
+
params.out_file = "control_vector.gguf";
|
|
398
|
+
|
|
397
399
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
|
|
398
400
|
return 1;
|
|
399
401
|
}
|
|
@@ -498,7 +500,7 @@ int main(int argc, char ** argv) {
|
|
|
498
500
|
}
|
|
499
501
|
|
|
500
502
|
// write output vectors to gguf
|
|
501
|
-
export_gguf(ctx_train.v_final, params.
|
|
503
|
+
export_gguf(ctx_train.v_final, params.out_file, model_hint);
|
|
502
504
|
|
|
503
505
|
llama_backend_free();
|
|
504
506
|
|
|
@@ -413,20 +413,22 @@ static void print_usage(int, char ** argv) {
|
|
|
413
413
|
int main(int argc, char ** argv) {
|
|
414
414
|
common_params params;
|
|
415
415
|
|
|
416
|
+
params.out_file = "ggml-lora-merged-f16.gguf";
|
|
417
|
+
|
|
416
418
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) {
|
|
417
419
|
return 1;
|
|
418
420
|
}
|
|
419
421
|
|
|
420
422
|
g_verbose = (params.verbosity > 1);
|
|
421
423
|
try {
|
|
422
|
-
lora_merge_ctx ctx(params.model, params.lora_adapters, params.
|
|
424
|
+
lora_merge_ctx ctx(params.model, params.lora_adapters, params.out_file, params.cpuparams.n_threads);
|
|
423
425
|
ctx.run_merge();
|
|
424
426
|
} catch (const std::exception & err) {
|
|
425
427
|
fprintf(stderr, "%s\n", err.what());
|
|
426
428
|
exit(EXIT_FAILURE);
|
|
427
429
|
}
|
|
428
430
|
|
|
429
|
-
printf("done, output file is %s\n", params.
|
|
431
|
+
printf("done, output file is %s\n", params.out_file.c_str());
|
|
430
432
|
|
|
431
433
|
return 0;
|
|
432
434
|
}
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
#include "log.h"
|
|
4
4
|
#include "llama.h"
|
|
5
5
|
|
|
6
|
+
#include <chrono>
|
|
6
7
|
#include <cmath>
|
|
7
8
|
#include <cstdio>
|
|
8
9
|
#include <cstring>
|
|
@@ -99,7 +100,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
|
|
99
100
|
const float * data = is_host ? (const float *) src1->data : m_src1_data.data();
|
|
100
101
|
|
|
101
102
|
// this has been adapted to the new format of storing merged experts in a single 3d tensor
|
|
102
|
-
// ref: https://github.com/
|
|
103
|
+
// ref: https://github.com/ggml-org/llama.cpp/pull/6387
|
|
103
104
|
if (t->op == GGML_OP_MUL_MAT_ID) {
|
|
104
105
|
// ids -> [n_experts_used, n_tokens]
|
|
105
106
|
// src1 -> [cols, n_expert_used, n_tokens]
|
|
@@ -205,9 +206,6 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
|
|
205
206
|
|
|
206
207
|
void IMatrixCollector::save_imatrix(int ncall) const {
|
|
207
208
|
auto fname = m_params.out_file;
|
|
208
|
-
if (fname.empty()) {
|
|
209
|
-
fname = "imatrix.dat";
|
|
210
|
-
}
|
|
211
209
|
|
|
212
210
|
if (ncall > 0) {
|
|
213
211
|
fname += ".at_";
|
|
@@ -582,6 +580,8 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
|
|
|
582
580
|
int main(int argc, char ** argv) {
|
|
583
581
|
common_params params;
|
|
584
582
|
|
|
583
|
+
params.out_file = "imatrix.dat" ;
|
|
584
|
+
|
|
585
585
|
params.n_ctx = 512;
|
|
586
586
|
params.logits_all = true;
|
|
587
587
|
params.escape = false;
|
|
@@ -876,8 +876,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|
|
876
876
|
struct test {
|
|
877
877
|
static const std::string build_commit;
|
|
878
878
|
static const int build_number;
|
|
879
|
-
|
|
880
|
-
|
|
879
|
+
const std::string cpu_info;
|
|
880
|
+
const std::string gpu_info;
|
|
881
881
|
std::string model_filename;
|
|
882
882
|
std::string model_type;
|
|
883
883
|
uint64_t model_size;
|
|
@@ -903,7 +903,10 @@ struct test {
|
|
|
903
903
|
std::string test_time;
|
|
904
904
|
std::vector<uint64_t> samples_ns;
|
|
905
905
|
|
|
906
|
-
test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx)
|
|
906
|
+
test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) :
|
|
907
|
+
cpu_info(get_cpu_info()),
|
|
908
|
+
gpu_info(get_gpu_info()) {
|
|
909
|
+
|
|
907
910
|
model_filename = inst.model;
|
|
908
911
|
char buf[128];
|
|
909
912
|
llama_model_desc(lmodel, buf, sizeof(buf));
|
|
@@ -1058,8 +1061,6 @@ struct test {
|
|
|
1058
1061
|
|
|
1059
1062
|
const std::string test::build_commit = LLAMA_COMMIT;
|
|
1060
1063
|
const int test::build_number = LLAMA_BUILD_NUMBER;
|
|
1061
|
-
const std::string test::cpu_info = get_cpu_info();
|
|
1062
|
-
const std::string test::gpu_info = get_gpu_info();
|
|
1063
1064
|
|
|
1064
1065
|
struct printer {
|
|
1065
1066
|
virtual ~printer() {}
|