@fugood/llama.node 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +9 -0
- package/README.md +1 -1
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +1 -1
- package/package.json +2 -1
- package/patches/llama.patch +22 -0
- package/src/TokenizeWorker.cpp +1 -1
- package/src/llama.cpp/CMakeLists.txt +14 -12
- package/src/llama.cpp/common/common.cpp +19 -5
- package/src/llama.cpp/common/common.h +2 -0
- package/src/llama.cpp/common/grammar-parser.cpp +9 -0
- package/src/llama.cpp/common/sampling.cpp +3 -3
- package/src/llama.cpp/common/sampling.h +1 -1
- package/src/llama.cpp/examples/CMakeLists.txt +3 -0
- package/src/llama.cpp/examples/embedding/embedding.cpp +10 -2
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +56 -7
- package/src/llama.cpp/examples/llama.android/{app/src/main/cpp → llama}/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +49 -0
- package/src/llama.cpp/examples/llama.android/{app → llama}/src/main/cpp/llama-android.cpp +14 -14
- package/src/llama.cpp/examples/llava/llava-cli.cpp +26 -6
- package/src/llama.cpp/examples/main/main.cpp +5 -1
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +2 -0
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +70 -0
- package/src/llama.cpp/examples/server/server.cpp +12 -16
- package/src/llama.cpp/examples/server/utils.hpp +1 -1
- package/src/llama.cpp/ggml-backend.c +2 -2
- package/src/llama.cpp/ggml-kompute.cpp +9 -3
- package/src/llama.cpp/ggml-quants.c +6 -0
- package/src/llama.cpp/ggml-rpc.cpp +1023 -0
- package/src/llama.cpp/ggml-rpc.h +24 -0
- package/src/llama.cpp/ggml-sycl.cpp +20 -143
- package/src/llama.cpp/ggml-vulkan.cpp +4 -2
- package/src/llama.cpp/ggml.c +116 -271
- package/src/llama.cpp/ggml.h +12 -15
- package/src/llama.cpp/llama.cpp +451 -265
- package/src/llama.cpp/llama.h +3 -0
- package/src/llama.cpp/requirements.txt +0 -1
- package/src/llama.cpp/tests/CMakeLists.txt +1 -1
- package/src/llama.cpp/tests/test-backend-ops.cpp +16 -19
- package/src/llama.cpp/tests/test-grammar-integration.cpp +46 -0
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +27 -3
- package/src/llama.cpp/requirements/requirements-convert-lora-to-ggml.txt +0 -2
package/src/llama.cpp/llama.h
CHANGED
|
@@ -242,6 +242,9 @@ extern "C" {
|
|
|
242
242
|
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
|
|
243
243
|
const float * tensor_split;
|
|
244
244
|
|
|
245
|
+
// comma separated list of RPC servers to use for offloading
|
|
246
|
+
const char * rpc_servers;
|
|
247
|
+
|
|
245
248
|
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
|
246
249
|
// If the provided progress_callback returns true, model loading continues.
|
|
247
250
|
// If it returns false, model loading is immediately aborted.
|
|
@@ -9,5 +9,4 @@
|
|
|
9
9
|
-r ./requirements/requirements-convert-hf-to-gguf.txt
|
|
10
10
|
-r ./requirements/requirements-convert-hf-to-gguf-update.txt
|
|
11
11
|
-r ./requirements/requirements-convert-llama-ggml-to-gguf.txt
|
|
12
|
-
-r ./requirements/requirements-convert-lora-to-ggml.txt
|
|
13
12
|
-r ./requirements/requirements-convert-persimmon-to-gguf.txt
|
|
@@ -92,7 +92,7 @@ target_link_libraries(test-tokenizer-1-bpe PRIVATE common)
|
|
|
92
92
|
install(TARGETS test-tokenizer-1-bpe RUNTIME)
|
|
93
93
|
|
|
94
94
|
# TODO: disabled due to slowness
|
|
95
|
-
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
|
|
95
|
+
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf --ignore-merges)
|
|
96
96
|
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
|
97
97
|
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
|
98
98
|
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
#include <ggml-alloc.h>
|
|
3
3
|
#include <ggml-backend.h>
|
|
4
4
|
#include <ggml-backend-impl.h>
|
|
5
|
+
|
|
5
6
|
#include <algorithm>
|
|
6
7
|
#include <array>
|
|
7
8
|
#include <cfloat>
|
|
@@ -1111,11 +1112,7 @@ struct test_soft_max : public test_case {
|
|
|
1111
1112
|
if (this->mask) {
|
|
1112
1113
|
mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, ne[0], ne[1]);
|
|
1113
1114
|
}
|
|
1114
|
-
ggml_tensor *
|
|
1115
|
-
if (max_bias > 0.0f) {
|
|
1116
|
-
pos = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ne[0]);
|
|
1117
|
-
}
|
|
1118
|
-
ggml_tensor * out = ggml_soft_max_ext(ctx, a, mask, pos, scale, max_bias);
|
|
1115
|
+
ggml_tensor * out = ggml_soft_max_ext(ctx, a, mask, scale, max_bias);
|
|
1119
1116
|
return out;
|
|
1120
1117
|
}
|
|
1121
1118
|
};
|
|
@@ -1490,23 +1487,25 @@ struct test_flash_attn_ext : public test_case {
|
|
|
1490
1487
|
const int64_t kv; // kv size
|
|
1491
1488
|
const int64_t nb; // batch size
|
|
1492
1489
|
|
|
1490
|
+
const float max_bias; // ALiBi
|
|
1491
|
+
|
|
1493
1492
|
std::string vars() override {
|
|
1494
|
-
return
|
|
1493
|
+
return VARS_TO_STR5(hs, nh, kv, nb, max_bias);
|
|
1495
1494
|
}
|
|
1496
1495
|
|
|
1497
1496
|
double max_nmse_err() override {
|
|
1498
1497
|
return 5e-4;
|
|
1499
1498
|
}
|
|
1500
1499
|
|
|
1501
|
-
test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8)
|
|
1502
|
-
: hs(hs), nh(nh), kv(kv), nb(nb) {}
|
|
1500
|
+
test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8, float max_bias = 0.0f)
|
|
1501
|
+
: hs(hs), nh(nh), kv(kv), nb(nb), max_bias(max_bias) {}
|
|
1503
1502
|
|
|
1504
1503
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
1505
1504
|
ggml_tensor * q = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, hs, nb, nh, 1);
|
|
1506
1505
|
ggml_tensor * k = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1);
|
|
1507
1506
|
ggml_tensor * v = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1);
|
|
1508
1507
|
ggml_tensor * mask = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, 1);
|
|
1509
|
-
ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, mask, 1.0f/sqrtf(hs));
|
|
1508
|
+
ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, mask, 1.0f/sqrtf(hs), max_bias);
|
|
1510
1509
|
return out;
|
|
1511
1510
|
}
|
|
1512
1511
|
};
|
|
@@ -1611,7 +1610,7 @@ public:
|
|
|
1611
1610
|
|
|
1612
1611
|
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
|
1613
1612
|
|
|
1614
|
-
kq = ggml_soft_max_ext(ctx, kq, kq_mask,
|
|
1613
|
+
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, 0.0f);
|
|
1615
1614
|
|
|
1616
1615
|
// split cached v into n_head heads
|
|
1617
1616
|
struct ggml_tensor * v =
|
|
@@ -2128,6 +2127,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|
|
2128
2127
|
#endif
|
|
2129
2128
|
for (bool mask : {false, true}) {
|
|
2130
2129
|
for (float max_bias : {0.0f, 8.0f}) {
|
|
2130
|
+
if (!mask && max_bias > 0.0f) continue;
|
|
2131
2131
|
for (float scale : {1.0f, 0.1f}) {
|
|
2132
2132
|
for (int64_t ne0 : {16, 1024}) {
|
|
2133
2133
|
for (int64_t ne1 : {16, 1024}) {
|
|
@@ -2141,7 +2141,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|
|
2141
2141
|
|
|
2142
2142
|
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, 0.1f, 0.0f));
|
|
2143
2143
|
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 0.0f));
|
|
2144
|
-
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, 0.1f, 8.0f));
|
|
2145
2144
|
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 8.0f));
|
|
2146
2145
|
|
|
2147
2146
|
for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
|
|
@@ -2175,15 +2174,13 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|
|
2175
2174
|
test_cases.emplace_back(new test_timestep_embedding());
|
|
2176
2175
|
test_cases.emplace_back(new test_leaky_relu());
|
|
2177
2176
|
|
|
2178
|
-
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
2179
|
-
for (int hs : { 64, 128, }) { // other head sizes not implemented
|
|
2180
|
-
#else
|
|
2181
2177
|
for (int hs : { 64, 80, 128, 256, }) {
|
|
2182
|
-
|
|
2183
|
-
|
|
2184
|
-
|
|
2185
|
-
|
|
2186
|
-
|
|
2178
|
+
for (float max_bias : {0.0f, 8.0f}) {
|
|
2179
|
+
for (int nh : { 32, }) {
|
|
2180
|
+
for (int kv : { 512, 1024, }) {
|
|
2181
|
+
for (int nb : { 1, 2, 4, 8, }) {
|
|
2182
|
+
test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, max_bias));
|
|
2183
|
+
}
|
|
2187
2184
|
}
|
|
2188
2185
|
}
|
|
2189
2186
|
}
|
|
@@ -28,6 +28,19 @@ static llama_grammar* build_grammar(const std::string & grammar_str) {
|
|
|
28
28
|
return grammar;
|
|
29
29
|
}
|
|
30
30
|
|
|
31
|
+
static bool test_build_grammar_fails(const std::string & grammar_str) {
|
|
32
|
+
fprintf(stderr, "⚫ Testing failure for grammar: %s\n", grammar_str.c_str());
|
|
33
|
+
bool grammar_fails = false;
|
|
34
|
+
try {
|
|
35
|
+
build_grammar(grammar_str);
|
|
36
|
+
fprintf(stderr, " ❌ Expected build failure, but succeeded\n");
|
|
37
|
+
} catch (const std::exception & err) {
|
|
38
|
+
grammar_fails = true;
|
|
39
|
+
fprintf(stdout, " ✅︎\n");
|
|
40
|
+
}
|
|
41
|
+
return grammar_fails;
|
|
42
|
+
}
|
|
43
|
+
|
|
31
44
|
static bool match_string(const std::string & input, llama_grammar* grammar) {
|
|
32
45
|
auto decoded = decode_utf8(input, {});
|
|
33
46
|
|
|
@@ -320,6 +333,38 @@ number ::= [0-9]+)""";
|
|
|
320
333
|
fprintf(stderr, " ✅︎ Passed\n");
|
|
321
334
|
}
|
|
322
335
|
|
|
336
|
+
static void test_failure_left_recursion() {
|
|
337
|
+
fprintf(stderr, "⚫ Testing left recursion detection:\n");
|
|
338
|
+
|
|
339
|
+
// Test simple left recursion detection
|
|
340
|
+
const std::string simple_str = R"""(root ::= "a" | root "a")""";
|
|
341
|
+
assert(test_build_grammar_fails(simple_str));
|
|
342
|
+
|
|
343
|
+
// Test more complicated left recursion detection
|
|
344
|
+
const std::string medium_str = R"""(
|
|
345
|
+
root ::= asdf
|
|
346
|
+
asdf ::= "a" | asdf "a"
|
|
347
|
+
)""";
|
|
348
|
+
assert(test_build_grammar_fails(medium_str));
|
|
349
|
+
|
|
350
|
+
// Test even more complicated left recursion detection
|
|
351
|
+
const std::string hard_str = R"""(
|
|
352
|
+
root ::= asdf
|
|
353
|
+
asdf ::= "a" | foo "b"
|
|
354
|
+
foo ::= "c" | asdf "d" | "e")""";
|
|
355
|
+
assert(test_build_grammar_fails(hard_str));
|
|
356
|
+
|
|
357
|
+
// Test yet even more complicated left recursion detection
|
|
358
|
+
const std::string hardest_str = R"""(
|
|
359
|
+
root ::= asdf
|
|
360
|
+
asdf ::= "a" | foo "b"
|
|
361
|
+
foo ::= "c" | empty asdf "d" | "e"
|
|
362
|
+
empty ::= "blah" | )""";
|
|
363
|
+
assert(test_build_grammar_fails(hardest_str));
|
|
364
|
+
|
|
365
|
+
fprintf(stderr, " ✅︎ Passed\n");
|
|
366
|
+
}
|
|
367
|
+
|
|
323
368
|
int main() {
|
|
324
369
|
fprintf(stdout, "Running grammar integration tests...\n");
|
|
325
370
|
test_simple_grammar();
|
|
@@ -327,6 +372,7 @@ int main() {
|
|
|
327
372
|
test_quantifiers();
|
|
328
373
|
test_failure_missing_root();
|
|
329
374
|
test_failure_missing_reference();
|
|
375
|
+
test_failure_left_recursion();
|
|
330
376
|
fprintf(stdout, "All tests passed.\n");
|
|
331
377
|
return 0;
|
|
332
378
|
}
|
|
@@ -13,15 +13,27 @@
|
|
|
13
13
|
#include <vector>
|
|
14
14
|
|
|
15
15
|
int main(int argc, char **argv) {
|
|
16
|
-
if (argc < 2) {
|
|
17
|
-
fprintf(stderr, "Usage: %s <vocab-file
|
|
16
|
+
if (argc < 2 || argc > 3) {
|
|
17
|
+
fprintf(stderr, "Usage: %s <vocab-file> [--ignore-merges]\n", argv[0]);
|
|
18
18
|
return 1;
|
|
19
19
|
}
|
|
20
20
|
|
|
21
21
|
const std::string fname = argv[1];
|
|
22
|
+
bool ignore_merges = false;
|
|
23
|
+
if (argc == 3) {
|
|
24
|
+
if (std::strcmp(argv[2], "--ignore-merges") != 0) {
|
|
25
|
+
fprintf(stderr, "Usage: %s <vocab-file> [--ignore-merges]\n", argv[0]);
|
|
26
|
+
return 1;
|
|
27
|
+
}
|
|
28
|
+
ignore_merges = true;
|
|
29
|
+
}
|
|
22
30
|
|
|
23
31
|
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
|
|
24
32
|
|
|
33
|
+
if (ignore_merges) {
|
|
34
|
+
fprintf(stderr, "%s : ignoring merges for tokens inside vocab\n", __func__);
|
|
35
|
+
}
|
|
36
|
+
|
|
25
37
|
llama_model * model;
|
|
26
38
|
llama_context * ctx;
|
|
27
39
|
|
|
@@ -65,7 +77,19 @@ int main(int argc, char **argv) {
|
|
|
65
77
|
std::string str = llama_detokenize_bpe(ctx, std::vector<int>(1, i));
|
|
66
78
|
try {
|
|
67
79
|
auto cps = unicode_cpts_from_utf8(str);
|
|
68
|
-
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
|
80
|
+
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true);
|
|
81
|
+
if (ignore_merges && tokens.size() > 1) {
|
|
82
|
+
fprintf(stderr,
|
|
83
|
+
"%s : error: token %d detokenizes to '%s'(%zu) but "
|
|
84
|
+
"tokenization of this to multiple tokens: [",
|
|
85
|
+
__func__, i, str.c_str(), str.length());
|
|
86
|
+
fprintf(stderr, "%d", tokens[0]);
|
|
87
|
+
for (size_t i = 1; i < tokens.size(); i++) {
|
|
88
|
+
fprintf(stderr, ", %d", tokens[i]);
|
|
89
|
+
}
|
|
90
|
+
fprintf(stderr, "]\n");
|
|
91
|
+
return 2;
|
|
92
|
+
}
|
|
69
93
|
std::string check = llama_detokenize_bpe(ctx, tokens);
|
|
70
94
|
if (check != str) {
|
|
71
95
|
fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
|