@fugood/llama.node 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/CMakeLists.txt +9 -0
  2. package/README.md +1 -1
  3. package/bin/darwin/arm64/default.metallib +0 -0
  4. package/bin/darwin/arm64/llama-node.node +0 -0
  5. package/bin/darwin/x64/default.metallib +0 -0
  6. package/bin/darwin/x64/llama-node.node +0 -0
  7. package/bin/linux/arm64/llama-node.node +0 -0
  8. package/bin/linux/x64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  10. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  11. package/bin/win32/arm64/llama-node.node +0 -0
  12. package/bin/win32/arm64/node.lib +0 -0
  13. package/bin/win32/x64/llama-node.node +0 -0
  14. package/bin/win32/x64/node.lib +0 -0
  15. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/arm64/node.lib +0 -0
  17. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  18. package/bin/win32-vulkan/x64/node.lib +0 -0
  19. package/lib/binding.ts +1 -1
  20. package/package.json +2 -1
  21. package/patches/llama.patch +22 -0
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/llama.cpp/CMakeLists.txt +14 -12
  24. package/src/llama.cpp/common/common.cpp +19 -5
  25. package/src/llama.cpp/common/common.h +2 -0
  26. package/src/llama.cpp/common/grammar-parser.cpp +9 -0
  27. package/src/llama.cpp/common/sampling.cpp +3 -3
  28. package/src/llama.cpp/common/sampling.h +1 -1
  29. package/src/llama.cpp/examples/CMakeLists.txt +3 -0
  30. package/src/llama.cpp/examples/embedding/embedding.cpp +10 -2
  31. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +56 -7
  32. package/src/llama.cpp/examples/llama.android/{app/src/main/cpp → llama}/CMakeLists.txt +1 -1
  33. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +49 -0
  34. package/src/llama.cpp/examples/llama.android/{app → llama}/src/main/cpp/llama-android.cpp +14 -14
  35. package/src/llama.cpp/examples/llava/llava-cli.cpp +26 -6
  36. package/src/llama.cpp/examples/main/main.cpp +5 -1
  37. package/src/llama.cpp/examples/rpc/CMakeLists.txt +2 -0
  38. package/src/llama.cpp/examples/rpc/rpc-server.cpp +70 -0
  39. package/src/llama.cpp/examples/server/server.cpp +12 -16
  40. package/src/llama.cpp/examples/server/utils.hpp +1 -1
  41. package/src/llama.cpp/ggml-backend.c +2 -2
  42. package/src/llama.cpp/ggml-kompute.cpp +9 -3
  43. package/src/llama.cpp/ggml-quants.c +6 -0
  44. package/src/llama.cpp/ggml-rpc.cpp +1023 -0
  45. package/src/llama.cpp/ggml-rpc.h +24 -0
  46. package/src/llama.cpp/ggml-sycl.cpp +20 -143
  47. package/src/llama.cpp/ggml-vulkan.cpp +4 -2
  48. package/src/llama.cpp/ggml.c +116 -271
  49. package/src/llama.cpp/ggml.h +12 -15
  50. package/src/llama.cpp/llama.cpp +451 -265
  51. package/src/llama.cpp/llama.h +3 -0
  52. package/src/llama.cpp/requirements.txt +0 -1
  53. package/src/llama.cpp/tests/CMakeLists.txt +1 -1
  54. package/src/llama.cpp/tests/test-backend-ops.cpp +16 -19
  55. package/src/llama.cpp/tests/test-grammar-integration.cpp +46 -0
  56. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +27 -3
  57. package/src/llama.cpp/requirements/requirements-convert-lora-to-ggml.txt +0 -2
@@ -242,6 +242,9 @@ extern "C" {
242
242
  // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
243
243
  const float * tensor_split;
244
244
 
245
+ // comma separated list of RPC servers to use for offloading
246
+ const char * rpc_servers;
247
+
245
248
  // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
246
249
  // If the provided progress_callback returns true, model loading continues.
247
250
  // If it returns false, model loading is immediately aborted.
@@ -9,5 +9,4 @@
9
9
  -r ./requirements/requirements-convert-hf-to-gguf.txt
10
10
  -r ./requirements/requirements-convert-hf-to-gguf-update.txt
11
11
  -r ./requirements/requirements-convert-llama-ggml-to-gguf.txt
12
- -r ./requirements/requirements-convert-lora-to-ggml.txt
13
12
  -r ./requirements/requirements-convert-persimmon-to-gguf.txt
@@ -92,7 +92,7 @@ target_link_libraries(test-tokenizer-1-bpe PRIVATE common)
92
92
  install(TARGETS test-tokenizer-1-bpe RUNTIME)
93
93
 
94
94
  # TODO: disabled due to slowness
95
- #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
95
+ #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf --ignore-merges)
96
96
  #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
97
97
  #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
98
98
  #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
@@ -2,6 +2,7 @@
2
2
  #include <ggml-alloc.h>
3
3
  #include <ggml-backend.h>
4
4
  #include <ggml-backend-impl.h>
5
+
5
6
  #include <algorithm>
6
7
  #include <array>
7
8
  #include <cfloat>
@@ -1111,11 +1112,7 @@ struct test_soft_max : public test_case {
1111
1112
  if (this->mask) {
1112
1113
  mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, ne[0], ne[1]);
1113
1114
  }
1114
- ggml_tensor * pos = nullptr;
1115
- if (max_bias > 0.0f) {
1116
- pos = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ne[0]);
1117
- }
1118
- ggml_tensor * out = ggml_soft_max_ext(ctx, a, mask, pos, scale, max_bias);
1115
+ ggml_tensor * out = ggml_soft_max_ext(ctx, a, mask, scale, max_bias);
1119
1116
  return out;
1120
1117
  }
1121
1118
  };
@@ -1490,23 +1487,25 @@ struct test_flash_attn_ext : public test_case {
1490
1487
  const int64_t kv; // kv size
1491
1488
  const int64_t nb; // batch size
1492
1489
 
1490
+ const float max_bias; // ALiBi
1491
+
1493
1492
  std::string vars() override {
1494
- return VARS_TO_STR4(hs, nh, kv, nb);
1493
+ return VARS_TO_STR5(hs, nh, kv, nb, max_bias);
1495
1494
  }
1496
1495
 
1497
1496
  double max_nmse_err() override {
1498
1497
  return 5e-4;
1499
1498
  }
1500
1499
 
1501
- test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8)
1502
- : hs(hs), nh(nh), kv(kv), nb(nb) {}
1500
+ test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8, float max_bias = 0.0f)
1501
+ : hs(hs), nh(nh), kv(kv), nb(nb), max_bias(max_bias) {}
1503
1502
 
1504
1503
  ggml_tensor * build_graph(ggml_context * ctx) override {
1505
1504
  ggml_tensor * q = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, hs, nb, nh, 1);
1506
1505
  ggml_tensor * k = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1);
1507
1506
  ggml_tensor * v = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1);
1508
1507
  ggml_tensor * mask = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, 1);
1509
- ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, mask, 1.0f/sqrtf(hs));
1508
+ ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, mask, 1.0f/sqrtf(hs), max_bias);
1510
1509
  return out;
1511
1510
  }
1512
1511
  };
@@ -1611,7 +1610,7 @@ public:
1611
1610
 
1612
1611
  struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
1613
1612
 
1614
- kq = ggml_soft_max_ext(ctx, kq, kq_mask, nullptr, kq_scale, 0.0f);
1613
+ kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, 0.0f);
1615
1614
 
1616
1615
  // split cached v into n_head heads
1617
1616
  struct ggml_tensor * v =
@@ -2128,6 +2127,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2128
2127
  #endif
2129
2128
  for (bool mask : {false, true}) {
2130
2129
  for (float max_bias : {0.0f, 8.0f}) {
2130
+ if (!mask && max_bias > 0.0f) continue;
2131
2131
  for (float scale : {1.0f, 0.1f}) {
2132
2132
  for (int64_t ne0 : {16, 1024}) {
2133
2133
  for (int64_t ne1 : {16, 1024}) {
@@ -2141,7 +2141,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2141
2141
 
2142
2142
  test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, 0.1f, 0.0f));
2143
2143
  test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 0.0f));
2144
- test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, 0.1f, 8.0f));
2145
2144
  test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 8.0f));
2146
2145
 
2147
2146
  for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
@@ -2175,15 +2174,13 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2175
2174
  test_cases.emplace_back(new test_timestep_embedding());
2176
2175
  test_cases.emplace_back(new test_leaky_relu());
2177
2176
 
2178
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
2179
- for (int hs : { 64, 128, }) { // other head sizes not implemented
2180
- #else
2181
2177
  for (int hs : { 64, 80, 128, 256, }) {
2182
- #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
2183
- for (int nh : { 32, }) {
2184
- for (int kv : { 512, 1024, }) {
2185
- for (int nb : { 1, 2, 4, 8, }) {
2186
- test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb));
2178
+ for (float max_bias : {0.0f, 8.0f}) {
2179
+ for (int nh : { 32, }) {
2180
+ for (int kv : { 512, 1024, }) {
2181
+ for (int nb : { 1, 2, 4, 8, }) {
2182
+ test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, max_bias));
2183
+ }
2187
2184
  }
2188
2185
  }
2189
2186
  }
@@ -28,6 +28,19 @@ static llama_grammar* build_grammar(const std::string & grammar_str) {
28
28
  return grammar;
29
29
  }
30
30
 
31
+ static bool test_build_grammar_fails(const std::string & grammar_str) {
32
+ fprintf(stderr, "⚫ Testing failure for grammar: %s\n", grammar_str.c_str());
33
+ bool grammar_fails = false;
34
+ try {
35
+ build_grammar(grammar_str);
36
+ fprintf(stderr, " ❌ Expected build failure, but succeeded\n");
37
+ } catch (const std::exception & err) {
38
+ grammar_fails = true;
39
+ fprintf(stdout, " ✅︎\n");
40
+ }
41
+ return grammar_fails;
42
+ }
43
+
31
44
  static bool match_string(const std::string & input, llama_grammar* grammar) {
32
45
  auto decoded = decode_utf8(input, {});
33
46
 
@@ -320,6 +333,38 @@ number ::= [0-9]+)""";
320
333
  fprintf(stderr, " ✅︎ Passed\n");
321
334
  }
322
335
 
336
+ static void test_failure_left_recursion() {
337
+ fprintf(stderr, "⚫ Testing left recursion detection:\n");
338
+
339
+ // Test simple left recursion detection
340
+ const std::string simple_str = R"""(root ::= "a" | root "a")""";
341
+ assert(test_build_grammar_fails(simple_str));
342
+
343
+ // Test more complicated left recursion detection
344
+ const std::string medium_str = R"""(
345
+ root ::= asdf
346
+ asdf ::= "a" | asdf "a"
347
+ )""";
348
+ assert(test_build_grammar_fails(medium_str));
349
+
350
+ // Test even more complicated left recursion detection
351
+ const std::string hard_str = R"""(
352
+ root ::= asdf
353
+ asdf ::= "a" | foo "b"
354
+ foo ::= "c" | asdf "d" | "e")""";
355
+ assert(test_build_grammar_fails(hard_str));
356
+
357
+ // Test yet even more complicated left recursion detection
358
+ const std::string hardest_str = R"""(
359
+ root ::= asdf
360
+ asdf ::= "a" | foo "b"
361
+ foo ::= "c" | empty asdf "d" | "e"
362
+ empty ::= "blah" | )""";
363
+ assert(test_build_grammar_fails(hardest_str));
364
+
365
+ fprintf(stderr, " ✅︎ Passed\n");
366
+ }
367
+
323
368
  int main() {
324
369
  fprintf(stdout, "Running grammar integration tests...\n");
325
370
  test_simple_grammar();
@@ -327,6 +372,7 @@ int main() {
327
372
  test_quantifiers();
328
373
  test_failure_missing_root();
329
374
  test_failure_missing_reference();
375
+ test_failure_left_recursion();
330
376
  fprintf(stdout, "All tests passed.\n");
331
377
  return 0;
332
378
  }
@@ -13,15 +13,27 @@
13
13
  #include <vector>
14
14
 
15
15
  int main(int argc, char **argv) {
16
- if (argc < 2) {
17
- fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
16
+ if (argc < 2 || argc > 3) {
17
+ fprintf(stderr, "Usage: %s <vocab-file> [--ignore-merges]\n", argv[0]);
18
18
  return 1;
19
19
  }
20
20
 
21
21
  const std::string fname = argv[1];
22
+ bool ignore_merges = false;
23
+ if (argc == 3) {
24
+ if (std::strcmp(argv[2], "--ignore-merges") != 0) {
25
+ fprintf(stderr, "Usage: %s <vocab-file> [--ignore-merges]\n", argv[0]);
26
+ return 1;
27
+ }
28
+ ignore_merges = true;
29
+ }
22
30
 
23
31
  fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
24
32
 
33
+ if (ignore_merges) {
34
+ fprintf(stderr, "%s : ignoring merges for tokens inside vocab\n", __func__);
35
+ }
36
+
25
37
  llama_model * model;
26
38
  llama_context * ctx;
27
39
 
@@ -65,7 +77,19 @@ int main(int argc, char **argv) {
65
77
  std::string str = llama_detokenize_bpe(ctx, std::vector<int>(1, i));
66
78
  try {
67
79
  auto cps = unicode_cpts_from_utf8(str);
68
- std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
80
+ std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true);
81
+ if (ignore_merges && tokens.size() > 1) {
82
+ fprintf(stderr,
83
+ "%s : error: token %d detokenizes to '%s'(%zu) but "
84
+ "tokenization of this to multiple tokens: [",
85
+ __func__, i, str.c_str(), str.length());
86
+ fprintf(stderr, "%d", tokens[0]);
87
+ for (size_t i = 1; i < tokens.size(); i++) {
88
+ fprintf(stderr, ", %d", tokens[i]);
89
+ }
90
+ fprintf(stderr, "]\n");
91
+ return 2;
92
+ }
69
93
  std::string check = llama_detokenize_bpe(ctx, tokens);
70
94
  if (check != str) {
71
95
  fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
@@ -1,2 +0,0 @@
1
- -r ./requirements-convert.txt
2
- torch~=2.1.1