@fugood/llama.node 0.3.9 → 0.3.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.js +2 -2
  18. package/lib/binding.ts +47 -8
  19. package/lib/index.js +21 -1
  20. package/lib/index.ts +31 -1
  21. package/package.json +12 -3
  22. package/src/LlamaCompletionWorker.cpp +33 -6
  23. package/src/LlamaCompletionWorker.h +3 -1
  24. package/src/LlamaContext.cpp +336 -28
  25. package/src/LlamaContext.h +2 -0
  26. package/src/common.hpp +19 -2
  27. package/src/llama.cpp/.github/workflows/build.yml +289 -107
  28. package/src/llama.cpp/.github/workflows/close-issue.yml +1 -1
  29. package/src/llama.cpp/.github/workflows/docker.yml +2 -1
  30. package/src/llama.cpp/.github/workflows/server.yml +25 -2
  31. package/src/llama.cpp/CMakeLists.txt +10 -19
  32. package/src/llama.cpp/cmake/build-info.cmake +1 -1
  33. package/src/llama.cpp/common/CMakeLists.txt +32 -0
  34. package/src/llama.cpp/common/arg.cpp +66 -16
  35. package/src/llama.cpp/common/chat-template.hpp +515 -0
  36. package/src/llama.cpp/common/chat.cpp +966 -0
  37. package/src/llama.cpp/common/chat.hpp +52 -0
  38. package/src/llama.cpp/common/common.cpp +159 -36
  39. package/src/llama.cpp/common/common.h +56 -14
  40. package/src/llama.cpp/common/json-schema-to-grammar.cpp +46 -66
  41. package/src/llama.cpp/common/json-schema-to-grammar.h +15 -1
  42. package/src/llama.cpp/common/llguidance.cpp +270 -0
  43. package/src/llama.cpp/common/log.cpp +1 -10
  44. package/src/llama.cpp/common/log.h +10 -0
  45. package/src/llama.cpp/common/minja.hpp +2868 -0
  46. package/src/llama.cpp/common/sampling.cpp +22 -1
  47. package/src/llama.cpp/common/sampling.h +3 -0
  48. package/src/llama.cpp/docs/build.md +54 -9
  49. package/src/llama.cpp/examples/export-lora/export-lora.cpp +12 -2
  50. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +1 -1
  51. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  52. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +59 -0
  53. package/src/llama.cpp/examples/llava/clip.cpp +133 -14
  54. package/src/llama.cpp/examples/llava/clip.h +2 -0
  55. package/src/llama.cpp/examples/llava/llava.cpp +22 -8
  56. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +9 -1
  57. package/src/llama.cpp/examples/main/main.cpp +26 -25
  58. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +136 -137
  59. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +18 -4
  60. package/src/llama.cpp/examples/run/run.cpp +224 -69
  61. package/src/llama.cpp/examples/server/server.cpp +252 -81
  62. package/src/llama.cpp/examples/server/utils.hpp +73 -21
  63. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +6 -4
  64. package/src/llama.cpp/examples/simple-cmake-pkg/CMakeLists.txt +11 -0
  65. package/src/llama.cpp/ggml/CMakeLists.txt +78 -1
  66. package/src/llama.cpp/ggml/include/ggml.h +1 -1
  67. package/src/llama.cpp/ggml/src/CMakeLists.txt +21 -4
  68. package/src/llama.cpp/ggml/src/ggml-alloc.c +1 -13
  69. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +91 -78
  70. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +7 -7
  71. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -1
  72. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +1 -1
  73. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +46 -0
  74. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +16 -1
  75. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +1 -1
  76. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +28 -8
  77. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +5 -7
  78. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +33 -23
  79. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +1 -5
  80. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +323 -121
  81. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +13 -3
  82. package/src/llama.cpp/ggml/src/ggml.c +23 -13
  83. package/src/llama.cpp/include/llama.h +14 -1
  84. package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +112 -0
  85. package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +46 -0
  86. package/src/llama.cpp/src/CMakeLists.txt +1 -1
  87. package/src/llama.cpp/src/llama-arch.cpp +7 -2
  88. package/src/llama.cpp/src/llama-arch.h +3 -1
  89. package/src/llama.cpp/src/llama-chat.cpp +11 -2
  90. package/src/llama.cpp/src/llama-chat.h +1 -0
  91. package/src/llama.cpp/src/llama-grammar.cpp +86 -6
  92. package/src/llama.cpp/src/llama-grammar.h +22 -1
  93. package/src/llama.cpp/src/llama-mmap.cpp +1 -0
  94. package/src/llama.cpp/src/llama-model-loader.cpp +1 -1
  95. package/src/llama.cpp/src/llama-model.cpp +76 -6
  96. package/src/llama.cpp/src/llama-sampling.cpp +47 -4
  97. package/src/llama.cpp/src/llama-vocab.cpp +10 -4
  98. package/src/llama.cpp/src/llama.cpp +181 -123
  99. package/src/llama.cpp/tests/CMakeLists.txt +4 -0
  100. package/src/llama.cpp/tests/test-backend-ops.cpp +158 -57
  101. package/src/llama.cpp/tests/test-chat-template.cpp +154 -31
  102. package/src/llama.cpp/tests/test-chat.cpp +607 -0
  103. package/src/llama.cpp/tests/test-grammar-integration.cpp +2 -2
  104. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +1140 -0
  105. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +1 -1
  106. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +0 -32
@@ -17,13 +17,13 @@
17
17
  #include <cstring>
18
18
  #include <cstdlib>
19
19
  #include <cassert>
20
+ #include <algorithm>
20
21
  #include <sys/stat.h>
21
22
  #include <sys/types.h>
22
23
 
23
24
  #ifdef _WIN32
24
25
  #include <windows.h>
25
26
  #include <direct.h> // For _mkdir on Windows
26
- #include <algorithm> // For std::replace on w64devkit
27
27
  #else
28
28
  #include <unistd.h>
29
29
  #include <sys/wait.h>
@@ -55,6 +55,12 @@ const std::vector<std::string> type_names = {
55
55
  "q4_k",
56
56
  "q5_k",
57
57
  "q6_k",
58
+ "iq2_xxs",
59
+ "iq2_xs",
60
+ "iq2_s",
61
+ "iq3_xxs",
62
+ "iq3_s",
63
+ "iq4_xs",
58
64
  "iq4_nl"
59
65
  };
60
66
 
@@ -316,8 +322,11 @@ void matmul_shaders(bool fp16, bool matmul_id, bool coopmat, bool coopmat2, bool
316
322
  // For aligned matmul loads
317
323
  std::string load_vec_a = (coopmat2 || tname == "f32" || tname == "f16") ? load_vec : "2";
318
324
 
319
- string_to_spv(shader_name + "_" + tname + "_f32", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}}), fp16, coopmat, coopmat2, f16acc);
320
- string_to_spv(shader_name + "_" + tname + "_f32_aligned", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
325
+ // don't generate f32 variants for coopmat2
326
+ if (!coopmat2) {
327
+ string_to_spv(shader_name + "_" + tname + "_f32", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}}), fp16, coopmat, coopmat2, f16acc);
328
+ string_to_spv(shader_name + "_" + tname + "_f32_aligned", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
329
+ }
321
330
 
322
331
  if (tname != "f16" && tname != "f32") {
323
332
  string_to_spv(shader_name + "_" + tname + "_f16", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}}), fp16, coopmat, coopmat2, f16acc);
@@ -499,6 +508,7 @@ void write_output_files() {
499
508
  fprintf(hdr, "#include <cstdint>\n\n");
500
509
  fprintf(src, "#include \"%s\"\n\n", basename(target_hpp).c_str());
501
510
 
511
+ std::sort(shader_fnames.begin(), shader_fnames.end());
502
512
  for (const auto& pair : shader_fnames) {
503
513
  const std::string& name = pair.first;
504
514
  #ifdef _WIN32
@@ -128,6 +128,10 @@ static void ggml_print_backtrace_symbols(void) {
128
128
  #endif
129
129
 
130
130
  static void ggml_print_backtrace(void) {
131
+ const char * GGML_NO_BACKTRACE = getenv("GGML_NO_BACKTRACE");
132
+ if (GGML_NO_BACKTRACE) {
133
+ return;
134
+ }
131
135
  char attach[32];
132
136
  snprintf(attach, sizeof(attach), "attach %d", getpid());
133
137
  int pid = fork();
@@ -5339,7 +5343,7 @@ static void ggml_compute_backward(
5339
5343
  } break;
5340
5344
  case GGML_OP_MUL: {
5341
5345
  if (src0_needs_grads) {
5342
- ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, src1, grad));
5346
+ ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, grad, src1));
5343
5347
  }
5344
5348
  if (src1_needs_grads) {
5345
5349
  struct ggml_tensor * tmp = ggml_mul(ctx, src0, grad);
@@ -5431,21 +5435,25 @@ static void ggml_compute_backward(
5431
5435
  // src1.shape [n,p,qq,rr]
5432
5436
 
5433
5437
  if (src0_needs_grads) {
5434
- struct ggml_tensor * s1_tg =
5438
+ GGML_ASSERT(grad->ne[2] == src1->ne[2]);
5439
+ GGML_ASSERT(grad->ne[3] == src1->ne[3]);
5440
+ struct ggml_tensor * tmp =
5435
5441
  ggml_out_prod(ctx, // [n,m,qq,rr]
5436
5442
  src1, // [n,p,qq,rr]
5437
5443
  grad); // [m,p,qq,rr]
5438
- const int64_t qq = s1_tg->ne[2];
5439
- const int64_t rr = s1_tg->ne[3];
5440
- const int64_t q1 = src0->ne[2];
5441
- const int64_t r1 = src0->ne[3];
5442
- const bool ne2_broadcasted = qq > q1;
5443
- const bool ne3_broadcasted = rr > r1;
5444
- if (ne2_broadcasted || ne3_broadcasted) {
5445
- // sum broadcast repetitions of s1_tg into shape of src0
5446
- s1_tg = ggml_repeat_back(ctx, s1_tg, src0);
5444
+ if (!ggml_are_same_shape(tmp, src0)) {
5445
+ GGML_ASSERT(tmp->ne[0] == src0->ne[0]);
5446
+ GGML_ASSERT(tmp->ne[1] == src0->ne[1]);
5447
+ GGML_ASSERT(tmp->ne[3] == 1);
5448
+
5449
+ const int64_t nr2 = tmp->ne[2] / src0->ne[2];
5450
+ const size_t nb2 = tmp->nb[2] * nr2;
5451
+ const size_t nb3 = tmp->nb[2];
5452
+
5453
+ tmp = ggml_view_4d(ctx, tmp, src0->ne[0], src0->ne[1], src0->ne[2], nr2, tmp->nb[1], nb2, nb3, 0);
5454
+ tmp = ggml_repeat_back(ctx, tmp, src0);
5447
5455
  }
5448
- ggml_add_or_set(ctx, cgraph, isrc0, s1_tg /*= [n,m,q1,r1]*/);
5456
+ ggml_add_or_set(ctx, cgraph, isrc0, tmp);
5449
5457
  }
5450
5458
  if (src1_needs_grads) {
5451
5459
  ggml_add_or_set(ctx, cgraph, isrc1,
@@ -5514,7 +5522,9 @@ static void ggml_compute_backward(
5514
5522
  if (src0_needs_grads) {
5515
5523
  GGML_ASSERT(!cgraph->grads[isrc0] || ggml_is_contiguous(cgraph->grads[isrc0]));
5516
5524
  GGML_ASSERT(ggml_is_contiguous(grad));
5517
- ggml_add_or_set(ctx, cgraph, isrc0, grad);
5525
+ GGML_ASSERT(ggml_nelements(tensor) == ggml_nelements(src0));
5526
+ ggml_add_or_set(ctx, cgraph, isrc0,
5527
+ ggml_are_same_shape(tensor, src0) ? grad : ggml_reshape(ctx, grad, src0));
5518
5528
  }
5519
5529
  } break;
5520
5530
  case GGML_OP_RESHAPE: {
@@ -510,7 +510,8 @@ extern "C" {
510
510
  LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
511
511
 
512
512
  // Get the default chat template. Returns nullptr if not available
513
- LLAMA_API const char * llama_model_chat_template(const struct llama_model * model);
513
+ // If name is NULL, returns the default chat template
514
+ LLAMA_API const char * llama_model_chat_template(const struct llama_model * model, const char * name);
514
515
 
515
516
  // Returns the total number of parameters in the model
516
517
  LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
@@ -1198,6 +1199,18 @@ extern "C" {
1198
1199
  const char * grammar_str,
1199
1200
  const char * grammar_root);
1200
1201
 
1202
+ /// @details Lazy grammar sampler, introduced in https://github.com/ggerganov/llama.cpp/pull/9639
1203
+ /// @param trigger_words A list of words that will trigger the grammar sampler. This may be updated to a loose regex syntax (w/ ^) in a near future.
1204
+ /// @param trigger_tokens A list of tokens that will trigger the grammar sampler.
1205
+ LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy(
1206
+ const struct llama_vocab * vocab,
1207
+ const char * grammar_str,
1208
+ const char * grammar_root,
1209
+ const char ** trigger_words,
1210
+ size_t num_trigger_words,
1211
+ const llama_token * trigger_tokens,
1212
+ size_t num_trigger_tokens);
1213
+
1201
1214
  /// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first.
1202
1215
  LLAMA_API struct llama_sampler * llama_sampler_init_penalties(
1203
1216
  int32_t penalty_last_n, // last n tokens to penalize (0 = disable penalty, -1 = context size)
@@ -0,0 +1,112 @@
1
+ ied 4 ½ months
2
+ __ggml_vocab_test__
3
+ Führer
4
+ __ggml_vocab_test__
5
+
6
+ __ggml_vocab_test__
7
+
8
+ __ggml_vocab_test__
9
+
10
+ __ggml_vocab_test__
11
+
12
+ __ggml_vocab_test__
13
+
14
+ __ggml_vocab_test__
15
+
16
+
17
+ __ggml_vocab_test__
18
+
19
+
20
+
21
+ __ggml_vocab_test__
22
+
23
+
24
+
25
+
26
+ __ggml_vocab_test__
27
+
28
+
29
+ __ggml_vocab_test__
30
+ Hello world
31
+ __ggml_vocab_test__
32
+ Hello world
33
+ __ggml_vocab_test__
34
+ Hello World
35
+ __ggml_vocab_test__
36
+ Hello World
37
+ __ggml_vocab_test__
38
+ Hello World!
39
+ __ggml_vocab_test__
40
+ Hello, world!
41
+ __ggml_vocab_test__
42
+ Hello, world!
43
+ __ggml_vocab_test__
44
+ this is 🦙.cpp
45
+ __ggml_vocab_test__
46
+ w048 7tuijk dsdfhu
47
+ __ggml_vocab_test__
48
+ нещо на Български
49
+ __ggml_vocab_test__
50
+ កាន់តែពិសេសអាចខលចេញ
51
+ __ggml_vocab_test__
52
+ 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
53
+ __ggml_vocab_test__
54
+ Hello
55
+ __ggml_vocab_test__
56
+ Hello
57
+ __ggml_vocab_test__
58
+ Hello
59
+ __ggml_vocab_test__
60
+ Hello
61
+ __ggml_vocab_test__
62
+ Hello
63
+ __ggml_vocab_test__
64
+ Hello
65
+ Hello
66
+ __ggml_vocab_test__
67
+ (
68
+ __ggml_vocab_test__
69
+
70
+ =
71
+ __ggml_vocab_test__
72
+ ' era
73
+ __ggml_vocab_test__
74
+ Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
75
+ __ggml_vocab_test__
76
+ !!!!!!
77
+ __ggml_vocab_test__
78
+ 3
79
+ __ggml_vocab_test__
80
+ 33
81
+ __ggml_vocab_test__
82
+ 333
83
+ __ggml_vocab_test__
84
+ 3333
85
+ __ggml_vocab_test__
86
+ 33333
87
+ __ggml_vocab_test__
88
+ 333333
89
+ __ggml_vocab_test__
90
+ 3333333
91
+ __ggml_vocab_test__
92
+ 33333333
93
+ __ggml_vocab_test__
94
+ 333333333
95
+ __ggml_vocab_test__
96
+ Cửa Việt
97
+ __ggml_vocab_test__
98
+ discards
99
+ __ggml_vocab_test__
100
+
101
+
102
+
103
+
104
+
105
+
106
+
107
+
108
+
109
+
110
+
111
+ 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
112
+ __ggml_vocab_test__
@@ -0,0 +1,46 @@
1
+ 1122 220 19 220 26062 3951
2
+ 37 50753 261
3
+
4
+ 220
5
+ 256
6
+ 262
7
+ 197
8
+ 198
9
+ 271
10
+ 1406
11
+ 1572
12
+ 9707 1879
13
+ 21927 1879
14
+ 9707 4337
15
+ 21927 4337
16
+ 21927 4337 0
17
+ 9707 11 1879 0
18
+ 21927 11 1879 0
19
+ 419 374 11162 99 247 13 10821
20
+ 86 15 19 23 220 22 83 1963 41808 11472 2940 16739
21
+ 78762 14144 1456 13073 63471 33594 3038 133178 79012
22
+ 146394 97529 241 44258 233 146568 44258 224 147603 20879 115 146280 44258 223 146280 147272 97529 227 147805 148301 147270 44258 223 146848
23
+ 145836 320 8252 8 26525 114 378 235 149921 30543 320 35673 99066 97534 8 25521 227 320 3243 42365 429 702 1181 1828 3950 8
24
+ 9707
25
+ 21927
26
+ 220 21927
27
+ 256 21927
28
+ 262 21927
29
+ 262 21927 198 262 21927
30
+ 320
31
+ 198 284
32
+ 6 11385
33
+ 9707 11 379 64848 0 2585 525 498 26525 223 937 104100 18493 22377 99257 16 18 16 19 16 20 16 35727 21216
34
+ 17085 2928
35
+ 18
36
+ 18 18
37
+ 18 18 18
38
+ 18 18 18 18
39
+ 18 18 18 18 18
40
+ 18 18 18 18 18 18
41
+ 18 18 18 18 18 18 18
42
+ 18 18 18 18 18 18 18 18
43
+ 18 18 18 18 18 18 18 18 18
44
+ 34 90063 128324
45
+ 2560 2347
46
+ 198 4710 14731 65497 7847 1572 2303 78672 10947 145836 320 8252 8 26525 114 378 235 149921 30543 320 35673 99066 97534 8 25521 227 11162 99 247 149955 220 18 220 18 18 220 18 18 18 220 18 18 18 18 220 18 18 18 18 18 220 18 18 18 18 18 18 220 18 18 18 18 18 18 18 220 18 18 18 18 18 18 18 18 220 18 13 18 220 18 496 18 220 18 1112 18 220 146394 97529 241 44258 233 146568 44258 224 147603 20879 115 146280 44258 223 146280 147272 97529 227 144534 937 104100 18493 22377 99257 16 18 16 19 16 20 16 35727 21216 55460 53237 18658 14144 1456 13073 63471 33594 3038 133178 79012 3355 4605 4605 13874 13874 73594 3014 3014 28149 17085 2928 26610 7646 358 3003 1012 364 83 813 566 594 1052 11 364 787 498 2704 30 364 44 537 2704 358 3278 1281 432 11 364 35 498 1075 1045 15243 30 1205 6 42612 264 63866 43
@@ -29,7 +29,7 @@ add_library(llama
29
29
  unicode-data.cpp
30
30
  )
31
31
 
32
- target_include_directories(llama PUBLIC . ../include)
32
+ target_include_directories(llama PUBLIC . ../include ../common)
33
33
  target_compile_features (llama PUBLIC cxx_std_17) # don't bump
34
34
 
35
35
  target_link_libraries(llama PUBLIC ggml)
@@ -179,6 +179,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
179
179
  { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
180
180
  { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
181
181
  { LLM_KV_TOKENIZER_CHAT_TEMPLATE, "tokenizer.chat_template" },
182
+ { LLM_KV_TOKENIZER_CHAT_TEMPLATE_N, "tokenizer.chat_template.%s" },
182
183
  { LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" },
183
184
  { LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" },
184
185
  { LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" },
@@ -1023,6 +1024,9 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1023
1024
  { LLM_TENSOR_OUTPUT, "output" },
1024
1025
  { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1025
1026
  { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
1027
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1028
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1029
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1026
1030
  { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1027
1031
  { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1028
1032
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
@@ -1443,10 +1447,11 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
1443
1447
  {LLM_TENSOR_CONVNEXT_GAMMA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
1444
1448
  };
1445
1449
 
1446
- LLM_KV::LLM_KV(llm_arch arch) : arch(arch) {}
1450
+ LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
1447
1451
 
1448
1452
  std::string LLM_KV::operator()(llm_kv kv) const {
1449
- return ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
1453
+ return suffix ? ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch), suffix)
1454
+ : ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
1450
1455
  }
1451
1456
 
1452
1457
  std::string LLM_TN_IMPL::str() const {
@@ -177,6 +177,7 @@ enum llm_kv {
177
177
  LLM_KV_TOKENIZER_HF_JSON,
178
178
  LLM_KV_TOKENIZER_RWKV,
179
179
  LLM_KV_TOKENIZER_CHAT_TEMPLATE,
180
+ LLM_KV_TOKENIZER_CHAT_TEMPLATE_N,
180
181
  LLM_KV_TOKENIZER_FIM_PRE_ID,
181
182
  LLM_KV_TOKENIZER_FIM_SUF_ID,
182
183
  LLM_KV_TOKENIZER_FIM_MID_ID,
@@ -335,9 +336,10 @@ enum llm_tensor_layer {
335
336
  };
336
337
 
337
338
  struct LLM_KV {
338
- LLM_KV(llm_arch arch);
339
+ LLM_KV(llm_arch arch, const char * suffix = nullptr);
339
340
 
340
341
  llm_arch arch;
342
+ const char * suffix;
341
343
 
342
344
  std::string operator()(llm_kv kv) const;
343
345
  };
@@ -51,6 +51,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
51
51
  { "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
52
52
  { "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 },
53
53
  { "chatglm4", LLM_CHAT_TEMPLATE_CHATGML_4 },
54
+ { "glmedge", LLM_CHAT_TEMPLATE_GLMEDGE },
54
55
  { "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
55
56
  { "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
56
57
  { "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
@@ -115,7 +116,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
115
116
  } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
116
117
  return LLM_CHAT_TEMPLATE_PHI_3;
117
118
  } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
118
- return LLM_CHAT_TEMPLATE_FALCON_3;
119
+ return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE;
119
120
  } else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
120
121
  return LLM_CHAT_TEMPLATE_ZEPHYR;
121
122
  } else if (tmpl_contains("bos_token + message['role']")) {
@@ -152,7 +153,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
152
153
  return LLM_CHAT_TEMPLATE_MINICPM;
153
154
  } else if (tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
154
155
  return LLM_CHAT_TEMPLATE_DEEPSEEK_2;
155
- } else if (tmpl_contains(LU8("'<|Assistant|>' + message['content'] + '<|end▁of▁sentence|>'"))) {
156
+ } else if (tmpl_contains(LU8("<|Assistant|>")) && tmpl_contains(LU8("<|User|>")) && tmpl_contains(LU8("<|end▁of▁sentence|>"))) {
156
157
  return LLM_CHAT_TEMPLATE_DEEPSEEK_3;
157
158
  } else if (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]")) {
158
159
  // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
@@ -440,6 +441,14 @@ int32_t llm_chat_apply_template(
440
441
  if (add_ass) {
441
442
  ss << "<|assistant|>";
442
443
  }
444
+ } else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
445
+ for (auto message : chat) {
446
+ std::string role(message->role);
447
+ ss << "<|" << role << "|>" << "\n" << message->content;
448
+ }
449
+ if (add_ass) {
450
+ ss << "<|assistant|>";
451
+ }
443
452
  } else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
444
453
  // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
445
454
  for (auto message : chat) {
@@ -31,6 +31,7 @@ enum llm_chat_template {
31
31
  LLM_CHAT_TEMPLATE_LLAMA_3,
32
32
  LLM_CHAT_TEMPLATE_CHATGML_3,
33
33
  LLM_CHAT_TEMPLATE_CHATGML_4,
34
+ LLM_CHAT_TEMPLATE_GLMEDGE,
34
35
  LLM_CHAT_TEMPLATE_MINICPM,
35
36
  LLM_CHAT_TEMPLATE_EXAONE_3,
36
37
  LLM_CHAT_TEMPLATE_RWKV_WORLD,
@@ -560,7 +560,7 @@ bool llama_grammar_parser::parse(const char * src) {
560
560
  }
561
561
  }
562
562
  } catch (const std::exception & err) {
563
- fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what());
563
+ fprintf(stderr, "%s: error parsing grammar: %s\n\n%s\n", __func__, err.what(), src);
564
564
  rules.clear();
565
565
  return false;
566
566
  }
@@ -960,10 +960,28 @@ struct llama_grammar * llama_grammar_init_impl(
960
960
  // Important: vec_rules has to be moved here, not copied, because stacks contains
961
961
  // pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
962
962
  // then the pointers would be invalidated when the local vec_rules goes out of scope.
963
- return new llama_grammar { vocab, std::move(vec_rules), std::move(stacks), {}, };
963
+ return new llama_grammar {
964
+ vocab,
965
+ std::move(vec_rules),
966
+ std::move(stacks),
967
+ /* .partial_utf8 = */ {},
968
+ /* .lazy =*/ false,
969
+ /* .awaiting_trigger = */ false,
970
+ /* .trigger_buffer = */ "",
971
+ /* .trigger_tokens = */ {},
972
+ /* .trigger_words = */ {},
973
+ };
964
974
  }
965
975
 
966
- struct llama_grammar * llama_grammar_init_impl(const struct llama_vocab * vocab, const char * grammar_str, const char * grammar_root) {
976
+ struct llama_grammar * llama_grammar_init_impl(
977
+ const struct llama_vocab * vocab,
978
+ const char * grammar_str,
979
+ const char * grammar_root,
980
+ bool lazy,
981
+ const char ** trigger_words,
982
+ size_t num_trigger_words,
983
+ const llama_token * trigger_tokens,
984
+ size_t num_trigger_tokens) {
967
985
  llama_grammar_parser parser;
968
986
 
969
987
  // if there is a grammar, parse it
@@ -1035,10 +1053,31 @@ struct llama_grammar * llama_grammar_init_impl(const struct llama_vocab * vocab,
1035
1053
  }
1036
1054
  } while (true);
1037
1055
 
1056
+ std::vector<llama_token> vec_trigger_tokens;
1057
+ std::vector<std::string> vec_trigger_words;
1058
+ for (size_t i = 0; i < num_trigger_tokens; i++) {
1059
+ GGML_ASSERT(trigger_tokens != nullptr);
1060
+ vec_trigger_tokens.push_back(trigger_tokens[i]);
1061
+ }
1062
+ for (size_t i = 0; i < num_trigger_words; i++) {
1063
+ GGML_ASSERT(trigger_words != nullptr);
1064
+ vec_trigger_words.push_back(trigger_words[i]);
1065
+ }
1066
+
1038
1067
  // Important: vec_rules has to be moved here, not copied, because stacks contains
1039
1068
  // pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
1040
1069
  // then the pointers would be invalidated when the local vec_rules goes out of scope.
1041
- return new llama_grammar { vocab, std::move(vec_rules), std::move(stacks), {}, };
1070
+ return new llama_grammar {
1071
+ vocab,
1072
+ std::move(vec_rules),
1073
+ std::move(stacks),
1074
+ /* .partial_utf8 = */ {},
1075
+ /* .lazy = */ lazy,
1076
+ /* .awaiting_trigger = */ lazy,
1077
+ /* .trigger_buffer = */ "",
1078
+ std::move(vec_trigger_tokens),
1079
+ std::move(vec_trigger_words),
1080
+ };
1042
1081
  }
1043
1082
 
1044
1083
  void llama_grammar_free_impl(struct llama_grammar * grammar) {
@@ -1055,6 +1094,11 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
1055
1094
  grammar.rules,
1056
1095
  grammar.stacks,
1057
1096
  grammar.partial_utf8,
1097
+ grammar.lazy,
1098
+ grammar.awaiting_trigger,
1099
+ grammar.trigger_buffer,
1100
+ grammar.trigger_tokens,
1101
+ grammar.trigger_words,
1058
1102
  };
1059
1103
 
1060
1104
  // redirect elements in stacks to point to new rules
@@ -1076,6 +1120,10 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
1076
1120
  void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_data_array * cur_p) {
1077
1121
  GGML_ASSERT(grammar.vocab != nullptr);
1078
1122
 
1123
+ if (grammar.awaiting_trigger) {
1124
+ return;
1125
+ }
1126
+
1079
1127
  bool allow_eog = false;
1080
1128
  for (const auto & stack : grammar.stacks) {
1081
1129
  if (stack.empty()) {
@@ -1115,6 +1163,34 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
1115
1163
  void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token) {
1116
1164
  GGML_ASSERT(grammar.vocab != nullptr);
1117
1165
 
1166
+ const auto & piece = grammar.vocab->token_to_piece(token);
1167
+
1168
+ if (grammar.awaiting_trigger) {
1169
+ if (std::find(grammar.trigger_tokens.begin(), grammar.trigger_tokens.end(), token) != grammar.trigger_tokens.end()) {
1170
+ grammar.awaiting_trigger = false;
1171
+ grammar.trigger_buffer.clear();
1172
+ llama_grammar_accept_str(grammar, piece);
1173
+ LLAMA_LOG_DEBUG("Grammar triggered on token %u (`%s`)", token, piece.c_str());
1174
+ return;
1175
+ } else {
1176
+ // TODO: consider a smarter incremental substring search algorithm (store last position to search from).
1177
+ grammar.trigger_buffer += piece;
1178
+ for (const auto & word : grammar.trigger_words) {
1179
+ auto pos = grammar.trigger_buffer.find(word);
1180
+ if (pos != std::string::npos) {
1181
+ grammar.awaiting_trigger = false;
1182
+ auto constrained_str = grammar.trigger_buffer.substr(pos);
1183
+ grammar.trigger_buffer.clear();
1184
+ llama_grammar_accept_str(grammar, constrained_str);
1185
+ LLAMA_LOG_DEBUG("Grammar triggered on word `%s`", word.c_str());
1186
+ return;
1187
+ }
1188
+ }
1189
+ LLAMA_LOG_DEBUG("Grammar still awaiting trigger after token %d (`%s`) (buffer: `%s`)\n", token, piece.c_str(), grammar.trigger_buffer.c_str());
1190
+ return;
1191
+ }
1192
+ }
1193
+
1118
1194
  if (grammar.vocab->is_eog(token)) {
1119
1195
  for (const auto & stack : grammar.stacks) {
1120
1196
  if (stack.empty()) {
@@ -1124,8 +1200,10 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
1124
1200
  GGML_ABORT("fatal error");
1125
1201
  }
1126
1202
 
1127
- const std::string & piece = grammar.vocab->token_to_piece(token);
1203
+ llama_grammar_accept_str(grammar, piece);
1204
+ }
1128
1205
 
1206
+ void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string & piece) {
1129
1207
  // Note terminating 0 in decoded string
1130
1208
  const auto decoded = decode_utf8(piece, grammar.partial_utf8);
1131
1209
  const auto & code_points = decoded.first;
@@ -1135,5 +1213,7 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
1135
1213
  }
1136
1214
 
1137
1215
  grammar.partial_utf8 = decoded.second;
1138
- GGML_ASSERT(!grammar.stacks.empty());
1216
+ if (grammar.stacks.empty()) {
1217
+ throw std::runtime_error("Unexpected empty grammar stack after accepting piece: " + piece);
1218
+ }
1139
1219
  }
@@ -114,6 +114,15 @@ struct llama_grammar {
114
114
 
115
115
  // buffer for partially generated UTF-8 sequence from accepted tokens
116
116
  llama_partial_utf8 partial_utf8;
117
+
118
+ // lazy grammars wait for trigger words or tokens before constraining the sampling.
119
+ // we still ahve trigger_tokens for non-lazy grammars to force printing of special trigger tokens.
120
+ // (useful e.g. for tool_choice=required)
121
+ bool lazy = false;
122
+ bool awaiting_trigger = false; // Initialized to true for lazy grammars only
123
+ std::string trigger_buffer; // Output buffered by lazy grammar. Will be cleared once trigger is found.
124
+ std::vector<llama_token> trigger_tokens; // Tokens that trigger a lazy grammar, or tokens to force printing of (even if special).
125
+ std::vector<std::string> trigger_words;
117
126
  };
118
127
 
119
128
  //
@@ -127,7 +136,15 @@ struct llama_grammar * llama_grammar_init_impl(
127
136
  size_t n_rules,
128
137
  size_t start_rule_index);
129
138
 
130
- struct llama_grammar * llama_grammar_init_impl(const struct llama_vocab * vocab, const char * grammar_str, const char * grammar_root);
139
+ struct llama_grammar * llama_grammar_init_impl(
140
+ const struct llama_vocab * vocab,
141
+ const char * grammar_str,
142
+ const char * grammar_root,
143
+ bool lazy,
144
+ const char ** trigger_words,
145
+ size_t num_trigger_words,
146
+ const llama_token * trigger_tokens,
147
+ size_t num_trigger_tokens);
131
148
 
132
149
  void llama_grammar_free_impl(struct llama_grammar * grammar);
133
150
 
@@ -141,3 +158,7 @@ void llama_grammar_apply_impl(
141
158
  void llama_grammar_accept_impl(
142
159
  struct llama_grammar & grammar,
143
160
  llama_token token);
161
+
162
+ void llama_grammar_accept_str(
163
+ struct llama_grammar & grammar,
164
+ const std::string & piece);
@@ -7,6 +7,7 @@
7
7
  #include <cstring>
8
8
  #include <climits>
9
9
  #include <stdexcept>
10
+ #include <cerrno>
10
11
 
11
12
  #ifdef __has_include
12
13
  #if __has_include(<unistd.h>)
@@ -819,7 +819,7 @@ void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps
819
819
  for (const auto & file : files) {
820
820
  auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
821
821
  auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
822
- std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, is_numa_fn()));
822
+ std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa_fn());
823
823
  mmaps_used.emplace_back(mapping->size(), 0);
824
824
  if (mlock_mmaps) {
825
825
  std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());