@fugood/llama.node 0.3.14 → 0.3.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/package.json +1 -1
  18. package/src/llama.cpp/.github/workflows/build.yml +30 -1
  19. package/src/llama.cpp/CMakeLists.txt +9 -1
  20. package/src/llama.cpp/cmake/common.cmake +2 -0
  21. package/src/llama.cpp/common/arg.cpp +20 -2
  22. package/src/llama.cpp/common/common.cpp +6 -3
  23. package/src/llama.cpp/common/speculative.cpp +4 -4
  24. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +2 -2
  25. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +1 -1
  26. package/src/llama.cpp/examples/embedding/embedding.cpp +1 -1
  27. package/src/llama.cpp/examples/gritlm/gritlm.cpp +2 -2
  28. package/src/llama.cpp/examples/imatrix/imatrix.cpp +1 -1
  29. package/src/llama.cpp/examples/infill/infill.cpp +2 -2
  30. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
  31. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +4 -4
  32. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +1 -1
  33. package/src/llama.cpp/examples/lookahead/lookahead.cpp +6 -6
  34. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  35. package/src/llama.cpp/examples/main/main.cpp +6 -6
  36. package/src/llama.cpp/examples/parallel/parallel.cpp +5 -5
  37. package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
  38. package/src/llama.cpp/examples/perplexity/perplexity.cpp +6 -6
  39. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -2
  40. package/src/llama.cpp/examples/retrieval/retrieval.cpp +1 -1
  41. package/src/llama.cpp/examples/run/run.cpp +91 -46
  42. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +2 -2
  43. package/src/llama.cpp/examples/server/server.cpp +32 -15
  44. package/src/llama.cpp/examples/server/utils.hpp +3 -1
  45. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
  46. package/src/llama.cpp/examples/speculative/speculative.cpp +14 -14
  47. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  48. package/src/llama.cpp/examples/tts/tts.cpp +12 -9
  49. package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
  50. package/src/llama.cpp/ggml/cmake/common.cmake +26 -0
  51. package/src/llama.cpp/ggml/include/ggml.h +24 -0
  52. package/src/llama.cpp/ggml/src/CMakeLists.txt +5 -27
  53. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +6 -2
  54. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +0 -5
  55. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +15 -7
  56. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +150 -1
  57. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +253 -2
  58. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +2 -1
  59. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -1
  60. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +7 -0
  61. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +0 -4
  62. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +95 -22
  63. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +3 -0
  64. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -1
  65. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +66 -26
  66. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
  67. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +12 -13
  68. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +40 -40
  69. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +1 -2
  70. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +103 -34
  71. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -1
  72. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +19 -20
  73. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +114 -6
  74. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +6 -0
  75. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +1 -1
  76. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +305 -0
  77. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +10 -0
  78. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +352 -146
  79. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -4
  80. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +3 -0
  81. package/src/llama.cpp/ggml/src/ggml.c +85 -2
  82. package/src/llama.cpp/include/llama.h +86 -22
  83. package/src/llama.cpp/src/CMakeLists.txt +5 -2
  84. package/src/llama.cpp/src/llama-adapter.cpp +19 -20
  85. package/src/llama.cpp/src/llama-adapter.h +11 -9
  86. package/src/llama.cpp/src/llama-arch.cpp +102 -16
  87. package/src/llama.cpp/src/llama-arch.h +18 -0
  88. package/src/llama.cpp/src/llama-batch.h +2 -2
  89. package/src/llama.cpp/src/llama-context.cpp +2253 -1222
  90. package/src/llama.cpp/src/llama-context.h +214 -77
  91. package/src/llama.cpp/src/llama-cparams.h +1 -0
  92. package/src/llama.cpp/src/llama-graph.cpp +1662 -0
  93. package/src/llama.cpp/src/llama-graph.h +574 -0
  94. package/src/llama.cpp/src/llama-hparams.cpp +8 -0
  95. package/src/llama.cpp/src/llama-hparams.h +9 -0
  96. package/src/llama.cpp/src/llama-io.cpp +15 -0
  97. package/src/llama.cpp/src/llama-io.h +35 -0
  98. package/src/llama.cpp/src/llama-kv-cache.cpp +1006 -291
  99. package/src/llama.cpp/src/llama-kv-cache.h +178 -110
  100. package/src/llama.cpp/src/llama-memory.cpp +1 -0
  101. package/src/llama.cpp/src/llama-memory.h +21 -0
  102. package/src/llama.cpp/src/llama-model.cpp +8207 -163
  103. package/src/llama.cpp/src/llama-model.h +34 -1
  104. package/src/llama.cpp/src/llama-quant.cpp +10 -1
  105. package/src/llama.cpp/src/llama.cpp +51 -9984
  106. package/src/llama.cpp/tests/test-backend-ops.cpp +88 -9
  107. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +0 -143
  108. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +0 -9
@@ -79,6 +79,7 @@ class Opt {
79
79
  ctx_params = llama_context_default_params();
80
80
  model_params = llama_model_default_params();
81
81
  context_size_default = ctx_params.n_batch;
82
+ n_threads_default = ctx_params.n_threads;
82
83
  ngl_default = model_params.n_gpu_layers;
83
84
  common_params_sampling sampling;
84
85
  temperature_default = sampling.temp;
@@ -104,6 +105,7 @@ class Opt {
104
105
 
105
106
  ctx_params.n_batch = context_size >= 0 ? context_size : context_size_default;
106
107
  ctx_params.n_ctx = ctx_params.n_batch;
108
+ ctx_params.n_threads = ctx_params.n_threads_batch = n_threads >= 0 ? n_threads : n_threads_default;
107
109
  model_params.n_gpu_layers = ngl >= 0 ? ngl : ngl_default;
108
110
  temperature = temperature >= 0 ? temperature : temperature_default;
109
111
 
@@ -116,12 +118,12 @@ class Opt {
116
118
  std::string chat_template_file;
117
119
  std::string user;
118
120
  bool use_jinja = false;
119
- int context_size = -1, ngl = -1;
121
+ int context_size = -1, ngl = -1, n_threads = -1;
120
122
  float temperature = -1;
121
123
  bool verbose = false;
122
124
 
123
125
  private:
124
- int context_size_default = -1, ngl_default = -1;
126
+ int context_size_default = -1, ngl_default = -1, n_threads_default = -1;
125
127
  float temperature_default = -1;
126
128
  bool help = false;
127
129
 
@@ -159,53 +161,94 @@ class Opt {
159
161
  return 0;
160
162
  }
161
163
 
164
+ int parse_options_with_value(int argc, const char ** argv, int & i, bool & options_parsing) {
165
+ if (options_parsing && (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0)) {
166
+ if (handle_option_with_value(argc, argv, i, context_size) == 1) {
167
+ return 1;
168
+ }
169
+ } else if (options_parsing &&
170
+ (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "-ngl") == 0 || strcmp(argv[i], "--ngl") == 0)) {
171
+ if (handle_option_with_value(argc, argv, i, ngl) == 1) {
172
+ return 1;
173
+ }
174
+ } else if (options_parsing && (strcmp(argv[i], "-t") == 0 || strcmp(argv[i], "--threads") == 0)) {
175
+ if (handle_option_with_value(argc, argv, i, n_threads) == 1) {
176
+ return 1;
177
+ }
178
+ } else if (options_parsing && strcmp(argv[i], "--temp") == 0) {
179
+ if (handle_option_with_value(argc, argv, i, temperature) == 1) {
180
+ return 1;
181
+ }
182
+ } else if (options_parsing && strcmp(argv[i], "--chat-template-file") == 0) {
183
+ if (handle_option_with_value(argc, argv, i, chat_template_file) == 1) {
184
+ return 1;
185
+ }
186
+ use_jinja = true;
187
+ } else {
188
+ return 2;
189
+ }
190
+
191
+ return 0;
192
+ }
193
+
194
+ int parse_options(const char ** argv, int & i, bool & options_parsing) {
195
+ if (options_parsing && (parse_flag(argv, i, "-v", "--verbose") || parse_flag(argv, i, "-v", "--log-verbose"))) {
196
+ verbose = true;
197
+ } else if (options_parsing && strcmp(argv[i], "--jinja") == 0) {
198
+ use_jinja = true;
199
+ } else if (options_parsing && parse_flag(argv, i, "-h", "--help")) {
200
+ help = true;
201
+ return 0;
202
+ } else if (options_parsing && strcmp(argv[i], "--") == 0) {
203
+ options_parsing = false;
204
+ } else {
205
+ return 2;
206
+ }
207
+
208
+ return 0;
209
+ }
210
+
211
+ int parse_positional_args(const char ** argv, int & i, int & positional_args_i) {
212
+ if (positional_args_i == 0) {
213
+ if (!argv[i][0] || argv[i][0] == '-') {
214
+ return 1;
215
+ }
216
+
217
+ ++positional_args_i;
218
+ model_ = argv[i];
219
+ } else if (positional_args_i == 1) {
220
+ ++positional_args_i;
221
+ user = argv[i];
222
+ } else {
223
+ user += " " + std::string(argv[i]);
224
+ }
225
+
226
+ return 0;
227
+ }
228
+
162
229
  int parse(int argc, const char ** argv) {
163
230
  bool options_parsing = true;
164
231
  for (int i = 1, positional_args_i = 0; i < argc; ++i) {
165
- if (options_parsing && (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0)) {
166
- if (handle_option_with_value(argc, argv, i, context_size) == 1) {
167
- return 1;
168
- }
169
- } else if (options_parsing &&
170
- (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "-ngl") == 0 || strcmp(argv[i], "--ngl") == 0)) {
171
- if (handle_option_with_value(argc, argv, i, ngl) == 1) {
172
- return 1;
173
- }
174
- } else if (options_parsing && strcmp(argv[i], "--temp") == 0) {
175
- if (handle_option_with_value(argc, argv, i, temperature) == 1) {
176
- return 1;
177
- }
178
- } else if (options_parsing &&
179
- (parse_flag(argv, i, "-v", "--verbose") || parse_flag(argv, i, "-v", "--log-verbose"))) {
180
- verbose = true;
181
- } else if (options_parsing && strcmp(argv[i], "--jinja") == 0) {
182
- use_jinja = true;
183
- } else if (options_parsing && strcmp(argv[i], "--chat-template-file") == 0){
184
- if (handle_option_with_value(argc, argv, i, chat_template_file) == 1) {
185
- return 1;
186
- }
187
- use_jinja = true;
188
- } else if (options_parsing && parse_flag(argv, i, "-h", "--help")) {
189
- help = true;
190
- return 0;
191
- } else if (options_parsing && strcmp(argv[i], "--") == 0) {
192
- options_parsing = false;
193
- } else if (positional_args_i == 0) {
194
- if (!argv[i][0] || argv[i][0] == '-') {
195
- return 1;
196
- }
197
-
198
- ++positional_args_i;
199
- model_ = argv[i];
200
- } else if (positional_args_i == 1) {
201
- ++positional_args_i;
202
- user = argv[i];
203
- } else {
204
- user += " " + std::string(argv[i]);
232
+ int ret = parse_options_with_value(argc, argv, i, options_parsing);
233
+ if (ret == 0) {
234
+ continue;
235
+ } else if (ret == 1) {
236
+ return ret;
237
+ }
238
+
239
+ ret = parse_options(argv, i, options_parsing);
240
+ if (ret == 0) {
241
+ continue;
242
+ } else if (ret == 1) {
243
+ return ret;
244
+ }
245
+
246
+ if (parse_positional_args(argv, i, positional_args_i)) {
247
+ return 1;
205
248
  }
206
249
  }
207
250
 
208
- if (model_.empty()){
251
+ if (model_.empty()) {
209
252
  return 1;
210
253
  }
211
254
 
@@ -232,6 +275,8 @@ class Opt {
232
275
  " Number of GPU layers (default: %d)\n"
233
276
  " --temp <value>\n"
234
277
  " Temperature (default: %.1f)\n"
278
+ " -t, --threads <value>\n"
279
+ " Number of threads to use during generation (default: %d)\n"
235
280
  " -v, --verbose, --log-verbose\n"
236
281
  " Set verbosity level to infinity (i.e. log all messages, useful for debugging)\n"
237
282
  " -h, --help\n"
@@ -260,7 +305,7 @@ class Opt {
260
305
  " llama-run file://some-file3.gguf\n"
261
306
  " llama-run --ngl 999 some-file4.gguf\n"
262
307
  " llama-run --ngl 999 some-file5.gguf Hello World\n",
263
- context_size_default, ngl_default, temperature_default);
308
+ context_size_default, ngl_default, temperature_default, n_threads_default);
264
309
  }
265
310
  };
266
311
 
@@ -891,7 +936,7 @@ static int apply_chat_template(const struct common_chat_templates * tmpls, Llama
891
936
  // Function to tokenize the prompt
892
937
  static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt,
893
938
  std::vector<llama_token> & prompt_tokens, const LlamaData & llama_data) {
894
- const bool is_first = llama_get_kv_cache_used_cells(llama_data.context.get()) == 0;
939
+ const bool is_first = llama_kv_self_used_cells(llama_data.context.get()) == 0;
895
940
 
896
941
  const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
897
942
  prompt_tokens.resize(n_prompt_tokens);
@@ -907,7 +952,7 @@ static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt
907
952
  // Check if we have enough space in the context to evaluate this batch
908
953
  static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) {
909
954
  const int n_ctx = llama_n_ctx(ctx.get());
910
- const int n_ctx_used = llama_get_kv_cache_used_cells(ctx.get());
955
+ const int n_ctx_used = llama_kv_self_used_cells(ctx.get());
911
956
  if (n_ctx_used + batch.n_tokens > n_ctx) {
912
957
  printf(LOG_COL_DEFAULT "\n");
913
958
  printe("context size exceeded\n");
@@ -15,7 +15,7 @@ int main(int argc, char ** argv) {
15
15
  return 1;
16
16
  }
17
17
 
18
- print_build_info();
18
+ common_init();
19
19
 
20
20
  if (params.n_predict < 0) {
21
21
  params.n_predict = 16;
@@ -196,7 +196,7 @@ int main(int argc, char ** argv) {
196
196
  fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
197
197
 
198
198
  // erase whole kv
199
- llama_kv_cache_clear(ctx3);
199
+ llama_kv_self_clear(ctx3);
200
200
  fprintf(stderr, "%s : kv cache cleared\n", __func__);
201
201
 
202
202
  // restore kv into seq 1
@@ -1872,6 +1872,10 @@ struct server_context {
1872
1872
  params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
1873
1873
  params_dft.n_parallel = 1;
1874
1874
 
1875
+ // force F16 KV cache for the draft model for extra performance
1876
+ params_dft.cache_type_k = GGML_TYPE_F16;
1877
+ params_dft.cache_type_v = GGML_TYPE_F16;
1878
+
1875
1879
  llama_init_dft = common_init_from_params(params_dft);
1876
1880
 
1877
1881
  model_dft = llama_init_dft.model.get();
@@ -1892,10 +1896,6 @@ struct server_context {
1892
1896
  cparams_dft = common_context_params_to_llama(params_dft);
1893
1897
  cparams_dft.n_batch = n_ctx_dft;
1894
1898
 
1895
- // force F16 KV cache for the draft model for extra performance
1896
- cparams_dft.type_k = GGML_TYPE_F16;
1897
- cparams_dft.type_v = GGML_TYPE_F16;
1898
-
1899
1899
  // the context is not needed - we will create one for each slot
1900
1900
  llama_init_dft.context.reset();
1901
1901
  }
@@ -2040,6 +2040,18 @@ struct server_context {
2040
2040
  return ret;
2041
2041
  }
2042
2042
 
2043
+ bool can_be_detokenized(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
2044
+ const llama_model * model = llama_get_model(ctx);
2045
+ const llama_vocab * vocab = llama_model_get_vocab(model);
2046
+ const int32_t n_vocab = llama_vocab_n_tokens(vocab);
2047
+ for (const auto & token : tokens) {
2048
+ if (token < 0 || token >= n_vocab) {
2049
+ return false;
2050
+ }
2051
+ }
2052
+ return true;
2053
+ }
2054
+
2043
2055
  bool launch_slot_with_task(server_slot & slot, const server_task & task) {
2044
2056
  slot.reset();
2045
2057
  slot.id_task = task.id;
@@ -2054,6 +2066,11 @@ struct server_context {
2054
2066
  slot.lora = task.params.lora;
2055
2067
  }
2056
2068
 
2069
+ bool can_detokenize = can_be_detokenized(ctx, slot.prompt_tokens);
2070
+ if (!can_detokenize) {
2071
+ send_error(task, "Prompt contains invalid tokens", ERROR_TYPE_INVALID_REQUEST);
2072
+ return false;
2073
+ }
2057
2074
  SLT_DBG(slot, "launching slot : %s\n", safe_json_to_str(slot.to_json()).c_str());
2058
2075
 
2059
2076
  if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
@@ -2096,7 +2113,7 @@ struct server_context {
2096
2113
  SRV_DBG("%s", "clearing KV cache\n");
2097
2114
 
2098
2115
  // clear the entire KV cache
2099
- llama_kv_cache_clear(ctx);
2116
+ llama_kv_self_clear(ctx);
2100
2117
  clean_kv_cache = false;
2101
2118
  }
2102
2119
 
@@ -2638,8 +2655,8 @@ struct server_context {
2638
2655
  res->n_tasks_deferred = queue_tasks.queue_tasks_deferred.size();
2639
2656
  res->t_start = metrics.t_start;
2640
2657
 
2641
- res->kv_cache_tokens_count = llama_get_kv_cache_token_count(ctx);
2642
- res->kv_cache_used_cells = llama_get_kv_cache_used_cells(ctx);
2658
+ res->kv_cache_tokens_count = llama_kv_self_n_tokens(ctx);
2659
+ res->kv_cache_used_cells = llama_kv_self_used_cells(ctx);
2643
2660
 
2644
2661
  res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total;
2645
2662
  res->t_prompt_processing_total = metrics.t_prompt_processing_total;
@@ -2755,7 +2772,7 @@ struct server_context {
2755
2772
 
2756
2773
  // Erase token cache
2757
2774
  const size_t n_erased = slot->cache_tokens.size();
2758
- llama_kv_cache_seq_rm(ctx, slot->id, -1, -1);
2775
+ llama_kv_self_seq_rm(ctx, slot->id, -1, -1);
2759
2776
  slot->cache_tokens.clear();
2760
2777
 
2761
2778
  auto res = std::make_unique<server_task_result_slot_erase>();
@@ -2823,8 +2840,8 @@ struct server_context {
2823
2840
 
2824
2841
  SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
2825
2842
 
2826
- llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
2827
- llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard);
2843
+ llama_kv_self_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
2844
+ llama_kv_self_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard);
2828
2845
 
2829
2846
  if (slot.params.cache_prompt) {
2830
2847
  for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
@@ -3015,8 +3032,8 @@ struct server_context {
3015
3032
 
3016
3033
  const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;
3017
3034
 
3018
- llama_kv_cache_seq_rm (ctx, slot.id, head_p, head_c);
3019
- llama_kv_cache_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift);
3035
+ llama_kv_self_seq_rm (ctx, slot.id, head_p, head_c);
3036
+ llama_kv_self_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift);
3020
3037
 
3021
3038
  for (size_t i = 0; i < n_match; i++) {
3022
3039
  slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
@@ -3054,9 +3071,9 @@ struct server_context {
3054
3071
  }
3055
3072
 
3056
3073
  // keep only the common part
3057
- if (!llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1)) {
3074
+ if (!llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1)) {
3058
3075
  // could not partially delete (likely using a non-Transformer model)
3059
- llama_kv_cache_seq_rm(ctx, slot.id, -1, -1);
3076
+ llama_kv_self_seq_rm(ctx, slot.id, -1, -1);
3060
3077
 
3061
3078
  // there is no common part left
3062
3079
  slot.n_past = 0;
@@ -3296,7 +3313,7 @@ struct server_context {
3296
3313
  slot.cache_tokens.push_back(id);
3297
3314
  slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);
3298
3315
 
3299
- llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1);
3316
+ llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1);
3300
3317
 
3301
3318
  for (size_t i = 0; i < ids.size(); ++i) {
3302
3319
  completion_token_output result;
@@ -621,7 +621,9 @@ static json oaicompat_completion_params_parse(
621
621
 
622
622
  llama_params["chat_format"] = static_cast<int>(chat_params.format);
623
623
  llama_params["prompt"] = chat_params.prompt;
624
- llama_params["grammar"] = chat_params.grammar;
624
+ if (!chat_params.grammar.empty()) {
625
+ llama_params["grammar"] = chat_params.grammar;
626
+ }
625
627
  llama_params["grammar_lazy"] = chat_params.grammar_lazy;
626
628
  auto grammar_triggers = json::array();
627
629
  for (const auto & trigger : chat_params.grammar_triggers) {
@@ -98,7 +98,7 @@ int main(int argc, char ** argv) {
98
98
  auto generate = [&](const std::string & prompt) {
99
99
  std::string response;
100
100
 
101
- const bool is_first = llama_get_kv_cache_used_cells(ctx) == 0;
101
+ const bool is_first = llama_kv_self_used_cells(ctx) == 0;
102
102
 
103
103
  // tokenize the prompt
104
104
  const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
@@ -113,7 +113,7 @@ int main(int argc, char ** argv) {
113
113
  while (true) {
114
114
  // check if we have enough space in the context to evaluate this batch
115
115
  int n_ctx = llama_n_ctx(ctx);
116
- int n_ctx_used = llama_get_kv_cache_used_cells(ctx);
116
+ int n_ctx_used = llama_kv_self_used_cells(ctx);
117
117
  if (n_ctx_used + batch.n_tokens > n_ctx) {
118
118
  printf("\033[0m\n");
119
119
  fprintf(stderr, "context size exceeded\n");
@@ -331,11 +331,11 @@ int main(int argc, char ** argv) {
331
331
  }
332
332
 
333
333
  active_seqs.erase(s);
334
- for(int i = 0; i < n_seq_dft; i++) {
334
+ for (int i = 0; i < n_seq_dft; i++) {
335
335
  if (i == s) {
336
336
  continue;
337
337
  }
338
- if (drafts[i].tokens[i_dft] == drafts[s].tokens[i_dft]) {
338
+ if (drafts[i].active && drafts[i].tokens[i_dft] == drafts[s].tokens[i_dft]) {
339
339
  // synchronize active status for sequences with the same drafted token
340
340
  drafts[i].active = drafts[i].active && accept;
341
341
  if (!drafts[i].active) {
@@ -420,14 +420,14 @@ int main(int argc, char ** argv) {
420
420
  {
421
421
  LOG_DBG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
422
422
 
423
- llama_kv_cache_seq_keep(ctx_dft, s_keep);
424
- llama_kv_cache_seq_cp (ctx_dft, s_keep, 0, -1, -1);
425
- llama_kv_cache_seq_keep(ctx_dft, 0);
423
+ llama_kv_self_seq_keep(ctx_dft, s_keep);
424
+ llama_kv_self_seq_cp (ctx_dft, s_keep, 0, -1, -1);
425
+ llama_kv_self_seq_keep(ctx_dft, 0);
426
426
 
427
- llama_kv_cache_seq_rm (ctx_tgt, s_keep, n_past_tgt, -1);
428
- llama_kv_cache_seq_keep(ctx_tgt, s_keep);
429
- llama_kv_cache_seq_cp (ctx_tgt, s_keep, 0, -1, -1);
430
- llama_kv_cache_seq_keep(ctx_tgt, 0);
427
+ llama_kv_self_seq_rm (ctx_tgt, s_keep, n_past_tgt, -1);
428
+ llama_kv_self_seq_keep(ctx_tgt, s_keep);
429
+ llama_kv_self_seq_cp (ctx_tgt, s_keep, 0, -1, -1);
430
+ llama_kv_self_seq_keep(ctx_tgt, 0);
431
431
  }
432
432
 
433
433
  for (int s = 0; s < n_seq_dft; ++s) {
@@ -444,7 +444,7 @@ int main(int argc, char ** argv) {
444
444
  common_batch_clear(batch_dft);
445
445
  common_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true);
446
446
 
447
- llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
447
+ llama_kv_self_seq_rm(ctx_dft, 0, n_past_dft, -1);
448
448
  // LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
449
449
  llama_decode(ctx_dft, batch_dft);
450
450
 
@@ -503,8 +503,8 @@ int main(int argc, char ** argv) {
503
503
  if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_draft_split) {
504
504
  LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur);
505
505
 
506
- llama_kv_cache_seq_rm(ctx_dft, n_seq_cur, -1, -1);
507
- llama_kv_cache_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
506
+ llama_kv_self_seq_rm(ctx_dft, n_seq_cur, -1, -1);
507
+ llama_kv_self_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
508
508
 
509
509
  // all previous tokens from this branch are now also part of the new branch
510
510
  for (int t = 0; t < batch_tgt.n_tokens; ++t) {
@@ -585,9 +585,9 @@ int main(int argc, char ** argv) {
585
585
 
586
586
  // evaluate the target model on the drafted tokens
587
587
  {
588
- llama_kv_cache_seq_keep(ctx_tgt, 0);
588
+ llama_kv_self_seq_keep(ctx_tgt, 0);
589
589
  for (int s = 1; s < n_seq_dft; ++s) {
590
- llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
590
+ llama_kv_self_seq_cp(ctx_tgt, 0, s, -1, -1);
591
591
  }
592
592
 
593
593
  // LOG_DBG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
@@ -217,7 +217,7 @@ int main(int argc, char ** argv) {
217
217
  {
218
218
  LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);
219
219
 
220
- llama_kv_cache_seq_rm(ctx_tgt, 0, n_past, -1);
220
+ llama_kv_self_seq_rm(ctx_tgt, 0, n_past, -1);
221
221
  }
222
222
 
223
223
  if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
@@ -87,11 +87,11 @@ struct wav_header {
87
87
  uint32_t data_size;
88
88
  };
89
89
 
90
- static void save_wav16(const std::string & fname, const std::vector<float> & data, int sample_rate) {
90
+ static bool save_wav16(const std::string & fname, const std::vector<float> & data, int sample_rate) {
91
91
  std::ofstream file(fname, std::ios::binary);
92
92
  if (!file) {
93
- LOG_ERR("%s: Failed to open file '%s' for writing", __func__, fname.c_str());
94
- return;
93
+ LOG_ERR("%s: Failed to open file '%s' for writing.\n", __func__, fname.c_str());
94
+ return false;
95
95
  }
96
96
 
97
97
  wav_header header;
@@ -108,7 +108,7 @@ static void save_wav16(const std::string & fname, const std::vector<float> & dat
108
108
  file.write(reinterpret_cast<const char*>(&pcm_sample), sizeof(pcm_sample));
109
109
  }
110
110
 
111
- file.close();
111
+ return file.good();
112
112
  }
113
113
 
114
114
  static void fill_hann_window(int length, bool periodic, float * output) {
@@ -536,6 +536,7 @@ static std::string audio_data_from_speaker(json speaker, const outetts_version t
536
536
  int main(int argc, char ** argv) {
537
537
  common_params params;
538
538
 
539
+ params.out_file = "output.wav";
539
540
  params.prompt = "";
540
541
 
541
542
  params.n_predict = 4096;
@@ -1060,8 +1061,6 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
1060
1061
  }
1061
1062
  #endif
1062
1063
 
1063
- const std::string fname = "output.wav";
1064
-
1065
1064
  const int n_sr = 24000; // sampling rate
1066
1065
 
1067
1066
  // zero out first 0.25 seconds
@@ -1072,11 +1071,15 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
1072
1071
  LOG_INF("%s: time for spectral ops: %.3f ms\n", __func__, (ggml_time_us() - t_spec_start) / 1000.0f);
1073
1072
  LOG_INF("%s: total time: %.3f ms\n", __func__, (ggml_time_us() - t_main_start) / 1000.0f);
1074
1073
 
1075
- save_wav16(fname, audio, n_sr);
1074
+ int retval = 0;
1076
1075
 
1077
- LOG_INF("%s: audio written to file '%s'\n", __func__, fname.c_str());
1076
+ if (save_wav16(params.out_file, audio, n_sr)) {
1077
+ LOG_INF("%s: audio written to file '%s'\n", __func__, params.out_file.c_str());
1078
+ } else {
1079
+ retval = ENOENT;
1080
+ }
1078
1081
 
1079
1082
  llama_backend_free();
1080
1083
 
1081
- return 0;
1084
+ return retval;
1082
1085
  }
@@ -186,6 +186,7 @@ option(GGML_OPENMP "ggml: use OpenMP"
186
186
  option(GGML_RPC "ggml: use RPC" OFF)
187
187
  option(GGML_SYCL "ggml: use SYCL" OFF)
188
188
  option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
189
+ option(GGML_SYCL_GRAPH "ggml: enable graphs in the SYCL backend" ON)
189
190
  set (GGML_SYCL_TARGET "INTEL" CACHE STRING
190
191
  "ggml: sycl target device")
191
192
  set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
@@ -0,0 +1,26 @@
1
+ function(ggml_get_flags CCID CCVER)
2
+ set(C_FLAGS "")
3
+ set(CXX_FLAGS "")
4
+
5
+ if (CCID MATCHES "Clang")
6
+ set(C_FLAGS -Wunreachable-code-break -Wunreachable-code-return)
7
+ set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi)
8
+
9
+ if (
10
+ (CCID STREQUAL "Clang" AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
11
+ (CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
12
+ )
13
+ list(APPEND C_FLAGS -Wdouble-promotion)
14
+ endif()
15
+ elseif (CCID STREQUAL "GNU")
16
+ set(C_FLAGS -Wdouble-promotion)
17
+ set(CXX_FLAGS -Wno-array-bounds)
18
+
19
+ if (CCVER VERSION_GREATER_EQUAL 8.1.0)
20
+ list(APPEND CXX_FLAGS -Wextra-semi)
21
+ endif()
22
+ endif()
23
+
24
+ set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE)
25
+ set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
26
+ endfunction()
@@ -454,6 +454,7 @@ extern "C" {
454
454
  GGML_OP_RMS_NORM,
455
455
  GGML_OP_RMS_NORM_BACK,
456
456
  GGML_OP_GROUP_NORM,
457
+ GGML_OP_L2_NORM,
457
458
 
458
459
  GGML_OP_MUL_MAT,
459
460
  GGML_OP_MUL_MAT_ID,
@@ -502,6 +503,7 @@ extern "C" {
502
503
  GGML_OP_ADD_REL_POS,
503
504
  GGML_OP_RWKV_WKV6,
504
505
  GGML_OP_GATED_LINEAR_ATTN,
506
+ GGML_OP_RWKV_WKV7,
505
507
 
506
508
  GGML_OP_UNARY,
507
509
 
@@ -1095,6 +1097,18 @@ extern "C" {
1095
1097
  int n_groups,
1096
1098
  float eps);
1097
1099
 
1100
+ // l2 normalize along rows
1101
+ // used in rwkv v7
1102
+ GGML_API struct ggml_tensor * ggml_l2_norm(
1103
+ struct ggml_context * ctx,
1104
+ struct ggml_tensor * a,
1105
+ float eps);
1106
+
1107
+ GGML_API struct ggml_tensor * ggml_l2_norm_inplace(
1108
+ struct ggml_context * ctx,
1109
+ struct ggml_tensor * a,
1110
+ float eps);
1111
+
1098
1112
  // a - x
1099
1113
  // b - dy
1100
1114
  GGML_API struct ggml_tensor * ggml_rms_norm_back(
@@ -1890,6 +1904,16 @@ extern "C" {
1890
1904
  struct ggml_tensor * state,
1891
1905
  float scale);
1892
1906
 
1907
+ GGML_API struct ggml_tensor * ggml_rwkv_wkv7(
1908
+ struct ggml_context * ctx,
1909
+ struct ggml_tensor * r,
1910
+ struct ggml_tensor * w,
1911
+ struct ggml_tensor * k,
1912
+ struct ggml_tensor * v,
1913
+ struct ggml_tensor * a,
1914
+ struct ggml_tensor * b,
1915
+ struct ggml_tensor * state);
1916
+
1893
1917
  // custom operators
1894
1918
 
1895
1919
  typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
@@ -1,4 +1,5 @@
1
1
  include(CheckCXXCompilerFlag)
2
+ include("../cmake/common.cmake")
2
3
 
3
4
  add_compile_definitions(GGML_SCHED_MAX_COPIES=${GGML_SCHED_MAX_COPIES})
4
5
 
@@ -24,33 +25,6 @@ if (NOT MSVC)
24
25
  endif()
25
26
  endif()
26
27
 
27
- function(ggml_get_flags CCID CCVER)
28
- set(C_FLAGS "")
29
- set(CXX_FLAGS "")
30
-
31
- if (CCID MATCHES "Clang")
32
- set(C_FLAGS -Wunreachable-code-break -Wunreachable-code-return)
33
- set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi)
34
-
35
- if (
36
- (CCID STREQUAL "Clang" AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
37
- (CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
38
- )
39
- list(APPEND C_FLAGS -Wdouble-promotion)
40
- endif()
41
- elseif (CCID STREQUAL "GNU")
42
- set(C_FLAGS -Wdouble-promotion)
43
- set(CXX_FLAGS -Wno-array-bounds)
44
-
45
- if (CCVER VERSION_GREATER_EQUAL 8.1.0)
46
- list(APPEND CXX_FLAGS -Wextra-semi)
47
- endif()
48
- endif()
49
-
50
- set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE)
51
- set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
52
- endfunction()
53
-
54
28
  if (GGML_FATAL_WARNINGS)
55
29
  if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
56
30
  list(APPEND C_FLAGS -Werror)
@@ -351,6 +325,10 @@ if (CMAKE_SYSTEM_NAME MATCHES "Android")
351
325
  target_link_libraries(ggml-base PRIVATE dl)
352
326
  endif()
353
327
 
328
+ if(CMAKE_SYSTEM_NAME MATCHES "visionOS")
329
+ target_compile_definitions(ggml-base PUBLIC _DARWIN_C_SOURCE)
330
+ endif()
331
+
354
332
  if (BUILD_SHARED_LIBS)
355
333
  foreach (target ggml-base ggml)
356
334
  set_target_properties(${target} PROPERTIES POSITION_INDEPENDENT_CODE ON)