@fugood/llama.node 0.3.14 → 0.3.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/package.json +1 -1
  18. package/src/llama.cpp/.github/workflows/build.yml +30 -1
  19. package/src/llama.cpp/CMakeLists.txt +9 -1
  20. package/src/llama.cpp/cmake/common.cmake +2 -0
  21. package/src/llama.cpp/common/arg.cpp +20 -2
  22. package/src/llama.cpp/common/common.cpp +6 -3
  23. package/src/llama.cpp/common/speculative.cpp +4 -4
  24. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +2 -2
  25. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +1 -1
  26. package/src/llama.cpp/examples/embedding/embedding.cpp +1 -1
  27. package/src/llama.cpp/examples/gritlm/gritlm.cpp +2 -2
  28. package/src/llama.cpp/examples/imatrix/imatrix.cpp +1 -1
  29. package/src/llama.cpp/examples/infill/infill.cpp +2 -2
  30. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
  31. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +4 -4
  32. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +1 -1
  33. package/src/llama.cpp/examples/lookahead/lookahead.cpp +6 -6
  34. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  35. package/src/llama.cpp/examples/main/main.cpp +6 -6
  36. package/src/llama.cpp/examples/parallel/parallel.cpp +5 -5
  37. package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
  38. package/src/llama.cpp/examples/perplexity/perplexity.cpp +6 -6
  39. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -2
  40. package/src/llama.cpp/examples/retrieval/retrieval.cpp +1 -1
  41. package/src/llama.cpp/examples/run/run.cpp +91 -46
  42. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +2 -2
  43. package/src/llama.cpp/examples/server/server.cpp +37 -15
  44. package/src/llama.cpp/examples/server/utils.hpp +3 -1
  45. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
  46. package/src/llama.cpp/examples/speculative/speculative.cpp +14 -14
  47. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  48. package/src/llama.cpp/examples/tts/tts.cpp +20 -9
  49. package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
  50. package/src/llama.cpp/ggml/cmake/common.cmake +26 -0
  51. package/src/llama.cpp/ggml/include/ggml.h +24 -0
  52. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -28
  53. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +6 -2
  54. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +0 -5
  55. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +15 -7
  56. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +1493 -12
  57. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +150 -1
  58. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +284 -29
  59. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +2 -1
  60. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -1
  61. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +7 -0
  62. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +0 -4
  63. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +95 -22
  64. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +35 -12
  65. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -1
  66. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +93 -27
  67. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
  68. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +12 -13
  69. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +40 -40
  70. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +12 -43
  71. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +1 -2
  72. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +109 -40
  73. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -1
  74. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +19 -20
  75. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +114 -6
  76. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +6 -0
  77. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +1 -1
  78. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +305 -0
  79. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +10 -0
  80. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +398 -158
  81. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -4
  82. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +7 -2
  83. package/src/llama.cpp/ggml/src/ggml.c +85 -2
  84. package/src/llama.cpp/include/llama.h +86 -22
  85. package/src/llama.cpp/src/CMakeLists.txt +5 -2
  86. package/src/llama.cpp/src/llama-adapter.cpp +19 -20
  87. package/src/llama.cpp/src/llama-adapter.h +11 -9
  88. package/src/llama.cpp/src/llama-arch.cpp +103 -16
  89. package/src/llama.cpp/src/llama-arch.h +18 -0
  90. package/src/llama.cpp/src/llama-batch.h +2 -2
  91. package/src/llama.cpp/src/llama-context.cpp +2253 -1222
  92. package/src/llama.cpp/src/llama-context.h +214 -77
  93. package/src/llama.cpp/src/llama-cparams.h +1 -0
  94. package/src/llama.cpp/src/llama-graph.cpp +1662 -0
  95. package/src/llama.cpp/src/llama-graph.h +574 -0
  96. package/src/llama.cpp/src/llama-hparams.cpp +8 -0
  97. package/src/llama.cpp/src/llama-hparams.h +9 -0
  98. package/src/llama.cpp/src/llama-io.cpp +15 -0
  99. package/src/llama.cpp/src/llama-io.h +35 -0
  100. package/src/llama.cpp/src/llama-kv-cache.cpp +1006 -291
  101. package/src/llama.cpp/src/llama-kv-cache.h +178 -110
  102. package/src/llama.cpp/src/llama-memory.cpp +1 -0
  103. package/src/llama.cpp/src/llama-memory.h +21 -0
  104. package/src/llama.cpp/src/llama-model.cpp +8244 -173
  105. package/src/llama.cpp/src/llama-model.h +34 -1
  106. package/src/llama.cpp/src/llama-quant.cpp +10 -1
  107. package/src/llama.cpp/src/llama.cpp +51 -9984
  108. package/src/llama.cpp/tests/test-backend-ops.cpp +145 -23
  109. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +0 -143
  110. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +0 -9
@@ -79,6 +79,7 @@ class Opt {
79
79
  ctx_params = llama_context_default_params();
80
80
  model_params = llama_model_default_params();
81
81
  context_size_default = ctx_params.n_batch;
82
+ n_threads_default = ctx_params.n_threads;
82
83
  ngl_default = model_params.n_gpu_layers;
83
84
  common_params_sampling sampling;
84
85
  temperature_default = sampling.temp;
@@ -104,6 +105,7 @@ class Opt {
104
105
 
105
106
  ctx_params.n_batch = context_size >= 0 ? context_size : context_size_default;
106
107
  ctx_params.n_ctx = ctx_params.n_batch;
108
+ ctx_params.n_threads = ctx_params.n_threads_batch = n_threads >= 0 ? n_threads : n_threads_default;
107
109
  model_params.n_gpu_layers = ngl >= 0 ? ngl : ngl_default;
108
110
  temperature = temperature >= 0 ? temperature : temperature_default;
109
111
 
@@ -116,12 +118,12 @@ class Opt {
116
118
  std::string chat_template_file;
117
119
  std::string user;
118
120
  bool use_jinja = false;
119
- int context_size = -1, ngl = -1;
121
+ int context_size = -1, ngl = -1, n_threads = -1;
120
122
  float temperature = -1;
121
123
  bool verbose = false;
122
124
 
123
125
  private:
124
- int context_size_default = -1, ngl_default = -1;
126
+ int context_size_default = -1, ngl_default = -1, n_threads_default = -1;
125
127
  float temperature_default = -1;
126
128
  bool help = false;
127
129
 
@@ -159,53 +161,94 @@ class Opt {
159
161
  return 0;
160
162
  }
161
163
 
164
+ int parse_options_with_value(int argc, const char ** argv, int & i, bool & options_parsing) {
165
+ if (options_parsing && (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0)) {
166
+ if (handle_option_with_value(argc, argv, i, context_size) == 1) {
167
+ return 1;
168
+ }
169
+ } else if (options_parsing &&
170
+ (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "-ngl") == 0 || strcmp(argv[i], "--ngl") == 0)) {
171
+ if (handle_option_with_value(argc, argv, i, ngl) == 1) {
172
+ return 1;
173
+ }
174
+ } else if (options_parsing && (strcmp(argv[i], "-t") == 0 || strcmp(argv[i], "--threads") == 0)) {
175
+ if (handle_option_with_value(argc, argv, i, n_threads) == 1) {
176
+ return 1;
177
+ }
178
+ } else if (options_parsing && strcmp(argv[i], "--temp") == 0) {
179
+ if (handle_option_with_value(argc, argv, i, temperature) == 1) {
180
+ return 1;
181
+ }
182
+ } else if (options_parsing && strcmp(argv[i], "--chat-template-file") == 0) {
183
+ if (handle_option_with_value(argc, argv, i, chat_template_file) == 1) {
184
+ return 1;
185
+ }
186
+ use_jinja = true;
187
+ } else {
188
+ return 2;
189
+ }
190
+
191
+ return 0;
192
+ }
193
+
194
+ int parse_options(const char ** argv, int & i, bool & options_parsing) {
195
+ if (options_parsing && (parse_flag(argv, i, "-v", "--verbose") || parse_flag(argv, i, "-v", "--log-verbose"))) {
196
+ verbose = true;
197
+ } else if (options_parsing && strcmp(argv[i], "--jinja") == 0) {
198
+ use_jinja = true;
199
+ } else if (options_parsing && parse_flag(argv, i, "-h", "--help")) {
200
+ help = true;
201
+ return 0;
202
+ } else if (options_parsing && strcmp(argv[i], "--") == 0) {
203
+ options_parsing = false;
204
+ } else {
205
+ return 2;
206
+ }
207
+
208
+ return 0;
209
+ }
210
+
211
+ int parse_positional_args(const char ** argv, int & i, int & positional_args_i) {
212
+ if (positional_args_i == 0) {
213
+ if (!argv[i][0] || argv[i][0] == '-') {
214
+ return 1;
215
+ }
216
+
217
+ ++positional_args_i;
218
+ model_ = argv[i];
219
+ } else if (positional_args_i == 1) {
220
+ ++positional_args_i;
221
+ user = argv[i];
222
+ } else {
223
+ user += " " + std::string(argv[i]);
224
+ }
225
+
226
+ return 0;
227
+ }
228
+
162
229
  int parse(int argc, const char ** argv) {
163
230
  bool options_parsing = true;
164
231
  for (int i = 1, positional_args_i = 0; i < argc; ++i) {
165
- if (options_parsing && (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0)) {
166
- if (handle_option_with_value(argc, argv, i, context_size) == 1) {
167
- return 1;
168
- }
169
- } else if (options_parsing &&
170
- (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "-ngl") == 0 || strcmp(argv[i], "--ngl") == 0)) {
171
- if (handle_option_with_value(argc, argv, i, ngl) == 1) {
172
- return 1;
173
- }
174
- } else if (options_parsing && strcmp(argv[i], "--temp") == 0) {
175
- if (handle_option_with_value(argc, argv, i, temperature) == 1) {
176
- return 1;
177
- }
178
- } else if (options_parsing &&
179
- (parse_flag(argv, i, "-v", "--verbose") || parse_flag(argv, i, "-v", "--log-verbose"))) {
180
- verbose = true;
181
- } else if (options_parsing && strcmp(argv[i], "--jinja") == 0) {
182
- use_jinja = true;
183
- } else if (options_parsing && strcmp(argv[i], "--chat-template-file") == 0){
184
- if (handle_option_with_value(argc, argv, i, chat_template_file) == 1) {
185
- return 1;
186
- }
187
- use_jinja = true;
188
- } else if (options_parsing && parse_flag(argv, i, "-h", "--help")) {
189
- help = true;
190
- return 0;
191
- } else if (options_parsing && strcmp(argv[i], "--") == 0) {
192
- options_parsing = false;
193
- } else if (positional_args_i == 0) {
194
- if (!argv[i][0] || argv[i][0] == '-') {
195
- return 1;
196
- }
197
-
198
- ++positional_args_i;
199
- model_ = argv[i];
200
- } else if (positional_args_i == 1) {
201
- ++positional_args_i;
202
- user = argv[i];
203
- } else {
204
- user += " " + std::string(argv[i]);
232
+ int ret = parse_options_with_value(argc, argv, i, options_parsing);
233
+ if (ret == 0) {
234
+ continue;
235
+ } else if (ret == 1) {
236
+ return ret;
237
+ }
238
+
239
+ ret = parse_options(argv, i, options_parsing);
240
+ if (ret == 0) {
241
+ continue;
242
+ } else if (ret == 1) {
243
+ return ret;
244
+ }
245
+
246
+ if (parse_positional_args(argv, i, positional_args_i)) {
247
+ return 1;
205
248
  }
206
249
  }
207
250
 
208
- if (model_.empty()){
251
+ if (model_.empty()) {
209
252
  return 1;
210
253
  }
211
254
 
@@ -232,6 +275,8 @@ class Opt {
232
275
  " Number of GPU layers (default: %d)\n"
233
276
  " --temp <value>\n"
234
277
  " Temperature (default: %.1f)\n"
278
+ " -t, --threads <value>\n"
279
+ " Number of threads to use during generation (default: %d)\n"
235
280
  " -v, --verbose, --log-verbose\n"
236
281
  " Set verbosity level to infinity (i.e. log all messages, useful for debugging)\n"
237
282
  " -h, --help\n"
@@ -260,7 +305,7 @@ class Opt {
260
305
  " llama-run file://some-file3.gguf\n"
261
306
  " llama-run --ngl 999 some-file4.gguf\n"
262
307
  " llama-run --ngl 999 some-file5.gguf Hello World\n",
263
- context_size_default, ngl_default, temperature_default);
308
+ context_size_default, ngl_default, temperature_default, n_threads_default);
264
309
  }
265
310
  };
266
311
 
@@ -891,7 +936,7 @@ static int apply_chat_template(const struct common_chat_templates * tmpls, Llama
891
936
  // Function to tokenize the prompt
892
937
  static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt,
893
938
  std::vector<llama_token> & prompt_tokens, const LlamaData & llama_data) {
894
- const bool is_first = llama_get_kv_cache_used_cells(llama_data.context.get()) == 0;
939
+ const bool is_first = llama_kv_self_used_cells(llama_data.context.get()) == 0;
895
940
 
896
941
  const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
897
942
  prompt_tokens.resize(n_prompt_tokens);
@@ -907,7 +952,7 @@ static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt
907
952
  // Check if we have enough space in the context to evaluate this batch
908
953
  static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) {
909
954
  const int n_ctx = llama_n_ctx(ctx.get());
910
- const int n_ctx_used = llama_get_kv_cache_used_cells(ctx.get());
955
+ const int n_ctx_used = llama_kv_self_used_cells(ctx.get());
911
956
  if (n_ctx_used + batch.n_tokens > n_ctx) {
912
957
  printf(LOG_COL_DEFAULT "\n");
913
958
  printe("context size exceeded\n");
@@ -15,7 +15,7 @@ int main(int argc, char ** argv) {
15
15
  return 1;
16
16
  }
17
17
 
18
- print_build_info();
18
+ common_init();
19
19
 
20
20
  if (params.n_predict < 0) {
21
21
  params.n_predict = 16;
@@ -196,7 +196,7 @@ int main(int argc, char ** argv) {
196
196
  fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
197
197
 
198
198
  // erase whole kv
199
- llama_kv_cache_clear(ctx3);
199
+ llama_kv_self_clear(ctx3);
200
200
  fprintf(stderr, "%s : kv cache cleared\n", __func__);
201
201
 
202
202
  // restore kv into seq 1
@@ -830,6 +830,11 @@ struct server_task_result_cmpl_final : server_task_result {
830
830
  ret.push_back({"timings", timings.to_json()});
831
831
  }
832
832
 
833
+ // extra fields for debugging purposes
834
+ if (verbose) {
835
+ ret["__verbose"] = to_json_non_oaicompat();
836
+ }
837
+
833
838
  return ret;
834
839
  }
835
840
  };
@@ -1872,6 +1877,10 @@ struct server_context {
1872
1877
  params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
1873
1878
  params_dft.n_parallel = 1;
1874
1879
 
1880
+ // force F16 KV cache for the draft model for extra performance
1881
+ params_dft.cache_type_k = GGML_TYPE_F16;
1882
+ params_dft.cache_type_v = GGML_TYPE_F16;
1883
+
1875
1884
  llama_init_dft = common_init_from_params(params_dft);
1876
1885
 
1877
1886
  model_dft = llama_init_dft.model.get();
@@ -1892,10 +1901,6 @@ struct server_context {
1892
1901
  cparams_dft = common_context_params_to_llama(params_dft);
1893
1902
  cparams_dft.n_batch = n_ctx_dft;
1894
1903
 
1895
- // force F16 KV cache for the draft model for extra performance
1896
- cparams_dft.type_k = GGML_TYPE_F16;
1897
- cparams_dft.type_v = GGML_TYPE_F16;
1898
-
1899
1904
  // the context is not needed - we will create one for each slot
1900
1905
  llama_init_dft.context.reset();
1901
1906
  }
@@ -2040,6 +2045,18 @@ struct server_context {
2040
2045
  return ret;
2041
2046
  }
2042
2047
 
2048
+ bool can_be_detokenized(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
2049
+ const llama_model * model = llama_get_model(ctx);
2050
+ const llama_vocab * vocab = llama_model_get_vocab(model);
2051
+ const int32_t n_vocab = llama_vocab_n_tokens(vocab);
2052
+ for (const auto & token : tokens) {
2053
+ if (token < 0 || token >= n_vocab) {
2054
+ return false;
2055
+ }
2056
+ }
2057
+ return true;
2058
+ }
2059
+
2043
2060
  bool launch_slot_with_task(server_slot & slot, const server_task & task) {
2044
2061
  slot.reset();
2045
2062
  slot.id_task = task.id;
@@ -2054,6 +2071,11 @@ struct server_context {
2054
2071
  slot.lora = task.params.lora;
2055
2072
  }
2056
2073
 
2074
+ bool can_detokenize = can_be_detokenized(ctx, slot.prompt_tokens);
2075
+ if (!can_detokenize) {
2076
+ send_error(task, "Prompt contains invalid tokens", ERROR_TYPE_INVALID_REQUEST);
2077
+ return false;
2078
+ }
2057
2079
  SLT_DBG(slot, "launching slot : %s\n", safe_json_to_str(slot.to_json()).c_str());
2058
2080
 
2059
2081
  if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
@@ -2096,7 +2118,7 @@ struct server_context {
2096
2118
  SRV_DBG("%s", "clearing KV cache\n");
2097
2119
 
2098
2120
  // clear the entire KV cache
2099
- llama_kv_cache_clear(ctx);
2121
+ llama_kv_self_clear(ctx);
2100
2122
  clean_kv_cache = false;
2101
2123
  }
2102
2124
 
@@ -2638,8 +2660,8 @@ struct server_context {
2638
2660
  res->n_tasks_deferred = queue_tasks.queue_tasks_deferred.size();
2639
2661
  res->t_start = metrics.t_start;
2640
2662
 
2641
- res->kv_cache_tokens_count = llama_get_kv_cache_token_count(ctx);
2642
- res->kv_cache_used_cells = llama_get_kv_cache_used_cells(ctx);
2663
+ res->kv_cache_tokens_count = llama_kv_self_n_tokens(ctx);
2664
+ res->kv_cache_used_cells = llama_kv_self_used_cells(ctx);
2643
2665
 
2644
2666
  res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total;
2645
2667
  res->t_prompt_processing_total = metrics.t_prompt_processing_total;
@@ -2755,7 +2777,7 @@ struct server_context {
2755
2777
 
2756
2778
  // Erase token cache
2757
2779
  const size_t n_erased = slot->cache_tokens.size();
2758
- llama_kv_cache_seq_rm(ctx, slot->id, -1, -1);
2780
+ llama_kv_self_seq_rm(ctx, slot->id, -1, -1);
2759
2781
  slot->cache_tokens.clear();
2760
2782
 
2761
2783
  auto res = std::make_unique<server_task_result_slot_erase>();
@@ -2823,8 +2845,8 @@ struct server_context {
2823
2845
 
2824
2846
  SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
2825
2847
 
2826
- llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
2827
- llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard);
2848
+ llama_kv_self_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
2849
+ llama_kv_self_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard);
2828
2850
 
2829
2851
  if (slot.params.cache_prompt) {
2830
2852
  for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
@@ -3015,8 +3037,8 @@ struct server_context {
3015
3037
 
3016
3038
  const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;
3017
3039
 
3018
- llama_kv_cache_seq_rm (ctx, slot.id, head_p, head_c);
3019
- llama_kv_cache_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift);
3040
+ llama_kv_self_seq_rm (ctx, slot.id, head_p, head_c);
3041
+ llama_kv_self_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift);
3020
3042
 
3021
3043
  for (size_t i = 0; i < n_match; i++) {
3022
3044
  slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
@@ -3054,9 +3076,9 @@ struct server_context {
3054
3076
  }
3055
3077
 
3056
3078
  // keep only the common part
3057
- if (!llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1)) {
3079
+ if (!llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1)) {
3058
3080
  // could not partially delete (likely using a non-Transformer model)
3059
- llama_kv_cache_seq_rm(ctx, slot.id, -1, -1);
3081
+ llama_kv_self_seq_rm(ctx, slot.id, -1, -1);
3060
3082
 
3061
3083
  // there is no common part left
3062
3084
  slot.n_past = 0;
@@ -3296,7 +3318,7 @@ struct server_context {
3296
3318
  slot.cache_tokens.push_back(id);
3297
3319
  slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);
3298
3320
 
3299
- llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1);
3321
+ llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1);
3300
3322
 
3301
3323
  for (size_t i = 0; i < ids.size(); ++i) {
3302
3324
  completion_token_output result;
@@ -621,7 +621,9 @@ static json oaicompat_completion_params_parse(
621
621
 
622
622
  llama_params["chat_format"] = static_cast<int>(chat_params.format);
623
623
  llama_params["prompt"] = chat_params.prompt;
624
- llama_params["grammar"] = chat_params.grammar;
624
+ if (!chat_params.grammar.empty()) {
625
+ llama_params["grammar"] = chat_params.grammar;
626
+ }
625
627
  llama_params["grammar_lazy"] = chat_params.grammar_lazy;
626
628
  auto grammar_triggers = json::array();
627
629
  for (const auto & trigger : chat_params.grammar_triggers) {
@@ -98,7 +98,7 @@ int main(int argc, char ** argv) {
98
98
  auto generate = [&](const std::string & prompt) {
99
99
  std::string response;
100
100
 
101
- const bool is_first = llama_get_kv_cache_used_cells(ctx) == 0;
101
+ const bool is_first = llama_kv_self_used_cells(ctx) == 0;
102
102
 
103
103
  // tokenize the prompt
104
104
  const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
@@ -113,7 +113,7 @@ int main(int argc, char ** argv) {
113
113
  while (true) {
114
114
  // check if we have enough space in the context to evaluate this batch
115
115
  int n_ctx = llama_n_ctx(ctx);
116
- int n_ctx_used = llama_get_kv_cache_used_cells(ctx);
116
+ int n_ctx_used = llama_kv_self_used_cells(ctx);
117
117
  if (n_ctx_used + batch.n_tokens > n_ctx) {
118
118
  printf("\033[0m\n");
119
119
  fprintf(stderr, "context size exceeded\n");
@@ -331,11 +331,11 @@ int main(int argc, char ** argv) {
331
331
  }
332
332
 
333
333
  active_seqs.erase(s);
334
- for(int i = 0; i < n_seq_dft; i++) {
334
+ for (int i = 0; i < n_seq_dft; i++) {
335
335
  if (i == s) {
336
336
  continue;
337
337
  }
338
- if (drafts[i].tokens[i_dft] == drafts[s].tokens[i_dft]) {
338
+ if (drafts[i].active && drafts[i].tokens[i_dft] == drafts[s].tokens[i_dft]) {
339
339
  // synchronize active status for sequences with the same drafted token
340
340
  drafts[i].active = drafts[i].active && accept;
341
341
  if (!drafts[i].active) {
@@ -420,14 +420,14 @@ int main(int argc, char ** argv) {
420
420
  {
421
421
  LOG_DBG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
422
422
 
423
- llama_kv_cache_seq_keep(ctx_dft, s_keep);
424
- llama_kv_cache_seq_cp (ctx_dft, s_keep, 0, -1, -1);
425
- llama_kv_cache_seq_keep(ctx_dft, 0);
423
+ llama_kv_self_seq_keep(ctx_dft, s_keep);
424
+ llama_kv_self_seq_cp (ctx_dft, s_keep, 0, -1, -1);
425
+ llama_kv_self_seq_keep(ctx_dft, 0);
426
426
 
427
- llama_kv_cache_seq_rm (ctx_tgt, s_keep, n_past_tgt, -1);
428
- llama_kv_cache_seq_keep(ctx_tgt, s_keep);
429
- llama_kv_cache_seq_cp (ctx_tgt, s_keep, 0, -1, -1);
430
- llama_kv_cache_seq_keep(ctx_tgt, 0);
427
+ llama_kv_self_seq_rm (ctx_tgt, s_keep, n_past_tgt, -1);
428
+ llama_kv_self_seq_keep(ctx_tgt, s_keep);
429
+ llama_kv_self_seq_cp (ctx_tgt, s_keep, 0, -1, -1);
430
+ llama_kv_self_seq_keep(ctx_tgt, 0);
431
431
  }
432
432
 
433
433
  for (int s = 0; s < n_seq_dft; ++s) {
@@ -444,7 +444,7 @@ int main(int argc, char ** argv) {
444
444
  common_batch_clear(batch_dft);
445
445
  common_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true);
446
446
 
447
- llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
447
+ llama_kv_self_seq_rm(ctx_dft, 0, n_past_dft, -1);
448
448
  // LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
449
449
  llama_decode(ctx_dft, batch_dft);
450
450
 
@@ -503,8 +503,8 @@ int main(int argc, char ** argv) {
503
503
  if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_draft_split) {
504
504
  LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur);
505
505
 
506
- llama_kv_cache_seq_rm(ctx_dft, n_seq_cur, -1, -1);
507
- llama_kv_cache_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
506
+ llama_kv_self_seq_rm(ctx_dft, n_seq_cur, -1, -1);
507
+ llama_kv_self_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
508
508
 
509
509
  // all previous tokens from this branch are now also part of the new branch
510
510
  for (int t = 0; t < batch_tgt.n_tokens; ++t) {
@@ -585,9 +585,9 @@ int main(int argc, char ** argv) {
585
585
 
586
586
  // evaluate the target model on the drafted tokens
587
587
  {
588
- llama_kv_cache_seq_keep(ctx_tgt, 0);
588
+ llama_kv_self_seq_keep(ctx_tgt, 0);
589
589
  for (int s = 1; s < n_seq_dft; ++s) {
590
- llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
590
+ llama_kv_self_seq_cp(ctx_tgt, 0, s, -1, -1);
591
591
  }
592
592
 
593
593
  // LOG_DBG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
@@ -217,7 +217,7 @@ int main(int argc, char ** argv) {
217
217
  {
218
218
  LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);
219
219
 
220
- llama_kv_cache_seq_rm(ctx_tgt, 0, n_past, -1);
220
+ llama_kv_self_seq_rm(ctx_tgt, 0, n_past, -1);
221
221
  }
222
222
 
223
223
  if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
@@ -87,11 +87,11 @@ struct wav_header {
87
87
  uint32_t data_size;
88
88
  };
89
89
 
90
- static void save_wav16(const std::string & fname, const std::vector<float> & data, int sample_rate) {
90
+ static bool save_wav16(const std::string & fname, const std::vector<float> & data, int sample_rate) {
91
91
  std::ofstream file(fname, std::ios::binary);
92
92
  if (!file) {
93
- LOG_ERR("%s: Failed to open file '%s' for writing", __func__, fname.c_str());
94
- return;
93
+ LOG_ERR("%s: Failed to open file '%s' for writing.\n", __func__, fname.c_str());
94
+ return false;
95
95
  }
96
96
 
97
97
  wav_header header;
@@ -108,7 +108,7 @@ static void save_wav16(const std::string & fname, const std::vector<float> & dat
108
108
  file.write(reinterpret_cast<const char*>(&pcm_sample), sizeof(pcm_sample));
109
109
  }
110
110
 
111
- file.close();
111
+ return file.good();
112
112
  }
113
113
 
114
114
  static void fill_hann_window(int length, bool periodic, float * output) {
@@ -536,6 +536,7 @@ static std::string audio_data_from_speaker(json speaker, const outetts_version t
536
536
  int main(int argc, char ** argv) {
537
537
  common_params params;
538
538
 
539
+ params.out_file = "output.wav";
539
540
  params.prompt = "";
540
541
 
541
542
  params.n_predict = 4096;
@@ -570,6 +571,10 @@ int main(int argc, char ** argv) {
570
571
  model_ttc = llama_init_ttc.model.get();
571
572
  ctx_ttc = llama_init_ttc.context.get();
572
573
 
574
+ if (model_ttc == nullptr || ctx_ttc == nullptr) {
575
+ return ENOENT;
576
+ }
577
+
573
578
  const llama_vocab * vocab = llama_model_get_vocab(model_ttc);
574
579
 
575
580
  // TODO: refactor in a common struct
@@ -585,6 +590,10 @@ int main(int argc, char ** argv) {
585
590
  model_cts = llama_init_cts.model.get();
586
591
  ctx_cts = llama_init_cts.context.get();
587
592
 
593
+ if (model_cts == nullptr || ctx_cts == nullptr) {
594
+ return ENOENT;
595
+ }
596
+
588
597
  std::vector<common_sampler *> smpl(n_parallel);
589
598
  for (int i = 0; i < n_parallel; ++i) {
590
599
  params.sampling.no_perf = (i != 0);
@@ -1060,8 +1069,6 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
1060
1069
  }
1061
1070
  #endif
1062
1071
 
1063
- const std::string fname = "output.wav";
1064
-
1065
1072
  const int n_sr = 24000; // sampling rate
1066
1073
 
1067
1074
  // zero out first 0.25 seconds
@@ -1072,11 +1079,15 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
1072
1079
  LOG_INF("%s: time for spectral ops: %.3f ms\n", __func__, (ggml_time_us() - t_spec_start) / 1000.0f);
1073
1080
  LOG_INF("%s: total time: %.3f ms\n", __func__, (ggml_time_us() - t_main_start) / 1000.0f);
1074
1081
 
1075
- save_wav16(fname, audio, n_sr);
1082
+ int retval = 0;
1076
1083
 
1077
- LOG_INF("%s: audio written to file '%s'\n", __func__, fname.c_str());
1084
+ if (save_wav16(params.out_file, audio, n_sr)) {
1085
+ LOG_INF("%s: audio written to file '%s'\n", __func__, params.out_file.c_str());
1086
+ } else {
1087
+ retval = ENOENT;
1088
+ }
1078
1089
 
1079
1090
  llama_backend_free();
1080
1091
 
1081
- return 0;
1092
+ return retval;
1082
1093
  }
@@ -186,6 +186,7 @@ option(GGML_OPENMP "ggml: use OpenMP"
186
186
  option(GGML_RPC "ggml: use RPC" OFF)
187
187
  option(GGML_SYCL "ggml: use SYCL" OFF)
188
188
  option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
189
+ option(GGML_SYCL_GRAPH "ggml: enable graphs in the SYCL backend" ON)
189
190
  set (GGML_SYCL_TARGET "INTEL" CACHE STRING
190
191
  "ggml: sycl target device")
191
192
  set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
@@ -0,0 +1,26 @@
1
+ function(ggml_get_flags CCID CCVER)
2
+ set(C_FLAGS "")
3
+ set(CXX_FLAGS "")
4
+
5
+ if (CCID MATCHES "Clang")
6
+ set(C_FLAGS -Wunreachable-code-break -Wunreachable-code-return)
7
+ set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi)
8
+
9
+ if (
10
+ (CCID STREQUAL "Clang" AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
11
+ (CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
12
+ )
13
+ list(APPEND C_FLAGS -Wdouble-promotion)
14
+ endif()
15
+ elseif (CCID STREQUAL "GNU")
16
+ set(C_FLAGS -Wdouble-promotion)
17
+ set(CXX_FLAGS -Wno-array-bounds)
18
+
19
+ if (CCVER VERSION_GREATER_EQUAL 8.1.0)
20
+ list(APPEND CXX_FLAGS -Wextra-semi)
21
+ endif()
22
+ endif()
23
+
24
+ set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE)
25
+ set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
26
+ endfunction()
@@ -454,6 +454,7 @@ extern "C" {
454
454
  GGML_OP_RMS_NORM,
455
455
  GGML_OP_RMS_NORM_BACK,
456
456
  GGML_OP_GROUP_NORM,
457
+ GGML_OP_L2_NORM,
457
458
 
458
459
  GGML_OP_MUL_MAT,
459
460
  GGML_OP_MUL_MAT_ID,
@@ -502,6 +503,7 @@ extern "C" {
502
503
  GGML_OP_ADD_REL_POS,
503
504
  GGML_OP_RWKV_WKV6,
504
505
  GGML_OP_GATED_LINEAR_ATTN,
506
+ GGML_OP_RWKV_WKV7,
505
507
 
506
508
  GGML_OP_UNARY,
507
509
 
@@ -1095,6 +1097,18 @@ extern "C" {
1095
1097
  int n_groups,
1096
1098
  float eps);
1097
1099
 
1100
+ // l2 normalize along rows
1101
+ // used in rwkv v7
1102
+ GGML_API struct ggml_tensor * ggml_l2_norm(
1103
+ struct ggml_context * ctx,
1104
+ struct ggml_tensor * a,
1105
+ float eps);
1106
+
1107
+ GGML_API struct ggml_tensor * ggml_l2_norm_inplace(
1108
+ struct ggml_context * ctx,
1109
+ struct ggml_tensor * a,
1110
+ float eps);
1111
+
1098
1112
  // a - x
1099
1113
  // b - dy
1100
1114
  GGML_API struct ggml_tensor * ggml_rms_norm_back(
@@ -1890,6 +1904,16 @@ extern "C" {
1890
1904
  struct ggml_tensor * state,
1891
1905
  float scale);
1892
1906
 
1907
+ GGML_API struct ggml_tensor * ggml_rwkv_wkv7(
1908
+ struct ggml_context * ctx,
1909
+ struct ggml_tensor * r,
1910
+ struct ggml_tensor * w,
1911
+ struct ggml_tensor * k,
1912
+ struct ggml_tensor * v,
1913
+ struct ggml_tensor * a,
1914
+ struct ggml_tensor * b,
1915
+ struct ggml_tensor * state);
1916
+
1893
1917
  // custom operators
1894
1918
 
1895
1919
  typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);