@fugood/llama.node 0.3.13 → 0.3.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (184) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +1 -1
  18. package/package.json +1 -1
  19. package/src/LlamaContext.cpp +98 -76
  20. package/src/LlamaContext.h +1 -1
  21. package/src/common.hpp +1 -2
  22. package/src/llama.cpp/.github/workflows/build.yml +89 -10
  23. package/src/llama.cpp/.github/workflows/server.yml +2 -0
  24. package/src/llama.cpp/CMakeLists.txt +9 -1
  25. package/src/llama.cpp/cmake/common.cmake +2 -0
  26. package/src/llama.cpp/common/CMakeLists.txt +3 -3
  27. package/src/llama.cpp/common/arg.cpp +132 -13
  28. package/src/llama.cpp/common/chat.cpp +960 -266
  29. package/src/llama.cpp/common/chat.h +135 -0
  30. package/src/llama.cpp/common/common.cpp +33 -174
  31. package/src/llama.cpp/common/common.h +27 -67
  32. package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
  33. package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
  34. package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
  35. package/src/llama.cpp/common/ngram-cache.cpp +1 -0
  36. package/src/llama.cpp/common/sampling.cpp +45 -7
  37. package/src/llama.cpp/common/speculative.cpp +10 -9
  38. package/src/llama.cpp/common/speculative.h +1 -1
  39. package/src/llama.cpp/docs/build.md +45 -7
  40. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +2 -2
  41. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +4 -2
  42. package/src/llama.cpp/examples/embedding/embedding.cpp +2 -1
  43. package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
  44. package/src/llama.cpp/examples/gritlm/gritlm.cpp +2 -2
  45. package/src/llama.cpp/examples/imatrix/imatrix.cpp +3 -4
  46. package/src/llama.cpp/examples/infill/infill.cpp +2 -2
  47. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
  48. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +5 -5
  49. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  50. package/src/llama.cpp/examples/llava/clip.cpp +373 -107
  51. package/src/llama.cpp/examples/llava/clip.h +19 -3
  52. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
  53. package/src/llama.cpp/examples/llava/llava.cpp +4 -2
  54. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
  55. package/src/llama.cpp/examples/lookahead/lookahead.cpp +7 -6
  56. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  57. package/src/llama.cpp/examples/main/main.cpp +79 -34
  58. package/src/llama.cpp/examples/parallel/parallel.cpp +6 -5
  59. package/src/llama.cpp/examples/passkey/passkey.cpp +15 -14
  60. package/src/llama.cpp/examples/perplexity/perplexity.cpp +6 -6
  61. package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
  62. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -2
  63. package/src/llama.cpp/examples/retrieval/retrieval.cpp +1 -1
  64. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
  65. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
  66. package/src/llama.cpp/examples/run/run.cpp +196 -108
  67. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +2 -2
  68. package/src/llama.cpp/examples/server/server.cpp +113 -101
  69. package/src/llama.cpp/examples/server/utils.hpp +94 -105
  70. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
  71. package/src/llama.cpp/examples/speculative/speculative.cpp +14 -14
  72. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  73. package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
  74. package/src/llama.cpp/examples/tts/tts.cpp +263 -151
  75. package/src/llama.cpp/ggml/CMakeLists.txt +14 -1
  76. package/src/llama.cpp/ggml/cmake/common.cmake +26 -0
  77. package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
  78. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
  79. package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
  80. package/src/llama.cpp/ggml/include/ggml.h +29 -1
  81. package/src/llama.cpp/ggml/src/CMakeLists.txt +15 -34
  82. package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
  83. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
  84. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
  85. package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
  86. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +6 -2
  87. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -7
  88. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
  89. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +139 -16
  90. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
  91. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
  93. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
  94. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1546 -387
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1645 -113
  96. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
  97. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  101. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
  102. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +2 -1
  103. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -1
  104. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
  105. package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
  106. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
  107. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +242 -0
  108. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -6
  109. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
  110. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -138
  111. package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
  112. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
  113. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +5 -0
  114. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +2 -1
  115. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
  116. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +117 -36
  117. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
  118. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
  119. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
  120. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
  121. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +147 -16
  123. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +40 -40
  124. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +307 -0
  125. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
  126. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +262 -746
  127. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -1
  128. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -78
  129. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +114 -6
  130. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +6 -0
  131. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +4 -1
  132. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  133. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  134. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +305 -0
  135. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +10 -0
  136. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +498 -188
  137. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -4
  138. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +16 -3
  139. package/src/llama.cpp/ggml/src/ggml.c +93 -5
  140. package/src/llama.cpp/include/llama.h +105 -27
  141. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
  142. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
  143. package/src/llama.cpp/requirements/requirements-all.txt +1 -0
  144. package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
  145. package/src/llama.cpp/requirements.txt +1 -0
  146. package/src/llama.cpp/src/CMakeLists.txt +5 -2
  147. package/src/llama.cpp/src/llama-adapter.cpp +19 -20
  148. package/src/llama.cpp/src/llama-adapter.h +11 -9
  149. package/src/llama.cpp/src/llama-arch.cpp +123 -16
  150. package/src/llama.cpp/src/llama-arch.h +19 -0
  151. package/src/llama.cpp/src/llama-batch.h +2 -2
  152. package/src/llama.cpp/src/llama-chat.cpp +1 -0
  153. package/src/llama.cpp/src/llama-context.cpp +2253 -1222
  154. package/src/llama.cpp/src/llama-context.h +214 -77
  155. package/src/llama.cpp/src/llama-cparams.h +1 -0
  156. package/src/llama.cpp/src/llama-grammar.cpp +182 -182
  157. package/src/llama.cpp/src/llama-grammar.h +12 -3
  158. package/src/llama.cpp/src/llama-graph.cpp +1662 -0
  159. package/src/llama.cpp/src/llama-graph.h +574 -0
  160. package/src/llama.cpp/src/llama-hparams.cpp +8 -0
  161. package/src/llama.cpp/src/llama-hparams.h +9 -0
  162. package/src/llama.cpp/src/llama-io.cpp +15 -0
  163. package/src/llama.cpp/src/llama-io.h +35 -0
  164. package/src/llama.cpp/src/llama-kv-cache.cpp +1006 -291
  165. package/src/llama.cpp/src/llama-kv-cache.h +178 -109
  166. package/src/llama.cpp/src/llama-memory.cpp +1 -0
  167. package/src/llama.cpp/src/llama-memory.h +21 -0
  168. package/src/llama.cpp/src/llama-mmap.cpp +11 -1
  169. package/src/llama.cpp/src/llama-model.cpp +8230 -122
  170. package/src/llama.cpp/src/llama-model.h +34 -1
  171. package/src/llama.cpp/src/llama-quant.cpp +10 -1
  172. package/src/llama.cpp/src/llama-sampling.cpp +43 -10
  173. package/src/llama.cpp/src/llama-vocab.cpp +12 -0
  174. package/src/llama.cpp/src/llama.cpp +51 -9837
  175. package/src/llama.cpp/tests/test-backend-ops.cpp +247 -112
  176. package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
  177. package/src/llama.cpp/tests/test-chat.cpp +593 -395
  178. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
  179. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
  180. package/src/llama.cpp/Sources/llama/llama.h +0 -4
  181. package/src/llama.cpp/common/chat.hpp +0 -55
  182. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +0 -143
  183. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +0 -9
  184. /package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0
@@ -4,7 +4,7 @@
4
4
  #include "log.h"
5
5
  #include "sampling.h"
6
6
  #include "llama.h"
7
- #include "chat-template.hpp"
7
+ #include "chat.h"
8
8
 
9
9
  #include <cstdio>
10
10
  #include <cstring>
@@ -31,8 +31,6 @@
31
31
  #pragma warning(disable: 4244 4267) // possible loss of data
32
32
  #endif
33
33
 
34
- static const char * DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant";
35
-
36
34
  static llama_context ** g_ctx;
37
35
  static llama_model ** g_model;
38
36
  static common_sampler ** g_smpl;
@@ -47,8 +45,8 @@ static void print_usage(int argc, char ** argv) {
47
45
  (void) argc;
48
46
 
49
47
  LOG("\nexample usage:\n");
50
- LOG("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
51
- LOG("\n chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
48
+ LOG("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128 -no-cnv\n", argv[0]);
49
+ LOG("\n chat (conversation): %s -m your_model.gguf -sys \"You are a helpful assistant\"\n", argv[0]);
52
50
  LOG("\n");
53
51
  }
54
52
 
@@ -158,7 +156,7 @@ int main(int argc, char ** argv) {
158
156
  }
159
157
 
160
158
  const llama_vocab * vocab = llama_model_get_vocab(model);
161
- auto chat_templates = common_chat_templates_from_model(model, params.chat_template);
159
+ auto chat_templates = common_chat_templates_init(model, params.chat_template);
162
160
 
163
161
  LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
164
162
 
@@ -201,7 +199,7 @@ int main(int argc, char ** argv) {
201
199
  }
202
200
 
203
201
  // auto enable conversation mode if chat template is available
204
- const bool has_chat_template = chat_templates.has_explicit_template && chat_templates.template_default;
202
+ const bool has_chat_template = common_chat_templates_was_explicit(chat_templates.get());
205
203
  if (params.conversation_mode == COMMON_CONVERSATION_MODE_AUTO) {
206
204
  if (has_chat_template) {
207
205
  LOG_INF("%s: chat template is available, enabling conversation mode (disable it with -no-cnv)\n", __func__);
@@ -219,7 +217,11 @@ int main(int argc, char ** argv) {
219
217
  // print chat template example in conversation mode
220
218
  if (params.conversation_mode) {
221
219
  if (params.enable_chat_template) {
222
- LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(*chat_templates.template_default, params.use_jinja).c_str());
220
+ if (!params.prompt.empty() && params.system_prompt.empty()) {
221
+ LOG_WRN("*** User-specified prompt will pre-start conversation, did you mean to set --system-prompt (-sys) instead?\n");
222
+ }
223
+
224
+ LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(chat_templates.get(), params.use_jinja).c_str());
223
225
  } else {
224
226
  LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
225
227
  }
@@ -263,21 +265,45 @@ int main(int argc, char ** argv) {
263
265
 
264
266
  std::vector<llama_token> embd_inp;
265
267
 
268
+ bool waiting_for_first_input = false;
266
269
  auto chat_add_and_format = [&chat_msgs, &chat_templates](const std::string & role, const std::string & content) {
267
- common_chat_msg new_msg{role, content, {}};
268
- auto formatted = common_chat_format_single(*chat_templates.template_default, chat_msgs, new_msg, role == "user", g_params->use_jinja);
269
- chat_msgs.push_back({role, content, {}});
270
+ common_chat_msg new_msg;
271
+ new_msg.role = role;
272
+ new_msg.content = content;
273
+ auto formatted = common_chat_format_single(chat_templates.get(), chat_msgs, new_msg, role == "user", g_params->use_jinja);
274
+ chat_msgs.push_back(new_msg);
270
275
  LOG_DBG("formatted: '%s'\n", formatted.c_str());
271
276
  return formatted;
272
277
  };
273
278
 
279
+ std::string prompt;
274
280
  {
275
- auto prompt = (params.conversation_mode && params.enable_chat_template)
276
- // format the system prompt in conversation mode (fallback to default if empty)
277
- ? chat_add_and_format("system", params.prompt.empty() ? DEFAULT_SYSTEM_MESSAGE : params.prompt)
281
+ if (params.conversation_mode && params.enable_chat_template) {
282
+ if (!params.system_prompt.empty()) {
283
+ // format the system prompt (will use template default if empty)
284
+ chat_add_and_format("system", params.system_prompt);
285
+ }
286
+
287
+ if (!params.prompt.empty()) {
288
+ // format and append the user prompt
289
+ chat_add_and_format("user", params.prompt);
290
+ } else {
291
+ waiting_for_first_input = true;
292
+ }
293
+
294
+ if (!params.system_prompt.empty() || !params.prompt.empty()) {
295
+ common_chat_templates_inputs inputs;
296
+ inputs.messages = chat_msgs;
297
+ inputs.add_generation_prompt = !params.prompt.empty();
298
+
299
+ prompt = common_chat_templates_apply(chat_templates.get(), inputs).prompt;
300
+ }
301
+ } else {
278
302
  // otherwise use the prompt as is
279
- : params.prompt;
280
- if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
303
+ prompt = params.prompt;
304
+ }
305
+
306
+ if (params.interactive_first || !prompt.empty() || session_tokens.empty()) {
281
307
  LOG_DBG("tokenize the prompt\n");
282
308
  embd_inp = common_tokenize(ctx, prompt, true, true);
283
309
  } else {
@@ -290,7 +316,7 @@ int main(int argc, char ** argv) {
290
316
  }
291
317
 
292
318
  // Should not run without any tokens
293
- if (embd_inp.empty()) {
319
+ if (!waiting_for_first_input && embd_inp.empty()) {
294
320
  if (add_bos) {
295
321
  embd_inp.push_back(llama_vocab_bos(vocab));
296
322
  LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
@@ -328,7 +354,7 @@ int main(int argc, char ** argv) {
328
354
  }
329
355
 
330
356
  // remove any "future" tokens that we might have inherited from the previous session
331
- llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
357
+ llama_kv_self_seq_rm(ctx, -1, n_matching_session_tokens, -1);
332
358
  }
333
359
 
334
360
  LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",
@@ -350,7 +376,12 @@ int main(int argc, char ** argv) {
350
376
  }
351
377
 
352
378
  if (params.conversation_mode) {
353
- params.interactive_first = true;
379
+ if (params.single_turn && !params.prompt.empty()) {
380
+ params.interactive = false;
381
+ params.interactive_first = false;
382
+ } else {
383
+ params.interactive_first = true;
384
+ }
354
385
  }
355
386
 
356
387
  // enable interactive mode if interactive start is specified
@@ -474,8 +505,8 @@ int main(int argc, char ** argv) {
474
505
  LOG_INF( " - Press Ctrl+C to interject at any time.\n");
475
506
  #endif
476
507
  LOG_INF( "%s", control_message);
477
- if (params.conversation_mode && params.enable_chat_template && params.prompt.empty()) {
478
- LOG_INF( " - Using default system message. To change it, set a different value via -p PROMPT or -f FILE argument.\n");
508
+ if (params.conversation_mode && params.enable_chat_template && params.system_prompt.empty()) {
509
+ LOG_INF( " - Not using system message. To change it, set a different value via -sys PROMPT\n");
479
510
  }
480
511
  LOG_INF("\n");
481
512
 
@@ -571,8 +602,8 @@ int main(int argc, char ** argv) {
571
602
  LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
572
603
  n_past, n_left, n_ctx, params.n_keep, n_discard);
573
604
 
574
- llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
575
- llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
605
+ llama_kv_self_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
606
+ llama_kv_self_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
576
607
 
577
608
  n_past -= n_discard;
578
609
 
@@ -595,9 +626,9 @@ int main(int argc, char ** argv) {
595
626
  LOG_DBG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
596
627
  LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
597
628
 
598
- llama_kv_cache_seq_add(ctx, 0, ga_i, n_past, ib*bd);
599
- llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n);
600
- llama_kv_cache_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd);
629
+ llama_kv_self_seq_add(ctx, 0, ga_i, n_past, ib*bd);
630
+ llama_kv_self_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n);
631
+ llama_kv_self_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd);
601
632
 
602
633
  n_past -= bd;
603
634
 
@@ -755,11 +786,14 @@ int main(int argc, char ** argv) {
755
786
 
756
787
  // check for reverse prompt using special tokens
757
788
  llama_token last_token = common_sampler_last(smpl);
758
- if (std::find(antiprompt_token.begin(), antiprompt_token.end(), last_token) != antiprompt_token.end()) {
759
- if (params.interactive) {
760
- is_interacting = true;
789
+ for (auto token : antiprompt_token) {
790
+ if (token == last_token) {
791
+ if (params.interactive) {
792
+ is_interacting = true;
793
+ }
794
+ is_antiprompt = true;
795
+ break;
761
796
  }
762
- is_antiprompt = true;
763
797
  }
764
798
 
765
799
  if (is_antiprompt) {
@@ -768,7 +802,7 @@ int main(int argc, char ** argv) {
768
802
  }
769
803
 
770
804
  // deal with end of generation tokens in interactive mode
771
- if (llama_vocab_is_eog(vocab, common_sampler_last(smpl))) {
805
+ if (!waiting_for_first_input && llama_vocab_is_eog(vocab, common_sampler_last(smpl))) {
772
806
  LOG_DBG("found an EOG token\n");
773
807
 
774
808
  if (params.interactive) {
@@ -788,12 +822,17 @@ int main(int argc, char ** argv) {
788
822
  }
789
823
 
790
824
  // if current token is not EOG, we add it to current assistant message
791
- if (params.conversation_mode) {
825
+ if (params.conversation_mode && !waiting_for_first_input) {
792
826
  const auto id = common_sampler_last(smpl);
793
827
  assistant_ss << common_token_to_piece(ctx, id, false);
828
+
829
+ if (!prompt.empty()) {
830
+ prompt.clear();
831
+ is_interacting = false;
832
+ }
794
833
  }
795
834
 
796
- if (n_past > 0 && is_interacting) {
835
+ if ((n_past > 0 || waiting_for_first_input) && is_interacting) {
797
836
  LOG_DBG("waiting for user input\n");
798
837
 
799
838
  if (params.conversation_mode) {
@@ -883,11 +922,17 @@ int main(int argc, char ** argv) {
883
922
  input_echo = false; // do not echo this again
884
923
  }
885
924
 
886
- if (n_past > 0) {
925
+ if (n_past > 0 || waiting_for_first_input) {
887
926
  if (is_interacting) {
888
927
  common_sampler_reset(smpl);
889
928
  }
890
929
  is_interacting = false;
930
+
931
+ if (waiting_for_first_input && params.single_turn) {
932
+ params.interactive = false;
933
+ params.interactive_first = false;
934
+ }
935
+ waiting_for_first_input = false;
891
936
  }
892
937
  }
893
938
 
@@ -12,6 +12,7 @@
12
12
  #include <string>
13
13
  #include <vector>
14
14
  #include <ctime>
15
+ #include <algorithm>
15
16
 
16
17
  // trim whitespace from the beginning and end of a string
17
18
  static std::string trim(const std::string & str) {
@@ -201,7 +202,7 @@ int main(int argc, char ** argv) {
201
202
 
202
203
  // assign the system KV cache to all parallel sequences
203
204
  for (int32_t i = 1; i <= n_clients; ++i) {
204
- llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
205
+ llama_kv_self_seq_cp(ctx, 0, i, -1, -1);
205
206
  }
206
207
 
207
208
  LOG_INF("\n");
@@ -233,9 +234,9 @@ int main(int argc, char ** argv) {
233
234
  if (batch.n_tokens == 0) {
234
235
  // all sequences have ended - clear the entire KV cache
235
236
  for (int i = 1; i <= n_clients; ++i) {
236
- llama_kv_cache_seq_rm(ctx, i, -1, -1);
237
+ llama_kv_self_seq_rm(ctx, i, -1, -1);
237
238
  // but keep the system prompt
238
- llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
239
+ llama_kv_self_seq_cp(ctx, 0, i, -1, -1);
239
240
  }
240
241
 
241
242
  LOG_INF("%s: clearing the KV cache\n", __func__);
@@ -371,8 +372,8 @@ int main(int argc, char ** argv) {
371
372
  }
372
373
 
373
374
  // delete only the generated part of the sequence, i.e. keep the system prompt in the cache
374
- llama_kv_cache_seq_rm(ctx, client.id + 1, -1, -1);
375
- llama_kv_cache_seq_cp(ctx, 0, client.id + 1, -1, -1);
375
+ llama_kv_self_seq_rm(ctx, client.id + 1, -1, -1);
376
+ llama_kv_self_seq_cp(ctx, 0, client.id + 1, -1, -1);
376
377
 
377
378
  const auto t_main_end = ggml_time_us();
378
379
 
@@ -7,6 +7,7 @@
7
7
  #include <cstdio>
8
8
  #include <string>
9
9
  #include <vector>
10
+ #include <algorithm>
10
11
 
11
12
  static void print_usage(int, char ** argv) {
12
13
  LOG("\nexample usage:\n");
@@ -132,11 +133,11 @@ int main(int argc, char ** argv) {
132
133
  const int ib = i/n_batch - 1;
133
134
  const int bd = n_batch_grp*(n_grp - 1);
134
135
 
135
- llama_kv_cache_seq_add (ctx, 0, n_past - n_batch, n_past, ib*bd);
136
- llama_kv_cache_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
137
- llama_kv_cache_update (ctx);
136
+ llama_kv_self_seq_add (ctx, 0, n_past - n_batch, n_past, ib*bd);
137
+ llama_kv_self_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
138
+ llama_kv_self_update (ctx);
138
139
 
139
- n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
140
+ n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
140
141
  }
141
142
 
142
143
  common_batch_clear(batch);
@@ -166,12 +167,12 @@ int main(int argc, char ** argv) {
166
167
 
167
168
  LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard);
168
169
 
169
- llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
170
- llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
171
- //llama_kv_cache_defrag (ctx);
172
- llama_kv_cache_update (ctx);
170
+ llama_kv_self_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
171
+ llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
172
+ //llama_kv_self_defrag (ctx);
173
+ llama_kv_self_update (ctx);
173
174
 
174
- n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
175
+ n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
175
176
 
176
177
  common_batch_clear(batch);
177
178
 
@@ -197,12 +198,12 @@ int main(int argc, char ** argv) {
197
198
  if (n_discard > 0) {
198
199
  LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
199
200
 
200
- llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
201
- llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
202
- //llama_kv_cache_defrag (ctx);
203
- llama_kv_cache_update (ctx);
201
+ llama_kv_self_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
202
+ llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
203
+ //llama_kv_self_defrag (ctx);
204
+ llama_kv_self_update (ctx);
204
205
 
205
- n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
206
+ n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
206
207
  }
207
208
  }
208
209
 
@@ -361,7 +361,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
361
361
  const auto t_start = std::chrono::high_resolution_clock::now();
362
362
 
363
363
  // clear the KV cache
364
- llama_kv_cache_clear(ctx);
364
+ llama_kv_self_clear(ctx);
365
365
 
366
366
  llama_batch batch = llama_batch_init(n_batch, 0, 1);
367
367
 
@@ -547,7 +547,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
547
547
  const auto t_start = std::chrono::high_resolution_clock::now();
548
548
 
549
549
  // clear the KV cache
550
- llama_kv_cache_clear(ctx);
550
+ llama_kv_self_clear(ctx);
551
551
 
552
552
  for (int j = 0; j < num_batches; ++j) {
553
553
  const int batch_start = start + j * n_batch;
@@ -924,7 +924,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
924
924
  return;
925
925
  }
926
926
 
927
- llama_kv_cache_clear(ctx);
927
+ llama_kv_self_clear(ctx);
928
928
 
929
929
  // decode all tasks [i0, i1)
930
930
  if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
@@ -1203,7 +1203,7 @@ static void winogrande_score(llama_context * ctx, const common_params & params)
1203
1203
  return;
1204
1204
  }
1205
1205
 
1206
- llama_kv_cache_clear(ctx);
1206
+ llama_kv_self_clear(ctx);
1207
1207
 
1208
1208
  // decode all tasks [i0, i1)
1209
1209
  if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
@@ -1575,7 +1575,7 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par
1575
1575
  return;
1576
1576
  }
1577
1577
 
1578
- llama_kv_cache_clear(ctx);
1578
+ llama_kv_self_clear(ctx);
1579
1579
 
1580
1580
  // decode all tasks [i0, i1)
1581
1581
  if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
@@ -1765,7 +1765,7 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
1765
1765
  }
1766
1766
 
1767
1767
  // clear the KV cache
1768
- llama_kv_cache_clear(ctx);
1768
+ llama_kv_self_clear(ctx);
1769
1769
 
1770
1770
  llama_batch batch = llama_batch_init(n_batch, 0, 1);
1771
1771
 
@@ -8,6 +8,7 @@
8
8
  #include <unordered_map>
9
9
  #include <fstream>
10
10
  #include <cmath>
11
+ #include <cctype>
11
12
 
12
13
  struct quant_option {
13
14
  std::string name;
@@ -1,6 +1,6 @@
1
1
  #include "ggml.h"
2
2
  #include "llama.h"
3
- #include "llama-context.h"
3
+ #include "llama-model.h"
4
4
  #include "common.h"
5
5
 
6
6
  #include <algorithm>
@@ -328,7 +328,7 @@ int main(int argc, char ** argv) {
328
328
  }
329
329
  }
330
330
 
331
- const auto & tensors = llama_internal_get_tensor_map(ctx);
331
+ const auto & tensors = llama_internal_get_tensor_map(model);
332
332
 
333
333
  // check layer tensors
334
334
  int included_layers = 0;
@@ -83,7 +83,7 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
83
83
 
84
84
  static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
85
85
  // clear previous kv_cache values (irrelevant for embeddings)
86
- llama_kv_cache_clear(ctx);
86
+ llama_kv_self_clear(ctx);
87
87
 
88
88
  // run model
89
89
  LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);