@fugood/llama.node 0.3.13 → 0.3.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (184) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +1 -1
  18. package/package.json +1 -1
  19. package/src/LlamaContext.cpp +98 -76
  20. package/src/LlamaContext.h +1 -1
  21. package/src/common.hpp +1 -2
  22. package/src/llama.cpp/.github/workflows/build.yml +89 -10
  23. package/src/llama.cpp/.github/workflows/server.yml +2 -0
  24. package/src/llama.cpp/CMakeLists.txt +9 -1
  25. package/src/llama.cpp/cmake/common.cmake +2 -0
  26. package/src/llama.cpp/common/CMakeLists.txt +3 -3
  27. package/src/llama.cpp/common/arg.cpp +132 -13
  28. package/src/llama.cpp/common/chat.cpp +960 -266
  29. package/src/llama.cpp/common/chat.h +135 -0
  30. package/src/llama.cpp/common/common.cpp +33 -174
  31. package/src/llama.cpp/common/common.h +27 -67
  32. package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
  33. package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
  34. package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
  35. package/src/llama.cpp/common/ngram-cache.cpp +1 -0
  36. package/src/llama.cpp/common/sampling.cpp +45 -7
  37. package/src/llama.cpp/common/speculative.cpp +10 -9
  38. package/src/llama.cpp/common/speculative.h +1 -1
  39. package/src/llama.cpp/docs/build.md +45 -7
  40. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +2 -2
  41. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +4 -2
  42. package/src/llama.cpp/examples/embedding/embedding.cpp +2 -1
  43. package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
  44. package/src/llama.cpp/examples/gritlm/gritlm.cpp +2 -2
  45. package/src/llama.cpp/examples/imatrix/imatrix.cpp +3 -4
  46. package/src/llama.cpp/examples/infill/infill.cpp +2 -2
  47. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
  48. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +5 -5
  49. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  50. package/src/llama.cpp/examples/llava/clip.cpp +373 -107
  51. package/src/llama.cpp/examples/llava/clip.h +19 -3
  52. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
  53. package/src/llama.cpp/examples/llava/llava.cpp +4 -2
  54. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
  55. package/src/llama.cpp/examples/lookahead/lookahead.cpp +7 -6
  56. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  57. package/src/llama.cpp/examples/main/main.cpp +79 -34
  58. package/src/llama.cpp/examples/parallel/parallel.cpp +6 -5
  59. package/src/llama.cpp/examples/passkey/passkey.cpp +15 -14
  60. package/src/llama.cpp/examples/perplexity/perplexity.cpp +6 -6
  61. package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
  62. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -2
  63. package/src/llama.cpp/examples/retrieval/retrieval.cpp +1 -1
  64. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
  65. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
  66. package/src/llama.cpp/examples/run/run.cpp +196 -108
  67. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +2 -2
  68. package/src/llama.cpp/examples/server/server.cpp +113 -101
  69. package/src/llama.cpp/examples/server/utils.hpp +94 -105
  70. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
  71. package/src/llama.cpp/examples/speculative/speculative.cpp +14 -14
  72. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  73. package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
  74. package/src/llama.cpp/examples/tts/tts.cpp +263 -151
  75. package/src/llama.cpp/ggml/CMakeLists.txt +14 -1
  76. package/src/llama.cpp/ggml/cmake/common.cmake +26 -0
  77. package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
  78. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
  79. package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
  80. package/src/llama.cpp/ggml/include/ggml.h +29 -1
  81. package/src/llama.cpp/ggml/src/CMakeLists.txt +15 -34
  82. package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
  83. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
  84. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
  85. package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
  86. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +6 -2
  87. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -7
  88. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
  89. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +139 -16
  90. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
  91. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
  93. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
  94. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1546 -387
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1645 -113
  96. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
  97. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  101. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
  102. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +2 -1
  103. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -1
  104. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
  105. package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
  106. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
  107. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +242 -0
  108. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -6
  109. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
  110. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -138
  111. package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
  112. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
  113. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +5 -0
  114. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +2 -1
  115. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
  116. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +117 -36
  117. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
  118. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
  119. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
  120. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
  121. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +147 -16
  123. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +40 -40
  124. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +307 -0
  125. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
  126. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +262 -746
  127. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -1
  128. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -78
  129. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +114 -6
  130. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +6 -0
  131. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +4 -1
  132. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  133. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  134. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +305 -0
  135. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +10 -0
  136. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +498 -188
  137. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -4
  138. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +16 -3
  139. package/src/llama.cpp/ggml/src/ggml.c +93 -5
  140. package/src/llama.cpp/include/llama.h +105 -27
  141. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
  142. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
  143. package/src/llama.cpp/requirements/requirements-all.txt +1 -0
  144. package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
  145. package/src/llama.cpp/requirements.txt +1 -0
  146. package/src/llama.cpp/src/CMakeLists.txt +5 -2
  147. package/src/llama.cpp/src/llama-adapter.cpp +19 -20
  148. package/src/llama.cpp/src/llama-adapter.h +11 -9
  149. package/src/llama.cpp/src/llama-arch.cpp +123 -16
  150. package/src/llama.cpp/src/llama-arch.h +19 -0
  151. package/src/llama.cpp/src/llama-batch.h +2 -2
  152. package/src/llama.cpp/src/llama-chat.cpp +1 -0
  153. package/src/llama.cpp/src/llama-context.cpp +2253 -1222
  154. package/src/llama.cpp/src/llama-context.h +214 -77
  155. package/src/llama.cpp/src/llama-cparams.h +1 -0
  156. package/src/llama.cpp/src/llama-grammar.cpp +182 -182
  157. package/src/llama.cpp/src/llama-grammar.h +12 -3
  158. package/src/llama.cpp/src/llama-graph.cpp +1662 -0
  159. package/src/llama.cpp/src/llama-graph.h +574 -0
  160. package/src/llama.cpp/src/llama-hparams.cpp +8 -0
  161. package/src/llama.cpp/src/llama-hparams.h +9 -0
  162. package/src/llama.cpp/src/llama-io.cpp +15 -0
  163. package/src/llama.cpp/src/llama-io.h +35 -0
  164. package/src/llama.cpp/src/llama-kv-cache.cpp +1006 -291
  165. package/src/llama.cpp/src/llama-kv-cache.h +178 -109
  166. package/src/llama.cpp/src/llama-memory.cpp +1 -0
  167. package/src/llama.cpp/src/llama-memory.h +21 -0
  168. package/src/llama.cpp/src/llama-mmap.cpp +11 -1
  169. package/src/llama.cpp/src/llama-model.cpp +8230 -122
  170. package/src/llama.cpp/src/llama-model.h +34 -1
  171. package/src/llama.cpp/src/llama-quant.cpp +10 -1
  172. package/src/llama.cpp/src/llama-sampling.cpp +43 -10
  173. package/src/llama.cpp/src/llama-vocab.cpp +12 -0
  174. package/src/llama.cpp/src/llama.cpp +51 -9837
  175. package/src/llama.cpp/tests/test-backend-ops.cpp +247 -112
  176. package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
  177. package/src/llama.cpp/tests/test-chat.cpp +593 -395
  178. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
  179. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
  180. package/src/llama.cpp/Sources/llama/llama.h +0 -4
  181. package/src/llama.cpp/common/chat.hpp +0 -55
  182. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +0 -143
  183. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +0 -9
  184. /package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0
@@ -4,6 +4,7 @@
4
4
 
5
5
  #include <cmath>
6
6
  #include <unordered_map>
7
+ #include <algorithm>
7
8
 
8
9
  // the ring buffer works similarly to std::deque, but with a fixed capacity
9
10
  // TODO: deduplicate with llama-impl.h
@@ -159,16 +160,53 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
159
160
  GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
160
161
  #endif // LLAMA_USE_LLGUIDANCE
161
162
  } else {
162
- std::vector<const char *> trigger_words;
163
- trigger_words.reserve(params.grammar_trigger_words.size());
164
- for (const auto & str : params.grammar_trigger_words) {
165
- trigger_words.push_back(str.word.c_str());
163
+ std::vector<std::string> patterns_at_start;
164
+ std::vector<std::string> patterns_anywhere;
165
+ std::vector<llama_token> trigger_tokens;
166
+ for (const auto & trigger : params.grammar_triggers) {
167
+ switch (trigger.type) {
168
+ case COMMON_GRAMMAR_TRIGGER_TYPE_WORD:
169
+ {
170
+ const auto & word = trigger.value;
171
+ patterns_anywhere.push_back(regex_escape(word));
172
+ break;
173
+ }
174
+ case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
175
+ case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START:
176
+ {
177
+ const auto & pattern = trigger.value;
178
+ (trigger.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START ? patterns_at_start : patterns_anywhere).push_back(pattern);
179
+ break;
180
+ }
181
+ case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN:
182
+ {
183
+ const auto token = trigger.token;
184
+ trigger_tokens.push_back(token);
185
+ break;
186
+ }
187
+ default:
188
+ GGML_ASSERT(false && "unknown trigger type");
189
+ }
190
+ }
191
+
192
+ std::vector<std::string> trigger_patterns;
193
+ if (!patterns_at_start.empty()) {
194
+ trigger_patterns.push_back("^(" + string_join(patterns_at_start, "|") + ")[\\s\\S]*");
195
+ }
196
+ if (!patterns_anywhere.empty()) {
197
+ trigger_patterns.push_back("^[\\s\\S]*?(" + string_join(patterns_anywhere, "|") + ")[\\s\\S]*");
198
+ }
199
+
200
+ std::vector<const char *> trigger_patterns_c;
201
+ trigger_patterns_c.reserve(trigger_patterns.size());
202
+ for (const auto & regex : trigger_patterns) {
203
+ trigger_patterns_c.push_back(regex.c_str());
166
204
  }
167
205
 
168
206
  grmr = params.grammar_lazy
169
- ? llama_sampler_init_grammar_lazy(vocab, params.grammar.c_str(), "root",
170
- trigger_words.data(), trigger_words.size(),
171
- params.grammar_trigger_tokens.data(), params.grammar_trigger_tokens.size())
207
+ ? llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
208
+ trigger_patterns_c.data(), trigger_patterns_c.size(),
209
+ trigger_tokens.data(), trigger_tokens.size())
172
210
  : llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
173
211
  }
174
212
 
@@ -5,6 +5,7 @@
5
5
  #include "sampling.h"
6
6
 
7
7
  #include <cstring>
8
+ #include <algorithm>
8
9
 
9
10
  #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 128
10
11
  #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
@@ -172,7 +173,7 @@ llama_tokens common_speculative_gen_draft(
172
173
  result.reserve(params.n_draft);
173
174
 
174
175
  if (reuse_n == 0) {
175
- llama_kv_cache_clear(ctx);
176
+ llama_kv_self_clear(ctx);
176
177
 
177
178
  prompt.clear();
178
179
  } else {
@@ -191,14 +192,14 @@ llama_tokens common_speculative_gen_draft(
191
192
  }
192
193
 
193
194
  if (reuse_i > 0) {
194
- llama_kv_cache_seq_rm (ctx, 0, 0, reuse_i);
195
- llama_kv_cache_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
195
+ llama_kv_self_seq_rm (ctx, 0, 0, reuse_i);
196
+ llama_kv_self_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
196
197
 
197
198
  prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
198
199
  }
199
200
 
200
201
  if (reuse_n < (int) prompt.size()) {
201
- llama_kv_cache_seq_rm (ctx, 0, reuse_n, -1);
202
+ llama_kv_self_seq_rm (ctx, 0, reuse_n, -1);
202
203
 
203
204
  prompt.erase(prompt.begin() + reuse_n, prompt.end());
204
205
  }
@@ -252,11 +253,6 @@ llama_tokens common_speculative_gen_draft(
252
253
  // add drafted token for each sequence
253
254
  const llama_token id = cur_p->data[0].id;
254
255
 
255
- // only collect very high-confidence draft tokens
256
- if (cur_p->data[0].p < params.p_min) {
257
- break;
258
- }
259
-
260
256
  common_sampler_accept(smpl, id, true);
261
257
 
262
258
  result.push_back(id);
@@ -265,6 +261,11 @@ llama_tokens common_speculative_gen_draft(
265
261
  break;
266
262
  }
267
263
 
264
+ // only collect very high-confidence draft tokens
265
+ if (cur_p->data[0].p < params.p_min) {
266
+ break;
267
+ }
268
+
268
269
  common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
269
270
 
270
271
  // evaluate the drafted tokens on the draft model
@@ -9,7 +9,7 @@ struct common_speculative_params {
9
9
  int n_draft = 16; // max drafted tokens
10
10
  int n_reuse = 256;
11
11
 
12
- float p_min = 0.9f; // min probability required to accept a token in the draft
12
+ float p_min = 0.75f; // min probability required to accept a token in the draft
13
13
  };
14
14
 
15
15
  struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);
@@ -197,20 +197,52 @@ The following compilation options are also available to tweak performance:
197
197
 
198
198
  ## MUSA
199
199
 
200
- This provides GPU acceleration using the MUSA cores of your Moore Threads MTT GPU. Make sure to have the MUSA SDK installed. You can download it from here: [MUSA SDK](https://developer.mthreads.com/sdk/download/musa).
200
+ This provides GPU acceleration using a Moore Threads GPU. Make sure to have the [MUSA SDK](https://developer.mthreads.com/musa/musa-sdk) installed.
201
201
 
202
- - Using `CMake`:
202
+ #### Download directly from Moore Threads
203
203
 
204
- ```bash
205
- cmake -B build -DGGML_MUSA=ON
204
+ You may find the official downloads here: [Moore Threads developer site](https://developer.mthreads.com/sdk/download/musa).
205
+
206
+ ### Compilation
207
+
208
+ ```bash
209
+ cmake -B build -DGGML_MUSA=ON
210
+ cmake --build build --config Release
211
+ ```
212
+
213
+ #### Override Compute Capability Specifications
214
+
215
+ By default, all supported compute capabilities are enabled. To customize this behavior, you can specify the `MUSA_ARCHITECTURES` option in the CMake command:
216
+
217
+ ```bash
218
+ cmake -B build -DGGML_MUSA=ON -DMUSA_ARCHITECTURES="21"
219
+ ```
220
+
221
+ This configuration enables only compute capability `2.1` (MTT S80) during compilation, which can help reduce compilation time.
222
+
223
+ #### Compilation options
224
+
225
+ Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
226
+
227
+ - For static builds, add `-DBUILD_SHARED_LIBS=OFF` and `-DCMAKE_POSITION_INDEPENDENT_CODE=ON`:
228
+ ```
229
+ cmake -B build -DGGML_MUSA=ON \
230
+ -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
206
231
  cmake --build build --config Release
207
232
  ```
208
233
 
209
- The environment variable [`MUSA_VISIBLE_DEVICES`](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) can be used to specify which GPU(s) will be used.
234
+ ### Runtime MUSA environmental variables
210
235
 
211
- The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted.
236
+ You may set the [musa environmental variables](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) at runtime.
212
237
 
213
- Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
238
+ ```bash
239
+ # Use `MUSA_VISIBLE_DEVICES` to hide the first compute device.
240
+ MUSA_VISIBLE_DEVICES="-0" ./build/bin/llama-server --model /srv/models/llama.gguf
241
+ ```
242
+
243
+ ### Unified Memory
244
+
245
+ The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted.
214
246
 
215
247
  ## HIP
216
248
 
@@ -227,6 +259,12 @@ You can download it from your Linux distro's package manager or from here: [ROCm
227
259
  On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
228
260
  However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
229
261
 
262
+ To enhance flash attention performance on RDNA3+ or CDNA architectures, you can utilize the rocWMMA library by enabling the `-DGGML_HIP_ROCWMMA_FATTN=ON` option. This requires rocWMMA headers to be installed on the build system.
263
+
264
+ The rocWMMA library is included by default when installing the ROCm SDK using the `rocm` meta package provided by AMD. Alternatively, if you are not using the meta package, you can install the library using the `rocwmma-dev` or `rocwmma-devel` package, depending on your system's package manager.
265
+
266
+ As an alternative, you can manually install the library by cloning it from the official [GitHub repository](https://github.com/ROCm/rocWMMA), checkout the corresponding version tag (e.g. `rocm-6.2.4`) and set `-DCMAKE_CXX_FLAGS="-I<path/to/rocwmma>/library/include/"` in CMake. This also works under Windows despite not officially supported by AMD.
267
+
230
268
  Note that if you get the following error:
231
269
  ```
232
270
  clang: error: cannot find ROCm device library; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library
@@ -132,7 +132,7 @@ int main(int argc, char ** argv) {
132
132
 
133
133
  const auto t_pp_start = ggml_time_us();
134
134
 
135
- llama_kv_cache_clear(ctx);
135
+ llama_kv_self_clear(ctx);
136
136
 
137
137
  if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
138
138
  LOG_ERR("%s: llama_decode() failed\n", __func__);
@@ -141,7 +141,7 @@ int main(int argc, char ** argv) {
141
141
 
142
142
  if (is_pp_shared) {
143
143
  for (int32_t i = 1; i < pl; ++i) {
144
- llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
144
+ llama_kv_self_seq_cp(ctx, 0, i, -1, -1);
145
145
  }
146
146
  }
147
147
 
@@ -342,7 +342,7 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
342
342
  }
343
343
 
344
344
  static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
345
- llama_kv_cache_clear(ctx);
345
+ llama_kv_self_clear(ctx);
346
346
  if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
347
347
  fprintf(stderr, "%s : failed to eval\n", __func__);
348
348
  return false;
@@ -394,6 +394,8 @@ static int prepare_entries(common_params & params, train_context & ctx_train) {
394
394
  int main(int argc, char ** argv) {
395
395
  common_params params;
396
396
 
397
+ params.out_file = "control_vector.gguf";
398
+
397
399
  if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
398
400
  return 1;
399
401
  }
@@ -498,7 +500,7 @@ int main(int argc, char ** argv) {
498
500
  }
499
501
 
500
502
  // write output vectors to gguf
501
- export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint);
503
+ export_gguf(ctx_train.v_final, params.out_file, model_hint);
502
504
 
503
505
  llama_backend_free();
504
506
 
@@ -4,6 +4,7 @@
4
4
  #include "llama.h"
5
5
 
6
6
  #include <ctime>
7
+ #include <algorithm>
7
8
 
8
9
  #if defined(_MSC_VER)
9
10
  #pragma warning(disable: 4244 4267) // possible loss of data
@@ -37,7 +38,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
37
38
  const struct llama_model * model = llama_get_model(ctx);
38
39
 
39
40
  // clear previous kv_cache values (irrelevant for embeddings)
40
- llama_kv_cache_clear(ctx);
41
+ llama_kv_self_clear(ctx);
41
42
 
42
43
  // run model
43
44
  LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
@@ -413,20 +413,22 @@ static void print_usage(int, char ** argv) {
413
413
  int main(int argc, char ** argv) {
414
414
  common_params params;
415
415
 
416
+ params.out_file = "ggml-lora-merged-f16.gguf";
417
+
416
418
  if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) {
417
419
  return 1;
418
420
  }
419
421
 
420
422
  g_verbose = (params.verbosity > 1);
421
423
  try {
422
- lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads);
424
+ lora_merge_ctx ctx(params.model, params.lora_adapters, params.out_file, params.cpuparams.n_threads);
423
425
  ctx.run_merge();
424
426
  } catch (const std::exception & err) {
425
427
  fprintf(stderr, "%s\n", err.what());
426
428
  exit(EXIT_FAILURE);
427
429
  }
428
430
 
429
- printf("done, output file is %s\n", params.lora_outfile.c_str());
431
+ printf("done, output file is %s\n", params.out_file.c_str());
430
432
 
431
433
  return 0;
432
434
  }
@@ -45,7 +45,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
45
45
  }
46
46
 
47
47
  // clear previous kv_cache values (irrelevant for embeddings)
48
- llama_kv_cache_clear(ctx);
48
+ llama_kv_self_clear(ctx);
49
49
  llama_set_embeddings(ctx, true);
50
50
  llama_set_causal_attn(ctx, false);
51
51
 
@@ -102,7 +102,7 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
102
102
 
103
103
  llama_token eos_token = llama_vocab_eos(vocab);
104
104
 
105
- llama_kv_cache_clear(ctx);
105
+ llama_kv_self_clear(ctx);
106
106
  llama_set_embeddings(ctx, false);
107
107
  llama_set_causal_attn(ctx, true);
108
108
 
@@ -206,9 +206,6 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
206
206
 
207
207
  void IMatrixCollector::save_imatrix(int ncall) const {
208
208
  auto fname = m_params.out_file;
209
- if (fname.empty()) {
210
- fname = "imatrix.dat";
211
- }
212
209
 
213
210
  if (ncall > 0) {
214
211
  fname += ".at_";
@@ -498,7 +495,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
498
495
  const auto t_start = std::chrono::high_resolution_clock::now();
499
496
 
500
497
  // clear the KV cache
501
- llama_kv_cache_clear(ctx);
498
+ llama_kv_self_clear(ctx);
502
499
 
503
500
  llama_batch batch = llama_batch_init(n_batch, 0, 1);
504
501
 
@@ -583,6 +580,8 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
583
580
  int main(int argc, char ** argv) {
584
581
  common_params params;
585
582
 
583
+ params.out_file = "imatrix.dat" ;
584
+
586
585
  params.n_ctx = 512;
587
586
  params.logits_all = true;
588
587
  params.escape = false;
@@ -332,8 +332,8 @@ int main(int argc, char ** argv) {
332
332
  LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
333
333
  n_past, n_left, n_ctx, params.n_keep, n_discard);
334
334
 
335
- llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
336
- llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
335
+ llama_kv_self_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
336
+ llama_kv_self_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
337
337
 
338
338
  n_past -= n_discard;
339
339
 
@@ -1578,7 +1578,7 @@ int main(int argc, char ** argv) {
1578
1578
 
1579
1579
  test t(inst, lmodel, ctx);
1580
1580
 
1581
- llama_kv_cache_clear(ctx);
1581
+ llama_kv_self_clear(ctx);
1582
1582
 
1583
1583
  // cool off before the test
1584
1584
  if (params.delay) {
@@ -1618,7 +1618,7 @@ int main(int argc, char ** argv) {
1618
1618
  }
1619
1619
 
1620
1620
  for (int i = 0; i < params.reps; i++) {
1621
- llama_kv_cache_clear(ctx);
1621
+ llama_kv_self_clear(ctx);
1622
1622
 
1623
1623
  uint64_t t_start = get_time_ns();
1624
1624
 
@@ -194,7 +194,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
194
194
  }
195
195
 
196
196
  batch->logits[batch->n_tokens - 1] = true;
197
- llama_kv_cache_clear(context);
197
+ llama_kv_self_clear(context);
198
198
 
199
199
  const auto t_pp_start = ggml_time_us();
200
200
  if (llama_decode(context, *batch) != 0) {
@@ -206,7 +206,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
206
206
 
207
207
  LOGi("Benchmark text generation (tg)");
208
208
 
209
- llama_kv_cache_clear(context);
209
+ llama_kv_self_clear(context);
210
210
  const auto t_tg_start = ggml_time_us();
211
211
  for (i = 0; i < tg; i++) {
212
212
 
@@ -223,7 +223,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
223
223
 
224
224
  const auto t_tg_end = ggml_time_us();
225
225
 
226
- llama_kv_cache_clear(context);
226
+ llama_kv_self_clear(context);
227
227
 
228
228
  const auto t_pp = double(t_pp_end - t_pp_start) / 1000000.0;
229
229
  const auto t_tg = double(t_tg_end - t_tg_start) / 1000000.0;
@@ -361,7 +361,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
361
361
  const auto tokens_list = common_tokenize(context, text, true, parse_special);
362
362
 
363
363
  auto n_ctx = llama_n_ctx(context);
364
- auto n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
364
+ auto n_kv_req = tokens_list.size() + n_len;
365
365
 
366
366
  LOGi("n_len = %d, n_ctx = %d, n_kv_req = %d", n_len, n_ctx, n_kv_req);
367
367
 
@@ -448,5 +448,5 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
448
448
  extern "C"
449
449
  JNIEXPORT void JNICALL
450
450
  Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
451
- llama_kv_cache_clear(reinterpret_cast<llama_context *>(context));
451
+ llama_kv_self_clear(reinterpret_cast<llama_context *>(context));
452
452
  }
@@ -51,6 +51,13 @@ install(TARGETS ${TARGET} RUNTIME)
51
51
  target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
52
52
  target_compile_features(${TARGET} PRIVATE cxx_std_17)
53
53
 
54
+ set(TARGET llama-gemma3-cli)
55
+ add_executable(${TARGET} gemma3-cli.cpp)
56
+ set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-gemma3-cli)
57
+ install(TARGETS ${TARGET} RUNTIME)
58
+ target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
59
+ target_compile_features(${TARGET} PRIVATE cxx_std_17)
60
+
54
61
  set(TARGET llama-llava-clip-quantize-cli)
55
62
  add_executable(${TARGET} clip-quantize-cli.cpp)
56
63
  set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-clip-quantize-cli)