@fugood/llama.node 0.3.13 → 0.3.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +1 -1
  18. package/package.json +1 -1
  19. package/src/LlamaContext.cpp +98 -76
  20. package/src/LlamaContext.h +1 -1
  21. package/src/common.hpp +1 -2
  22. package/src/llama.cpp/.github/workflows/build.yml +60 -10
  23. package/src/llama.cpp/.github/workflows/server.yml +2 -0
  24. package/src/llama.cpp/common/CMakeLists.txt +3 -3
  25. package/src/llama.cpp/common/arg.cpp +112 -11
  26. package/src/llama.cpp/common/chat.cpp +960 -266
  27. package/src/llama.cpp/common/chat.h +135 -0
  28. package/src/llama.cpp/common/common.cpp +27 -171
  29. package/src/llama.cpp/common/common.h +27 -67
  30. package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
  31. package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
  32. package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
  33. package/src/llama.cpp/common/ngram-cache.cpp +1 -0
  34. package/src/llama.cpp/common/sampling.cpp +45 -7
  35. package/src/llama.cpp/common/speculative.cpp +6 -5
  36. package/src/llama.cpp/common/speculative.h +1 -1
  37. package/src/llama.cpp/docs/build.md +45 -7
  38. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
  39. package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
  40. package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
  41. package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -3
  42. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
  43. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  44. package/src/llama.cpp/examples/llava/clip.cpp +373 -107
  45. package/src/llama.cpp/examples/llava/clip.h +19 -3
  46. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
  47. package/src/llama.cpp/examples/llava/llava.cpp +4 -2
  48. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
  49. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
  50. package/src/llama.cpp/examples/main/main.cpp +73 -28
  51. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
  52. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
  53. package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
  54. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
  55. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
  56. package/src/llama.cpp/examples/run/run.cpp +110 -67
  57. package/src/llama.cpp/examples/server/server.cpp +82 -87
  58. package/src/llama.cpp/examples/server/utils.hpp +94 -107
  59. package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
  60. package/src/llama.cpp/examples/tts/tts.cpp +251 -142
  61. package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
  62. package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
  63. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
  64. package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
  65. package/src/llama.cpp/ggml/include/ggml.h +5 -1
  66. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
  67. package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
  68. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
  69. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
  70. package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
  71. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
  72. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
  73. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
  74. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
  75. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
  76. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
  77. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
  78. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1396 -386
  79. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1432 -151
  80. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
  81. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
  82. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
  83. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
  84. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  85. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
  86. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
  87. package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
  88. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
  89. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
  90. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
  91. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
  92. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +220 -116
  93. package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
  94. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
  95. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
  96. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  97. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
  98. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
  99. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
  100. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
  101. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
  102. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
  103. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
  104. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
  105. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
  106. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
  107. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +168 -721
  108. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
  109. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
  110. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  111. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  112. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +146 -42
  113. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +13 -3
  114. package/src/llama.cpp/ggml/src/ggml.c +8 -3
  115. package/src/llama.cpp/include/llama.h +19 -5
  116. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
  117. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
  118. package/src/llama.cpp/requirements/requirements-all.txt +1 -0
  119. package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
  120. package/src/llama.cpp/requirements.txt +1 -0
  121. package/src/llama.cpp/src/llama-arch.cpp +21 -0
  122. package/src/llama.cpp/src/llama-arch.h +1 -0
  123. package/src/llama.cpp/src/llama-chat.cpp +1 -0
  124. package/src/llama.cpp/src/llama-grammar.cpp +182 -182
  125. package/src/llama.cpp/src/llama-grammar.h +12 -3
  126. package/src/llama.cpp/src/llama-kv-cache.h +1 -0
  127. package/src/llama.cpp/src/llama-mmap.cpp +11 -1
  128. package/src/llama.cpp/src/llama-model.cpp +69 -5
  129. package/src/llama.cpp/src/llama-sampling.cpp +43 -10
  130. package/src/llama.cpp/src/llama-vocab.cpp +12 -0
  131. package/src/llama.cpp/src/llama.cpp +147 -0
  132. package/src/llama.cpp/tests/test-backend-ops.cpp +166 -110
  133. package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
  134. package/src/llama.cpp/tests/test-chat.cpp +593 -395
  135. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
  136. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
  137. package/src/llama.cpp/Sources/llama/llama.h +0 -4
  138. package/src/llama.cpp/common/chat.hpp +0 -55
  139. /package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0
@@ -4,6 +4,7 @@
4
4
 
5
5
  #include <cmath>
6
6
  #include <unordered_map>
7
+ #include <algorithm>
7
8
 
8
9
  // the ring buffer works similarly to std::deque, but with a fixed capacity
9
10
  // TODO: deduplicate with llama-impl.h
@@ -159,16 +160,53 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
159
160
  GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
160
161
  #endif // LLAMA_USE_LLGUIDANCE
161
162
  } else {
162
- std::vector<const char *> trigger_words;
163
- trigger_words.reserve(params.grammar_trigger_words.size());
164
- for (const auto & str : params.grammar_trigger_words) {
165
- trigger_words.push_back(str.word.c_str());
163
+ std::vector<std::string> patterns_at_start;
164
+ std::vector<std::string> patterns_anywhere;
165
+ std::vector<llama_token> trigger_tokens;
166
+ for (const auto & trigger : params.grammar_triggers) {
167
+ switch (trigger.type) {
168
+ case COMMON_GRAMMAR_TRIGGER_TYPE_WORD:
169
+ {
170
+ const auto & word = trigger.value;
171
+ patterns_anywhere.push_back(regex_escape(word));
172
+ break;
173
+ }
174
+ case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
175
+ case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START:
176
+ {
177
+ const auto & pattern = trigger.value;
178
+ (trigger.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START ? patterns_at_start : patterns_anywhere).push_back(pattern);
179
+ break;
180
+ }
181
+ case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN:
182
+ {
183
+ const auto token = trigger.token;
184
+ trigger_tokens.push_back(token);
185
+ break;
186
+ }
187
+ default:
188
+ GGML_ASSERT(false && "unknown trigger type");
189
+ }
190
+ }
191
+
192
+ std::vector<std::string> trigger_patterns;
193
+ if (!patterns_at_start.empty()) {
194
+ trigger_patterns.push_back("^(" + string_join(patterns_at_start, "|") + ")[\\s\\S]*");
195
+ }
196
+ if (!patterns_anywhere.empty()) {
197
+ trigger_patterns.push_back("^[\\s\\S]*?(" + string_join(patterns_anywhere, "|") + ")[\\s\\S]*");
198
+ }
199
+
200
+ std::vector<const char *> trigger_patterns_c;
201
+ trigger_patterns_c.reserve(trigger_patterns.size());
202
+ for (const auto & regex : trigger_patterns) {
203
+ trigger_patterns_c.push_back(regex.c_str());
166
204
  }
167
205
 
168
206
  grmr = params.grammar_lazy
169
- ? llama_sampler_init_grammar_lazy(vocab, params.grammar.c_str(), "root",
170
- trigger_words.data(), trigger_words.size(),
171
- params.grammar_trigger_tokens.data(), params.grammar_trigger_tokens.size())
207
+ ? llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
208
+ trigger_patterns_c.data(), trigger_patterns_c.size(),
209
+ trigger_tokens.data(), trigger_tokens.size())
172
210
  : llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
173
211
  }
174
212
 
@@ -5,6 +5,7 @@
5
5
  #include "sampling.h"
6
6
 
7
7
  #include <cstring>
8
+ #include <algorithm>
8
9
 
9
10
  #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 128
10
11
  #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
@@ -252,11 +253,6 @@ llama_tokens common_speculative_gen_draft(
252
253
  // add drafted token for each sequence
253
254
  const llama_token id = cur_p->data[0].id;
254
255
 
255
- // only collect very high-confidence draft tokens
256
- if (cur_p->data[0].p < params.p_min) {
257
- break;
258
- }
259
-
260
256
  common_sampler_accept(smpl, id, true);
261
257
 
262
258
  result.push_back(id);
@@ -265,6 +261,11 @@ llama_tokens common_speculative_gen_draft(
265
261
  break;
266
262
  }
267
263
 
264
+ // only collect very high-confidence draft tokens
265
+ if (cur_p->data[0].p < params.p_min) {
266
+ break;
267
+ }
268
+
268
269
  common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
269
270
 
270
271
  // evaluate the drafted tokens on the draft model
@@ -9,7 +9,7 @@ struct common_speculative_params {
9
9
  int n_draft = 16; // max drafted tokens
10
10
  int n_reuse = 256;
11
11
 
12
- float p_min = 0.9f; // min probability required to accept a token in the draft
12
+ float p_min = 0.75f; // min probability required to accept a token in the draft
13
13
  };
14
14
 
15
15
  struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);
@@ -197,20 +197,52 @@ The following compilation options are also available to tweak performance:
197
197
 
198
198
  ## MUSA
199
199
 
200
- This provides GPU acceleration using the MUSA cores of your Moore Threads MTT GPU. Make sure to have the MUSA SDK installed. You can download it from here: [MUSA SDK](https://developer.mthreads.com/sdk/download/musa).
200
+ This provides GPU acceleration using a Moore Threads GPU. Make sure to have the [MUSA SDK](https://developer.mthreads.com/musa/musa-sdk) installed.
201
201
 
202
- - Using `CMake`:
202
+ #### Download directly from Moore Threads
203
203
 
204
- ```bash
205
- cmake -B build -DGGML_MUSA=ON
204
+ You may find the official downloads here: [Moore Threads developer site](https://developer.mthreads.com/sdk/download/musa).
205
+
206
+ ### Compilation
207
+
208
+ ```bash
209
+ cmake -B build -DGGML_MUSA=ON
210
+ cmake --build build --config Release
211
+ ```
212
+
213
+ #### Override Compute Capability Specifications
214
+
215
+ By default, all supported compute capabilities are enabled. To customize this behavior, you can specify the `MUSA_ARCHITECTURES` option in the CMake command:
216
+
217
+ ```bash
218
+ cmake -B build -DGGML_MUSA=ON -DMUSA_ARCHITECTURES="21"
219
+ ```
220
+
221
+ This configuration enables only compute capability `2.1` (MTT S80) during compilation, which can help reduce compilation time.
222
+
223
+ #### Compilation options
224
+
225
+ Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
226
+
227
+ - For static builds, add `-DBUILD_SHARED_LIBS=OFF` and `-DCMAKE_POSITION_INDEPENDENT_CODE=ON`:
228
+ ```
229
+ cmake -B build -DGGML_MUSA=ON \
230
+ -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
206
231
  cmake --build build --config Release
207
232
  ```
208
233
 
209
- The environment variable [`MUSA_VISIBLE_DEVICES`](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) can be used to specify which GPU(s) will be used.
234
+ ### Runtime MUSA environmental variables
210
235
 
211
- The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted.
236
+ You may set the [musa environmental variables](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) at runtime.
212
237
 
213
- Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
238
+ ```bash
239
+ # Use `MUSA_VISIBLE_DEVICES` to hide the first compute device.
240
+ MUSA_VISIBLE_DEVICES="-0" ./build/bin/llama-server --model /srv/models/llama.gguf
241
+ ```
242
+
243
+ ### Unified Memory
244
+
245
+ The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted.
214
246
 
215
247
  ## HIP
216
248
 
@@ -227,6 +259,12 @@ You can download it from your Linux distro's package manager or from here: [ROCm
227
259
  On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
228
260
  However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
229
261
 
262
+ To enhance flash attention performance on RDNA3+ or CDNA architectures, you can utilize the rocWMMA library by enabling the `-DGGML_HIP_ROCWMMA_FATTN=ON` option. This requires rocWMMA headers to be installed on the build system.
263
+
264
+ The rocWMMA library is included by default when installing the ROCm SDK using the `rocm` meta package provided by AMD. Alternatively, if you are not using the meta package, you can install the library using the `rocwmma-dev` or `rocwmma-devel` package, depending on your system's package manager.
265
+
266
+ As an alternative, you can manually install the library by cloning it from the official [GitHub repository](https://github.com/ROCm/rocWMMA), checkout the corresponding version tag (e.g. `rocm-6.2.4`) and set `-DCMAKE_CXX_FLAGS="-I<path/to/rocwmma>/library/include/"` in CMake. This also works under Windows despite not officially supported by AMD.
267
+
230
268
  Note that if you get the following error:
231
269
  ```
232
270
  clang: error: cannot find ROCm device library; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library
@@ -394,6 +394,8 @@ static int prepare_entries(common_params & params, train_context & ctx_train) {
394
394
  int main(int argc, char ** argv) {
395
395
  common_params params;
396
396
 
397
+ params.out_file = "control_vector.gguf";
398
+
397
399
  if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
398
400
  return 1;
399
401
  }
@@ -498,7 +500,7 @@ int main(int argc, char ** argv) {
498
500
  }
499
501
 
500
502
  // write output vectors to gguf
501
- export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint);
503
+ export_gguf(ctx_train.v_final, params.out_file, model_hint);
502
504
 
503
505
  llama_backend_free();
504
506
 
@@ -4,6 +4,7 @@
4
4
  #include "llama.h"
5
5
 
6
6
  #include <ctime>
7
+ #include <algorithm>
7
8
 
8
9
  #if defined(_MSC_VER)
9
10
  #pragma warning(disable: 4244 4267) // possible loss of data
@@ -413,20 +413,22 @@ static void print_usage(int, char ** argv) {
413
413
  int main(int argc, char ** argv) {
414
414
  common_params params;
415
415
 
416
+ params.out_file = "ggml-lora-merged-f16.gguf";
417
+
416
418
  if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) {
417
419
  return 1;
418
420
  }
419
421
 
420
422
  g_verbose = (params.verbosity > 1);
421
423
  try {
422
- lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads);
424
+ lora_merge_ctx ctx(params.model, params.lora_adapters, params.out_file, params.cpuparams.n_threads);
423
425
  ctx.run_merge();
424
426
  } catch (const std::exception & err) {
425
427
  fprintf(stderr, "%s\n", err.what());
426
428
  exit(EXIT_FAILURE);
427
429
  }
428
430
 
429
- printf("done, output file is %s\n", params.lora_outfile.c_str());
431
+ printf("done, output file is %s\n", params.out_file.c_str());
430
432
 
431
433
  return 0;
432
434
  }
@@ -206,9 +206,6 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
206
206
 
207
207
  void IMatrixCollector::save_imatrix(int ncall) const {
208
208
  auto fname = m_params.out_file;
209
- if (fname.empty()) {
210
- fname = "imatrix.dat";
211
- }
212
209
 
213
210
  if (ncall > 0) {
214
211
  fname += ".at_";
@@ -583,6 +580,8 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
583
580
  int main(int argc, char ** argv) {
584
581
  common_params params;
585
582
 
583
+ params.out_file = "imatrix.dat" ;
584
+
586
585
  params.n_ctx = 512;
587
586
  params.logits_all = true;
588
587
  params.escape = false;
@@ -361,7 +361,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
361
361
  const auto tokens_list = common_tokenize(context, text, true, parse_special);
362
362
 
363
363
  auto n_ctx = llama_n_ctx(context);
364
- auto n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
364
+ auto n_kv_req = tokens_list.size() + n_len;
365
365
 
366
366
  LOGi("n_len = %d, n_ctx = %d, n_kv_req = %d", n_len, n_ctx, n_kv_req);
367
367
 
@@ -51,6 +51,13 @@ install(TARGETS ${TARGET} RUNTIME)
51
51
  target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
52
52
  target_compile_features(${TARGET} PRIVATE cxx_std_17)
53
53
 
54
+ set(TARGET llama-gemma3-cli)
55
+ add_executable(${TARGET} gemma3-cli.cpp)
56
+ set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-gemma3-cli)
57
+ install(TARGETS ${TARGET} RUNTIME)
58
+ target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
59
+ target_compile_features(${TARGET} PRIVATE cxx_std_17)
60
+
54
61
  set(TARGET llama-llava-clip-quantize-cli)
55
62
  add_executable(${TARGET} clip-quantize-cli.cpp)
56
63
  set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-clip-quantize-cli)