@fugood/llama.node 0.3.13 → 0.3.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +1 -1
- package/package.json +1 -1
- package/src/LlamaContext.cpp +98 -76
- package/src/LlamaContext.h +1 -1
- package/src/common.hpp +1 -2
- package/src/llama.cpp/.github/workflows/build.yml +60 -10
- package/src/llama.cpp/.github/workflows/server.yml +2 -0
- package/src/llama.cpp/common/CMakeLists.txt +3 -3
- package/src/llama.cpp/common/arg.cpp +112 -11
- package/src/llama.cpp/common/chat.cpp +960 -266
- package/src/llama.cpp/common/chat.h +135 -0
- package/src/llama.cpp/common/common.cpp +27 -171
- package/src/llama.cpp/common/common.h +27 -67
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
- package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
- package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
- package/src/llama.cpp/common/ngram-cache.cpp +1 -0
- package/src/llama.cpp/common/sampling.cpp +45 -7
- package/src/llama.cpp/common/speculative.cpp +6 -5
- package/src/llama.cpp/common/speculative.h +1 -1
- package/src/llama.cpp/docs/build.md +45 -7
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -3
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +373 -107
- package/src/llama.cpp/examples/llava/clip.h +19 -3
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
- package/src/llama.cpp/examples/llava/llava.cpp +4 -2
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
- package/src/llama.cpp/examples/main/main.cpp +73 -28
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
- package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
- package/src/llama.cpp/examples/run/run.cpp +110 -67
- package/src/llama.cpp/examples/server/server.cpp +82 -87
- package/src/llama.cpp/examples/server/utils.hpp +94 -107
- package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +251 -142
- package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
- package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
- package/src/llama.cpp/ggml/include/ggml.h +5 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
- package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1396 -386
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1432 -151
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +220 -116
- package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +168 -721
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +146 -42
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +13 -3
- package/src/llama.cpp/ggml/src/ggml.c +8 -3
- package/src/llama.cpp/include/llama.h +19 -5
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +1 -0
- package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
- package/src/llama.cpp/requirements.txt +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +21 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-chat.cpp +1 -0
- package/src/llama.cpp/src/llama-grammar.cpp +182 -182
- package/src/llama.cpp/src/llama-grammar.h +12 -3
- package/src/llama.cpp/src/llama-kv-cache.h +1 -0
- package/src/llama.cpp/src/llama-mmap.cpp +11 -1
- package/src/llama.cpp/src/llama-model.cpp +69 -5
- package/src/llama.cpp/src/llama-sampling.cpp +43 -10
- package/src/llama.cpp/src/llama-vocab.cpp +12 -0
- package/src/llama.cpp/src/llama.cpp +147 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +166 -110
- package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
- package/src/llama.cpp/tests/test-chat.cpp +593 -395
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
- package/src/llama.cpp/Sources/llama/llama.h +0 -4
- package/src/llama.cpp/common/chat.hpp +0 -55
- /package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
#include <cmath>
|
|
6
6
|
#include <unordered_map>
|
|
7
|
+
#include <algorithm>
|
|
7
8
|
|
|
8
9
|
// the ring buffer works similarly to std::deque, but with a fixed capacity
|
|
9
10
|
// TODO: deduplicate with llama-impl.h
|
|
@@ -159,16 +160,53 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|
|
159
160
|
GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
|
|
160
161
|
#endif // LLAMA_USE_LLGUIDANCE
|
|
161
162
|
} else {
|
|
162
|
-
std::vector<
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
163
|
+
std::vector<std::string> patterns_at_start;
|
|
164
|
+
std::vector<std::string> patterns_anywhere;
|
|
165
|
+
std::vector<llama_token> trigger_tokens;
|
|
166
|
+
for (const auto & trigger : params.grammar_triggers) {
|
|
167
|
+
switch (trigger.type) {
|
|
168
|
+
case COMMON_GRAMMAR_TRIGGER_TYPE_WORD:
|
|
169
|
+
{
|
|
170
|
+
const auto & word = trigger.value;
|
|
171
|
+
patterns_anywhere.push_back(regex_escape(word));
|
|
172
|
+
break;
|
|
173
|
+
}
|
|
174
|
+
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
|
|
175
|
+
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START:
|
|
176
|
+
{
|
|
177
|
+
const auto & pattern = trigger.value;
|
|
178
|
+
(trigger.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START ? patterns_at_start : patterns_anywhere).push_back(pattern);
|
|
179
|
+
break;
|
|
180
|
+
}
|
|
181
|
+
case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN:
|
|
182
|
+
{
|
|
183
|
+
const auto token = trigger.token;
|
|
184
|
+
trigger_tokens.push_back(token);
|
|
185
|
+
break;
|
|
186
|
+
}
|
|
187
|
+
default:
|
|
188
|
+
GGML_ASSERT(false && "unknown trigger type");
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
std::vector<std::string> trigger_patterns;
|
|
193
|
+
if (!patterns_at_start.empty()) {
|
|
194
|
+
trigger_patterns.push_back("^(" + string_join(patterns_at_start, "|") + ")[\\s\\S]*");
|
|
195
|
+
}
|
|
196
|
+
if (!patterns_anywhere.empty()) {
|
|
197
|
+
trigger_patterns.push_back("^[\\s\\S]*?(" + string_join(patterns_anywhere, "|") + ")[\\s\\S]*");
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
std::vector<const char *> trigger_patterns_c;
|
|
201
|
+
trigger_patterns_c.reserve(trigger_patterns.size());
|
|
202
|
+
for (const auto & regex : trigger_patterns) {
|
|
203
|
+
trigger_patterns_c.push_back(regex.c_str());
|
|
166
204
|
}
|
|
167
205
|
|
|
168
206
|
grmr = params.grammar_lazy
|
|
169
|
-
?
|
|
170
|
-
|
|
171
|
-
|
|
207
|
+
? llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
|
|
208
|
+
trigger_patterns_c.data(), trigger_patterns_c.size(),
|
|
209
|
+
trigger_tokens.data(), trigger_tokens.size())
|
|
172
210
|
: llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
|
|
173
211
|
}
|
|
174
212
|
|
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
#include "sampling.h"
|
|
6
6
|
|
|
7
7
|
#include <cstring>
|
|
8
|
+
#include <algorithm>
|
|
8
9
|
|
|
9
10
|
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 128
|
|
10
11
|
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
|
|
@@ -252,11 +253,6 @@ llama_tokens common_speculative_gen_draft(
|
|
|
252
253
|
// add drafted token for each sequence
|
|
253
254
|
const llama_token id = cur_p->data[0].id;
|
|
254
255
|
|
|
255
|
-
// only collect very high-confidence draft tokens
|
|
256
|
-
if (cur_p->data[0].p < params.p_min) {
|
|
257
|
-
break;
|
|
258
|
-
}
|
|
259
|
-
|
|
260
256
|
common_sampler_accept(smpl, id, true);
|
|
261
257
|
|
|
262
258
|
result.push_back(id);
|
|
@@ -265,6 +261,11 @@ llama_tokens common_speculative_gen_draft(
|
|
|
265
261
|
break;
|
|
266
262
|
}
|
|
267
263
|
|
|
264
|
+
// only collect very high-confidence draft tokens
|
|
265
|
+
if (cur_p->data[0].p < params.p_min) {
|
|
266
|
+
break;
|
|
267
|
+
}
|
|
268
|
+
|
|
268
269
|
common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
|
|
269
270
|
|
|
270
271
|
// evaluate the drafted tokens on the draft model
|
|
@@ -9,7 +9,7 @@ struct common_speculative_params {
|
|
|
9
9
|
int n_draft = 16; // max drafted tokens
|
|
10
10
|
int n_reuse = 256;
|
|
11
11
|
|
|
12
|
-
float p_min = 0.
|
|
12
|
+
float p_min = 0.75f; // min probability required to accept a token in the draft
|
|
13
13
|
};
|
|
14
14
|
|
|
15
15
|
struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);
|
|
@@ -197,20 +197,52 @@ The following compilation options are also available to tweak performance:
|
|
|
197
197
|
|
|
198
198
|
## MUSA
|
|
199
199
|
|
|
200
|
-
This provides GPU acceleration using
|
|
200
|
+
This provides GPU acceleration using a Moore Threads GPU. Make sure to have the [MUSA SDK](https://developer.mthreads.com/musa/musa-sdk) installed.
|
|
201
201
|
|
|
202
|
-
|
|
202
|
+
#### Download directly from Moore Threads
|
|
203
203
|
|
|
204
|
-
|
|
205
|
-
|
|
204
|
+
You may find the official downloads here: [Moore Threads developer site](https://developer.mthreads.com/sdk/download/musa).
|
|
205
|
+
|
|
206
|
+
### Compilation
|
|
207
|
+
|
|
208
|
+
```bash
|
|
209
|
+
cmake -B build -DGGML_MUSA=ON
|
|
210
|
+
cmake --build build --config Release
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
#### Override Compute Capability Specifications
|
|
214
|
+
|
|
215
|
+
By default, all supported compute capabilities are enabled. To customize this behavior, you can specify the `MUSA_ARCHITECTURES` option in the CMake command:
|
|
216
|
+
|
|
217
|
+
```bash
|
|
218
|
+
cmake -B build -DGGML_MUSA=ON -DMUSA_ARCHITECTURES="21"
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
This configuration enables only compute capability `2.1` (MTT S80) during compilation, which can help reduce compilation time.
|
|
222
|
+
|
|
223
|
+
#### Compilation options
|
|
224
|
+
|
|
225
|
+
Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
|
|
226
|
+
|
|
227
|
+
- For static builds, add `-DBUILD_SHARED_LIBS=OFF` and `-DCMAKE_POSITION_INDEPENDENT_CODE=ON`:
|
|
228
|
+
```
|
|
229
|
+
cmake -B build -DGGML_MUSA=ON \
|
|
230
|
+
-DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
|
|
206
231
|
cmake --build build --config Release
|
|
207
232
|
```
|
|
208
233
|
|
|
209
|
-
|
|
234
|
+
### Runtime MUSA environmental variables
|
|
210
235
|
|
|
211
|
-
|
|
236
|
+
You may set the [musa environmental variables](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) at runtime.
|
|
212
237
|
|
|
213
|
-
|
|
238
|
+
```bash
|
|
239
|
+
# Use `MUSA_VISIBLE_DEVICES` to hide the first compute device.
|
|
240
|
+
MUSA_VISIBLE_DEVICES="-0" ./build/bin/llama-server --model /srv/models/llama.gguf
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
### Unified Memory
|
|
244
|
+
|
|
245
|
+
The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted.
|
|
214
246
|
|
|
215
247
|
## HIP
|
|
216
248
|
|
|
@@ -227,6 +259,12 @@ You can download it from your Linux distro's package manager or from here: [ROCm
|
|
|
227
259
|
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
|
|
228
260
|
However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
|
|
229
261
|
|
|
262
|
+
To enhance flash attention performance on RDNA3+ or CDNA architectures, you can utilize the rocWMMA library by enabling the `-DGGML_HIP_ROCWMMA_FATTN=ON` option. This requires rocWMMA headers to be installed on the build system.
|
|
263
|
+
|
|
264
|
+
The rocWMMA library is included by default when installing the ROCm SDK using the `rocm` meta package provided by AMD. Alternatively, if you are not using the meta package, you can install the library using the `rocwmma-dev` or `rocwmma-devel` package, depending on your system's package manager.
|
|
265
|
+
|
|
266
|
+
As an alternative, you can manually install the library by cloning it from the official [GitHub repository](https://github.com/ROCm/rocWMMA), checkout the corresponding version tag (e.g. `rocm-6.2.4`) and set `-DCMAKE_CXX_FLAGS="-I<path/to/rocwmma>/library/include/"` in CMake. This also works under Windows despite not officially supported by AMD.
|
|
267
|
+
|
|
230
268
|
Note that if you get the following error:
|
|
231
269
|
```
|
|
232
270
|
clang: error: cannot find ROCm device library; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library
|
|
@@ -394,6 +394,8 @@ static int prepare_entries(common_params & params, train_context & ctx_train) {
|
|
|
394
394
|
int main(int argc, char ** argv) {
|
|
395
395
|
common_params params;
|
|
396
396
|
|
|
397
|
+
params.out_file = "control_vector.gguf";
|
|
398
|
+
|
|
397
399
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
|
|
398
400
|
return 1;
|
|
399
401
|
}
|
|
@@ -498,7 +500,7 @@ int main(int argc, char ** argv) {
|
|
|
498
500
|
}
|
|
499
501
|
|
|
500
502
|
// write output vectors to gguf
|
|
501
|
-
export_gguf(ctx_train.v_final, params.
|
|
503
|
+
export_gguf(ctx_train.v_final, params.out_file, model_hint);
|
|
502
504
|
|
|
503
505
|
llama_backend_free();
|
|
504
506
|
|
|
@@ -413,20 +413,22 @@ static void print_usage(int, char ** argv) {
|
|
|
413
413
|
int main(int argc, char ** argv) {
|
|
414
414
|
common_params params;
|
|
415
415
|
|
|
416
|
+
params.out_file = "ggml-lora-merged-f16.gguf";
|
|
417
|
+
|
|
416
418
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) {
|
|
417
419
|
return 1;
|
|
418
420
|
}
|
|
419
421
|
|
|
420
422
|
g_verbose = (params.verbosity > 1);
|
|
421
423
|
try {
|
|
422
|
-
lora_merge_ctx ctx(params.model, params.lora_adapters, params.
|
|
424
|
+
lora_merge_ctx ctx(params.model, params.lora_adapters, params.out_file, params.cpuparams.n_threads);
|
|
423
425
|
ctx.run_merge();
|
|
424
426
|
} catch (const std::exception & err) {
|
|
425
427
|
fprintf(stderr, "%s\n", err.what());
|
|
426
428
|
exit(EXIT_FAILURE);
|
|
427
429
|
}
|
|
428
430
|
|
|
429
|
-
printf("done, output file is %s\n", params.
|
|
431
|
+
printf("done, output file is %s\n", params.out_file.c_str());
|
|
430
432
|
|
|
431
433
|
return 0;
|
|
432
434
|
}
|
|
@@ -206,9 +206,6 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
|
|
206
206
|
|
|
207
207
|
void IMatrixCollector::save_imatrix(int ncall) const {
|
|
208
208
|
auto fname = m_params.out_file;
|
|
209
|
-
if (fname.empty()) {
|
|
210
|
-
fname = "imatrix.dat";
|
|
211
|
-
}
|
|
212
209
|
|
|
213
210
|
if (ncall > 0) {
|
|
214
211
|
fname += ".at_";
|
|
@@ -583,6 +580,8 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
|
|
|
583
580
|
int main(int argc, char ** argv) {
|
|
584
581
|
common_params params;
|
|
585
582
|
|
|
583
|
+
params.out_file = "imatrix.dat" ;
|
|
584
|
+
|
|
586
585
|
params.n_ctx = 512;
|
|
587
586
|
params.logits_all = true;
|
|
588
587
|
params.escape = false;
|
|
@@ -361,7 +361,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
|
|
|
361
361
|
const auto tokens_list = common_tokenize(context, text, true, parse_special);
|
|
362
362
|
|
|
363
363
|
auto n_ctx = llama_n_ctx(context);
|
|
364
|
-
auto n_kv_req = tokens_list.size() +
|
|
364
|
+
auto n_kv_req = tokens_list.size() + n_len;
|
|
365
365
|
|
|
366
366
|
LOGi("n_len = %d, n_ctx = %d, n_kv_req = %d", n_len, n_ctx, n_kv_req);
|
|
367
367
|
|
|
@@ -51,6 +51,13 @@ install(TARGETS ${TARGET} RUNTIME)
|
|
|
51
51
|
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
|
52
52
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
|
53
53
|
|
|
54
|
+
set(TARGET llama-gemma3-cli)
|
|
55
|
+
add_executable(${TARGET} gemma3-cli.cpp)
|
|
56
|
+
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-gemma3-cli)
|
|
57
|
+
install(TARGETS ${TARGET} RUNTIME)
|
|
58
|
+
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
|
59
|
+
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
|
60
|
+
|
|
54
61
|
set(TARGET llama-llava-clip-quantize-cli)
|
|
55
62
|
add_executable(${TARGET} clip-quantize-cli.cpp)
|
|
56
63
|
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-clip-quantize-cli)
|