@fugood/llama.node 1.1.10 → 1.2.0-rc.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. package/CMakeLists.txt +5 -8
  2. package/lib/binding.ts +20 -2
  3. package/lib/index.js +2 -2
  4. package/lib/index.ts +2 -2
  5. package/package.json +20 -16
  6. package/src/DecodeAudioTokenWorker.cpp +23 -26
  7. package/src/DecodeAudioTokenWorker.h +6 -8
  8. package/src/DetokenizeWorker.cpp +5 -8
  9. package/src/DetokenizeWorker.h +6 -5
  10. package/src/DisposeWorker.cpp +23 -3
  11. package/src/DisposeWorker.h +4 -2
  12. package/src/EmbeddingWorker.cpp +9 -35
  13. package/src/EmbeddingWorker.h +3 -2
  14. package/src/LlamaCompletionWorker.cpp +217 -315
  15. package/src/LlamaCompletionWorker.h +6 -12
  16. package/src/LlamaContext.cpp +174 -388
  17. package/src/LlamaContext.h +8 -13
  18. package/src/LoadSessionWorker.cpp +22 -19
  19. package/src/LoadSessionWorker.h +3 -2
  20. package/src/RerankWorker.h +3 -2
  21. package/src/SaveSessionWorker.cpp +22 -19
  22. package/src/SaveSessionWorker.h +3 -2
  23. package/src/TokenizeWorker.cpp +38 -35
  24. package/src/TokenizeWorker.h +12 -3
  25. package/src/common.hpp +0 -458
  26. package/src/llama.cpp/common/arg.cpp +67 -37
  27. package/src/llama.cpp/common/chat.cpp +263 -2
  28. package/src/llama.cpp/common/chat.h +4 -0
  29. package/src/llama.cpp/common/common.cpp +10 -3
  30. package/src/llama.cpp/common/common.h +5 -2
  31. package/src/llama.cpp/common/log.cpp +53 -2
  32. package/src/llama.cpp/common/log.h +10 -4
  33. package/src/llama.cpp/common/sampling.cpp +23 -2
  34. package/src/llama.cpp/common/sampling.h +3 -1
  35. package/src/llama.cpp/common/speculative.cpp +1 -1
  36. package/src/llama.cpp/ggml/CMakeLists.txt +4 -3
  37. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -0
  38. package/src/llama.cpp/ggml/include/ggml-cpu.h +0 -1
  39. package/src/llama.cpp/ggml/include/ggml.h +50 -1
  40. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +19 -16
  41. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +210 -96
  42. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -7
  43. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +11 -37
  44. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -4
  45. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +43 -6
  46. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +4 -1
  47. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +18 -18
  48. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +232 -123
  49. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +234 -16
  50. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
  51. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +80 -51
  52. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +161 -20
  53. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +399 -50
  54. package/src/llama.cpp/include/llama.h +32 -7
  55. package/src/llama.cpp/src/llama-adapter.cpp +101 -4
  56. package/src/llama.cpp/src/llama-adapter.h +6 -0
  57. package/src/llama.cpp/src/llama-arch.cpp +69 -2
  58. package/src/llama.cpp/src/llama-arch.h +6 -0
  59. package/src/llama.cpp/src/llama-context.cpp +92 -45
  60. package/src/llama.cpp/src/llama-context.h +1 -5
  61. package/src/llama.cpp/src/llama-graph.cpp +74 -19
  62. package/src/llama.cpp/src/llama-graph.h +10 -1
  63. package/src/llama.cpp/src/llama-hparams.cpp +37 -0
  64. package/src/llama.cpp/src/llama-hparams.h +9 -3
  65. package/src/llama.cpp/src/llama-impl.h +2 -0
  66. package/src/llama.cpp/src/llama-kv-cache.cpp +33 -120
  67. package/src/llama.cpp/src/llama-kv-cache.h +4 -13
  68. package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
  69. package/src/llama.cpp/src/llama-model.cpp +434 -21
  70. package/src/llama.cpp/src/llama-model.h +1 -1
  71. package/src/llama.cpp/src/llama-sampling.cpp +226 -126
  72. package/src/llama.cpp/src/llama-vocab.cpp +1 -1
  73. package/src/llama.cpp/src/llama.cpp +12 -0
  74. package/src/anyascii.c +0 -22223
  75. package/src/anyascii.h +0 -42
  76. package/src/tts_utils.cpp +0 -371
  77. package/src/tts_utils.h +0 -103
package/src/common.hpp CHANGED
@@ -4,16 +4,9 @@
4
4
  #include "common/common.h"
5
5
  #include "common/sampling.h"
6
6
  #include "llama.h"
7
- #include "tools/mtmd/clip.h"
8
- #include "tools/mtmd/mtmd.h"
9
- #include "tools/mtmd/mtmd-helper.h"
10
7
  #include <memory>
11
- #include <mutex>
12
8
  #include <napi.h>
13
9
  #include <string>
14
- #include <thread>
15
- #include <tuple>
16
- #include <vector>
17
10
 
18
11
  typedef std::unique_ptr<common_sampler, decltype(&common_sampler_free)>
19
12
  LlamaCppSampling;
@@ -64,455 +57,4 @@ constexpr T get_option(const Napi::Object &options, const std::string &name,
64
57
  } else {
65
58
  return default_value;
66
59
  }
67
- }
68
-
69
- class LlamaSession {
70
- public:
71
- LlamaSession(common_params params) : params_(params) {
72
- llama_init_ = common_init_from_params(params);
73
- tokens_.reserve(params.n_ctx);
74
- }
75
-
76
- ~LlamaSession() { dispose(); }
77
-
78
- inline llama_context *context() { return llama_init_.context.get(); }
79
-
80
- inline llama_model *model() { return llama_init_.model.get(); }
81
-
82
- inline std::vector<llama_token> *tokens_ptr() { return &tokens_; }
83
-
84
- inline void set_tokens(std::vector<llama_token> tokens) {
85
- tokens_ = std::move(tokens);
86
- }
87
-
88
- inline std::vector<std::string> *mtmd_bitmap_past_hashes_ptr() {
89
- return &mtmd_bitmap_past_hashes_;
90
- }
91
-
92
- inline void set_mtmd_bitmap_past_hashes(std::vector<std::string> hashes) {
93
- mtmd_bitmap_past_hashes_ = std::move(hashes);
94
- }
95
-
96
- inline const common_params &params() const { return params_; }
97
-
98
- inline std::mutex &get_mutex() { return mutex; }
99
-
100
- // Getter for the multimodal context
101
- inline mtmd_context *get_mtmd_ctx() { return _mtmd_ctx; }
102
-
103
- // Setter for the multimodal context
104
- inline void set_mtmd_ctx(mtmd_context *ctx) { _mtmd_ctx = ctx; }
105
-
106
- void dispose() {
107
- std::lock_guard<std::mutex> lock(mutex);
108
- tokens_.clear();
109
-
110
- // mtmd_ctx is owned by LlamaContext, so we don't free it here
111
- _mtmd_ctx = nullptr;
112
- }
113
-
114
- private:
115
- common_init_result llama_init_;
116
- const common_params params_;
117
- std::vector<llama_token> tokens_{};
118
- std::vector<std::string> mtmd_bitmap_past_hashes_{};
119
- std::mutex mutex;
120
- mtmd_context *_mtmd_ctx = nullptr;
121
- };
122
-
123
- typedef std::shared_ptr<LlamaSession> LlamaSessionPtr;
124
-
125
- static size_t common_tokens_part(const std::vector<llama_token> &a,
126
- const std::vector<llama_token> &b) {
127
- size_t i = 0;
128
- while (i < a.size() && i < b.size() && a[i] == b[i]) {
129
- i++;
130
- }
131
- return i;
132
- }
133
-
134
- // Computes FNV-1a hash of the data
135
- static std::string fnv_hash(const uint8_t *data, size_t len) {
136
- const uint64_t fnv_prime = 0x100000001b3ULL;
137
- uint64_t hash = 0xcbf29ce484222325ULL;
138
-
139
- for (size_t i = 0; i < len; ++i) {
140
- hash ^= data[i];
141
- hash *= fnv_prime;
142
- }
143
- return std::to_string(hash);
144
- }
145
-
146
- static const std::string base64_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
147
- "abcdefghijklmnopqrstuvwxyz"
148
- "0123456789+/";
149
-
150
- // Base64 decoding function
151
- static std::vector<uint8_t> base64_decode(const std::string &encoded_string) {
152
- std::vector<uint8_t> decoded;
153
- int in_len = encoded_string.size();
154
- int i = 0;
155
- int j = 0;
156
- int in_ = 0;
157
- unsigned char char_array_4[4], char_array_3[3];
158
-
159
- while (in_len-- && (encoded_string[in_] != '=')) {
160
- if (isspace(encoded_string[in_])) {
161
- in_++;
162
- continue;
163
- }
164
-
165
- if (encoded_string[in_] == '=' ||
166
- base64_chars.find(encoded_string[in_]) == std::string::npos) {
167
- break;
168
- }
169
-
170
- char_array_4[i++] = encoded_string[in_];
171
- in_++;
172
- if (i == 4) {
173
- for (i = 0; i < 4; i++) {
174
- char_array_4[i] = base64_chars.find(char_array_4[i]);
175
- }
176
-
177
- char_array_3[0] =
178
- (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
179
- char_array_3[1] =
180
- ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
181
- char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
182
-
183
- for (i = 0; i < 3; i++) {
184
- decoded.push_back(char_array_3[i]);
185
- }
186
- i = 0;
187
- }
188
- }
189
-
190
- if (i) {
191
- for (j = i; j < 4; j++) {
192
- char_array_4[j] = 0;
193
- }
194
-
195
- for (j = 0; j < 4; j++) {
196
- char_array_4[j] = base64_chars.find(char_array_4[j]);
197
- }
198
-
199
- char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
200
- char_array_3[1] =
201
- ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
202
- char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
203
-
204
- for (j = 0; j < i - 1; j++) {
205
- decoded.push_back(char_array_3[j]);
206
- }
207
- }
208
-
209
- return decoded;
210
- }
211
-
212
- struct TokenizeResult {
213
- std::vector<llama_token> tokens;
214
-
215
- bool has_media = false;
216
- std::vector<std::string> bitmap_hashes;
217
- std::vector<size_t> chunk_pos; // both text and media
218
- std::vector<size_t> chunk_pos_media; // media only
219
- mtmd_input_chunks *chunks = nullptr;
220
- };
221
-
222
- static TokenizeResult
223
- tokenizeWithMedia(mtmd_context *mtmd_ctx, const std::string &prompt,
224
- const std::vector<std::string> &media_paths) {
225
- if (mtmd_ctx == nullptr) {
226
- throw std::runtime_error("Multimodal context is not initialized");
227
- }
228
-
229
- TokenizeResult result;
230
- result.has_media = !media_paths.empty();
231
-
232
- mtmd::bitmaps bitmaps;
233
-
234
- // Load all media paths
235
- for (const auto &media_path : media_paths) {
236
- fprintf(
237
- stdout, "[DEBUG] Loading media: %s\n",
238
- media_path.substr(0, 50).c_str()); // Only log part of path for base64
239
-
240
- // Check if it's a base64 media
241
- if (media_path.compare(0, 11, "data:image/") == 0 ||
242
- media_path.compare(0, 11, "data:audio/") == 0) {
243
-
244
- // Parse base64 data
245
- std::vector<std::string> parts;
246
- size_t comma_pos = media_path.find(',');
247
- if (comma_pos == std::string::npos) {
248
- result.bitmap_hashes.clear();
249
- throw std::runtime_error(
250
- "Invalid base64 media format, missing comma separator");
251
- }
252
-
253
- std::string header = media_path.substr(0, comma_pos);
254
- std::string base64_data = media_path.substr(comma_pos + 1);
255
-
256
- if (header.find("base64") == std::string::npos) {
257
- result.bitmap_hashes.clear();
258
- throw std::runtime_error("Invalid base64 media");
259
- }
260
-
261
- // Decode base64
262
- try {
263
- // Decode base64 to binary
264
- std::vector<uint8_t> media_data = base64_decode(base64_data);
265
-
266
- // Load bitmap from memory buffer using direct initialization
267
- mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(mtmd_ctx, media_data.data(),
268
- media_data.size()));
269
- if (!bmp.ptr) {
270
- bitmaps.entries.clear();
271
- throw std::runtime_error("Failed to load base64 media");
272
- }
273
-
274
- // Calculate bitmap hash (for KV caching)
275
- std::string hash = fnv_hash(bmp.data(), bmp.n_bytes());
276
- bmp.set_id(hash.c_str());
277
- bitmaps.entries.push_back(std::move(bmp));
278
- result.bitmap_hashes.push_back(hash.c_str());
279
- } catch (const std::exception &e) {
280
- bitmaps.entries.clear();
281
- throw std::runtime_error("Failed to decode base64 media");
282
- }
283
- } else if (media_path.compare(0, 7, "http://") == 0 ||
284
- media_path.compare(0, 8, "https://") == 0) {
285
- // HTTP URLs are not supported yet
286
- bitmaps.entries.clear();
287
- throw std::runtime_error("HTTP/HTTPS URLs are not supported yet");
288
- } else {
289
- // Regular file path
290
- // Check if file exists
291
- FILE *file = fopen(media_path.c_str(), "rb");
292
- if (file == nullptr) {
293
- bitmaps.entries.clear();
294
- throw std::runtime_error("File does not exist or cannot be opened");
295
- }
296
-
297
- // Get file size
298
- fseek(file, 0, SEEK_END);
299
- long file_size = ftell(file);
300
- fseek(file, 0, SEEK_SET);
301
- fclose(file);
302
-
303
- // Create bitmap directly
304
- mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(mtmd_ctx, media_path.c_str()));
305
- if (!bmp.ptr) {
306
- bitmaps.entries.clear();
307
- throw std::runtime_error("Failed to load media");
308
- }
309
-
310
- // Calculate bitmap hash (for KV caching)
311
- std::string hash = fnv_hash(bmp.data(), bmp.nx() * bmp.ny() * 3);
312
- bmp.set_id(hash.c_str());
313
- bitmaps.entries.push_back(std::move(bmp));
314
- result.bitmap_hashes.push_back(hash.c_str());
315
- }
316
- }
317
-
318
- result.chunks = mtmd_input_chunks_init();
319
- if (result.chunks == nullptr) {
320
- bitmaps.entries.clear();
321
- throw std::runtime_error("Failed to initialize input chunks");
322
- }
323
-
324
- // Create input text
325
- mtmd_input_text input_text;
326
- input_text.text = prompt.c_str(); // Use the full prompt with media marker
327
- input_text.add_special = true; // Add BOS token if this is the first message
328
- input_text.parse_special = true; // Parse special tokens like <__media__>
329
-
330
- // Tokenize the text and media
331
- fprintf(stdout, "[DEBUG] Tokenizing text and %zu media\n",
332
- bitmaps.entries.size());
333
- auto bitmaps_c_ptr = bitmaps.c_ptr();
334
-
335
- // Cast away const for mtmd_tokenize
336
- int32_t res =
337
- mtmd_tokenize(const_cast<mtmd_context *>(mtmd_ctx), result.chunks,
338
- &input_text, bitmaps_c_ptr.data(), bitmaps_c_ptr.size());
339
-
340
- if (res != 0) {
341
- mtmd_input_chunks_free(result.chunks);
342
- bitmaps.entries.clear();
343
- throw std::runtime_error("Failed to tokenize text and media");
344
- }
345
-
346
- // Log chunk information
347
- size_t num_chunks = mtmd_input_chunks_size(result.chunks);
348
- fprintf(stdout, "[DEBUG] Tokenization successful: num_chunks=%zu\n",
349
- num_chunks);
350
-
351
- // Track the total number of tokens (both text and media)
352
- size_t total_token_count = 0;
353
-
354
- // chunk pos
355
- for (size_t i = 0; i < num_chunks; i++) {
356
- result.chunk_pos.push_back(total_token_count);
357
-
358
- const mtmd_input_chunk *chunk = mtmd_input_chunks_get(result.chunks, i);
359
- mtmd_input_chunk_type chunk_type = mtmd_input_chunk_get_type(chunk);
360
-
361
- if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
362
- size_t n_tokens;
363
- const llama_token *tokens =
364
- mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
365
-
366
- result.tokens.insert(result.tokens.end(), tokens, tokens + n_tokens);
367
- total_token_count += n_tokens;
368
- } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE ||
369
- chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
370
- result.chunk_pos_media.push_back(total_token_count);
371
-
372
- size_t n_tokens = mtmd_input_chunk_get_n_tokens(chunk);
373
- size_t n_pos = mtmd_input_chunk_get_n_pos(chunk);
374
- fprintf(stdout, "[DEBUG] Chunk %zu: type=%s, n_tokens=%zu, n_pos=%zu\n",
375
- i, chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE ? "IMAGE" : "AUDIO",
376
- n_tokens, n_pos);
377
-
378
- for (size_t j = 0; j < n_pos; j++) {
379
- result.tokens.push_back(LLAMA_TOKEN_NULL);
380
- }
381
- total_token_count += n_pos;
382
- }
383
- }
384
-
385
- bitmaps.entries.clear();
386
-
387
- return result;
388
- }
389
-
390
- // Process media and add them to the tokenized input
391
- static llama_pos
392
- processMediaPrompt(llama_context *ctx, mtmd_context *mtmd_ctx,
393
- LlamaSessionPtr sess, const common_params &params,
394
- const std::vector<std::string> &media_paths) {
395
- if (mtmd_ctx == nullptr) {
396
- throw std::runtime_error("Multimodal context is not initialized");
397
- }
398
-
399
- // Multimodal path
400
- std::string full_prompt = params.prompt;
401
- auto default_media_marker = mtmd_default_marker();
402
- // Add media marker if it doesn't already exist
403
- if (full_prompt.find(default_media_marker) == std::string::npos) {
404
- full_prompt += " ";
405
- full_prompt += default_media_marker;
406
- }
407
-
408
- auto result = tokenizeWithMedia(mtmd_ctx, full_prompt, media_paths);
409
-
410
- auto all_tokens = result.tokens;
411
- auto chunks = result.chunks;
412
- auto chunk_pos = result.chunk_pos;
413
- auto chunk_pos_media = result.chunk_pos_media;
414
- auto bitmap_hashes = result.bitmap_hashes;
415
-
416
- llama_pos n_past = common_tokens_part(*sess->tokens_ptr(), all_tokens);
417
-
418
- llama_pos new_n_past = n_past;
419
-
420
- // Adjust n_past to position of the text chunk
421
- // TODO: Edit the text chunk to remove the tokens before n_past to speed up
422
- // need to update the mtmd api
423
- auto adjusted_n_past = -1;
424
- for (size_t i = 0; i < chunk_pos.size(); i++) {
425
- if (n_past < chunk_pos[i]) {
426
- break;
427
- }
428
- bool is_end = i + 1 == chunk_pos.size();
429
- if (chunk_pos[i] < n_past && (!is_end && chunk_pos[i + 1] > n_past)
430
- // is_end & n_past < total_token_count:
431
- // don't need to adjust and it will skip eval_chunk_single, let
432
- // nextToken() to finish the job
433
- ) {
434
- adjusted_n_past = chunk_pos[i];
435
- }
436
- }
437
- if (adjusted_n_past != -1) {
438
- n_past = adjusted_n_past;
439
- new_n_past = n_past;
440
- fprintf(stdout, "[DEBUG] Adjusted n_past to %d\n", n_past);
441
- }
442
-
443
- // Compare bitmap hashes, if they are not the same, backtrack n_past to the
444
- // position of the first mismatch
445
- auto mtmd_bitmap_past_hashes = sess->mtmd_bitmap_past_hashes_ptr();
446
- if (mtmd_bitmap_past_hashes->size() > 0) {
447
- for (size_t i = 0; i < bitmap_hashes.size(); i++) {
448
- auto pos = chunk_pos_media[i];
449
- if (n_past < pos) {
450
- break;
451
- }
452
- if (i >= mtmd_bitmap_past_hashes->size()) {
453
- break;
454
- }
455
- if (bitmap_hashes[i] != (*mtmd_bitmap_past_hashes)[i]) {
456
- n_past = chunk_pos_media[i];
457
- new_n_past = n_past;
458
- break;
459
- }
460
- }
461
- }
462
-
463
- // Clear all KV cache entries after position n_past
464
- auto * kv = llama_get_memory(ctx);
465
- bool clear_result = llama_memory_seq_rm(kv, 0, n_past, -1);
466
- if (!clear_result) {
467
- fprintf(stdout, "[DEBUG] llama_memory_seq_rm failed (likely using a non-Transformer model)! Trying full clear...");
468
- llama_memory_clear(kv, false);
469
- n_past = 0;
470
- new_n_past = n_past;
471
- }
472
-
473
- size_t num_chunks = mtmd_input_chunks_size(chunks);
474
-
475
- for (size_t i = 0; i < chunk_pos.size(); i++) {
476
- fprintf(stdout, "[DEBUG] Evaluating chunk %zu: n_past=%d, chunk_pos=%zu\n",
477
- i, n_past, chunk_pos[i]);
478
-
479
- // Process chunk only if it's after the current n_past
480
- if (chunk_pos[i] >= new_n_past) {
481
- bool chunk_logits_last = (i == num_chunks - 1);
482
- auto chunk = mtmd_input_chunks_get(chunks, i);
483
-
484
- // Cast away const for mtmd_helper_eval_chunk_single
485
- int32_t res = mtmd_helper_eval_chunk_single(
486
- const_cast<mtmd_context *>(mtmd_ctx), ctx, chunk, n_past, 0,
487
- params.n_batch, // batch size
488
- chunk_logits_last, &new_n_past);
489
-
490
- if (res != 0) {
491
- mtmd_input_chunks_free(chunks);
492
- throw std::runtime_error("Failed to process chunk");
493
- }
494
- n_past = new_n_past;
495
- }
496
- }
497
-
498
- if (n_past == all_tokens.size() && n_past > 0 &&
499
- all_tokens[n_past - 1] != LLAMA_TOKEN_NULL) {
500
- // we have to evaluate at least 1 token to generate logits.
501
- n_past--;
502
- }
503
-
504
- // Update sampling context to process token sequences
505
- for (auto &token : all_tokens) {
506
- if (token == LLAMA_TOKEN_NULL) {
507
- continue;
508
- }
509
- }
510
- // Set the tokens
511
- sess->set_tokens(std::move(all_tokens));
512
-
513
- sess->set_mtmd_bitmap_past_hashes(bitmap_hashes);
514
-
515
- // Clean up media resources
516
- mtmd_input_chunks_free(chunks);
517
- return n_past;
518
60
  }
@@ -1106,7 +1106,7 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
1106
1106
  printf("\"\n\n");
1107
1107
 
1108
1108
  printf(" case \"$prev\" in\n");
1109
- printf(" --model)\n");
1109
+ printf(" --model|-m)\n");
1110
1110
  printf(" COMPREPLY=( $(compgen -f -X '!*.gguf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
1111
1111
  printf(" return 0\n");
1112
1112
  printf(" ;;\n");
@@ -1263,6 +1263,18 @@ static std::string list_builtin_chat_templates() {
1263
1263
  return msg.str();
1264
1264
  }
1265
1265
 
1266
+ static bool is_truthy(const std::string & value) {
1267
+ return value == "on" || value == "enabled" || value == "1";
1268
+ }
1269
+
1270
+ static bool is_falsey(const std::string & value) {
1271
+ return value == "off" || value == "disabled" || value == "0";
1272
+ }
1273
+
1274
+ static bool is_autoy(const std::string & value) {
1275
+ return value == "auto" || value == "-1";
1276
+ }
1277
+
1266
1278
  common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
1267
1279
  // load dynamic backends
1268
1280
  ggml_backend_load_all();
@@ -1544,13 +1556,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1544
1556
  params.n_chunks = value;
1545
1557
  }
1546
1558
  ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
1547
- add_opt(common_arg(
1548
- {"-fa", "--flash-attn"},
1549
- string_format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"),
1550
- [](common_params & params) {
1551
- params.flash_attn = true;
1552
- }
1553
- ).set_env("LLAMA_ARG_FLASH_ATTN"));
1559
+ add_opt(common_arg({ "-fa", "--flash-attn" }, "[on|off|auto]",
1560
+ string_format("set Flash Attention use ('on', 'off', or 'auto', default: '%s')",
1561
+ llama_flash_attn_type_name(params.flash_attn_type)),
1562
+ [](common_params & params, const std::string & value) {
1563
+ if (is_truthy(value)) {
1564
+ params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
1565
+ } else if (is_falsey(value)) {
1566
+ params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
1567
+ } else if (is_autoy(value)) {
1568
+ params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
1569
+ } else {
1570
+ throw std::runtime_error(
1571
+ string_format("error: unkown value for --flash-attn: '%s'\n", value.c_str()));
1572
+ }
1573
+ }).set_env("LLAMA_ARG_FLASH_ATTN"));
1554
1574
  add_opt(common_arg(
1555
1575
  {"-p", "--prompt"}, "PROMPT",
1556
1576
  "prompt to start generation with; for system message, use -sys",
@@ -2458,7 +2478,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2458
2478
  ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
2459
2479
  add_opt(common_arg(
2460
2480
  {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
2461
- "number of layers to store in VRAM",
2481
+ string_format("max. number of layers to store in VRAM (default: %d)", params.n_gpu_layers),
2462
2482
  [](common_params & params, int value) {
2463
2483
  params.n_gpu_layers = value;
2464
2484
  if (!llama_supports_gpu_offload()) {
@@ -2555,7 +2575,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2555
2575
  {"--lora"}, "FNAME",
2556
2576
  "path to LoRA adapter (can be repeated to use multiple adapters)",
2557
2577
  [](common_params & params, const std::string & value) {
2558
- params.lora_adapters.push_back({ std::string(value), 1.0, nullptr });
2578
+ params.lora_adapters.push_back({ std::string(value), 1.0, "", "", nullptr });
2559
2579
  }
2560
2580
  // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
2561
2581
  ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
@@ -2563,7 +2583,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2563
2583
  {"--lora-scaled"}, "FNAME", "SCALE",
2564
2584
  "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
2565
2585
  [](common_params & params, const std::string & fname, const std::string & scale) {
2566
- params.lora_adapters.push_back({ fname, std::stof(scale), nullptr });
2586
+ params.lora_adapters.push_back({ fname, std::stof(scale), "", "", nullptr });
2567
2587
  }
2568
2588
  // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
2569
2589
  ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
@@ -2954,13 +2974,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2954
2974
  params.endpoint_metrics = true;
2955
2975
  }
2956
2976
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS"));
2957
- add_opt(common_arg(
2958
- {"--slots"},
2959
- string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
2960
- [](common_params & params) {
2961
- params.endpoint_slots = true;
2962
- }
2963
- ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
2964
2977
  add_opt(common_arg(
2965
2978
  {"--props"},
2966
2979
  string_format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"),
@@ -2968,6 +2981,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2968
2981
  params.endpoint_props = true;
2969
2982
  }
2970
2983
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
2984
+ add_opt(common_arg(
2985
+ {"--slots"},
2986
+ string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
2987
+ [](common_params & params) {
2988
+ params.endpoint_slots = true;
2989
+ }
2990
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
2971
2991
  add_opt(common_arg(
2972
2992
  {"--no-slots"},
2973
2993
  "disables slots monitoring endpoint",
@@ -3126,13 +3146,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3126
3146
  common_log_set_file(common_log_main(), value.c_str());
3127
3147
  }
3128
3148
  ));
3129
- add_opt(common_arg(
3130
- {"--log-colors"},
3131
- "Enable colored logging",
3132
- [](common_params &) {
3133
- common_log_set_colors(common_log_main(), true);
3134
- }
3135
- ).set_env("LLAMA_LOG_COLORS"));
3149
+ add_opt(common_arg({ "--log-colors" }, "[on|off|auto]",
3150
+ "Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
3151
+ "'auto' enables colors when output is to a terminal",
3152
+ [](common_params &, const std::string & value) {
3153
+ if (is_truthy(value)) {
3154
+ common_log_set_colors(common_log_main(), LOG_COLORS_ENABLED);
3155
+ } else if (is_falsey(value)) {
3156
+ common_log_set_colors(common_log_main(), LOG_COLORS_DISABLED);
3157
+ } else if (is_autoy(value)) {
3158
+ common_log_set_colors(common_log_main(), LOG_COLORS_AUTO);
3159
+ } else {
3160
+ throw std::invalid_argument(
3161
+ string_format("error: unkown value for --log-colors: '%s'\n", value.c_str()));
3162
+ }
3163
+ }).set_env("LLAMA_LOG_COLORS"));
3136
3164
  add_opt(common_arg(
3137
3165
  {"-v", "--verbose", "--log-verbose"},
3138
3166
  "Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
@@ -3459,8 +3487,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3459
3487
  params.model.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
3460
3488
  params.model.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
3461
3489
  params.port = 8012;
3462
- params.n_gpu_layers = 99;
3463
- params.flash_attn = true;
3464
3490
  params.n_ubatch = 1024;
3465
3491
  params.n_batch = 1024;
3466
3492
  params.n_ctx = 0;
@@ -3475,8 +3501,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3475
3501
  params.model.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
3476
3502
  params.model.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
3477
3503
  params.port = 8012;
3478
- params.n_gpu_layers = 99;
3479
- params.flash_attn = true;
3480
3504
  params.n_ubatch = 1024;
3481
3505
  params.n_batch = 1024;
3482
3506
  params.n_ctx = 0;
@@ -3491,8 +3515,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3491
3515
  params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
3492
3516
  params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
3493
3517
  params.port = 8012;
3494
- params.n_gpu_layers = 99;
3495
- params.flash_attn = true;
3496
3518
  params.n_ubatch = 1024;
3497
3519
  params.n_batch = 1024;
3498
3520
  params.n_ctx = 0;
@@ -3508,10 +3530,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3508
3530
  params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
3509
3531
  params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
3510
3532
  params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
3511
- params.speculative.n_gpu_layers = 99;
3512
3533
  params.port = 8012;
3513
- params.n_gpu_layers = 99;
3514
- params.flash_attn = true;
3515
3534
  params.n_ubatch = 1024;
3516
3535
  params.n_batch = 1024;
3517
3536
  params.n_ctx = 0;
@@ -3527,10 +3546,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3527
3546
  params.model.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
3528
3547
  params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
3529
3548
  params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
3530
- params.speculative.n_gpu_layers = 99;
3531
3549
  params.port = 8012;
3532
- params.n_gpu_layers = 99;
3533
- params.flash_attn = true;
3550
+ params.n_ubatch = 1024;
3551
+ params.n_batch = 1024;
3552
+ params.n_ctx = 0;
3553
+ params.n_cache_reuse = 256;
3554
+ }
3555
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
3556
+
3557
+ add_opt(common_arg(
3558
+ {"--fim-qwen-30b-default"},
3559
+ string_format("use default Qwen 3 Coder 30B A3B Instruct (note: can download weights from the internet)"),
3560
+ [](common_params & params) {
3561
+ params.model.hf_repo = "ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF";
3562
+ params.model.hf_file = "qwen3-coder-30b-a3b-instruct-q8_0.gguf";
3563
+ params.port = 8012;
3534
3564
  params.n_ubatch = 1024;
3535
3565
  params.n_batch = 1024;
3536
3566
  params.n_ctx = 0;