@fugood/llama.node 0.4.6 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +22 -4
  18. package/lib/index.js +42 -18
  19. package/lib/index.ts +57 -23
  20. package/package.json +1 -1
  21. package/src/LlamaCompletionWorker.cpp +22 -381
  22. package/src/LlamaCompletionWorker.h +2 -4
  23. package/src/LlamaContext.cpp +40 -100
  24. package/src/LlamaContext.h +1 -0
  25. package/src/TokenizeWorker.cpp +33 -4
  26. package/src/TokenizeWorker.h +2 -5
  27. package/src/common.hpp +389 -0
  28. package/src/llama.cpp/.github/workflows/build.yml +2 -2
  29. package/src/llama.cpp/.github/workflows/release.yml +152 -129
  30. package/src/llama.cpp/.github/workflows/winget.yml +42 -0
  31. package/src/llama.cpp/common/arg.cpp +14 -13
  32. package/src/llama.cpp/common/common.cpp +4 -75
  33. package/src/llama.cpp/common/common.h +7 -12
  34. package/src/llama.cpp/examples/lookahead/lookahead.cpp +0 -13
  35. package/src/llama.cpp/examples/lookup/lookup.cpp +0 -11
  36. package/src/llama.cpp/examples/parallel/parallel.cpp +0 -9
  37. package/src/llama.cpp/examples/retrieval/retrieval.cpp +6 -6
  38. package/src/llama.cpp/examples/simple/simple.cpp +1 -1
  39. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
  40. package/src/llama.cpp/examples/sycl/run-llama2.sh +4 -4
  41. package/src/llama.cpp/examples/sycl/run-llama3.sh +28 -0
  42. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  43. package/src/llama.cpp/examples/sycl/win-run-llama3.bat +9 -0
  44. package/src/llama.cpp/ggml/include/ggml-opt.h +2 -0
  45. package/src/llama.cpp/ggml/include/ggml.h +11 -0
  46. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +274 -0
  47. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +27 -0
  48. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +18 -2
  49. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1 -0
  50. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +107 -0
  51. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +16 -0
  52. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
  53. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -155
  54. package/src/llama.cpp/ggml/src/ggml-opt.cpp +5 -0
  55. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +43 -12
  56. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +171 -112
  57. package/src/llama.cpp/ggml/src/ggml.c +64 -18
  58. package/src/llama.cpp/include/llama.h +24 -124
  59. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
  60. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
  61. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  62. package/src/llama.cpp/src/llama-batch.cpp +3 -1
  63. package/src/llama.cpp/src/llama-context.cpp +60 -110
  64. package/src/llama.cpp/src/llama-graph.cpp +137 -233
  65. package/src/llama.cpp/src/llama-graph.h +49 -7
  66. package/src/llama.cpp/src/llama-hparams.cpp +17 -1
  67. package/src/llama.cpp/src/llama-hparams.h +34 -5
  68. package/src/llama.cpp/src/llama-kv-cache.cpp +654 -321
  69. package/src/llama.cpp/src/llama-kv-cache.h +201 -85
  70. package/src/llama.cpp/src/llama-memory.h +3 -2
  71. package/src/llama.cpp/src/llama-model.cpp +273 -94
  72. package/src/llama.cpp/src/llama-model.h +4 -1
  73. package/src/llama.cpp/tests/test-arg-parser.cpp +1 -1
  74. package/src/llama.cpp/tools/llama-bench/llama-bench.cpp +1 -0
  75. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +13 -2
  76. package/src/llama.cpp/tools/mtmd/clip-impl.h +108 -11
  77. package/src/llama.cpp/tools/mtmd/clip.cpp +466 -88
  78. package/src/llama.cpp/tools/mtmd/clip.h +6 -4
  79. package/src/llama.cpp/tools/mtmd/miniaudio.h +93468 -0
  80. package/src/llama.cpp/tools/mtmd/mtmd-audio.cpp +855 -0
  81. package/src/llama.cpp/tools/mtmd/mtmd-audio.h +62 -0
  82. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +21 -14
  83. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +36 -49
  84. package/src/llama.cpp/tools/mtmd/mtmd.cpp +362 -98
  85. package/src/llama.cpp/tools/mtmd/mtmd.h +52 -21
  86. package/src/llama.cpp/tools/run/run.cpp +2 -2
  87. package/src/llama.cpp/tools/server/server.cpp +158 -47
  88. package/src/llama.cpp/tools/server/utils.hpp +71 -43
  89. package/src/llama.cpp/tools/tts/tts.cpp +4 -2
@@ -1,367 +1,6 @@
1
1
  #include "LlamaCompletionWorker.h"
2
2
  #include "LlamaContext.h"
3
3
 
4
- // Computes FNV-1a hash of the data
5
- static std::string fnv_hash(const uint8_t * data, size_t len) {
6
- const uint64_t fnv_prime = 0x100000001b3ULL;
7
- uint64_t hash = 0xcbf29ce484222325ULL;
8
-
9
- for (size_t i = 0; i < len; ++i) {
10
- hash ^= data[i];
11
- hash *= fnv_prime;
12
- }
13
- return std::to_string(hash);
14
- }
15
-
16
- static const std::string base64_chars =
17
- "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
18
- "abcdefghijklmnopqrstuvwxyz"
19
- "0123456789+/";
20
-
21
- // Base64 decoding function
22
- static std::vector<uint8_t> base64_decode(const std::string &encoded_string) {
23
- std::vector<uint8_t> decoded;
24
- int in_len = encoded_string.size();
25
- int i = 0;
26
- int j = 0;
27
- int in_ = 0;
28
- unsigned char char_array_4[4], char_array_3[3];
29
-
30
- while (in_len-- && (encoded_string[in_] != '=')) {
31
- if (isspace(encoded_string[in_])) {
32
- in_++;
33
- continue;
34
- }
35
-
36
- if (encoded_string[in_] == '=' || base64_chars.find(encoded_string[in_]) == std::string::npos) {
37
- break;
38
- }
39
-
40
- char_array_4[i++] = encoded_string[in_]; in_++;
41
- if (i == 4) {
42
- for (i = 0; i < 4; i++) {
43
- char_array_4[i] = base64_chars.find(char_array_4[i]);
44
- }
45
-
46
- char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
47
- char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
48
- char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
49
-
50
- for (i = 0; i < 3; i++) {
51
- decoded.push_back(char_array_3[i]);
52
- }
53
- i = 0;
54
- }
55
- }
56
-
57
- if (i) {
58
- for (j = i; j < 4; j++) {
59
- char_array_4[j] = 0;
60
- }
61
-
62
- for (j = 0; j < 4; j++) {
63
- char_array_4[j] = base64_chars.find(char_array_4[j]);
64
- }
65
-
66
- char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
67
- char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
68
- char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
69
-
70
- for (j = 0; j < i - 1; j++) {
71
- decoded.push_back(char_array_3[j]);
72
- }
73
- }
74
-
75
- return decoded;
76
- }
77
-
78
- size_t common_part(const std::vector<llama_token> &a,
79
- const std::vector<llama_token> &b) {
80
- size_t i = 0;
81
- while (i < a.size() && i < b.size() && a[i] == b[i]) {
82
- i++;
83
- }
84
- return i;
85
- }
86
-
87
- // Process images and add them to the tokenized input
88
- llama_pos processImage(
89
- const mtmd_context* mtmd_ctx,
90
- llama_context* ctx,
91
- LlamaSessionPtr sess,
92
- const std::vector<std::string>& image_paths,
93
- const common_params& params,
94
- std::vector<llama_token>& text_tokens
95
- ) {
96
- if (mtmd_ctx == nullptr) {
97
- return false;
98
- }
99
-
100
- // Multimodal path
101
- std::string full_prompt = params.prompt;
102
- // Add image marker if it doesn't already exist
103
- if (full_prompt.find("<__image__>") == std::string::npos) {
104
- full_prompt += " <__image__>";
105
- }
106
-
107
- // Prepare bitmaps array for all images
108
- mtmd::bitmaps bitmaps;
109
-
110
- std::vector<std::string> bitmap_hashes;
111
-
112
- // Load all images
113
- for (const auto& image_path : image_paths) {
114
- fprintf(stdout, "[DEBUG] Loading image: %s\n",
115
- image_path.substr(0, 50).c_str()); // Only log part of path for base64
116
-
117
- // Check if it's a base64 image
118
- if (image_path.compare(0, 11, "data:image/") == 0) {
119
-
120
- // Parse base64 data
121
- std::vector<std::string> parts;
122
- size_t comma_pos = image_path.find(',');
123
- if (comma_pos == std::string::npos) {
124
- bitmaps.entries.clear();
125
- return false;
126
- }
127
-
128
- std::string header = image_path.substr(0, comma_pos);
129
- std::string base64_data = image_path.substr(comma_pos + 1);
130
-
131
- if (header.find("base64") == std::string::npos) {
132
- bitmaps.entries.clear();
133
- return false;
134
- }
135
-
136
- // Decode base64
137
- try {
138
- // Decode base64 to binary
139
- std::vector<uint8_t> image_data = base64_decode(base64_data);
140
-
141
- // Load bitmap from memory buffer using direct initialization
142
- mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(image_data.data(), image_data.size()));
143
- if (!bmp.ptr) {
144
- bitmaps.entries.clear();
145
- return false;
146
- }
147
-
148
- // Calculate bitmap hash (for KV caching)
149
- std::string hash = fnv_hash(bmp.data(), bmp.nx()*bmp.ny()*3);
150
- bmp.set_id(hash.c_str());
151
- bitmaps.entries.push_back(std::move(bmp));
152
- bitmap_hashes.push_back(hash.c_str());
153
- } catch (const std::exception& e) {
154
- bitmaps.entries.clear();
155
- return false;
156
- }
157
- } else if (image_path.compare(0, 7, "http://") == 0 || image_path.compare(0, 8, "https://") == 0) {
158
- // HTTP URLs are not supported yet
159
- bitmaps.entries.clear();
160
- return false;
161
- } else {
162
- // Check if file exists
163
- FILE* file = fopen(image_path.c_str(), "rb");
164
- if (file == nullptr) {
165
- bitmaps.entries.clear();
166
- return false;
167
- }
168
-
169
- // Get file size
170
- fseek(file, 0, SEEK_END);
171
- long file_size = ftell(file);
172
- fseek(file, 0, SEEK_SET);
173
- fclose(file);
174
-
175
- // Create bitmap directly
176
- mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(image_path.c_str()));
177
- if (!bmp.ptr) {
178
- bitmaps.entries.clear();
179
- return false;
180
- }
181
-
182
- // Calculate bitmap hash (for KV caching)
183
- std::string hash = fnv_hash(bmp.data(), bmp.nx()*bmp.ny()*3);
184
- bmp.set_id(hash.c_str());
185
- bitmaps.entries.push_back(std::move(bmp));
186
- bitmap_hashes.push_back(hash.c_str());
187
- }
188
- }
189
-
190
- mtmd_input_chunks* chunks = mtmd_input_chunks_init();
191
- if (chunks == nullptr) {
192
- bitmaps.entries.clear();
193
- return false;
194
- }
195
-
196
- // Create input text
197
- mtmd_input_text input_text;
198
- input_text.text = full_prompt.c_str(); // Use the full prompt with image marker
199
- input_text.add_special = true; // Add BOS token if this is the first message
200
- input_text.parse_special = true; // Parse special tokens like <__image__>
201
-
202
- // Tokenize the text and images
203
- fprintf(stdout, "[DEBUG] Tokenizing text and %zu images\n", bitmaps.entries.size());
204
- auto bitmaps_c_ptr = bitmaps.c_ptr();
205
-
206
- // Cast away const for mtmd_tokenize
207
- int32_t res = mtmd_tokenize(
208
- const_cast<mtmd_context*>(mtmd_ctx),
209
- chunks,
210
- &input_text,
211
- bitmaps_c_ptr.data(),
212
- bitmaps_c_ptr.size()
213
- );
214
-
215
- if (res != 0) {
216
- mtmd_input_chunks_free(chunks);
217
- bitmaps.entries.clear();
218
- return false;
219
- }
220
-
221
- // Log chunk information
222
- size_t num_chunks = mtmd_input_chunks_size(chunks);
223
- fprintf(stdout, "[DEBUG] Tokenization successful: num_chunks=%zu\n", num_chunks);
224
-
225
- // Clear text_tokens before adding new tokens
226
- text_tokens.clear();
227
-
228
- // Create a vector to store all tokens (both text and image)
229
- std::vector<llama_token> all_tokens;
230
-
231
- // Track the total number of tokens (both text and image)
232
- size_t total_token_count = 0;
233
-
234
- // chunk pos
235
- std::vector<size_t> chunk_pos;
236
- std::vector<size_t> chunk_pos_images;
237
- for (size_t i = 0; i < num_chunks; i++) {
238
- chunk_pos.push_back(total_token_count);
239
-
240
- const mtmd_input_chunk* chunk = mtmd_input_chunks_get(chunks, i);
241
- mtmd_input_chunk_type chunk_type = mtmd_input_chunk_get_type(chunk);
242
-
243
- if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
244
- size_t n_tokens;
245
- const llama_token* tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
246
-
247
- // Add text tokens
248
- text_tokens.insert(text_tokens.end(), tokens, tokens + n_tokens);
249
- all_tokens.insert(all_tokens.end(), tokens, tokens + n_tokens);
250
- total_token_count += n_tokens;
251
- } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
252
- chunk_pos_images.push_back(total_token_count);
253
-
254
- const mtmd_image_tokens* img_tokens = mtmd_input_chunk_get_tokens_image(chunk);
255
- size_t n_tokens = mtmd_image_tokens_get_n_tokens(img_tokens);
256
- size_t n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
257
-
258
- for (size_t j = 0; j < n_pos; j++) {
259
- all_tokens.push_back(LLAMA_TOKEN_NULL);
260
- }
261
- total_token_count += n_pos;
262
- }
263
- }
264
-
265
- llama_pos n_past = common_part(*sess->tokens_ptr(), all_tokens);
266
-
267
- llama_pos new_n_past = n_past;
268
-
269
- // Adjust n_past to position of the text chunk
270
- // TODO: Edit the text chunk to remove the tokens before n_past to speed up
271
- // need to update the mtmd api
272
- auto adjusted_n_past = -1;
273
- for (size_t i = 0; i < chunk_pos.size(); i++) {
274
- if (n_past < chunk_pos[i]) {
275
- break;
276
- }
277
- bool is_end = i + 1 == chunk_pos.size();
278
- if (
279
- chunk_pos[i] < n_past &&
280
- (!is_end && chunk_pos[i + 1] > n_past)
281
- // is_end & n_past < total_token_count:
282
- // don't need to adjust and it will skip eval_chunk_single, let nextToken() to finish the job
283
- ) {
284
- adjusted_n_past = chunk_pos[i];
285
- }
286
- }
287
- if (adjusted_n_past != -1) {
288
- n_past = adjusted_n_past;
289
- new_n_past = n_past;
290
- fprintf(stdout, "[DEBUG] Adjusted n_past to %d\n", n_past);
291
- }
292
-
293
- // Compare bitmap hashes, if they are not the same, backtrack n_past to the position of the first mismatch
294
- auto mtmd_bitmap_past_hashes = sess->mtmd_bitmap_past_hashes_ptr();
295
- if (mtmd_bitmap_past_hashes->size() > 0) {
296
- for (size_t i = 0; i < bitmap_hashes.size(); i++) {
297
- auto pos = chunk_pos_images[i];
298
- if (n_past < pos) {
299
- break;
300
- }
301
- if (i >= mtmd_bitmap_past_hashes->size()) {
302
- break;
303
- }
304
- if (bitmap_hashes[i] != (*mtmd_bitmap_past_hashes)[i]) {
305
- n_past = chunk_pos_images[i];
306
- new_n_past = n_past;
307
- break;
308
- }
309
- }
310
- }
311
-
312
- // Clear all KV cache entries after position n_past
313
- llama_kv_self_seq_rm(ctx, 0, n_past, -1);
314
-
315
- for (size_t i = 0; i < chunk_pos.size(); i++) {
316
- fprintf(stdout, "[DEBUG] Evaluating chunk %zu: n_past=%d, chunk_pos=%zu\n", i, n_past, chunk_pos[i]);
317
-
318
- // Process chunk only if it's after the current n_past
319
- if (chunk_pos[i] >= new_n_past) {
320
- bool chunk_logits_last = (i == num_chunks - 1);
321
- auto chunk = mtmd_input_chunks_get(chunks, i);
322
-
323
- // Cast away const for mtmd_helper_eval_chunk_single
324
- int32_t res = mtmd_helper_eval_chunk_single(
325
- const_cast<mtmd_context*>(mtmd_ctx),
326
- ctx,
327
- chunk,
328
- n_past,
329
- 0,
330
- params.n_batch, // batch size
331
- chunk_logits_last,
332
- &new_n_past
333
- );
334
-
335
- if (res != 0) {
336
- mtmd_input_chunks_free(chunks);
337
- bitmaps.entries.clear();
338
- return false;
339
- }
340
- n_past = new_n_past;
341
- }
342
- }
343
-
344
- if (n_past == total_token_count && n_past > 0 && all_tokens[n_past - 1] != LLAMA_TOKEN_NULL) {
345
- // we have to evaluate at least 1 token to generate logits.
346
- n_past--;
347
- }
348
-
349
- // Update sampling context to process token sequences
350
- for (auto & token : all_tokens) {
351
- if (token == LLAMA_TOKEN_NULL) {
352
- continue;
353
- }
354
- }
355
- // Set the tokens
356
- sess->set_tokens(std::move(all_tokens));
357
-
358
- sess->set_mtmd_bitmap_past_hashes(bitmap_hashes);
359
-
360
- // Clean up image resources
361
- mtmd_input_chunks_free(chunks);
362
- bitmaps.entries.clear();
363
- return n_past;
364
- }
365
4
 
366
5
  size_t findStoppingStrings(const std::string &text,
367
6
  const size_t last_token_size,
@@ -390,10 +29,10 @@ LlamaCompletionWorker::LlamaCompletionWorker(
390
29
  Napi::Function callback, common_params params,
391
30
  std::vector<std::string> stop_words,
392
31
  int32_t chat_format,
393
- std::vector<std::string> image_paths)
32
+ std::vector<std::string> media_paths)
394
33
  : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess),
395
34
  _params(params), _stop_words(stop_words), _chat_format(chat_format),
396
- _image_paths(image_paths) {
35
+ _media_paths(media_paths) {
397
36
  if (!callback.IsEmpty()) {
398
37
  _tsfn = Napi::ThreadSafeFunction::New(info.Env(), callback,
399
38
  "LlamaCompletionCallback", 0, 1);
@@ -425,30 +64,33 @@ void LlamaCompletionWorker::Execute() {
425
64
  LlamaCppSampling sampling{common_sampler_init(model, _params.sampling),
426
65
  common_sampler_free};
427
66
 
428
- std::vector<llama_token> prompt_tokens;
429
-
430
- // Process images if any are provided
431
- if (!_image_paths.empty()) {
67
+ // Process media if any are provided
68
+ if (!_media_paths.empty()) {
432
69
  const auto* mtmd_ctx = _sess->get_mtmd_ctx();
433
70
 
434
71
  if (mtmd_ctx != nullptr) {
435
- // Process the images and get the tokens
436
- n_cur = processImage(
437
- mtmd_ctx,
438
- ctx,
439
- _sess,
440
- _image_paths,
441
- _params,
442
- prompt_tokens
443
- );
72
+ // Process the media and get the tokens
73
+ try {
74
+ n_cur = processMediaPrompt(
75
+ ctx,
76
+ mtmd_ctx,
77
+ _sess,
78
+ _params,
79
+ _media_paths
80
+ );
81
+ } catch (const std::exception& e) {
82
+ SetError(e.what());
83
+ _sess->get_mutex().unlock();
84
+ return;
85
+ }
444
86
 
445
87
  if (n_cur <= 0) {
446
- SetError("Failed to process images");
88
+ SetError("Failed to process media");
447
89
  _sess->get_mutex().unlock();
448
90
  return;
449
91
  }
450
92
 
451
- fprintf(stdout, "[DEBUG] Image processing successful, n_cur=%zu, tokens=%zu\n",
93
+ fprintf(stdout, "[DEBUG] Media processing successful, n_cur=%zu, tokens=%zu\n",
452
94
  n_cur, _sess->tokens_ptr()->size());
453
95
 
454
96
  n_input = _sess->tokens_ptr()->size();
@@ -456,7 +98,6 @@ void LlamaCompletionWorker::Execute() {
456
98
  --n_cur;
457
99
  }
458
100
  n_input -= n_cur;
459
- llama_kv_self_seq_rm(ctx, 0, n_cur, -1);
460
101
  } else {
461
102
  SetError("Multimodal context not initialized");
462
103
  _sess->get_mutex().unlock();
@@ -464,11 +105,11 @@ void LlamaCompletionWorker::Execute() {
464
105
  }
465
106
  } else {
466
107
  // Text-only path
467
- prompt_tokens = ::common_tokenize(ctx, _params.prompt, add_bos);
108
+ std::vector<llama_token> prompt_tokens = ::common_tokenize(ctx, _params.prompt, add_bos);
468
109
  n_input = prompt_tokens.size();
469
110
 
470
111
  if (_sess->tokens_ptr()->size() > 0) {
471
- n_cur = common_part(*(_sess->tokens_ptr()), prompt_tokens);
112
+ n_cur = common_tokens_part(*(_sess->tokens_ptr()), prompt_tokens);
472
113
  if (n_cur == n_input) {
473
114
  --n_cur;
474
115
  }
@@ -4,8 +4,6 @@
4
4
  #include <atomic>
5
5
  #include <functional>
6
6
  #include <napi.h>
7
- #include "tools/mtmd/mtmd.h"
8
- #include "tools/mtmd/clip.h"
9
7
 
10
8
  struct CompletionResult {
11
9
  std::string text = "";
@@ -22,7 +20,7 @@ public:
22
20
  Napi::Function callback, common_params params,
23
21
  std::vector<std::string> stop_words,
24
22
  int32_t chat_format,
25
- std::vector<std::string> image_paths = {});
23
+ std::vector<std::string> media_paths = {});
26
24
 
27
25
  ~LlamaCompletionWorker();
28
26
 
@@ -46,7 +44,7 @@ private:
46
44
  common_params _params;
47
45
  std::vector<std::string> _stop_words;
48
46
  int32_t _chat_format;
49
- std::vector<std::string> _image_paths;
47
+ std::vector<std::string> _media_paths;
50
48
  std::function<void()> _onComplete;
51
49
  bool _has_callback = false;
52
50
  bool _stop = false;
@@ -27,80 +27,6 @@ static std::string format_string(const std::string& format, Args ... args) {
27
27
  return std::string(buf.get(), buf.get() + size - 1); // -1 to exclude null terminator
28
28
  }
29
29
 
30
- // Computes FNV-1a hash of the data
31
- static std::string fnv_hash(const uint8_t* data, size_t len) {
32
- const uint64_t fnv_prime = 0x100000001b3ULL;
33
- uint64_t hash = 0xcbf29ce484222325ULL;
34
-
35
- for (size_t i = 0; i < len; ++i) {
36
- hash ^= data[i];
37
- hash *= fnv_prime;
38
- }
39
- return std::to_string(hash);
40
- }
41
-
42
- static const std::string base64_chars =
43
- "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
44
- "abcdefghijklmnopqrstuvwxyz"
45
- "0123456789+/";
46
-
47
- // Base64 decoding function
48
- static std::vector<uint8_t> base64_decode(const std::string &encoded_string) {
49
- std::vector<uint8_t> decoded;
50
- int in_len = encoded_string.size();
51
- int i = 0;
52
- int j = 0;
53
- int in_ = 0;
54
- unsigned char char_array_4[4], char_array_3[3];
55
-
56
- while (in_len-- && (encoded_string[in_] != '=')) {
57
- if (isspace(encoded_string[in_])) {
58
- in_++;
59
- continue;
60
- }
61
-
62
- if (encoded_string[in_] == '=' || base64_chars.find(encoded_string[in_]) == std::string::npos) {
63
- break;
64
- }
65
-
66
- char_array_4[i++] = encoded_string[in_]; in_++;
67
- if (i == 4) {
68
- for (i = 0; i < 4; i++) {
69
- char_array_4[i] = base64_chars.find(char_array_4[i]);
70
- }
71
-
72
- char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
73
- char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
74
- char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
75
-
76
- for (i = 0; i < 3; i++) {
77
- decoded.push_back(char_array_3[i]);
78
- }
79
- i = 0;
80
- }
81
- }
82
-
83
- if (i) {
84
- for (j = i; j < 4; j++) {
85
- char_array_4[j] = 0;
86
- }
87
-
88
- for (j = 0; j < 4; j++) {
89
- char_array_4[j] = base64_chars.find(char_array_4[j]);
90
- }
91
-
92
- char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
93
- char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
94
- char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
95
-
96
- for (j = 0; j < i - 1; j++) {
97
- decoded.push_back(char_array_3[j]);
98
- }
99
- }
100
-
101
- return decoded;
102
- }
103
-
104
30
  using json = nlohmann::ordered_json;
105
31
 
106
32
  // loadModelInfo(path: string): object
@@ -153,18 +79,6 @@ Napi::Value LlamaContext::ModelInfo(const Napi::CallbackInfo& info) {
153
79
  return metadata;
154
80
  }
155
81
 
156
- std::vector<common_chat_msg> get_messages(Napi::Array messages) {
157
- std::vector<common_chat_msg> chat;
158
- for (size_t i = 0; i < messages.Length(); i++) {
159
- auto message = messages.Get(i).As<Napi::Object>();
160
- chat.push_back({
161
- get_option<std::string>(message, "role", ""),
162
- get_option<std::string>(message, "content", ""),
163
- });
164
- }
165
- return std::move(chat);
166
- }
167
-
168
82
  void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
169
83
  Napi::Function func = DefineClass(
170
84
  env, "LlamaContext",
@@ -221,6 +135,9 @@ void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
221
135
  static_cast<napi_property_attributes>(napi_enumerable)),
222
136
  StaticMethod<&LlamaContext::ToggleNativeLog>(
223
137
  "toggleNativeLog",
138
+ static_cast<napi_property_attributes>(napi_enumerable)),
139
+ InstanceMethod<&LlamaContext::GetMultimodalSupport>(
140
+ "getMultimodalSupport",
224
141
  static_cast<napi_property_attributes>(napi_enumerable))});
225
142
  Napi::FunctionReference *constructor = new Napi::FunctionReference();
226
143
  *constructor = Napi::Persistent(func);
@@ -693,22 +610,22 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
693
610
  }
694
611
  }
695
612
 
696
- // Process image_paths parameter
697
- std::vector<std::string> image_paths;
698
- if (options.Has("image_paths")) {
699
- if (options.Get("image_paths").IsArray()) {
700
- auto image_paths_array = options.Get("image_paths").As<Napi::Array>();
701
- for (size_t i = 0; i < image_paths_array.Length(); i++) {
702
- image_paths.push_back(image_paths_array.Get(i).ToString().Utf8Value());
613
+ // Process media_paths parameter
614
+ std::vector<std::string> media_paths;
615
+ if (options.Has("media_paths")) {
616
+ if (options.Get("media_paths").IsArray()) {
617
+ auto media_paths_array = options.Get("media_paths").As<Napi::Array>();
618
+ for (size_t i = 0; i < media_paths_array.Length(); i++) {
619
+ media_paths.push_back(media_paths_array.Get(i).ToString().Utf8Value());
703
620
  }
704
- } else if (options.Get("image_paths").IsString()) {
705
- image_paths.push_back(options.Get("image_paths").ToString().Utf8Value());
621
+ } else if (options.Get("media_paths").IsString()) {
622
+ media_paths.push_back(options.Get("media_paths").ToString().Utf8Value());
706
623
  }
707
624
  }
708
625
 
709
- // Check if multimodal is enabled when image_paths are provided
710
- if (!image_paths.empty() && !(_has_multimodal && _mtmd_ctx != nullptr)) {
711
- Napi::Error::New(env, "Multimodal support must be enabled via initMultimodal to use image_paths").ThrowAsJavaScriptException();
626
+ // Check if multimodal is enabled when media_paths are provided
627
+ if (!media_paths.empty() && !(_has_multimodal && _mtmd_ctx != nullptr)) {
628
+ Napi::Error::New(env, "Multimodal support must be enabled via initMultimodal to use media_paths").ThrowAsJavaScriptException();
712
629
  return env.Undefined();
713
630
  }
714
631
 
@@ -894,7 +811,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
894
811
  }
895
812
 
896
813
  auto *worker =
897
- new LlamaCompletionWorker(info, _sess, callback, params, stop_words, chat_format, image_paths);
814
+ new LlamaCompletionWorker(info, _sess, callback, params, stop_words, chat_format, media_paths);
898
815
  worker->Queue();
899
816
  _wip = worker;
900
817
  worker->OnComplete([this]() { _wip = nullptr; });
@@ -919,7 +836,14 @@ Napi::Value LlamaContext::Tokenize(const Napi::CallbackInfo &info) {
919
836
  .ThrowAsJavaScriptException();
920
837
  }
921
838
  auto text = info[0].ToString().Utf8Value();
922
- auto *worker = new TokenizeWorker(info, _sess, text);
839
+ std::vector<std::string> media_paths;
840
+ if (info.Length() >= 2 && info[1].IsArray()) {
841
+ auto media_paths_array = info[1].As<Napi::Array>();
842
+ for (size_t i = 0; i < media_paths_array.Length(); i++) {
843
+ media_paths.push_back(media_paths_array.Get(i).ToString().Utf8Value());
844
+ }
845
+ }
846
+ auto *worker = new TokenizeWorker(info, _sess, text, media_paths);
923
847
  worker->Queue();
924
848
  return worker->Promise();
925
849
  }
@@ -1160,6 +1084,22 @@ Napi::Value LlamaContext::IsMultimodalEnabled(const Napi::CallbackInfo &info) {
1160
1084
  return Napi::Boolean::New(info.Env(), _has_multimodal && _mtmd_ctx != nullptr);
1161
1085
  }
1162
1086
 
1087
+ // getMultimodalSupport(): Promise<{ vision: boolean, audio: boolean }>
1088
+ Napi::Value LlamaContext::GetMultimodalSupport(const Napi::CallbackInfo &info) {
1089
+ Napi::Env env = info.Env();
1090
+ auto result = Napi::Object::New(env);
1091
+
1092
+ if (_has_multimodal && _mtmd_ctx != nullptr) {
1093
+ result.Set("vision", Napi::Boolean::New(env, mtmd_support_vision(_mtmd_ctx)));
1094
+ result.Set("audio", Napi::Boolean::New(env, mtmd_support_audio(_mtmd_ctx)));
1095
+ } else {
1096
+ result.Set("vision", Napi::Boolean::New(env, false));
1097
+ result.Set("audio", Napi::Boolean::New(env, false));
1098
+ }
1099
+
1100
+ return result;
1101
+ }
1102
+
1163
1103
  // releaseMultimodal(): void
1164
1104
  void LlamaContext::ReleaseMultimodal(const Napi::CallbackInfo &info) {
1165
1105
  if (_mtmd_ctx != nullptr) {
@@ -31,6 +31,7 @@ private:
31
31
  // Multimodal methods
32
32
  Napi::Value InitMultimodal(const Napi::CallbackInfo &info);
33
33
  Napi::Value IsMultimodalEnabled(const Napi::CallbackInfo &info);
34
+ Napi::Value GetMultimodalSupport(const Napi::CallbackInfo &info);
34
35
  void ReleaseMultimodal(const Napi::CallbackInfo &info);
35
36
 
36
37
  std::string _info;