@fugood/llama.node 0.4.6 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +22 -4
  18. package/lib/index.js +42 -18
  19. package/lib/index.ts +57 -23
  20. package/package.json +1 -1
  21. package/src/LlamaCompletionWorker.cpp +22 -381
  22. package/src/LlamaCompletionWorker.h +2 -4
  23. package/src/LlamaContext.cpp +40 -100
  24. package/src/LlamaContext.h +1 -0
  25. package/src/TokenizeWorker.cpp +33 -4
  26. package/src/TokenizeWorker.h +2 -5
  27. package/src/common.hpp +389 -0
  28. package/src/llama.cpp/.github/workflows/build.yml +2 -2
  29. package/src/llama.cpp/.github/workflows/release.yml +152 -129
  30. package/src/llama.cpp/.github/workflows/winget.yml +42 -0
  31. package/src/llama.cpp/common/arg.cpp +14 -13
  32. package/src/llama.cpp/common/common.cpp +4 -75
  33. package/src/llama.cpp/common/common.h +7 -12
  34. package/src/llama.cpp/examples/lookahead/lookahead.cpp +0 -13
  35. package/src/llama.cpp/examples/lookup/lookup.cpp +0 -11
  36. package/src/llama.cpp/examples/parallel/parallel.cpp +0 -9
  37. package/src/llama.cpp/examples/retrieval/retrieval.cpp +6 -6
  38. package/src/llama.cpp/examples/simple/simple.cpp +1 -1
  39. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
  40. package/src/llama.cpp/examples/sycl/run-llama2.sh +4 -4
  41. package/src/llama.cpp/examples/sycl/run-llama3.sh +28 -0
  42. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  43. package/src/llama.cpp/examples/sycl/win-run-llama3.bat +9 -0
  44. package/src/llama.cpp/ggml/include/ggml-opt.h +2 -0
  45. package/src/llama.cpp/ggml/include/ggml.h +11 -0
  46. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +274 -0
  47. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +27 -0
  48. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +18 -2
  49. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1 -0
  50. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +107 -0
  51. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +16 -0
  52. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
  53. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -155
  54. package/src/llama.cpp/ggml/src/ggml-opt.cpp +5 -0
  55. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +43 -12
  56. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +171 -112
  57. package/src/llama.cpp/ggml/src/ggml.c +64 -18
  58. package/src/llama.cpp/include/llama.h +24 -124
  59. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
  60. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
  61. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  62. package/src/llama.cpp/src/llama-batch.cpp +3 -1
  63. package/src/llama.cpp/src/llama-context.cpp +60 -110
  64. package/src/llama.cpp/src/llama-graph.cpp +137 -233
  65. package/src/llama.cpp/src/llama-graph.h +49 -7
  66. package/src/llama.cpp/src/llama-hparams.cpp +17 -1
  67. package/src/llama.cpp/src/llama-hparams.h +34 -5
  68. package/src/llama.cpp/src/llama-kv-cache.cpp +654 -321
  69. package/src/llama.cpp/src/llama-kv-cache.h +201 -85
  70. package/src/llama.cpp/src/llama-memory.h +3 -2
  71. package/src/llama.cpp/src/llama-model.cpp +273 -94
  72. package/src/llama.cpp/src/llama-model.h +4 -1
  73. package/src/llama.cpp/tests/test-arg-parser.cpp +1 -1
  74. package/src/llama.cpp/tools/llama-bench/llama-bench.cpp +1 -0
  75. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +13 -2
  76. package/src/llama.cpp/tools/mtmd/clip-impl.h +108 -11
  77. package/src/llama.cpp/tools/mtmd/clip.cpp +466 -88
  78. package/src/llama.cpp/tools/mtmd/clip.h +6 -4
  79. package/src/llama.cpp/tools/mtmd/miniaudio.h +93468 -0
  80. package/src/llama.cpp/tools/mtmd/mtmd-audio.cpp +855 -0
  81. package/src/llama.cpp/tools/mtmd/mtmd-audio.h +62 -0
  82. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +21 -14
  83. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +36 -49
  84. package/src/llama.cpp/tools/mtmd/mtmd.cpp +362 -98
  85. package/src/llama.cpp/tools/mtmd/mtmd.h +52 -21
  86. package/src/llama.cpp/tools/run/run.cpp +2 -2
  87. package/src/llama.cpp/tools/server/server.cpp +158 -47
  88. package/src/llama.cpp/tools/server/utils.hpp +71 -43
  89. package/src/llama.cpp/tools/tts/tts.cpp +4 -2
@@ -2,12 +2,23 @@
2
2
  #include "LlamaContext.h"
3
3
 
4
4
  TokenizeWorker::TokenizeWorker(const Napi::CallbackInfo &info,
5
- LlamaSessionPtr &sess, std::string text)
6
- : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess), _text(text) {}
5
+ LlamaSessionPtr &sess, std::string text, std::vector<std::string> media_paths)
6
+ : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess), _text(text), _media_paths(media_paths) {}
7
7
 
8
8
  void TokenizeWorker::Execute() {
9
- const auto tokens = ::common_tokenize(_sess->context(), _text, false);
10
- _result.tokens = std::move(tokens);
9
+ auto mtmd_ctx = _sess->get_mtmd_ctx();
10
+ if (!_media_paths.empty()) {
11
+ try {
12
+ _result = tokenizeWithMedia(mtmd_ctx, _text, _media_paths);
13
+ mtmd_input_chunks_free(_result.chunks);
14
+ } catch (const std::exception &e) {
15
+ SetError(e.what());
16
+ }
17
+ } else {
18
+ const auto tokens = common_tokenize(_sess->context(), _text, false);
19
+ _result.tokens = tokens;
20
+ _result.has_media = false;
21
+ }
11
22
  }
12
23
 
13
24
  void TokenizeWorker::OnOK() {
@@ -18,6 +29,24 @@ void TokenizeWorker::OnOK() {
18
29
  memcpy(tokens.Data(), _result.tokens.data(),
19
30
  _result.tokens.size() * sizeof(llama_token));
20
31
  result.Set("tokens", tokens);
32
+ result.Set("has_media", _result.has_media);
33
+ if (_result.has_media) {
34
+ auto bitmap_hashes = Napi::Array::New(Napi::AsyncWorker::Env(), _result.bitmap_hashes.size());
35
+ for (size_t i = 0; i < _result.bitmap_hashes.size(); i++) {
36
+ bitmap_hashes.Set(i, _result.bitmap_hashes[i]);
37
+ }
38
+ result.Set("bitmap_hashes", bitmap_hashes);
39
+ auto chunk_pos = Napi::Array::New(Napi::AsyncWorker::Env(), _result.chunk_pos.size());
40
+ for (size_t i = 0; i < _result.chunk_pos.size(); i++) {
41
+ chunk_pos.Set(i, _result.chunk_pos[i]);
42
+ }
43
+ result.Set("chunk_pos", chunk_pos);
44
+ auto chunk_pos_media = Napi::Array::New(Napi::AsyncWorker::Env(), _result.chunk_pos_media.size());
45
+ for (size_t i = 0; i < _result.chunk_pos_media.size(); i++) {
46
+ chunk_pos_media.Set(i, _result.chunk_pos_media[i]);
47
+ }
48
+ result.Set("chunk_pos_media", chunk_pos_media);
49
+ }
21
50
  Napi::Promise::Deferred::Resolve(result);
22
51
  }
23
52
 
@@ -1,15 +1,11 @@
1
1
  #include "common.hpp"
2
2
  #include <vector>
3
3
 
4
- struct TokenizeResult {
5
- std::vector<llama_token> tokens;
6
- };
7
-
8
4
  class TokenizeWorker : public Napi::AsyncWorker,
9
5
  public Napi::Promise::Deferred {
10
6
  public:
11
7
  TokenizeWorker(const Napi::CallbackInfo &info, LlamaSessionPtr &sess,
12
- std::string text);
8
+ std::string text, std::vector<std::string> media_paths);
13
9
 
14
10
  protected:
15
11
  void Execute();
@@ -19,5 +15,6 @@ protected:
19
15
  private:
20
16
  LlamaSessionPtr _sess;
21
17
  std::string _text;
18
+ std::vector<std::string> _media_paths;
22
19
  TokenizeResult _result;
23
20
  };
package/src/common.hpp CHANGED
@@ -2,6 +2,8 @@
2
2
 
3
3
  #include "common/common.h"
4
4
  #include "common/sampling.h"
5
+ #include "tools/mtmd/mtmd.h"
6
+ #include "tools/mtmd/clip.h"
5
7
  #include "chat.h"
6
8
  #include "llama.h"
7
9
  #include "tools/mtmd/mtmd.h"
@@ -120,3 +122,390 @@ private:
120
122
  };
121
123
 
122
124
  typedef std::shared_ptr<LlamaSession> LlamaSessionPtr;
125
+
126
+ static size_t common_tokens_part(const std::vector<llama_token> &a,
127
+ const std::vector<llama_token> &b) {
128
+ size_t i = 0;
129
+ while (i < a.size() && i < b.size() && a[i] == b[i]) {
130
+ i++;
131
+ }
132
+ return i;
133
+ }
134
+
135
+ // Computes FNV-1a hash of the data
136
+ static std::string fnv_hash(const uint8_t * data, size_t len) {
137
+ const uint64_t fnv_prime = 0x100000001b3ULL;
138
+ uint64_t hash = 0xcbf29ce484222325ULL;
139
+
140
+ for (size_t i = 0; i < len; ++i) {
141
+ hash ^= data[i];
142
+ hash *= fnv_prime;
143
+ }
144
+ return std::to_string(hash);
145
+ }
146
+
147
+ static const std::string base64_chars =
148
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
149
+ "abcdefghijklmnopqrstuvwxyz"
150
+ "0123456789+/";
151
+
152
+ // Base64 decoding function
153
+ static std::vector<uint8_t> base64_decode(const std::string &encoded_string) {
154
+ std::vector<uint8_t> decoded;
155
+ int in_len = encoded_string.size();
156
+ int i = 0;
157
+ int j = 0;
158
+ int in_ = 0;
159
+ unsigned char char_array_4[4], char_array_3[3];
160
+
161
+ while (in_len-- && (encoded_string[in_] != '=')) {
162
+ if (isspace(encoded_string[in_])) {
163
+ in_++;
164
+ continue;
165
+ }
166
+
167
+ if (encoded_string[in_] == '=' || base64_chars.find(encoded_string[in_]) == std::string::npos) {
168
+ break;
169
+ }
170
+
171
+ char_array_4[i++] = encoded_string[in_]; in_++;
172
+ if (i == 4) {
173
+ for (i = 0; i < 4; i++) {
174
+ char_array_4[i] = base64_chars.find(char_array_4[i]);
175
+ }
176
+
177
+ char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
178
+ char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
179
+ char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
180
+
181
+ for (i = 0; i < 3; i++) {
182
+ decoded.push_back(char_array_3[i]);
183
+ }
184
+ i = 0;
185
+ }
186
+ }
187
+
188
+ if (i) {
189
+ for (j = i; j < 4; j++) {
190
+ char_array_4[j] = 0;
191
+ }
192
+
193
+ for (j = 0; j < 4; j++) {
194
+ char_array_4[j] = base64_chars.find(char_array_4[j]);
195
+ }
196
+
197
+ char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
198
+ char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
199
+ char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
200
+
201
+ for (j = 0; j < i - 1; j++) {
202
+ decoded.push_back(char_array_3[j]);
203
+ }
204
+ }
205
+
206
+ return decoded;
207
+ }
208
+
209
+ struct TokenizeResult {
210
+ std::vector<llama_token> tokens;
211
+
212
+ bool has_media = false;
213
+ std::vector<std::string> bitmap_hashes;
214
+ std::vector<size_t> chunk_pos; // both text and media
215
+ std::vector<size_t> chunk_pos_media; // media only
216
+ mtmd_input_chunks* chunks = nullptr;
217
+ };
218
+
219
+ static TokenizeResult tokenizeWithMedia(
220
+ const mtmd_context* mtmd_ctx,
221
+ const std::string &prompt,
222
+ const std::vector<std::string> &media_paths
223
+ ) {
224
+ if (mtmd_ctx == nullptr) {
225
+ throw std::runtime_error("Multimodal context is not initialized");
226
+ }
227
+
228
+ TokenizeResult result;
229
+ result.has_media = !media_paths.empty();
230
+
231
+ mtmd::bitmaps bitmaps;
232
+
233
+ // Load all media paths
234
+ for (const auto& media_path : media_paths) {
235
+ fprintf(stdout, "[DEBUG] Loading media: %s\n",
236
+ media_path.substr(0, 50).c_str()); // Only log part of path for base64
237
+
238
+ // Check if it's a base64 media
239
+ if (media_path.compare(0, 11, "data:image/") == 0 || media_path.compare(0, 11, "data:audio/") == 0) {
240
+
241
+ // Parse base64 data
242
+ std::vector<std::string> parts;
243
+ size_t comma_pos = media_path.find(',');
244
+ if (comma_pos == std::string::npos) {
245
+ result.bitmap_hashes.clear();
246
+ throw std::runtime_error("Invalid base64 media format, missing comma separator");
247
+ }
248
+
249
+ std::string header = media_path.substr(0, comma_pos);
250
+ std::string base64_data = media_path.substr(comma_pos + 1);
251
+
252
+ if (header.find("base64") == std::string::npos) {
253
+ result.bitmap_hashes.clear();
254
+ throw std::runtime_error("Invalid base64 media");
255
+ }
256
+
257
+ // Decode base64
258
+ try {
259
+ // Decode base64 to binary
260
+ std::vector<uint8_t> media_data = base64_decode(base64_data);
261
+
262
+ // Load bitmap from memory buffer using direct initialization
263
+ mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(media_data.data(), media_data.size()));
264
+ if (!bmp.ptr) {
265
+ bitmaps.entries.clear();
266
+ throw std::runtime_error("Failed to load base64 media");
267
+ }
268
+
269
+ // Calculate bitmap hash (for KV caching)
270
+ std::string hash = fnv_hash(bmp.data(), bmp.n_bytes());
271
+ bmp.set_id(hash.c_str());
272
+ bitmaps.entries.push_back(std::move(bmp));
273
+ result.bitmap_hashes.push_back(hash.c_str());
274
+ } catch (const std::exception& e) {
275
+ bitmaps.entries.clear();
276
+ throw std::runtime_error("Failed to decode base64 media");
277
+ }
278
+ } else if (media_path.compare(0, 7, "http://") == 0 || media_path.compare(0, 8, "https://") == 0) {
279
+ // HTTP URLs are not supported yet
280
+ bitmaps.entries.clear();
281
+ throw std::runtime_error("HTTP/HTTPS URLs are not supported yet");
282
+ } else {
283
+ // Regular file path
284
+ // Check if file exists
285
+ FILE* file = fopen(media_path.c_str(), "rb");
286
+ if (file == nullptr) {
287
+ bitmaps.entries.clear();
288
+ throw std::runtime_error("File does not exist or cannot be opened");
289
+ }
290
+
291
+ // Get file size
292
+ fseek(file, 0, SEEK_END);
293
+ long file_size = ftell(file);
294
+ fseek(file, 0, SEEK_SET);
295
+ fclose(file);
296
+
297
+ // Create bitmap directly
298
+ mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(media_path.c_str()));
299
+ if (!bmp.ptr) {
300
+ bitmaps.entries.clear();
301
+ throw std::runtime_error("Failed to load media");
302
+ }
303
+
304
+ // Calculate bitmap hash (for KV caching)
305
+ std::string hash = fnv_hash(bmp.data(), bmp.nx()*bmp.ny()*3);
306
+ bmp.set_id(hash.c_str());
307
+ bitmaps.entries.push_back(std::move(bmp));
308
+ result.bitmap_hashes.push_back(hash.c_str());
309
+ }
310
+ }
311
+
312
+ result.chunks = mtmd_input_chunks_init();
313
+ if (result.chunks == nullptr) {
314
+ bitmaps.entries.clear();
315
+ throw std::runtime_error("Failed to initialize input chunks");
316
+ }
317
+
318
+ // Create input text
319
+ mtmd_input_text input_text;
320
+ input_text.text = prompt.c_str(); // Use the full prompt with media marker
321
+ input_text.add_special = true; // Add BOS token if this is the first message
322
+ input_text.parse_special = true; // Parse special tokens like <__media__>
323
+
324
+ // Tokenize the text and media
325
+ fprintf(stdout, "[DEBUG] Tokenizing text and %zu media\n", bitmaps.entries.size());
326
+ auto bitmaps_c_ptr = bitmaps.c_ptr();
327
+
328
+ // Cast away const for mtmd_tokenize
329
+ int32_t res = mtmd_tokenize(
330
+ const_cast<mtmd_context*>(mtmd_ctx),
331
+ result.chunks,
332
+ &input_text,
333
+ bitmaps_c_ptr.data(),
334
+ bitmaps_c_ptr.size()
335
+ );
336
+
337
+ if (res != 0) {
338
+ mtmd_input_chunks_free(result.chunks);
339
+ bitmaps.entries.clear();
340
+ throw std::runtime_error("Failed to tokenize text and media");
341
+ }
342
+
343
+ // Log chunk information
344
+ size_t num_chunks = mtmd_input_chunks_size(result.chunks);
345
+ fprintf(stdout, "[DEBUG] Tokenization successful: num_chunks=%zu\n", num_chunks);
346
+
347
+ // Track the total number of tokens (both text and media)
348
+ size_t total_token_count = 0;
349
+
350
+ // chunk pos
351
+ for (size_t i = 0; i < num_chunks; i++) {
352
+ result.chunk_pos.push_back(total_token_count);
353
+
354
+ const mtmd_input_chunk* chunk = mtmd_input_chunks_get(result.chunks, i);
355
+ mtmd_input_chunk_type chunk_type = mtmd_input_chunk_get_type(chunk);
356
+
357
+ if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
358
+ size_t n_tokens;
359
+ const llama_token* tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
360
+
361
+ result.tokens.insert(result.tokens.end(), tokens, tokens + n_tokens);
362
+ total_token_count += n_tokens;
363
+ } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE || chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
364
+ result.chunk_pos_media.push_back(total_token_count);
365
+
366
+ size_t n_tokens = mtmd_input_chunk_get_n_tokens(chunk);
367
+ size_t n_pos = mtmd_input_chunk_get_n_pos(chunk);
368
+ fprintf(stdout, "[DEBUG] Chunk %zu: type=%s, n_tokens=%zu, n_pos=%zu\n",
369
+ i, chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE ? "IMAGE" : "AUDIO", n_tokens, n_pos);
370
+
371
+ for (size_t j = 0; j < n_pos; j++) {
372
+ result.tokens.push_back(LLAMA_TOKEN_NULL);
373
+ }
374
+ total_token_count += n_pos;
375
+ }
376
+ }
377
+
378
+ bitmaps.entries.clear();
379
+
380
+ return result;
381
+ }
382
+
383
+ // Process media and add them to the tokenized input
384
+ static llama_pos processMediaPrompt(
385
+ llama_context* ctx,
386
+ const mtmd_context* mtmd_ctx,
387
+ LlamaSessionPtr sess,
388
+ const common_params& params,
389
+ const std::vector<std::string>& media_paths
390
+ ) {
391
+ if (mtmd_ctx == nullptr) {
392
+ throw std::runtime_error("Multimodal context is not initialized");
393
+ }
394
+
395
+ // Multimodal path
396
+ std::string full_prompt = params.prompt;
397
+ auto default_media_marker = mtmd_default_marker();
398
+ // Add media marker if it doesn't already exist
399
+ if (full_prompt.find(default_media_marker) == std::string::npos) {
400
+ full_prompt += " ";
401
+ full_prompt += default_media_marker;
402
+ }
403
+
404
+ auto result = tokenizeWithMedia(mtmd_ctx, full_prompt, media_paths);
405
+
406
+ auto all_tokens = result.tokens;
407
+ auto chunks = result.chunks;
408
+ auto chunk_pos = result.chunk_pos;
409
+ auto chunk_pos_media = result.chunk_pos_media;
410
+ auto bitmap_hashes = result.bitmap_hashes;
411
+
412
+ llama_pos n_past = common_tokens_part(*sess->tokens_ptr(), all_tokens);
413
+
414
+ llama_pos new_n_past = n_past;
415
+
416
+ // Adjust n_past to position of the text chunk
417
+ // TODO: Edit the text chunk to remove the tokens before n_past to speed up
418
+ // need to update the mtmd api
419
+ auto adjusted_n_past = -1;
420
+ for (size_t i = 0; i < chunk_pos.size(); i++) {
421
+ if (n_past < chunk_pos[i]) {
422
+ break;
423
+ }
424
+ bool is_end = i + 1 == chunk_pos.size();
425
+ if (
426
+ chunk_pos[i] < n_past &&
427
+ (!is_end && chunk_pos[i + 1] > n_past)
428
+ // is_end & n_past < total_token_count:
429
+ // don't need to adjust and it will skip eval_chunk_single, let nextToken() to finish the job
430
+ ) {
431
+ adjusted_n_past = chunk_pos[i];
432
+ }
433
+ }
434
+ if (adjusted_n_past != -1) {
435
+ n_past = adjusted_n_past;
436
+ new_n_past = n_past;
437
+ fprintf(stdout, "[DEBUG] Adjusted n_past to %d\n", n_past);
438
+ }
439
+
440
+ // Compare bitmap hashes, if they are not the same, backtrack n_past to the position of the first mismatch
441
+ auto mtmd_bitmap_past_hashes = sess->mtmd_bitmap_past_hashes_ptr();
442
+ if (mtmd_bitmap_past_hashes->size() > 0) {
443
+ for (size_t i = 0; i < bitmap_hashes.size(); i++) {
444
+ auto pos = chunk_pos_media[i];
445
+ if (n_past < pos) {
446
+ break;
447
+ }
448
+ if (i >= mtmd_bitmap_past_hashes->size()) {
449
+ break;
450
+ }
451
+ if (bitmap_hashes[i] != (*mtmd_bitmap_past_hashes)[i]) {
452
+ n_past = chunk_pos_media[i];
453
+ new_n_past = n_past;
454
+ break;
455
+ }
456
+ }
457
+ }
458
+
459
+ // Clear all KV cache entries after position n_past
460
+ llama_kv_self_seq_rm(ctx, 0, n_past, -1);
461
+
462
+ size_t num_chunks = mtmd_input_chunks_size(chunks);
463
+
464
+ for (size_t i = 0; i < chunk_pos.size(); i++) {
465
+ fprintf(stdout, "[DEBUG] Evaluating chunk %zu: n_past=%d, chunk_pos=%zu\n", i, n_past, chunk_pos[i]);
466
+
467
+ // Process chunk only if it's after the current n_past
468
+ if (chunk_pos[i] >= new_n_past) {
469
+ bool chunk_logits_last = (i == num_chunks - 1);
470
+ auto chunk = mtmd_input_chunks_get(chunks, i);
471
+
472
+ // Cast away const for mtmd_helper_eval_chunk_single
473
+ int32_t res = mtmd_helper_eval_chunk_single(
474
+ const_cast<mtmd_context*>(mtmd_ctx),
475
+ ctx,
476
+ chunk,
477
+ n_past,
478
+ 0,
479
+ params.n_batch, // batch size
480
+ chunk_logits_last,
481
+ &new_n_past
482
+ );
483
+
484
+ if (res != 0) {
485
+ mtmd_input_chunks_free(chunks);
486
+ throw std::runtime_error("Failed to process chunk");
487
+ }
488
+ n_past = new_n_past;
489
+ }
490
+ }
491
+
492
+ if (n_past == all_tokens.size() && n_past > 0 && all_tokens[n_past - 1] != LLAMA_TOKEN_NULL) {
493
+ // we have to evaluate at least 1 token to generate logits.
494
+ n_past--;
495
+ }
496
+
497
+ // Update sampling context to process token sequences
498
+ for (auto & token : all_tokens) {
499
+ if (token == LLAMA_TOKEN_NULL) {
500
+ continue;
501
+ }
502
+ }
503
+ // Set the tokens
504
+ sess->set_tokens(std::move(all_tokens));
505
+
506
+ sess->set_mtmd_bitmap_past_hashes(bitmap_hashes);
507
+
508
+ // Clean up media resources
509
+ mtmd_input_chunks_free(chunks);
510
+ return n_past;
511
+ }
@@ -351,7 +351,7 @@ jobs:
351
351
 
352
352
  ubuntu-22-cmake-musa:
353
353
  runs-on: ubuntu-22.04
354
- container: mthreads/musa:rc3.1.1-devel-ubuntu22.04
354
+ container: mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04
355
355
 
356
356
  steps:
357
357
  - name: Clone
@@ -899,7 +899,7 @@ jobs:
899
899
  shell: bash
900
900
 
901
901
  env:
902
- WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe
902
+ WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7cd9bba0-7aab-4e30-b3ae-2221006a4a05/intel-oneapi-base-toolkit-2025.1.1.34_offline.exe
903
903
  WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
904
904
  ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
905
905
  steps: