@fugood/llama.node 0.4.7 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. package/CMakeLists.txt +4 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/lib/binding.ts +66 -6
  11. package/lib/index.js +59 -17
  12. package/lib/index.ts +74 -23
  13. package/package.json +1 -1
  14. package/src/DecodeAudioTokenWorker.cpp +40 -0
  15. package/src/DecodeAudioTokenWorker.h +22 -0
  16. package/src/EmbeddingWorker.cpp +7 -5
  17. package/src/LlamaCompletionWorker.cpp +68 -54
  18. package/src/LlamaCompletionWorker.h +7 -8
  19. package/src/LlamaContext.cpp +551 -235
  20. package/src/LlamaContext.h +26 -4
  21. package/src/LoadSessionWorker.cpp +4 -2
  22. package/src/SaveSessionWorker.cpp +10 -6
  23. package/src/TokenizeWorker.cpp +23 -14
  24. package/src/TokenizeWorker.h +2 -2
  25. package/src/addons.cc +8 -11
  26. package/src/common.hpp +129 -126
  27. package/src/llama.cpp/.github/workflows/build.yml +2 -2
  28. package/src/llama.cpp/.github/workflows/release.yml +152 -129
  29. package/src/llama.cpp/.github/workflows/winget.yml +42 -0
  30. package/src/llama.cpp/common/arg.cpp +14 -13
  31. package/src/llama.cpp/common/common.cpp +4 -75
  32. package/src/llama.cpp/common/common.h +7 -12
  33. package/src/llama.cpp/examples/lookahead/lookahead.cpp +0 -13
  34. package/src/llama.cpp/examples/lookup/lookup.cpp +0 -11
  35. package/src/llama.cpp/examples/parallel/parallel.cpp +0 -9
  36. package/src/llama.cpp/examples/retrieval/retrieval.cpp +6 -6
  37. package/src/llama.cpp/examples/simple/simple.cpp +1 -1
  38. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
  39. package/src/llama.cpp/examples/sycl/run-llama2.sh +4 -4
  40. package/src/llama.cpp/examples/sycl/run-llama3.sh +28 -0
  41. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  42. package/src/llama.cpp/examples/sycl/win-run-llama3.bat +9 -0
  43. package/src/llama.cpp/ggml/include/ggml-opt.h +2 -0
  44. package/src/llama.cpp/ggml/include/ggml.h +11 -0
  45. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +274 -0
  46. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +27 -0
  47. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +18 -2
  48. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1 -0
  49. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +107 -0
  50. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +16 -0
  51. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
  52. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -155
  53. package/src/llama.cpp/ggml/src/ggml-opt.cpp +5 -0
  54. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +43 -12
  55. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +171 -112
  56. package/src/llama.cpp/ggml/src/ggml.c +64 -18
  57. package/src/llama.cpp/include/llama.h +24 -124
  58. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
  59. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
  60. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  61. package/src/llama.cpp/src/llama-batch.cpp +3 -1
  62. package/src/llama.cpp/src/llama-context.cpp +60 -110
  63. package/src/llama.cpp/src/llama-graph.cpp +137 -233
  64. package/src/llama.cpp/src/llama-graph.h +49 -7
  65. package/src/llama.cpp/src/llama-hparams.cpp +17 -1
  66. package/src/llama.cpp/src/llama-hparams.h +34 -5
  67. package/src/llama.cpp/src/llama-kv-cache.cpp +654 -321
  68. package/src/llama.cpp/src/llama-kv-cache.h +201 -85
  69. package/src/llama.cpp/src/llama-memory.h +3 -2
  70. package/src/llama.cpp/src/llama-model.cpp +273 -94
  71. package/src/llama.cpp/src/llama-model.h +4 -1
  72. package/src/llama.cpp/tests/test-arg-parser.cpp +1 -1
  73. package/src/llama.cpp/tools/llama-bench/llama-bench.cpp +1 -0
  74. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +13 -2
  75. package/src/llama.cpp/tools/mtmd/clip-impl.h +108 -11
  76. package/src/llama.cpp/tools/mtmd/clip.cpp +466 -88
  77. package/src/llama.cpp/tools/mtmd/clip.h +6 -4
  78. package/src/llama.cpp/tools/mtmd/miniaudio.h +93468 -0
  79. package/src/llama.cpp/tools/mtmd/mtmd-audio.cpp +855 -0
  80. package/src/llama.cpp/tools/mtmd/mtmd-audio.h +62 -0
  81. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +21 -14
  82. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +36 -49
  83. package/src/llama.cpp/tools/mtmd/mtmd.cpp +362 -98
  84. package/src/llama.cpp/tools/mtmd/mtmd.h +52 -21
  85. package/src/llama.cpp/tools/run/run.cpp +2 -2
  86. package/src/llama.cpp/tools/server/server.cpp +158 -47
  87. package/src/llama.cpp/tools/server/utils.hpp +71 -43
  88. package/src/llama.cpp/tools/tts/tts.cpp +4 -2
  89. package/src/tts_utils.cpp +342 -0
  90. package/src/tts_utils.h +62 -0
  91. package/bin/win32/arm64/llama-node.node +0 -0
  92. package/bin/win32/arm64/node.lib +0 -0
  93. package/bin/win32/x64/llama-node.node +0 -0
  94. package/bin/win32/x64/node.lib +0 -0
  95. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  96. package/bin/win32-vulkan/arm64/node.lib +0 -0
  97. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  98. package/bin/win32-vulkan/x64/node.lib +0 -0
package/src/common.hpp CHANGED
@@ -1,11 +1,10 @@
1
1
  #pragma once
2
2
 
3
+ #include "chat.h"
3
4
  #include "common/common.h"
4
5
  #include "common/sampling.h"
5
- #include "tools/mtmd/mtmd.h"
6
- #include "tools/mtmd/clip.h"
7
- #include "chat.h"
8
6
  #include "llama.h"
7
+ #include "tools/mtmd/clip.h"
9
8
  #include "tools/mtmd/mtmd.h"
10
9
  #include <memory>
11
10
  #include <mutex>
@@ -27,13 +26,17 @@ static std::string json_stringify(const Napi::Object &obj) {
27
26
  Napi::Env env = obj.Env();
28
27
  Napi::Object json = env.Global().Get("JSON").As<Napi::Object>();
29
28
  Napi::Function stringify = json.Get("stringify").As<Napi::Function>();
30
- return stringify.Call(json, { obj }).As<Napi::String>().ToString();
29
+ return stringify.Call(json, {obj}).As<Napi::String>().ToString();
31
30
  }
32
31
 
33
- static void console_log(Napi::Env env, const std::string& message) {
34
- Napi::Function consoleLog = env.Global().Get("console").As<Napi::Object>().Get("log").As<Napi::Function>();
35
- consoleLog.Call({ Napi::String::New(env, message) });
36
- }
32
+ static void console_log(Napi::Env env, const std::string &message) {
33
+ Napi::Function consoleLog = env.Global()
34
+ .Get("console")
35
+ .As<Napi::Object>()
36
+ .Get("log")
37
+ .As<Napi::Function>();
38
+ consoleLog.Call({Napi::String::New(env, message)});
39
+ }
37
40
 
38
41
  template <typename T>
39
42
  constexpr T get_option(const Napi::Object &options, const std::string &name,
@@ -64,8 +67,7 @@ constexpr T get_option(const Napi::Object &options, const std::string &name,
64
67
 
65
68
  class LlamaSession {
66
69
  public:
67
- LlamaSession(common_params params)
68
- : params_(params) {
70
+ LlamaSession(common_params params) : params_(params) {
69
71
  llama_init_ = common_init_from_params(params);
70
72
  tokens_.reserve(params.n_ctx);
71
73
  }
@@ -93,21 +95,17 @@ public:
93
95
  inline const common_params &params() const { return params_; }
94
96
 
95
97
  inline std::mutex &get_mutex() { return mutex; }
96
-
98
+
97
99
  // Getter for the multimodal context
98
- inline const mtmd_context* get_mtmd_ctx() const {
99
- return _mtmd_ctx;
100
- }
101
-
100
+ inline const mtmd_context *get_mtmd_ctx() const { return _mtmd_ctx; }
101
+
102
102
  // Setter for the multimodal context
103
- inline void set_mtmd_ctx(mtmd_context* ctx) {
104
- _mtmd_ctx = ctx;
105
- }
103
+ inline void set_mtmd_ctx(mtmd_context *ctx) { _mtmd_ctx = ctx; }
106
104
 
107
105
  void dispose() {
108
106
  std::lock_guard<std::mutex> lock(mutex);
109
107
  tokens_.clear();
110
-
108
+
111
109
  // mtmd_ctx is owned by LlamaContext, so we don't free it here
112
110
  _mtmd_ctx = nullptr;
113
111
  }
@@ -118,13 +116,13 @@ private:
118
116
  std::vector<llama_token> tokens_{};
119
117
  std::vector<std::string> mtmd_bitmap_past_hashes_{};
120
118
  std::mutex mutex;
121
- mtmd_context* _mtmd_ctx = nullptr;
119
+ mtmd_context *_mtmd_ctx = nullptr;
122
120
  };
123
121
 
124
122
  typedef std::shared_ptr<LlamaSession> LlamaSessionPtr;
125
123
 
126
124
  static size_t common_tokens_part(const std::vector<llama_token> &a,
127
- const std::vector<llama_token> &b) {
125
+ const std::vector<llama_token> &b) {
128
126
  size_t i = 0;
129
127
  while (i < a.size() && i < b.size() && a[i] == b[i]) {
130
128
  i++;
@@ -133,7 +131,7 @@ static size_t common_tokens_part(const std::vector<llama_token> &a,
133
131
  }
134
132
 
135
133
  // Computes FNV-1a hash of the data
136
- static std::string fnv_hash(const uint8_t * data, size_t len) {
134
+ static std::string fnv_hash(const uint8_t *data, size_t len) {
137
135
  const uint64_t fnv_prime = 0x100000001b3ULL;
138
136
  uint64_t hash = 0xcbf29ce484222325ULL;
139
137
 
@@ -144,10 +142,9 @@ static std::string fnv_hash(const uint8_t * data, size_t len) {
144
142
  return std::to_string(hash);
145
143
  }
146
144
 
147
- static const std::string base64_chars =
148
- "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
149
- "abcdefghijklmnopqrstuvwxyz"
150
- "0123456789+/";
145
+ static const std::string base64_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
146
+ "abcdefghijklmnopqrstuvwxyz"
147
+ "0123456789+/";
151
148
 
152
149
  // Base64 decoding function
153
150
  static std::vector<uint8_t> base64_decode(const std::string &encoded_string) {
@@ -164,18 +161,22 @@ static std::vector<uint8_t> base64_decode(const std::string &encoded_string) {
164
161
  continue;
165
162
  }
166
163
 
167
- if (encoded_string[in_] == '=' || base64_chars.find(encoded_string[in_]) == std::string::npos) {
164
+ if (encoded_string[in_] == '=' ||
165
+ base64_chars.find(encoded_string[in_]) == std::string::npos) {
168
166
  break;
169
167
  }
170
168
 
171
- char_array_4[i++] = encoded_string[in_]; in_++;
169
+ char_array_4[i++] = encoded_string[in_];
170
+ in_++;
172
171
  if (i == 4) {
173
172
  for (i = 0; i < 4; i++) {
174
173
  char_array_4[i] = base64_chars.find(char_array_4[i]);
175
174
  }
176
175
 
177
- char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
178
- char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
176
+ char_array_3[0] =
177
+ (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
178
+ char_array_3[1] =
179
+ ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
179
180
  char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
180
181
 
181
182
  for (i = 0; i < 3; i++) {
@@ -195,7 +196,8 @@ static std::vector<uint8_t> base64_decode(const std::string &encoded_string) {
195
196
  }
196
197
 
197
198
  char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
198
- char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
199
+ char_array_3[1] =
200
+ ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
199
201
  char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
200
202
 
201
203
  for (j = 0; j < i - 1; j++) {
@@ -209,82 +211,86 @@ static std::vector<uint8_t> base64_decode(const std::string &encoded_string) {
209
211
  struct TokenizeResult {
210
212
  std::vector<llama_token> tokens;
211
213
 
212
- bool has_image = false;
214
+ bool has_media = false;
213
215
  std::vector<std::string> bitmap_hashes;
214
- std::vector<size_t> chunk_pos; // both text and image
215
- std::vector<size_t> chunk_pos_images; // image only
216
- mtmd_input_chunks* chunks = nullptr;
216
+ std::vector<size_t> chunk_pos; // both text and media
217
+ std::vector<size_t> chunk_pos_media; // media only
218
+ mtmd_input_chunks *chunks = nullptr;
217
219
  };
218
220
 
219
- static TokenizeResult tokenizeWithImages(
220
- const mtmd_context* mtmd_ctx,
221
- const std::string &prompt,
222
- const std::vector<std::string> &image_paths
223
- ) {
221
+ static TokenizeResult
222
+ tokenizeWithMedia(const mtmd_context *mtmd_ctx, const std::string &prompt,
223
+ const std::vector<std::string> &media_paths) {
224
224
  if (mtmd_ctx == nullptr) {
225
225
  throw std::runtime_error("Multimodal context is not initialized");
226
226
  }
227
227
 
228
228
  TokenizeResult result;
229
- result.has_image = !image_paths.empty();
229
+ result.has_media = !media_paths.empty();
230
230
 
231
231
  mtmd::bitmaps bitmaps;
232
232
 
233
- // Load all images
234
- for (const auto& image_path : image_paths) {
235
- fprintf(stdout, "[DEBUG] Loading image: %s\n",
236
- image_path.substr(0, 50).c_str()); // Only log part of path for base64
233
+ // Load all media paths
234
+ for (const auto &media_path : media_paths) {
235
+ fprintf(
236
+ stdout, "[DEBUG] Loading media: %s\n",
237
+ media_path.substr(0, 50).c_str()); // Only log part of path for base64
237
238
 
238
- // Check if it's a base64 image
239
- if (image_path.compare(0, 11, "data:image/") == 0) {
239
+ // Check if it's a base64 media
240
+ if (media_path.compare(0, 11, "data:image/") == 0 ||
241
+ media_path.compare(0, 11, "data:audio/") == 0) {
240
242
 
241
243
  // Parse base64 data
242
244
  std::vector<std::string> parts;
243
- size_t comma_pos = image_path.find(',');
245
+ size_t comma_pos = media_path.find(',');
244
246
  if (comma_pos == std::string::npos) {
245
247
  result.bitmap_hashes.clear();
246
- throw std::runtime_error("Invalid base64 image");
248
+ throw std::runtime_error(
249
+ "Invalid base64 media format, missing comma separator");
247
250
  }
248
251
 
249
- std::string header = image_path.substr(0, comma_pos);
250
- std::string base64_data = image_path.substr(comma_pos + 1);
252
+ std::string header = media_path.substr(0, comma_pos);
253
+ std::string base64_data = media_path.substr(comma_pos + 1);
251
254
 
252
255
  if (header.find("base64") == std::string::npos) {
253
256
  result.bitmap_hashes.clear();
254
- throw std::runtime_error("Invalid base64 image");
257
+ throw std::runtime_error("Invalid base64 media");
255
258
  }
256
259
 
257
260
  // Decode base64
258
261
  try {
259
262
  // Decode base64 to binary
260
- std::vector<uint8_t> image_data = base64_decode(base64_data);
263
+ std::vector<uint8_t> media_data = base64_decode(base64_data);
261
264
 
262
265
  // Load bitmap from memory buffer using direct initialization
263
- mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(image_data.data(), image_data.size()));
266
+ mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(media_data.data(),
267
+ media_data.size()));
264
268
  if (!bmp.ptr) {
265
269
  bitmaps.entries.clear();
266
- throw std::runtime_error("Failed to decode base64 image");
270
+ throw std::runtime_error("Failed to load base64 media");
267
271
  }
268
272
 
269
273
  // Calculate bitmap hash (for KV caching)
270
- std::string hash = fnv_hash(bmp.data(), bmp.nx()*bmp.ny()*3);
274
+ std::string hash = fnv_hash(bmp.data(), bmp.n_bytes());
271
275
  bmp.set_id(hash.c_str());
272
276
  bitmaps.entries.push_back(std::move(bmp));
273
277
  result.bitmap_hashes.push_back(hash.c_str());
274
- } catch (const std::exception& e) {
278
+ } catch (const std::exception &e) {
275
279
  bitmaps.entries.clear();
276
- throw std::runtime_error("Failed to decode base64 image");
280
+ throw std::runtime_error("Failed to decode base64 media");
277
281
  }
278
- } else if (image_path.compare(0, 7, "http://") == 0 || image_path.compare(0, 8, "https://") == 0) {
282
+ } else if (media_path.compare(0, 7, "http://") == 0 ||
283
+ media_path.compare(0, 8, "https://") == 0) {
279
284
  // HTTP URLs are not supported yet
280
285
  bitmaps.entries.clear();
281
- throw std::runtime_error("HTTP URLs are not supported yet");
286
+ throw std::runtime_error("HTTP/HTTPS URLs are not supported yet");
282
287
  } else {
288
+ // Regular file path
283
289
  // Check if file exists
284
- FILE* file = fopen(image_path.c_str(), "rb");
290
+ FILE *file = fopen(media_path.c_str(), "rb");
285
291
  if (file == nullptr) {
286
292
  bitmaps.entries.clear();
287
- throw std::runtime_error("Failed to open image file");
293
+ throw std::runtime_error("File does not exist or cannot be opened");
288
294
  }
289
295
 
290
296
  // Get file size
@@ -294,14 +300,14 @@ static TokenizeResult tokenizeWithImages(
294
300
  fclose(file);
295
301
 
296
302
  // Create bitmap directly
297
- mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(image_path.c_str()));
303
+ mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(media_path.c_str()));
298
304
  if (!bmp.ptr) {
299
305
  bitmaps.entries.clear();
300
- throw std::runtime_error("Failed to create bitmap from image file");
306
+ throw std::runtime_error("Failed to load media");
301
307
  }
302
308
 
303
309
  // Calculate bitmap hash (for KV caching)
304
- std::string hash = fnv_hash(bmp.data(), bmp.nx()*bmp.ny()*3);
310
+ std::string hash = fnv_hash(bmp.data(), bmp.nx() * bmp.ny() * 3);
305
311
  bmp.set_id(hash.c_str());
306
312
  bitmaps.entries.push_back(std::move(bmp));
307
313
  result.bitmap_hashes.push_back(hash.c_str());
@@ -313,58 +319,60 @@ static TokenizeResult tokenizeWithImages(
313
319
  bitmaps.entries.clear();
314
320
  throw std::runtime_error("Failed to initialize input chunks");
315
321
  }
316
-
322
+
317
323
  // Create input text
318
324
  mtmd_input_text input_text;
319
- input_text.text = prompt.c_str(); // Use the full prompt with image marker
320
- input_text.add_special = true; // Add BOS token if this is the first message
321
- input_text.parse_special = true; // Parse special tokens like <__image__>
325
+ input_text.text = prompt.c_str(); // Use the full prompt with media marker
326
+ input_text.add_special = true; // Add BOS token if this is the first message
327
+ input_text.parse_special = true; // Parse special tokens like <__media__>
322
328
 
323
- // Tokenize the text and images
324
- fprintf(stdout, "[DEBUG] Tokenizing text and %zu images\n", bitmaps.entries.size());
329
+ // Tokenize the text and media
330
+ fprintf(stdout, "[DEBUG] Tokenizing text and %zu media\n",
331
+ bitmaps.entries.size());
325
332
  auto bitmaps_c_ptr = bitmaps.c_ptr();
326
-
333
+
327
334
  // Cast away const for mtmd_tokenize
328
- int32_t res = mtmd_tokenize(
329
- const_cast<mtmd_context*>(mtmd_ctx),
330
- result.chunks,
331
- &input_text,
332
- bitmaps_c_ptr.data(),
333
- bitmaps_c_ptr.size()
334
- );
335
-
335
+ int32_t res =
336
+ mtmd_tokenize(const_cast<mtmd_context *>(mtmd_ctx), result.chunks,
337
+ &input_text, bitmaps_c_ptr.data(), bitmaps_c_ptr.size());
338
+
336
339
  if (res != 0) {
337
340
  mtmd_input_chunks_free(result.chunks);
338
341
  bitmaps.entries.clear();
339
- throw std::runtime_error("Failed to tokenize text and images");
342
+ throw std::runtime_error("Failed to tokenize text and media");
340
343
  }
341
344
 
342
345
  // Log chunk information
343
346
  size_t num_chunks = mtmd_input_chunks_size(result.chunks);
344
- fprintf(stdout, "[DEBUG] Tokenization successful: num_chunks=%zu\n", num_chunks);
347
+ fprintf(stdout, "[DEBUG] Tokenization successful: num_chunks=%zu\n",
348
+ num_chunks);
345
349
 
346
- // Track the total number of tokens (both text and image)
350
+ // Track the total number of tokens (both text and media)
347
351
  size_t total_token_count = 0;
348
352
 
349
353
  // chunk pos
350
354
  for (size_t i = 0; i < num_chunks; i++) {
351
355
  result.chunk_pos.push_back(total_token_count);
352
356
 
353
- const mtmd_input_chunk* chunk = mtmd_input_chunks_get(result.chunks, i);
357
+ const mtmd_input_chunk *chunk = mtmd_input_chunks_get(result.chunks, i);
354
358
  mtmd_input_chunk_type chunk_type = mtmd_input_chunk_get_type(chunk);
355
359
 
356
360
  if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
357
361
  size_t n_tokens;
358
- const llama_token* tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
362
+ const llama_token *tokens =
363
+ mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
359
364
 
360
365
  result.tokens.insert(result.tokens.end(), tokens, tokens + n_tokens);
361
366
  total_token_count += n_tokens;
362
- } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
363
- result.chunk_pos_images.push_back(total_token_count);
367
+ } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE ||
368
+ chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
369
+ result.chunk_pos_media.push_back(total_token_count);
364
370
 
365
- const mtmd_image_tokens* img_tokens = mtmd_input_chunk_get_tokens_image(chunk);
366
- size_t n_tokens = mtmd_image_tokens_get_n_tokens(img_tokens);
367
- size_t n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
371
+ size_t n_tokens = mtmd_input_chunk_get_n_tokens(chunk);
372
+ size_t n_pos = mtmd_input_chunk_get_n_pos(chunk);
373
+ fprintf(stdout, "[DEBUG] Chunk %zu: type=%s, n_tokens=%zu, n_pos=%zu\n",
374
+ i, chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE ? "IMAGE" : "AUDIO",
375
+ n_tokens, n_pos);
368
376
 
369
377
  for (size_t j = 0; j < n_pos; j++) {
370
378
  result.tokens.push_back(LLAMA_TOKEN_NULL);
@@ -374,35 +382,34 @@ static TokenizeResult tokenizeWithImages(
374
382
  }
375
383
 
376
384
  bitmaps.entries.clear();
377
-
385
+
378
386
  return result;
379
387
  }
380
388
 
381
- // Process images and add them to the tokenized input
382
- static llama_pos process_image_prompt(
383
- llama_context* ctx,
384
- const mtmd_context* mtmd_ctx,
385
- LlamaSessionPtr sess,
386
- const common_params& params,
387
- const std::vector<std::string>& image_paths
388
- ) {
389
+ // Process media and add them to the tokenized input
390
+ static llama_pos
391
+ processMediaPrompt(llama_context *ctx, const mtmd_context *mtmd_ctx,
392
+ LlamaSessionPtr sess, const common_params &params,
393
+ const std::vector<std::string> &media_paths) {
389
394
  if (mtmd_ctx == nullptr) {
390
395
  throw std::runtime_error("Multimodal context is not initialized");
391
396
  }
392
397
 
393
398
  // Multimodal path
394
399
  std::string full_prompt = params.prompt;
395
- // Add image marker if it doesn't already exist
396
- if (full_prompt.find("<__image__>") == std::string::npos) {
397
- full_prompt += " <__image__>";
400
+ auto default_media_marker = mtmd_default_marker();
401
+ // Add media marker if it doesn't already exist
402
+ if (full_prompt.find(default_media_marker) == std::string::npos) {
403
+ full_prompt += " ";
404
+ full_prompt += default_media_marker;
398
405
  }
399
406
 
400
- auto result = tokenizeWithImages(mtmd_ctx, full_prompt, image_paths);
407
+ auto result = tokenizeWithMedia(mtmd_ctx, full_prompt, media_paths);
401
408
 
402
409
  auto all_tokens = result.tokens;
403
410
  auto chunks = result.chunks;
404
411
  auto chunk_pos = result.chunk_pos;
405
- auto chunk_pos_images = result.chunk_pos_images;
412
+ auto chunk_pos_media = result.chunk_pos_media;
406
413
  auto bitmap_hashes = result.bitmap_hashes;
407
414
 
408
415
  llama_pos n_past = common_tokens_part(*sess->tokens_ptr(), all_tokens);
@@ -418,11 +425,10 @@ static llama_pos process_image_prompt(
418
425
  break;
419
426
  }
420
427
  bool is_end = i + 1 == chunk_pos.size();
421
- if (
422
- chunk_pos[i] < n_past &&
423
- (!is_end && chunk_pos[i + 1] > n_past)
424
- // is_end & n_past < total_token_count:
425
- // don't need to adjust and it will skip eval_chunk_single, let nextToken() to finish the job
428
+ if (chunk_pos[i] < n_past && (!is_end && chunk_pos[i + 1] > n_past)
429
+ // is_end & n_past < total_token_count:
430
+ // don't need to adjust and it will skip eval_chunk_single, let
431
+ // nextToken() to finish the job
426
432
  ) {
427
433
  adjusted_n_past = chunk_pos[i];
428
434
  }
@@ -433,11 +439,12 @@ static llama_pos process_image_prompt(
433
439
  fprintf(stdout, "[DEBUG] Adjusted n_past to %d\n", n_past);
434
440
  }
435
441
 
436
- // Compare bitmap hashes, if they are not the same, backtrack n_past to the position of the first mismatch
442
+ // Compare bitmap hashes, if they are not the same, backtrack n_past to the
443
+ // position of the first mismatch
437
444
  auto mtmd_bitmap_past_hashes = sess->mtmd_bitmap_past_hashes_ptr();
438
445
  if (mtmd_bitmap_past_hashes->size() > 0) {
439
446
  for (size_t i = 0; i < bitmap_hashes.size(); i++) {
440
- auto pos = chunk_pos_images[i];
447
+ auto pos = chunk_pos_media[i];
441
448
  if (n_past < pos) {
442
449
  break;
443
450
  }
@@ -445,7 +452,7 @@ static llama_pos process_image_prompt(
445
452
  break;
446
453
  }
447
454
  if (bitmap_hashes[i] != (*mtmd_bitmap_past_hashes)[i]) {
448
- n_past = chunk_pos_images[i];
455
+ n_past = chunk_pos_media[i];
449
456
  new_n_past = n_past;
450
457
  break;
451
458
  }
@@ -458,7 +465,8 @@ static llama_pos process_image_prompt(
458
465
  size_t num_chunks = mtmd_input_chunks_size(chunks);
459
466
 
460
467
  for (size_t i = 0; i < chunk_pos.size(); i++) {
461
- fprintf(stdout, "[DEBUG] Evaluating chunk %zu: n_past=%d, chunk_pos=%zu\n", i, n_past, chunk_pos[i]);
468
+ fprintf(stdout, "[DEBUG] Evaluating chunk %zu: n_past=%d, chunk_pos=%zu\n",
469
+ i, n_past, chunk_pos[i]);
462
470
 
463
471
  // Process chunk only if it's after the current n_past
464
472
  if (chunk_pos[i] >= new_n_past) {
@@ -467,16 +475,10 @@ static llama_pos process_image_prompt(
467
475
 
468
476
  // Cast away const for mtmd_helper_eval_chunk_single
469
477
  int32_t res = mtmd_helper_eval_chunk_single(
470
- const_cast<mtmd_context*>(mtmd_ctx),
471
- ctx,
472
- chunk,
473
- n_past,
474
- 0,
475
- params.n_batch, // batch size
476
- chunk_logits_last,
477
- &new_n_past
478
- );
479
-
478
+ const_cast<mtmd_context *>(mtmd_ctx), ctx, chunk, n_past, 0,
479
+ params.n_batch, // batch size
480
+ chunk_logits_last, &new_n_past);
481
+
480
482
  if (res != 0) {
481
483
  mtmd_input_chunks_free(chunks);
482
484
  throw std::runtime_error("Failed to process chunk");
@@ -485,13 +487,14 @@ static llama_pos process_image_prompt(
485
487
  }
486
488
  }
487
489
 
488
- if (n_past == all_tokens.size() && n_past > 0 && all_tokens[n_past - 1] != LLAMA_TOKEN_NULL) {
490
+ if (n_past == all_tokens.size() && n_past > 0 &&
491
+ all_tokens[n_past - 1] != LLAMA_TOKEN_NULL) {
489
492
  // we have to evaluate at least 1 token to generate logits.
490
493
  n_past--;
491
494
  }
492
495
 
493
496
  // Update sampling context to process token sequences
494
- for (auto & token : all_tokens) {
497
+ for (auto &token : all_tokens) {
495
498
  if (token == LLAMA_TOKEN_NULL) {
496
499
  continue;
497
500
  }
@@ -501,7 +504,7 @@ static llama_pos process_image_prompt(
501
504
 
502
505
  sess->set_mtmd_bitmap_past_hashes(bitmap_hashes);
503
506
 
504
- // Clean up image resources
507
+ // Clean up media resources
505
508
  mtmd_input_chunks_free(chunks);
506
509
  return n_past;
507
510
  }
@@ -351,7 +351,7 @@ jobs:
351
351
 
352
352
  ubuntu-22-cmake-musa:
353
353
  runs-on: ubuntu-22.04
354
- container: mthreads/musa:rc3.1.1-devel-ubuntu22.04
354
+ container: mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04
355
355
 
356
356
  steps:
357
357
  - name: Clone
@@ -899,7 +899,7 @@ jobs:
899
899
  shell: bash
900
900
 
901
901
  env:
902
- WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe
902
+ WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7cd9bba0-7aab-4e30-b3ae-2221006a4a05/intel-oneapi-base-toolkit-2025.1.1.34_offline.exe
903
903
  WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
904
904
  ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
905
905
  steps: