@fugood/llama.node 0.4.7 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +20 -6
  18. package/lib/index.js +41 -17
  19. package/lib/index.ts +50 -23
  20. package/package.json +1 -1
  21. package/src/LlamaCompletionWorker.cpp +9 -9
  22. package/src/LlamaCompletionWorker.h +2 -2
  23. package/src/LlamaContext.cpp +37 -18
  24. package/src/LlamaContext.h +1 -0
  25. package/src/TokenizeWorker.cpp +16 -12
  26. package/src/TokenizeWorker.h +2 -2
  27. package/src/common.hpp +54 -50
  28. package/src/llama.cpp/.github/workflows/build.yml +2 -2
  29. package/src/llama.cpp/.github/workflows/release.yml +152 -129
  30. package/src/llama.cpp/.github/workflows/winget.yml +42 -0
  31. package/src/llama.cpp/common/arg.cpp +14 -13
  32. package/src/llama.cpp/common/common.cpp +4 -75
  33. package/src/llama.cpp/common/common.h +7 -12
  34. package/src/llama.cpp/examples/lookahead/lookahead.cpp +0 -13
  35. package/src/llama.cpp/examples/lookup/lookup.cpp +0 -11
  36. package/src/llama.cpp/examples/parallel/parallel.cpp +0 -9
  37. package/src/llama.cpp/examples/retrieval/retrieval.cpp +6 -6
  38. package/src/llama.cpp/examples/simple/simple.cpp +1 -1
  39. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
  40. package/src/llama.cpp/examples/sycl/run-llama2.sh +4 -4
  41. package/src/llama.cpp/examples/sycl/run-llama3.sh +28 -0
  42. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  43. package/src/llama.cpp/examples/sycl/win-run-llama3.bat +9 -0
  44. package/src/llama.cpp/ggml/include/ggml-opt.h +2 -0
  45. package/src/llama.cpp/ggml/include/ggml.h +11 -0
  46. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +274 -0
  47. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +27 -0
  48. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +18 -2
  49. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1 -0
  50. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +107 -0
  51. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +16 -0
  52. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
  53. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -155
  54. package/src/llama.cpp/ggml/src/ggml-opt.cpp +5 -0
  55. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +43 -12
  56. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +171 -112
  57. package/src/llama.cpp/ggml/src/ggml.c +64 -18
  58. package/src/llama.cpp/include/llama.h +24 -124
  59. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
  60. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
  61. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  62. package/src/llama.cpp/src/llama-batch.cpp +3 -1
  63. package/src/llama.cpp/src/llama-context.cpp +60 -110
  64. package/src/llama.cpp/src/llama-graph.cpp +137 -233
  65. package/src/llama.cpp/src/llama-graph.h +49 -7
  66. package/src/llama.cpp/src/llama-hparams.cpp +17 -1
  67. package/src/llama.cpp/src/llama-hparams.h +34 -5
  68. package/src/llama.cpp/src/llama-kv-cache.cpp +654 -321
  69. package/src/llama.cpp/src/llama-kv-cache.h +201 -85
  70. package/src/llama.cpp/src/llama-memory.h +3 -2
  71. package/src/llama.cpp/src/llama-model.cpp +273 -94
  72. package/src/llama.cpp/src/llama-model.h +4 -1
  73. package/src/llama.cpp/tests/test-arg-parser.cpp +1 -1
  74. package/src/llama.cpp/tools/llama-bench/llama-bench.cpp +1 -0
  75. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +13 -2
  76. package/src/llama.cpp/tools/mtmd/clip-impl.h +108 -11
  77. package/src/llama.cpp/tools/mtmd/clip.cpp +466 -88
  78. package/src/llama.cpp/tools/mtmd/clip.h +6 -4
  79. package/src/llama.cpp/tools/mtmd/miniaudio.h +93468 -0
  80. package/src/llama.cpp/tools/mtmd/mtmd-audio.cpp +855 -0
  81. package/src/llama.cpp/tools/mtmd/mtmd-audio.h +62 -0
  82. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +21 -14
  83. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +36 -49
  84. package/src/llama.cpp/tools/mtmd/mtmd.cpp +362 -98
  85. package/src/llama.cpp/tools/mtmd/mtmd.h +52 -21
  86. package/src/llama.cpp/tools/run/run.cpp +2 -2
  87. package/src/llama.cpp/tools/server/server.cpp +158 -47
  88. package/src/llama.cpp/tools/server/utils.hpp +71 -43
  89. package/src/llama.cpp/tools/tts/tts.cpp +4 -2
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
package/lib/binding.ts CHANGED
@@ -6,6 +6,11 @@ export type MessagePart = {
6
6
  text?: string,
7
7
  image_url?: {
8
8
  url?: string
9
+ },
10
+ input_audio?: {
11
+ format: string
12
+ data?: string
13
+ url?: string
9
14
  }
10
15
  }
11
16
 
@@ -103,12 +108,12 @@ export type LlamaCompletionOptions = {
103
108
  grammar_triggers?: { type: number; word: string; at_start: boolean }[]
104
109
  preserved_tokens?: string[]
105
110
  /**
106
- * Path(s) to image file(s) to process before generating text.
107
- * When provided, the image(s) will be processed and added to the context.
111
+ * Path(s) to media file(s) to process before generating text.
112
+ * When provided, the media will be processed and added to the context.
108
113
  * Requires multimodal support to be enabled via initMultimodal.
109
114
  * Supports both file paths and base64 data URLs.
110
115
  */
111
- image_paths?: string | string[]
116
+ media_paths?: string | string[]
112
117
  }
113
118
 
114
119
  export type LlamaCompletionResult = {
@@ -137,10 +142,10 @@ export type LlamaCompletionToken = {
137
142
 
138
143
  export type TokenizeResult = {
139
144
  tokens: Int32Array
140
- has_image: boolean
145
+ has_media: boolean
141
146
  bitmap_hashes: string[]
142
147
  chunk_pos: number[]
143
- chunk_pos_images: number[]
148
+ chunk_pos_media: number[]
144
149
  }
145
150
 
146
151
  export type EmbeddingResult = {
@@ -167,7 +172,7 @@ export interface LlamaContext {
167
172
  callback?: (token: LlamaCompletionToken) => void,
168
173
  ): Promise<LlamaCompletionResult>
169
174
  stopCompletion(): void
170
- tokenize(text: string, image_paths?: string[]): Promise<TokenizeResult>
175
+ tokenize(text: string, media_paths?: string[]): Promise<TokenizeResult>
171
176
  detokenize(tokens: number[]): Promise<string>
172
177
  embedding(text: string): Promise<EmbeddingResult>
173
178
  saveSession(path: string): Promise<void>
@@ -189,6 +194,15 @@ export interface LlamaContext {
189
194
  */
190
195
  isMultimodalEnabled(): Promise<boolean>
191
196
 
197
+ /**
198
+ * Get multimodal support capabilities
199
+ * @returns Promise resolving to an object with vision and audio support
200
+ */
201
+ getMultimodalSupport(): Promise<{
202
+ vision: boolean
203
+ audio: boolean
204
+ }>
205
+
192
206
  /**
193
207
  * Release multimodal support
194
208
  */
package/lib/index.js CHANGED
@@ -23,10 +23,11 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
23
23
  });
24
24
  };
25
25
  Object.defineProperty(exports, "__esModule", { value: true });
26
- exports.loadLlamaModelInfo = exports.initLlama = exports.loadModel = exports.toggleNativeLog = void 0;
26
+ exports.loadLlamaModelInfo = exports.initLlama = exports.loadModel = exports.toggleNativeLog = exports.MTMD_DEFAULT_MEDIA_MARKER = void 0;
27
27
  exports.addNativeLogListener = addNativeLogListener;
28
28
  const binding_1 = require("./binding");
29
29
  __exportStar(require("./binding"), exports);
30
+ exports.MTMD_DEFAULT_MEDIA_MARKER = '<__media__>';
30
31
  const mods = {};
31
32
  const logListeners = [];
32
33
  const logCallback = (level, text) => {
@@ -78,13 +79,13 @@ class LlamaContextWrapper {
78
79
  isLlamaChatSupported() {
79
80
  return !!this.ctx.getModelInfo().chatTemplates.llamaChat;
80
81
  }
81
- _formatImageChat(messages) {
82
+ _formatMediaChat(messages) {
82
83
  if (!messages)
83
84
  return {
84
85
  messages,
85
- has_image: false,
86
+ has_media: false,
86
87
  };
87
- const imagePaths = [];
88
+ const mediaPaths = [];
88
89
  return {
89
90
  messages: messages.map((msg) => {
90
91
  if (Array.isArray(msg.content)) {
@@ -93,10 +94,30 @@ class LlamaContextWrapper {
93
94
  // Handle multimodal content
94
95
  if (part.type === 'image_url') {
95
96
  let path = ((_a = part.image_url) === null || _a === void 0 ? void 0 : _a.url) || '';
96
- imagePaths.push(path);
97
+ mediaPaths.push(path);
97
98
  return {
98
99
  type: 'text',
99
- text: '<__image__>',
100
+ text: exports.MTMD_DEFAULT_MEDIA_MARKER,
101
+ };
102
+ }
103
+ else if (part.type === 'input_audio') {
104
+ const { input_audio: audio } = part;
105
+ if (!audio)
106
+ throw new Error('input_audio is required');
107
+ const { format } = audio;
108
+ if (format != 'wav' && format != 'mp3') {
109
+ throw new Error(`Unsupported audio format: ${format}`);
110
+ }
111
+ if (audio.url) {
112
+ const path = audio.url.replace(/file:\/\//, '');
113
+ mediaPaths.push(path);
114
+ }
115
+ else if (audio.data) {
116
+ mediaPaths.push(audio.data);
117
+ }
118
+ return {
119
+ type: 'text',
120
+ text: exports.MTMD_DEFAULT_MEDIA_MARKER,
100
121
  };
101
122
  }
102
123
  return part;
@@ -105,12 +126,12 @@ class LlamaContextWrapper {
105
126
  }
106
127
  return msg;
107
128
  }),
108
- has_image: imagePaths.length > 0,
109
- image_paths: imagePaths,
129
+ has_media: mediaPaths.length > 0,
130
+ media_paths: mediaPaths,
110
131
  };
111
132
  }
112
133
  getFormattedChat(messages, template, params) {
113
- const { messages: chat, has_image, image_paths, } = this._formatImageChat(messages);
134
+ const { messages: chat, has_media, media_paths, } = this._formatMediaChat(messages);
114
135
  const useJinja = this.isJinjaSupported() && (params === null || params === void 0 ? void 0 : params.jinja);
115
136
  let tmpl;
116
137
  if (template)
@@ -127,25 +148,25 @@ class LlamaContextWrapper {
127
148
  return {
128
149
  type: 'llama-chat',
129
150
  prompt: result,
130
- has_image,
131
- image_paths,
151
+ has_media,
152
+ media_paths,
132
153
  };
133
154
  }
134
155
  const jinjaResult = result;
135
156
  jinjaResult.type = 'jinja';
136
- jinjaResult.has_image = has_image;
137
- jinjaResult.image_paths = image_paths;
157
+ jinjaResult.has_media = has_media;
158
+ jinjaResult.media_paths = media_paths;
138
159
  return jinjaResult;
139
160
  }
140
161
  completion(options, callback) {
141
- const { messages, image_paths = options.image_paths } = this._formatImageChat(options.messages);
142
- return this.ctx.completion(Object.assign(Object.assign({}, options), { messages, image_paths: options.image_paths || image_paths }), callback || (() => { }));
162
+ const { messages, media_paths = options.media_paths } = this._formatMediaChat(options.messages);
163
+ return this.ctx.completion(Object.assign(Object.assign({}, options), { messages, media_paths: options.media_paths || media_paths }), callback || (() => { }));
143
164
  }
144
165
  stopCompletion() {
145
166
  return this.ctx.stopCompletion();
146
167
  }
147
- tokenize(text, { image_paths } = {}) {
148
- return this.ctx.tokenize(text, image_paths);
168
+ tokenize(text, { media_paths } = {}) {
169
+ return this.ctx.tokenize(text, media_paths);
149
170
  }
150
171
  detokenize(tokens) {
151
172
  return this.ctx.detokenize(tokens);
@@ -180,6 +201,9 @@ class LlamaContextWrapper {
180
201
  releaseMultimodal() {
181
202
  return this.ctx.releaseMultimodal();
182
203
  }
204
+ getMultimodalSupport() {
205
+ return this.ctx.getMultimodalSupport();
206
+ }
183
207
  }
184
208
  const loadModel = (options) => __awaiter(void 0, void 0, void 0, function* () {
185
209
  var _a, _b;
package/lib/index.ts CHANGED
@@ -14,6 +14,8 @@ import type {
14
14
 
15
15
  export * from './binding'
16
16
 
17
+ export const MTMD_DEFAULT_MEDIA_MARKER = '<__media__>'
18
+
17
19
  export interface LlamaModelOptionsExtended extends LlamaModelOptions {
18
20
  lib_variant?: LibVariant
19
21
  }
@@ -63,8 +65,8 @@ const getJsonSchema = (responseFormat?: CompletionResponseFormat) => {
63
65
  export type FormattedChatResult = {
64
66
  type: 'jinja' | 'llama-chat'
65
67
  prompt: string
66
- has_image: boolean
67
- image_paths?: Array<string>
68
+ has_media: boolean
69
+ media_paths?: Array<string>
68
70
  }
69
71
 
70
72
  class LlamaContextWrapper {
@@ -91,17 +93,17 @@ class LlamaContextWrapper {
91
93
  return !!this.ctx.getModelInfo().chatTemplates.llamaChat
92
94
  }
93
95
 
94
- _formatImageChat(messages: ChatMessage[] | undefined): {
96
+ _formatMediaChat(messages: ChatMessage[] | undefined): {
95
97
  messages: ChatMessage[] | undefined
96
- has_image: boolean
97
- image_paths?: string[]
98
+ has_media: boolean
99
+ media_paths?: string[]
98
100
  } {
99
101
  if (!messages)
100
102
  return {
101
103
  messages,
102
- has_image: false,
104
+ has_media: false,
103
105
  }
104
- const imagePaths: string[] = []
106
+ const mediaPaths: string[] = []
105
107
  return {
106
108
  messages: messages.map((msg) => {
107
109
  if (Array.isArray(msg.content)) {
@@ -109,10 +111,28 @@ class LlamaContextWrapper {
109
111
  // Handle multimodal content
110
112
  if (part.type === 'image_url') {
111
113
  let path = part.image_url?.url || ''
112
- imagePaths.push(path)
114
+ mediaPaths.push(path)
113
115
  return {
114
116
  type: 'text',
115
- text: '<__image__>',
117
+ text: MTMD_DEFAULT_MEDIA_MARKER,
118
+ }
119
+ } else if (part.type === 'input_audio') {
120
+ const { input_audio: audio } = part
121
+ if (!audio) throw new Error('input_audio is required')
122
+
123
+ const { format } = audio
124
+ if (format != 'wav' && format != 'mp3') {
125
+ throw new Error(`Unsupported audio format: ${format}`)
126
+ }
127
+ if (audio.url) {
128
+ const path = audio.url.replace(/file:\/\//, '')
129
+ mediaPaths.push(path)
130
+ } else if (audio.data) {
131
+ mediaPaths.push(audio.data)
132
+ }
133
+ return {
134
+ type: 'text',
135
+ text: MTMD_DEFAULT_MEDIA_MARKER,
116
136
  }
117
137
  }
118
138
  return part
@@ -125,8 +145,8 @@ class LlamaContextWrapper {
125
145
  }
126
146
  return msg
127
147
  }),
128
- has_image: imagePaths.length > 0,
129
- image_paths: imagePaths,
148
+ has_media: mediaPaths.length > 0,
149
+ media_paths: mediaPaths,
130
150
  }
131
151
  }
132
152
 
@@ -143,9 +163,9 @@ class LlamaContextWrapper {
143
163
  ): FormattedChatResult {
144
164
  const {
145
165
  messages: chat,
146
- has_image,
147
- image_paths,
148
- } = this._formatImageChat(messages)
166
+ has_media,
167
+ media_paths,
168
+ } = this._formatMediaChat(messages)
149
169
 
150
170
  const useJinja = this.isJinjaSupported() && params?.jinja
151
171
  let tmpl
@@ -164,14 +184,14 @@ class LlamaContextWrapper {
164
184
  return {
165
185
  type: 'llama-chat',
166
186
  prompt: result as string,
167
- has_image,
168
- image_paths,
187
+ has_media,
188
+ media_paths,
169
189
  }
170
190
  }
171
191
  const jinjaResult = result
172
192
  jinjaResult.type = 'jinja'
173
- jinjaResult.has_image = has_image
174
- jinjaResult.image_paths = image_paths
193
+ jinjaResult.has_media = has_media
194
+ jinjaResult.media_paths = media_paths
175
195
  return jinjaResult
176
196
  }
177
197
 
@@ -179,12 +199,12 @@ class LlamaContextWrapper {
179
199
  options: LlamaCompletionOptions,
180
200
  callback?: (token: LlamaCompletionToken) => void,
181
201
  ): Promise<LlamaCompletionResult> {
182
- const { messages, image_paths = options.image_paths } =
183
- this._formatImageChat(options.messages)
202
+ const { messages, media_paths = options.media_paths } =
203
+ this._formatMediaChat(options.messages)
184
204
  return this.ctx.completion({
185
205
  ...options,
186
206
  messages,
187
- image_paths: options.image_paths || image_paths,
207
+ media_paths: options.media_paths || media_paths,
188
208
  }, callback || (() => {}))
189
209
  }
190
210
 
@@ -192,8 +212,8 @@ class LlamaContextWrapper {
192
212
  return this.ctx.stopCompletion()
193
213
  }
194
214
 
195
- tokenize(text: string, { image_paths }: { image_paths?: string[] } = {}): Promise<TokenizeResult> {
196
- return this.ctx.tokenize(text, image_paths)
215
+ tokenize(text: string, { media_paths }: { media_paths?: string[] } = {}): Promise<TokenizeResult> {
216
+ return this.ctx.tokenize(text, media_paths)
197
217
  }
198
218
 
199
219
  detokenize(tokens: number[]): Promise<string> {
@@ -242,6 +262,13 @@ class LlamaContextWrapper {
242
262
  releaseMultimodal(): Promise<void> {
243
263
  return this.ctx.releaseMultimodal()
244
264
  }
265
+
266
+ getMultimodalSupport(): Promise<{
267
+ vision: boolean
268
+ audio: boolean
269
+ }> {
270
+ return this.ctx.getMultimodalSupport()
271
+ }
245
272
  }
246
273
 
247
274
  export const loadModel = async (
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "0.4.7",
4
+ "version": "0.5.0",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -29,10 +29,10 @@ LlamaCompletionWorker::LlamaCompletionWorker(
29
29
  Napi::Function callback, common_params params,
30
30
  std::vector<std::string> stop_words,
31
31
  int32_t chat_format,
32
- std::vector<std::string> image_paths)
32
+ std::vector<std::string> media_paths)
33
33
  : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess),
34
34
  _params(params), _stop_words(stop_words), _chat_format(chat_format),
35
- _image_paths(image_paths) {
35
+ _media_paths(media_paths) {
36
36
  if (!callback.IsEmpty()) {
37
37
  _tsfn = Napi::ThreadSafeFunction::New(info.Env(), callback,
38
38
  "LlamaCompletionCallback", 0, 1);
@@ -64,19 +64,19 @@ void LlamaCompletionWorker::Execute() {
64
64
  LlamaCppSampling sampling{common_sampler_init(model, _params.sampling),
65
65
  common_sampler_free};
66
66
 
67
- // Process images if any are provided
68
- if (!_image_paths.empty()) {
67
+ // Process media if any are provided
68
+ if (!_media_paths.empty()) {
69
69
  const auto* mtmd_ctx = _sess->get_mtmd_ctx();
70
70
 
71
71
  if (mtmd_ctx != nullptr) {
72
- // Process the images and get the tokens
72
+ // Process the media and get the tokens
73
73
  try {
74
- n_cur = process_image_prompt(
74
+ n_cur = processMediaPrompt(
75
75
  ctx,
76
76
  mtmd_ctx,
77
77
  _sess,
78
78
  _params,
79
- _image_paths
79
+ _media_paths
80
80
  );
81
81
  } catch (const std::exception& e) {
82
82
  SetError(e.what());
@@ -85,12 +85,12 @@ void LlamaCompletionWorker::Execute() {
85
85
  }
86
86
 
87
87
  if (n_cur <= 0) {
88
- SetError("Failed to process images");
88
+ SetError("Failed to process media");
89
89
  _sess->get_mutex().unlock();
90
90
  return;
91
91
  }
92
92
 
93
- fprintf(stdout, "[DEBUG] Image processing successful, n_cur=%zu, tokens=%zu\n",
93
+ fprintf(stdout, "[DEBUG] Media processing successful, n_cur=%zu, tokens=%zu\n",
94
94
  n_cur, _sess->tokens_ptr()->size());
95
95
 
96
96
  n_input = _sess->tokens_ptr()->size();
@@ -20,7 +20,7 @@ public:
20
20
  Napi::Function callback, common_params params,
21
21
  std::vector<std::string> stop_words,
22
22
  int32_t chat_format,
23
- std::vector<std::string> image_paths = {});
23
+ std::vector<std::string> media_paths = {});
24
24
 
25
25
  ~LlamaCompletionWorker();
26
26
 
@@ -44,7 +44,7 @@ private:
44
44
  common_params _params;
45
45
  std::vector<std::string> _stop_words;
46
46
  int32_t _chat_format;
47
- std::vector<std::string> _image_paths;
47
+ std::vector<std::string> _media_paths;
48
48
  std::function<void()> _onComplete;
49
49
  bool _has_callback = false;
50
50
  bool _stop = false;
@@ -135,6 +135,9 @@ void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
135
135
  static_cast<napi_property_attributes>(napi_enumerable)),
136
136
  StaticMethod<&LlamaContext::ToggleNativeLog>(
137
137
  "toggleNativeLog",
138
+ static_cast<napi_property_attributes>(napi_enumerable)),
139
+ InstanceMethod<&LlamaContext::GetMultimodalSupport>(
140
+ "getMultimodalSupport",
138
141
  static_cast<napi_property_attributes>(napi_enumerable))});
139
142
  Napi::FunctionReference *constructor = new Napi::FunctionReference();
140
143
  *constructor = Napi::Persistent(func);
@@ -607,22 +610,22 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
607
610
  }
608
611
  }
609
612
 
610
- // Process image_paths parameter
611
- std::vector<std::string> image_paths;
612
- if (options.Has("image_paths")) {
613
- if (options.Get("image_paths").IsArray()) {
614
- auto image_paths_array = options.Get("image_paths").As<Napi::Array>();
615
- for (size_t i = 0; i < image_paths_array.Length(); i++) {
616
- image_paths.push_back(image_paths_array.Get(i).ToString().Utf8Value());
613
+ // Process media_paths parameter
614
+ std::vector<std::string> media_paths;
615
+ if (options.Has("media_paths")) {
616
+ if (options.Get("media_paths").IsArray()) {
617
+ auto media_paths_array = options.Get("media_paths").As<Napi::Array>();
618
+ for (size_t i = 0; i < media_paths_array.Length(); i++) {
619
+ media_paths.push_back(media_paths_array.Get(i).ToString().Utf8Value());
617
620
  }
618
- } else if (options.Get("image_paths").IsString()) {
619
- image_paths.push_back(options.Get("image_paths").ToString().Utf8Value());
621
+ } else if (options.Get("media_paths").IsString()) {
622
+ media_paths.push_back(options.Get("media_paths").ToString().Utf8Value());
620
623
  }
621
624
  }
622
625
 
623
- // Check if multimodal is enabled when image_paths are provided
624
- if (!image_paths.empty() && !(_has_multimodal && _mtmd_ctx != nullptr)) {
625
- Napi::Error::New(env, "Multimodal support must be enabled via initMultimodal to use image_paths").ThrowAsJavaScriptException();
626
+ // Check if multimodal is enabled when media_paths are provided
627
+ if (!media_paths.empty() && !(_has_multimodal && _mtmd_ctx != nullptr)) {
628
+ Napi::Error::New(env, "Multimodal support must be enabled via initMultimodal to use media_paths").ThrowAsJavaScriptException();
626
629
  return env.Undefined();
627
630
  }
628
631
 
@@ -808,7 +811,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
808
811
  }
809
812
 
810
813
  auto *worker =
811
- new LlamaCompletionWorker(info, _sess, callback, params, stop_words, chat_format, image_paths);
814
+ new LlamaCompletionWorker(info, _sess, callback, params, stop_words, chat_format, media_paths);
812
815
  worker->Queue();
813
816
  _wip = worker;
814
817
  worker->OnComplete([this]() { _wip = nullptr; });
@@ -833,14 +836,14 @@ Napi::Value LlamaContext::Tokenize(const Napi::CallbackInfo &info) {
833
836
  .ThrowAsJavaScriptException();
834
837
  }
835
838
  auto text = info[0].ToString().Utf8Value();
836
- std::vector<std::string> image_paths;
839
+ std::vector<std::string> media_paths;
837
840
  if (info.Length() >= 2 && info[1].IsArray()) {
838
- auto image_paths_array = info[1].As<Napi::Array>();
839
- for (size_t i = 0; i < image_paths_array.Length(); i++) {
840
- image_paths.push_back(image_paths_array.Get(i).ToString().Utf8Value());
841
+ auto media_paths_array = info[1].As<Napi::Array>();
842
+ for (size_t i = 0; i < media_paths_array.Length(); i++) {
843
+ media_paths.push_back(media_paths_array.Get(i).ToString().Utf8Value());
841
844
  }
842
845
  }
843
- auto *worker = new TokenizeWorker(info, _sess, text, image_paths);
846
+ auto *worker = new TokenizeWorker(info, _sess, text, media_paths);
844
847
  worker->Queue();
845
848
  return worker->Promise();
846
849
  }
@@ -1081,6 +1084,22 @@ Napi::Value LlamaContext::IsMultimodalEnabled(const Napi::CallbackInfo &info) {
1081
1084
  return Napi::Boolean::New(info.Env(), _has_multimodal && _mtmd_ctx != nullptr);
1082
1085
  }
1083
1086
 
1087
+ // getMultimodalSupport(): Promise<{ vision: boolean, audio: boolean }>
1088
+ Napi::Value LlamaContext::GetMultimodalSupport(const Napi::CallbackInfo &info) {
1089
+ Napi::Env env = info.Env();
1090
+ auto result = Napi::Object::New(env);
1091
+
1092
+ if (_has_multimodal && _mtmd_ctx != nullptr) {
1093
+ result.Set("vision", Napi::Boolean::New(env, mtmd_support_vision(_mtmd_ctx)));
1094
+ result.Set("audio", Napi::Boolean::New(env, mtmd_support_audio(_mtmd_ctx)));
1095
+ } else {
1096
+ result.Set("vision", Napi::Boolean::New(env, false));
1097
+ result.Set("audio", Napi::Boolean::New(env, false));
1098
+ }
1099
+
1100
+ return result;
1101
+ }
1102
+
1084
1103
  // releaseMultimodal(): void
1085
1104
  void LlamaContext::ReleaseMultimodal(const Napi::CallbackInfo &info) {
1086
1105
  if (_mtmd_ctx != nullptr) {
@@ -31,6 +31,7 @@ private:
31
31
  // Multimodal methods
32
32
  Napi::Value InitMultimodal(const Napi::CallbackInfo &info);
33
33
  Napi::Value IsMultimodalEnabled(const Napi::CallbackInfo &info);
34
+ Napi::Value GetMultimodalSupport(const Napi::CallbackInfo &info);
34
35
  void ReleaseMultimodal(const Napi::CallbackInfo &info);
35
36
 
36
37
  std::string _info;
@@ -2,17 +2,22 @@
2
2
  #include "LlamaContext.h"
3
3
 
4
4
  TokenizeWorker::TokenizeWorker(const Napi::CallbackInfo &info,
5
- LlamaSessionPtr &sess, std::string text, std::vector<std::string> image_paths)
6
- : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess), _text(text), _image_paths(image_paths) {}
5
+ LlamaSessionPtr &sess, std::string text, std::vector<std::string> media_paths)
6
+ : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess), _text(text), _media_paths(media_paths) {}
7
7
 
8
8
  void TokenizeWorker::Execute() {
9
9
  auto mtmd_ctx = _sess->get_mtmd_ctx();
10
- if (!_image_paths.empty()) {
11
- _result = tokenizeWithImages(mtmd_ctx, _text, _image_paths);
10
+ if (!_media_paths.empty()) {
11
+ try {
12
+ _result = tokenizeWithMedia(mtmd_ctx, _text, _media_paths);
13
+ mtmd_input_chunks_free(_result.chunks);
14
+ } catch (const std::exception &e) {
15
+ SetError(e.what());
16
+ }
12
17
  } else {
13
18
  const auto tokens = common_tokenize(_sess->context(), _text, false);
14
19
  _result.tokens = tokens;
15
- _result.has_image = false;
20
+ _result.has_media = false;
16
21
  }
17
22
  }
18
23
 
@@ -24,9 +29,8 @@ void TokenizeWorker::OnOK() {
24
29
  memcpy(tokens.Data(), _result.tokens.data(),
25
30
  _result.tokens.size() * sizeof(llama_token));
26
31
  result.Set("tokens", tokens);
27
- if (_result.has_image) {
28
- result.Set("has_image", _result.has_image);
29
-
32
+ result.Set("has_media", _result.has_media);
33
+ if (_result.has_media) {
30
34
  auto bitmap_hashes = Napi::Array::New(Napi::AsyncWorker::Env(), _result.bitmap_hashes.size());
31
35
  for (size_t i = 0; i < _result.bitmap_hashes.size(); i++) {
32
36
  bitmap_hashes.Set(i, _result.bitmap_hashes[i]);
@@ -37,11 +41,11 @@ void TokenizeWorker::OnOK() {
37
41
  chunk_pos.Set(i, _result.chunk_pos[i]);
38
42
  }
39
43
  result.Set("chunk_pos", chunk_pos);
40
- auto chunk_pos_images = Napi::Array::New(Napi::AsyncWorker::Env(), _result.chunk_pos_images.size());
41
- for (size_t i = 0; i < _result.chunk_pos_images.size(); i++) {
42
- chunk_pos_images.Set(i, _result.chunk_pos_images[i]);
44
+ auto chunk_pos_media = Napi::Array::New(Napi::AsyncWorker::Env(), _result.chunk_pos_media.size());
45
+ for (size_t i = 0; i < _result.chunk_pos_media.size(); i++) {
46
+ chunk_pos_media.Set(i, _result.chunk_pos_media[i]);
43
47
  }
44
- result.Set("chunk_pos_images", chunk_pos_images);
48
+ result.Set("chunk_pos_media", chunk_pos_media);
45
49
  }
46
50
  Napi::Promise::Deferred::Resolve(result);
47
51
  }
@@ -5,7 +5,7 @@ class TokenizeWorker : public Napi::AsyncWorker,
5
5
  public Napi::Promise::Deferred {
6
6
  public:
7
7
  TokenizeWorker(const Napi::CallbackInfo &info, LlamaSessionPtr &sess,
8
- std::string text, std::vector<std::string> image_paths);
8
+ std::string text, std::vector<std::string> media_paths);
9
9
 
10
10
  protected:
11
11
  void Execute();
@@ -15,6 +15,6 @@ protected:
15
15
  private:
16
16
  LlamaSessionPtr _sess;
17
17
  std::string _text;
18
- std::vector<std::string> _image_paths;
18
+ std::vector<std::string> _media_paths;
19
19
  TokenizeResult _result;
20
20
  };