@fugood/llama.node 0.4.6 → 0.4.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
package/lib/binding.ts CHANGED
@@ -137,6 +137,10 @@ export type LlamaCompletionToken = {
137
137
 
138
138
  export type TokenizeResult = {
139
139
  tokens: Int32Array
140
+ has_image: boolean
141
+ bitmap_hashes: string[]
142
+ chunk_pos: number[]
143
+ chunk_pos_images: number[]
140
144
  }
141
145
 
142
146
  export type EmbeddingResult = {
@@ -163,7 +167,7 @@ export interface LlamaContext {
163
167
  callback?: (token: LlamaCompletionToken) => void,
164
168
  ): Promise<LlamaCompletionResult>
165
169
  stopCompletion(): void
166
- tokenize(text: string): Promise<TokenizeResult>
170
+ tokenize(text: string, image_paths?: string[]): Promise<TokenizeResult>
167
171
  detokenize(tokens: number[]): Promise<string>
168
172
  embedding(text: string): Promise<EmbeddingResult>
169
173
  saveSession(path: string): Promise<void>
package/lib/index.js CHANGED
@@ -112,7 +112,7 @@ class LlamaContextWrapper {
112
112
  getFormattedChat(messages, template, params) {
113
113
  const { messages: chat, has_image, image_paths, } = this._formatImageChat(messages);
114
114
  const useJinja = this.isJinjaSupported() && (params === null || params === void 0 ? void 0 : params.jinja);
115
- let tmpl = this.isLlamaChatSupported() || useJinja ? undefined : 'chatml';
115
+ let tmpl;
116
116
  if (template)
117
117
  tmpl = template; // Force replace if provided
118
118
  const jsonSchema = getJsonSchema(params === null || params === void 0 ? void 0 : params.response_format);
@@ -144,8 +144,8 @@ class LlamaContextWrapper {
144
144
  stopCompletion() {
145
145
  return this.ctx.stopCompletion();
146
146
  }
147
- tokenize(text) {
148
- return this.ctx.tokenize(text);
147
+ tokenize(text, { image_paths } = {}) {
148
+ return this.ctx.tokenize(text, image_paths);
149
149
  }
150
150
  detokenize(tokens) {
151
151
  return this.ctx.detokenize(tokens);
package/lib/index.ts CHANGED
@@ -60,6 +60,13 @@ const getJsonSchema = (responseFormat?: CompletionResponseFormat) => {
60
60
  return null
61
61
  }
62
62
 
63
+ export type FormattedChatResult = {
64
+ type: 'jinja' | 'llama-chat'
65
+ prompt: string
66
+ has_image: boolean
67
+ image_paths?: Array<string>
68
+ }
69
+
63
70
  class LlamaContextWrapper {
64
71
  ctx: any
65
72
 
@@ -133,7 +140,7 @@ class LlamaContextWrapper {
133
140
  parallel_tool_calls?: object
134
141
  tool_choice?: string
135
142
  },
136
- ): object {
143
+ ): FormattedChatResult {
137
144
  const {
138
145
  messages: chat,
139
146
  has_image,
@@ -141,7 +148,7 @@ class LlamaContextWrapper {
141
148
  } = this._formatImageChat(messages)
142
149
 
143
150
  const useJinja = this.isJinjaSupported() && params?.jinja
144
- let tmpl = this.isLlamaChatSupported() || useJinja ? undefined : 'chatml'
151
+ let tmpl
145
152
  if (template) tmpl = template // Force replace if provided
146
153
  const jsonSchema = getJsonSchema(params?.response_format)
147
154
 
@@ -185,8 +192,8 @@ class LlamaContextWrapper {
185
192
  return this.ctx.stopCompletion()
186
193
  }
187
194
 
188
- tokenize(text: string): Promise<TokenizeResult> {
189
- return this.ctx.tokenize(text)
195
+ tokenize(text: string, { image_paths }: { image_paths?: string[] } = {}): Promise<TokenizeResult> {
196
+ return this.ctx.tokenize(text, image_paths)
190
197
  }
191
198
 
192
199
  detokenize(tokens: number[]): Promise<string> {
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "0.4.6",
4
+ "version": "0.4.7",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -1,367 +1,6 @@
1
1
  #include "LlamaCompletionWorker.h"
2
2
  #include "LlamaContext.h"
3
3
 
4
- // Computes FNV-1a hash of the data
5
- static std::string fnv_hash(const uint8_t * data, size_t len) {
6
- const uint64_t fnv_prime = 0x100000001b3ULL;
7
- uint64_t hash = 0xcbf29ce484222325ULL;
8
-
9
- for (size_t i = 0; i < len; ++i) {
10
- hash ^= data[i];
11
- hash *= fnv_prime;
12
- }
13
- return std::to_string(hash);
14
- }
15
-
16
- static const std::string base64_chars =
17
- "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
18
- "abcdefghijklmnopqrstuvwxyz"
19
- "0123456789+/";
20
-
21
- // Base64 decoding function
22
- static std::vector<uint8_t> base64_decode(const std::string &encoded_string) {
23
- std::vector<uint8_t> decoded;
24
- int in_len = encoded_string.size();
25
- int i = 0;
26
- int j = 0;
27
- int in_ = 0;
28
- unsigned char char_array_4[4], char_array_3[3];
29
-
30
- while (in_len-- && (encoded_string[in_] != '=')) {
31
- if (isspace(encoded_string[in_])) {
32
- in_++;
33
- continue;
34
- }
35
-
36
- if (encoded_string[in_] == '=' || base64_chars.find(encoded_string[in_]) == std::string::npos) {
37
- break;
38
- }
39
-
40
- char_array_4[i++] = encoded_string[in_]; in_++;
41
- if (i == 4) {
42
- for (i = 0; i < 4; i++) {
43
- char_array_4[i] = base64_chars.find(char_array_4[i]);
44
- }
45
-
46
- char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
47
- char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
48
- char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
49
-
50
- for (i = 0; i < 3; i++) {
51
- decoded.push_back(char_array_3[i]);
52
- }
53
- i = 0;
54
- }
55
- }
56
-
57
- if (i) {
58
- for (j = i; j < 4; j++) {
59
- char_array_4[j] = 0;
60
- }
61
-
62
- for (j = 0; j < 4; j++) {
63
- char_array_4[j] = base64_chars.find(char_array_4[j]);
64
- }
65
-
66
- char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
67
- char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
68
- char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
69
-
70
- for (j = 0; j < i - 1; j++) {
71
- decoded.push_back(char_array_3[j]);
72
- }
73
- }
74
-
75
- return decoded;
76
- }
77
-
78
- size_t common_part(const std::vector<llama_token> &a,
79
- const std::vector<llama_token> &b) {
80
- size_t i = 0;
81
- while (i < a.size() && i < b.size() && a[i] == b[i]) {
82
- i++;
83
- }
84
- return i;
85
- }
86
-
87
- // Process images and add them to the tokenized input
88
- llama_pos processImage(
89
- const mtmd_context* mtmd_ctx,
90
- llama_context* ctx,
91
- LlamaSessionPtr sess,
92
- const std::vector<std::string>& image_paths,
93
- const common_params& params,
94
- std::vector<llama_token>& text_tokens
95
- ) {
96
- if (mtmd_ctx == nullptr) {
97
- return false;
98
- }
99
-
100
- // Multimodal path
101
- std::string full_prompt = params.prompt;
102
- // Add image marker if it doesn't already exist
103
- if (full_prompt.find("<__image__>") == std::string::npos) {
104
- full_prompt += " <__image__>";
105
- }
106
-
107
- // Prepare bitmaps array for all images
108
- mtmd::bitmaps bitmaps;
109
-
110
- std::vector<std::string> bitmap_hashes;
111
-
112
- // Load all images
113
- for (const auto& image_path : image_paths) {
114
- fprintf(stdout, "[DEBUG] Loading image: %s\n",
115
- image_path.substr(0, 50).c_str()); // Only log part of path for base64
116
-
117
- // Check if it's a base64 image
118
- if (image_path.compare(0, 11, "data:image/") == 0) {
119
-
120
- // Parse base64 data
121
- std::vector<std::string> parts;
122
- size_t comma_pos = image_path.find(',');
123
- if (comma_pos == std::string::npos) {
124
- bitmaps.entries.clear();
125
- return false;
126
- }
127
-
128
- std::string header = image_path.substr(0, comma_pos);
129
- std::string base64_data = image_path.substr(comma_pos + 1);
130
-
131
- if (header.find("base64") == std::string::npos) {
132
- bitmaps.entries.clear();
133
- return false;
134
- }
135
-
136
- // Decode base64
137
- try {
138
- // Decode base64 to binary
139
- std::vector<uint8_t> image_data = base64_decode(base64_data);
140
-
141
- // Load bitmap from memory buffer using direct initialization
142
- mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(image_data.data(), image_data.size()));
143
- if (!bmp.ptr) {
144
- bitmaps.entries.clear();
145
- return false;
146
- }
147
-
148
- // Calculate bitmap hash (for KV caching)
149
- std::string hash = fnv_hash(bmp.data(), bmp.nx()*bmp.ny()*3);
150
- bmp.set_id(hash.c_str());
151
- bitmaps.entries.push_back(std::move(bmp));
152
- bitmap_hashes.push_back(hash.c_str());
153
- } catch (const std::exception& e) {
154
- bitmaps.entries.clear();
155
- return false;
156
- }
157
- } else if (image_path.compare(0, 7, "http://") == 0 || image_path.compare(0, 8, "https://") == 0) {
158
- // HTTP URLs are not supported yet
159
- bitmaps.entries.clear();
160
- return false;
161
- } else {
162
- // Check if file exists
163
- FILE* file = fopen(image_path.c_str(), "rb");
164
- if (file == nullptr) {
165
- bitmaps.entries.clear();
166
- return false;
167
- }
168
-
169
- // Get file size
170
- fseek(file, 0, SEEK_END);
171
- long file_size = ftell(file);
172
- fseek(file, 0, SEEK_SET);
173
- fclose(file);
174
-
175
- // Create bitmap directly
176
- mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(image_path.c_str()));
177
- if (!bmp.ptr) {
178
- bitmaps.entries.clear();
179
- return false;
180
- }
181
-
182
- // Calculate bitmap hash (for KV caching)
183
- std::string hash = fnv_hash(bmp.data(), bmp.nx()*bmp.ny()*3);
184
- bmp.set_id(hash.c_str());
185
- bitmaps.entries.push_back(std::move(bmp));
186
- bitmap_hashes.push_back(hash.c_str());
187
- }
188
- }
189
-
190
- mtmd_input_chunks* chunks = mtmd_input_chunks_init();
191
- if (chunks == nullptr) {
192
- bitmaps.entries.clear();
193
- return false;
194
- }
195
-
196
- // Create input text
197
- mtmd_input_text input_text;
198
- input_text.text = full_prompt.c_str(); // Use the full prompt with image marker
199
- input_text.add_special = true; // Add BOS token if this is the first message
200
- input_text.parse_special = true; // Parse special tokens like <__image__>
201
-
202
- // Tokenize the text and images
203
- fprintf(stdout, "[DEBUG] Tokenizing text and %zu images\n", bitmaps.entries.size());
204
- auto bitmaps_c_ptr = bitmaps.c_ptr();
205
-
206
- // Cast away const for mtmd_tokenize
207
- int32_t res = mtmd_tokenize(
208
- const_cast<mtmd_context*>(mtmd_ctx),
209
- chunks,
210
- &input_text,
211
- bitmaps_c_ptr.data(),
212
- bitmaps_c_ptr.size()
213
- );
214
-
215
- if (res != 0) {
216
- mtmd_input_chunks_free(chunks);
217
- bitmaps.entries.clear();
218
- return false;
219
- }
220
-
221
- // Log chunk information
222
- size_t num_chunks = mtmd_input_chunks_size(chunks);
223
- fprintf(stdout, "[DEBUG] Tokenization successful: num_chunks=%zu\n", num_chunks);
224
-
225
- // Clear text_tokens before adding new tokens
226
- text_tokens.clear();
227
-
228
- // Create a vector to store all tokens (both text and image)
229
- std::vector<llama_token> all_tokens;
230
-
231
- // Track the total number of tokens (both text and image)
232
- size_t total_token_count = 0;
233
-
234
- // chunk pos
235
- std::vector<size_t> chunk_pos;
236
- std::vector<size_t> chunk_pos_images;
237
- for (size_t i = 0; i < num_chunks; i++) {
238
- chunk_pos.push_back(total_token_count);
239
-
240
- const mtmd_input_chunk* chunk = mtmd_input_chunks_get(chunks, i);
241
- mtmd_input_chunk_type chunk_type = mtmd_input_chunk_get_type(chunk);
242
-
243
- if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
244
- size_t n_tokens;
245
- const llama_token* tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
246
-
247
- // Add text tokens
248
- text_tokens.insert(text_tokens.end(), tokens, tokens + n_tokens);
249
- all_tokens.insert(all_tokens.end(), tokens, tokens + n_tokens);
250
- total_token_count += n_tokens;
251
- } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
252
- chunk_pos_images.push_back(total_token_count);
253
-
254
- const mtmd_image_tokens* img_tokens = mtmd_input_chunk_get_tokens_image(chunk);
255
- size_t n_tokens = mtmd_image_tokens_get_n_tokens(img_tokens);
256
- size_t n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
257
-
258
- for (size_t j = 0; j < n_pos; j++) {
259
- all_tokens.push_back(LLAMA_TOKEN_NULL);
260
- }
261
- total_token_count += n_pos;
262
- }
263
- }
264
-
265
- llama_pos n_past = common_part(*sess->tokens_ptr(), all_tokens);
266
-
267
- llama_pos new_n_past = n_past;
268
-
269
- // Adjust n_past to position of the text chunk
270
- // TODO: Edit the text chunk to remove the tokens before n_past to speed up
271
- // need to update the mtmd api
272
- auto adjusted_n_past = -1;
273
- for (size_t i = 0; i < chunk_pos.size(); i++) {
274
- if (n_past < chunk_pos[i]) {
275
- break;
276
- }
277
- bool is_end = i + 1 == chunk_pos.size();
278
- if (
279
- chunk_pos[i] < n_past &&
280
- (!is_end && chunk_pos[i + 1] > n_past)
281
- // is_end & n_past < total_token_count:
282
- // don't need to adjust and it will skip eval_chunk_single, let nextToken() to finish the job
283
- ) {
284
- adjusted_n_past = chunk_pos[i];
285
- }
286
- }
287
- if (adjusted_n_past != -1) {
288
- n_past = adjusted_n_past;
289
- new_n_past = n_past;
290
- fprintf(stdout, "[DEBUG] Adjusted n_past to %d\n", n_past);
291
- }
292
-
293
- // Compare bitmap hashes, if they are not the same, backtrack n_past to the position of the first mismatch
294
- auto mtmd_bitmap_past_hashes = sess->mtmd_bitmap_past_hashes_ptr();
295
- if (mtmd_bitmap_past_hashes->size() > 0) {
296
- for (size_t i = 0; i < bitmap_hashes.size(); i++) {
297
- auto pos = chunk_pos_images[i];
298
- if (n_past < pos) {
299
- break;
300
- }
301
- if (i >= mtmd_bitmap_past_hashes->size()) {
302
- break;
303
- }
304
- if (bitmap_hashes[i] != (*mtmd_bitmap_past_hashes)[i]) {
305
- n_past = chunk_pos_images[i];
306
- new_n_past = n_past;
307
- break;
308
- }
309
- }
310
- }
311
-
312
- // Clear all KV cache entries after position n_past
313
- llama_kv_self_seq_rm(ctx, 0, n_past, -1);
314
-
315
- for (size_t i = 0; i < chunk_pos.size(); i++) {
316
- fprintf(stdout, "[DEBUG] Evaluating chunk %zu: n_past=%d, chunk_pos=%zu\n", i, n_past, chunk_pos[i]);
317
-
318
- // Process chunk only if it's after the current n_past
319
- if (chunk_pos[i] >= new_n_past) {
320
- bool chunk_logits_last = (i == num_chunks - 1);
321
- auto chunk = mtmd_input_chunks_get(chunks, i);
322
-
323
- // Cast away const for mtmd_helper_eval_chunk_single
324
- int32_t res = mtmd_helper_eval_chunk_single(
325
- const_cast<mtmd_context*>(mtmd_ctx),
326
- ctx,
327
- chunk,
328
- n_past,
329
- 0,
330
- params.n_batch, // batch size
331
- chunk_logits_last,
332
- &new_n_past
333
- );
334
-
335
- if (res != 0) {
336
- mtmd_input_chunks_free(chunks);
337
- bitmaps.entries.clear();
338
- return false;
339
- }
340
- n_past = new_n_past;
341
- }
342
- }
343
-
344
- if (n_past == total_token_count && n_past > 0 && all_tokens[n_past - 1] != LLAMA_TOKEN_NULL) {
345
- // we have to evaluate at least 1 token to generate logits.
346
- n_past--;
347
- }
348
-
349
- // Update sampling context to process token sequences
350
- for (auto & token : all_tokens) {
351
- if (token == LLAMA_TOKEN_NULL) {
352
- continue;
353
- }
354
- }
355
- // Set the tokens
356
- sess->set_tokens(std::move(all_tokens));
357
-
358
- sess->set_mtmd_bitmap_past_hashes(bitmap_hashes);
359
-
360
- // Clean up image resources
361
- mtmd_input_chunks_free(chunks);
362
- bitmaps.entries.clear();
363
- return n_past;
364
- }
365
4
 
366
5
  size_t findStoppingStrings(const std::string &text,
367
6
  const size_t last_token_size,
@@ -425,22 +64,25 @@ void LlamaCompletionWorker::Execute() {
425
64
  LlamaCppSampling sampling{common_sampler_init(model, _params.sampling),
426
65
  common_sampler_free};
427
66
 
428
- std::vector<llama_token> prompt_tokens;
429
-
430
67
  // Process images if any are provided
431
68
  if (!_image_paths.empty()) {
432
69
  const auto* mtmd_ctx = _sess->get_mtmd_ctx();
433
70
 
434
71
  if (mtmd_ctx != nullptr) {
435
72
  // Process the images and get the tokens
436
- n_cur = processImage(
437
- mtmd_ctx,
438
- ctx,
439
- _sess,
440
- _image_paths,
441
- _params,
442
- prompt_tokens
443
- );
73
+ try {
74
+ n_cur = process_image_prompt(
75
+ ctx,
76
+ mtmd_ctx,
77
+ _sess,
78
+ _params,
79
+ _image_paths
80
+ );
81
+ } catch (const std::exception& e) {
82
+ SetError(e.what());
83
+ _sess->get_mutex().unlock();
84
+ return;
85
+ }
444
86
 
445
87
  if (n_cur <= 0) {
446
88
  SetError("Failed to process images");
@@ -456,7 +98,6 @@ void LlamaCompletionWorker::Execute() {
456
98
  --n_cur;
457
99
  }
458
100
  n_input -= n_cur;
459
- llama_kv_self_seq_rm(ctx, 0, n_cur, -1);
460
101
  } else {
461
102
  SetError("Multimodal context not initialized");
462
103
  _sess->get_mutex().unlock();
@@ -464,11 +105,11 @@ void LlamaCompletionWorker::Execute() {
464
105
  }
465
106
  } else {
466
107
  // Text-only path
467
- prompt_tokens = ::common_tokenize(ctx, _params.prompt, add_bos);
108
+ std::vector<llama_token> prompt_tokens = ::common_tokenize(ctx, _params.prompt, add_bos);
468
109
  n_input = prompt_tokens.size();
469
110
 
470
111
  if (_sess->tokens_ptr()->size() > 0) {
471
- n_cur = common_part(*(_sess->tokens_ptr()), prompt_tokens);
112
+ n_cur = common_tokens_part(*(_sess->tokens_ptr()), prompt_tokens);
472
113
  if (n_cur == n_input) {
473
114
  --n_cur;
474
115
  }
@@ -4,8 +4,6 @@
4
4
  #include <atomic>
5
5
  #include <functional>
6
6
  #include <napi.h>
7
- #include "tools/mtmd/mtmd.h"
8
- #include "tools/mtmd/clip.h"
9
7
 
10
8
  struct CompletionResult {
11
9
  std::string text = "";
@@ -27,80 +27,6 @@ static std::string format_string(const std::string& format, Args ... args) {
27
27
  return std::string(buf.get(), buf.get() + size - 1); // -1 to exclude null terminator
28
28
  }
29
29
 
30
- // Computes FNV-1a hash of the data
31
- static std::string fnv_hash(const uint8_t* data, size_t len) {
32
- const uint64_t fnv_prime = 0x100000001b3ULL;
33
- uint64_t hash = 0xcbf29ce484222325ULL;
34
-
35
- for (size_t i = 0; i < len; ++i) {
36
- hash ^= data[i];
37
- hash *= fnv_prime;
38
- }
39
- return std::to_string(hash);
40
- }
41
-
42
- static const std::string base64_chars =
43
- "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
44
- "abcdefghijklmnopqrstuvwxyz"
45
- "0123456789+/";
46
-
47
- // Base64 decoding function
48
- static std::vector<uint8_t> base64_decode(const std::string &encoded_string) {
49
- std::vector<uint8_t> decoded;
50
- int in_len = encoded_string.size();
51
- int i = 0;
52
- int j = 0;
53
- int in_ = 0;
54
- unsigned char char_array_4[4], char_array_3[3];
55
-
56
- while (in_len-- && (encoded_string[in_] != '=')) {
57
- if (isspace(encoded_string[in_])) {
58
- in_++;
59
- continue;
60
- }
61
-
62
- if (encoded_string[in_] == '=' || base64_chars.find(encoded_string[in_]) == std::string::npos) {
63
- break;
64
- }
65
-
66
- char_array_4[i++] = encoded_string[in_]; in_++;
67
- if (i == 4) {
68
- for (i = 0; i < 4; i++) {
69
- char_array_4[i] = base64_chars.find(char_array_4[i]);
70
- }
71
-
72
- char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
73
- char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
74
- char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
75
-
76
- for (i = 0; i < 3; i++) {
77
- decoded.push_back(char_array_3[i]);
78
- }
79
- i = 0;
80
- }
81
- }
82
-
83
- if (i) {
84
- for (j = i; j < 4; j++) {
85
- char_array_4[j] = 0;
86
- }
87
-
88
- for (j = 0; j < 4; j++) {
89
- char_array_4[j] = base64_chars.find(char_array_4[j]);
90
- }
91
-
92
- char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
93
- char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
94
- char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
95
-
96
- for (j = 0; j < i - 1; j++) {
97
- decoded.push_back(char_array_3[j]);
98
- }
99
- }
100
-
101
- return decoded;
102
- }
103
-
104
30
  using json = nlohmann::ordered_json;
105
31
 
106
32
  // loadModelInfo(path: string): object
@@ -153,18 +79,6 @@ Napi::Value LlamaContext::ModelInfo(const Napi::CallbackInfo& info) {
153
79
  return metadata;
154
80
  }
155
81
 
156
- std::vector<common_chat_msg> get_messages(Napi::Array messages) {
157
- std::vector<common_chat_msg> chat;
158
- for (size_t i = 0; i < messages.Length(); i++) {
159
- auto message = messages.Get(i).As<Napi::Object>();
160
- chat.push_back({
161
- get_option<std::string>(message, "role", ""),
162
- get_option<std::string>(message, "content", ""),
163
- });
164
- }
165
- return std::move(chat);
166
- }
167
-
168
82
  void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
169
83
  Napi::Function func = DefineClass(
170
84
  env, "LlamaContext",
@@ -919,7 +833,14 @@ Napi::Value LlamaContext::Tokenize(const Napi::CallbackInfo &info) {
919
833
  .ThrowAsJavaScriptException();
920
834
  }
921
835
  auto text = info[0].ToString().Utf8Value();
922
- auto *worker = new TokenizeWorker(info, _sess, text);
836
+ std::vector<std::string> image_paths;
837
+ if (info.Length() >= 2 && info[1].IsArray()) {
838
+ auto image_paths_array = info[1].As<Napi::Array>();
839
+ for (size_t i = 0; i < image_paths_array.Length(); i++) {
840
+ image_paths.push_back(image_paths_array.Get(i).ToString().Utf8Value());
841
+ }
842
+ }
843
+ auto *worker = new TokenizeWorker(info, _sess, text, image_paths);
923
844
  worker->Queue();
924
845
  return worker->Promise();
925
846
  }
@@ -2,12 +2,18 @@
2
2
  #include "LlamaContext.h"
3
3
 
4
4
  TokenizeWorker::TokenizeWorker(const Napi::CallbackInfo &info,
5
- LlamaSessionPtr &sess, std::string text)
6
- : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess), _text(text) {}
5
+ LlamaSessionPtr &sess, std::string text, std::vector<std::string> image_paths)
6
+ : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess), _text(text), _image_paths(image_paths) {}
7
7
 
8
8
  void TokenizeWorker::Execute() {
9
- const auto tokens = ::common_tokenize(_sess->context(), _text, false);
10
- _result.tokens = std::move(tokens);
9
+ auto mtmd_ctx = _sess->get_mtmd_ctx();
10
+ if (!_image_paths.empty()) {
11
+ _result = tokenizeWithImages(mtmd_ctx, _text, _image_paths);
12
+ } else {
13
+ const auto tokens = common_tokenize(_sess->context(), _text, false);
14
+ _result.tokens = tokens;
15
+ _result.has_image = false;
16
+ }
11
17
  }
12
18
 
13
19
  void TokenizeWorker::OnOK() {
@@ -18,6 +24,25 @@ void TokenizeWorker::OnOK() {
18
24
  memcpy(tokens.Data(), _result.tokens.data(),
19
25
  _result.tokens.size() * sizeof(llama_token));
20
26
  result.Set("tokens", tokens);
27
+ if (_result.has_image) {
28
+ result.Set("has_image", _result.has_image);
29
+
30
+ auto bitmap_hashes = Napi::Array::New(Napi::AsyncWorker::Env(), _result.bitmap_hashes.size());
31
+ for (size_t i = 0; i < _result.bitmap_hashes.size(); i++) {
32
+ bitmap_hashes.Set(i, _result.bitmap_hashes[i]);
33
+ }
34
+ result.Set("bitmap_hashes", bitmap_hashes);
35
+ auto chunk_pos = Napi::Array::New(Napi::AsyncWorker::Env(), _result.chunk_pos.size());
36
+ for (size_t i = 0; i < _result.chunk_pos.size(); i++) {
37
+ chunk_pos.Set(i, _result.chunk_pos[i]);
38
+ }
39
+ result.Set("chunk_pos", chunk_pos);
40
+ auto chunk_pos_images = Napi::Array::New(Napi::AsyncWorker::Env(), _result.chunk_pos_images.size());
41
+ for (size_t i = 0; i < _result.chunk_pos_images.size(); i++) {
42
+ chunk_pos_images.Set(i, _result.chunk_pos_images[i]);
43
+ }
44
+ result.Set("chunk_pos_images", chunk_pos_images);
45
+ }
21
46
  Napi::Promise::Deferred::Resolve(result);
22
47
  }
23
48
 
@@ -1,15 +1,11 @@
1
1
  #include "common.hpp"
2
2
  #include <vector>
3
3
 
4
- struct TokenizeResult {
5
- std::vector<llama_token> tokens;
6
- };
7
-
8
4
  class TokenizeWorker : public Napi::AsyncWorker,
9
5
  public Napi::Promise::Deferred {
10
6
  public:
11
7
  TokenizeWorker(const Napi::CallbackInfo &info, LlamaSessionPtr &sess,
12
- std::string text);
8
+ std::string text, std::vector<std::string> image_paths);
13
9
 
14
10
  protected:
15
11
  void Execute();
@@ -19,5 +15,6 @@ protected:
19
15
  private:
20
16
  LlamaSessionPtr _sess;
21
17
  std::string _text;
18
+ std::vector<std::string> _image_paths;
22
19
  TokenizeResult _result;
23
20
  };
package/src/common.hpp CHANGED
@@ -2,6 +2,8 @@
2
2
 
3
3
  #include "common/common.h"
4
4
  #include "common/sampling.h"
5
+ #include "tools/mtmd/mtmd.h"
6
+ #include "tools/mtmd/clip.h"
5
7
  #include "chat.h"
6
8
  #include "llama.h"
7
9
  #include "tools/mtmd/mtmd.h"
@@ -120,3 +122,386 @@ private:
120
122
  };
121
123
 
122
124
  typedef std::shared_ptr<LlamaSession> LlamaSessionPtr;
125
+
126
+ static size_t common_tokens_part(const std::vector<llama_token> &a,
127
+ const std::vector<llama_token> &b) {
128
+ size_t i = 0;
129
+ while (i < a.size() && i < b.size() && a[i] == b[i]) {
130
+ i++;
131
+ }
132
+ return i;
133
+ }
134
+
135
+ // Computes FNV-1a hash of the data
136
+ static std::string fnv_hash(const uint8_t * data, size_t len) {
137
+ const uint64_t fnv_prime = 0x100000001b3ULL;
138
+ uint64_t hash = 0xcbf29ce484222325ULL;
139
+
140
+ for (size_t i = 0; i < len; ++i) {
141
+ hash ^= data[i];
142
+ hash *= fnv_prime;
143
+ }
144
+ return std::to_string(hash);
145
+ }
146
+
147
+ static const std::string base64_chars =
148
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
149
+ "abcdefghijklmnopqrstuvwxyz"
150
+ "0123456789+/";
151
+
152
+ // Base64 decoding function
153
+ static std::vector<uint8_t> base64_decode(const std::string &encoded_string) {
154
+ std::vector<uint8_t> decoded;
155
+ int in_len = encoded_string.size();
156
+ int i = 0;
157
+ int j = 0;
158
+ int in_ = 0;
159
+ unsigned char char_array_4[4], char_array_3[3];
160
+
161
+ while (in_len-- && (encoded_string[in_] != '=')) {
162
+ if (isspace(encoded_string[in_])) {
163
+ in_++;
164
+ continue;
165
+ }
166
+
167
+ if (encoded_string[in_] == '=' || base64_chars.find(encoded_string[in_]) == std::string::npos) {
168
+ break;
169
+ }
170
+
171
+ char_array_4[i++] = encoded_string[in_]; in_++;
172
+ if (i == 4) {
173
+ for (i = 0; i < 4; i++) {
174
+ char_array_4[i] = base64_chars.find(char_array_4[i]);
175
+ }
176
+
177
+ char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
178
+ char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
179
+ char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
180
+
181
+ for (i = 0; i < 3; i++) {
182
+ decoded.push_back(char_array_3[i]);
183
+ }
184
+ i = 0;
185
+ }
186
+ }
187
+
188
+ if (i) {
189
+ for (j = i; j < 4; j++) {
190
+ char_array_4[j] = 0;
191
+ }
192
+
193
+ for (j = 0; j < 4; j++) {
194
+ char_array_4[j] = base64_chars.find(char_array_4[j]);
195
+ }
196
+
197
+ char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
198
+ char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
199
+ char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
200
+
201
+ for (j = 0; j < i - 1; j++) {
202
+ decoded.push_back(char_array_3[j]);
203
+ }
204
+ }
205
+
206
+ return decoded;
207
+ }
208
+
209
+ struct TokenizeResult {
210
+ std::vector<llama_token> tokens;
211
+
212
+ bool has_image = false;
213
+ std::vector<std::string> bitmap_hashes;
214
+ std::vector<size_t> chunk_pos; // both text and image
215
+ std::vector<size_t> chunk_pos_images; // image only
216
+ mtmd_input_chunks* chunks = nullptr;
217
+ };
218
+
219
+ static TokenizeResult tokenizeWithImages(
220
+ const mtmd_context* mtmd_ctx,
221
+ const std::string &prompt,
222
+ const std::vector<std::string> &image_paths
223
+ ) {
224
+ if (mtmd_ctx == nullptr) {
225
+ throw std::runtime_error("Multimodal context is not initialized");
226
+ }
227
+
228
+ TokenizeResult result;
229
+ result.has_image = !image_paths.empty();
230
+
231
+ mtmd::bitmaps bitmaps;
232
+
233
+ // Load all images
234
+ for (const auto& image_path : image_paths) {
235
+ fprintf(stdout, "[DEBUG] Loading image: %s\n",
236
+ image_path.substr(0, 50).c_str()); // Only log part of path for base64
237
+
238
+ // Check if it's a base64 image
239
+ if (image_path.compare(0, 11, "data:image/") == 0) {
240
+
241
+ // Parse base64 data
242
+ std::vector<std::string> parts;
243
+ size_t comma_pos = image_path.find(',');
244
+ if (comma_pos == std::string::npos) {
245
+ result.bitmap_hashes.clear();
246
+ throw std::runtime_error("Invalid base64 image");
247
+ }
248
+
249
+ std::string header = image_path.substr(0, comma_pos);
250
+ std::string base64_data = image_path.substr(comma_pos + 1);
251
+
252
+ if (header.find("base64") == std::string::npos) {
253
+ result.bitmap_hashes.clear();
254
+ throw std::runtime_error("Invalid base64 image");
255
+ }
256
+
257
+ // Decode base64
258
+ try {
259
+ // Decode base64 to binary
260
+ std::vector<uint8_t> image_data = base64_decode(base64_data);
261
+
262
+ // Load bitmap from memory buffer using direct initialization
263
+ mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(image_data.data(), image_data.size()));
264
+ if (!bmp.ptr) {
265
+ bitmaps.entries.clear();
266
+ throw std::runtime_error("Failed to decode base64 image");
267
+ }
268
+
269
+ // Calculate bitmap hash (for KV caching)
270
+ std::string hash = fnv_hash(bmp.data(), bmp.nx()*bmp.ny()*3);
271
+ bmp.set_id(hash.c_str());
272
+ bitmaps.entries.push_back(std::move(bmp));
273
+ result.bitmap_hashes.push_back(hash.c_str());
274
+ } catch (const std::exception& e) {
275
+ bitmaps.entries.clear();
276
+ throw std::runtime_error("Failed to decode base64 image");
277
+ }
278
+ } else if (image_path.compare(0, 7, "http://") == 0 || image_path.compare(0, 8, "https://") == 0) {
279
+ // HTTP URLs are not supported yet
280
+ bitmaps.entries.clear();
281
+ throw std::runtime_error("HTTP URLs are not supported yet");
282
+ } else {
283
+ // Check if file exists
284
+ FILE* file = fopen(image_path.c_str(), "rb");
285
+ if (file == nullptr) {
286
+ bitmaps.entries.clear();
287
+ throw std::runtime_error("Failed to open image file");
288
+ }
289
+
290
+ // Get file size
291
+ fseek(file, 0, SEEK_END);
292
+ long file_size = ftell(file);
293
+ fseek(file, 0, SEEK_SET);
294
+ fclose(file);
295
+
296
+ // Create bitmap directly
297
+ mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(image_path.c_str()));
298
+ if (!bmp.ptr) {
299
+ bitmaps.entries.clear();
300
+ throw std::runtime_error("Failed to create bitmap from image file");
301
+ }
302
+
303
+ // Calculate bitmap hash (for KV caching)
304
+ std::string hash = fnv_hash(bmp.data(), bmp.nx()*bmp.ny()*3);
305
+ bmp.set_id(hash.c_str());
306
+ bitmaps.entries.push_back(std::move(bmp));
307
+ result.bitmap_hashes.push_back(hash.c_str());
308
+ }
309
+ }
310
+
311
+ result.chunks = mtmd_input_chunks_init();
312
+ if (result.chunks == nullptr) {
313
+ bitmaps.entries.clear();
314
+ throw std::runtime_error("Failed to initialize input chunks");
315
+ }
316
+
317
+ // Create input text
318
+ mtmd_input_text input_text;
319
+ input_text.text = prompt.c_str(); // Use the full prompt with image marker
320
+ input_text.add_special = true; // Add BOS token if this is the first message
321
+ input_text.parse_special = true; // Parse special tokens like <__image__>
322
+
323
+ // Tokenize the text and images
324
+ fprintf(stdout, "[DEBUG] Tokenizing text and %zu images\n", bitmaps.entries.size());
325
+ auto bitmaps_c_ptr = bitmaps.c_ptr();
326
+
327
+ // Cast away const for mtmd_tokenize
328
+ int32_t res = mtmd_tokenize(
329
+ const_cast<mtmd_context*>(mtmd_ctx),
330
+ result.chunks,
331
+ &input_text,
332
+ bitmaps_c_ptr.data(),
333
+ bitmaps_c_ptr.size()
334
+ );
335
+
336
+ if (res != 0) {
337
+ mtmd_input_chunks_free(result.chunks);
338
+ bitmaps.entries.clear();
339
+ throw std::runtime_error("Failed to tokenize text and images");
340
+ }
341
+
342
+ // Log chunk information
343
+ size_t num_chunks = mtmd_input_chunks_size(result.chunks);
344
+ fprintf(stdout, "[DEBUG] Tokenization successful: num_chunks=%zu\n", num_chunks);
345
+
346
+ // Track the total number of tokens (both text and image)
347
+ size_t total_token_count = 0;
348
+
349
+ // chunk pos
350
+ for (size_t i = 0; i < num_chunks; i++) {
351
+ result.chunk_pos.push_back(total_token_count);
352
+
353
+ const mtmd_input_chunk* chunk = mtmd_input_chunks_get(result.chunks, i);
354
+ mtmd_input_chunk_type chunk_type = mtmd_input_chunk_get_type(chunk);
355
+
356
+ if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
357
+ size_t n_tokens;
358
+ const llama_token* tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
359
+
360
+ result.tokens.insert(result.tokens.end(), tokens, tokens + n_tokens);
361
+ total_token_count += n_tokens;
362
+ } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
363
+ result.chunk_pos_images.push_back(total_token_count);
364
+
365
+ const mtmd_image_tokens* img_tokens = mtmd_input_chunk_get_tokens_image(chunk);
366
+ size_t n_tokens = mtmd_image_tokens_get_n_tokens(img_tokens);
367
+ size_t n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
368
+
369
+ for (size_t j = 0; j < n_pos; j++) {
370
+ result.tokens.push_back(LLAMA_TOKEN_NULL);
371
+ }
372
+ total_token_count += n_pos;
373
+ }
374
+ }
375
+
376
+ bitmaps.entries.clear();
377
+
378
+ return result;
379
+ }
380
+
381
+ // Process images and add them to the tokenized input
382
+ static llama_pos process_image_prompt(
383
+ llama_context* ctx,
384
+ const mtmd_context* mtmd_ctx,
385
+ LlamaSessionPtr sess,
386
+ const common_params& params,
387
+ const std::vector<std::string>& image_paths
388
+ ) {
389
+ if (mtmd_ctx == nullptr) {
390
+ throw std::runtime_error("Multimodal context is not initialized");
391
+ }
392
+
393
+ // Multimodal path
394
+ std::string full_prompt = params.prompt;
395
+ // Add image marker if it doesn't already exist
396
+ if (full_prompt.find("<__image__>") == std::string::npos) {
397
+ full_prompt += " <__image__>";
398
+ }
399
+
400
+ auto result = tokenizeWithImages(mtmd_ctx, full_prompt, image_paths);
401
+
402
+ auto all_tokens = result.tokens;
403
+ auto chunks = result.chunks;
404
+ auto chunk_pos = result.chunk_pos;
405
+ auto chunk_pos_images = result.chunk_pos_images;
406
+ auto bitmap_hashes = result.bitmap_hashes;
407
+
408
+ llama_pos n_past = common_tokens_part(*sess->tokens_ptr(), all_tokens);
409
+
410
+ llama_pos new_n_past = n_past;
411
+
412
+ // Adjust n_past to position of the text chunk
413
+ // TODO: Edit the text chunk to remove the tokens before n_past to speed up
414
+ // need to update the mtmd api
415
+ auto adjusted_n_past = -1;
416
+ for (size_t i = 0; i < chunk_pos.size(); i++) {
417
+ if (n_past < chunk_pos[i]) {
418
+ break;
419
+ }
420
+ bool is_end = i + 1 == chunk_pos.size();
421
+ if (
422
+ chunk_pos[i] < n_past &&
423
+ (!is_end && chunk_pos[i + 1] > n_past)
424
+ // is_end & n_past < total_token_count:
425
+ // don't need to adjust and it will skip eval_chunk_single, let nextToken() to finish the job
426
+ ) {
427
+ adjusted_n_past = chunk_pos[i];
428
+ }
429
+ }
430
+ if (adjusted_n_past != -1) {
431
+ n_past = adjusted_n_past;
432
+ new_n_past = n_past;
433
+ fprintf(stdout, "[DEBUG] Adjusted n_past to %d\n", n_past);
434
+ }
435
+
436
+ // Compare bitmap hashes, if they are not the same, backtrack n_past to the position of the first mismatch
437
+ auto mtmd_bitmap_past_hashes = sess->mtmd_bitmap_past_hashes_ptr();
438
+ if (mtmd_bitmap_past_hashes->size() > 0) {
439
+ for (size_t i = 0; i < bitmap_hashes.size(); i++) {
440
+ auto pos = chunk_pos_images[i];
441
+ if (n_past < pos) {
442
+ break;
443
+ }
444
+ if (i >= mtmd_bitmap_past_hashes->size()) {
445
+ break;
446
+ }
447
+ if (bitmap_hashes[i] != (*mtmd_bitmap_past_hashes)[i]) {
448
+ n_past = chunk_pos_images[i];
449
+ new_n_past = n_past;
450
+ break;
451
+ }
452
+ }
453
+ }
454
+
455
+ // Clear all KV cache entries after position n_past
456
+ llama_kv_self_seq_rm(ctx, 0, n_past, -1);
457
+
458
+ size_t num_chunks = mtmd_input_chunks_size(chunks);
459
+
460
+ for (size_t i = 0; i < chunk_pos.size(); i++) {
461
+ fprintf(stdout, "[DEBUG] Evaluating chunk %zu: n_past=%d, chunk_pos=%zu\n", i, n_past, chunk_pos[i]);
462
+
463
+ // Process chunk only if it's after the current n_past
464
+ if (chunk_pos[i] >= new_n_past) {
465
+ bool chunk_logits_last = (i == num_chunks - 1);
466
+ auto chunk = mtmd_input_chunks_get(chunks, i);
467
+
468
+ // Cast away const for mtmd_helper_eval_chunk_single
469
+ int32_t res = mtmd_helper_eval_chunk_single(
470
+ const_cast<mtmd_context*>(mtmd_ctx),
471
+ ctx,
472
+ chunk,
473
+ n_past,
474
+ 0,
475
+ params.n_batch, // batch size
476
+ chunk_logits_last,
477
+ &new_n_past
478
+ );
479
+
480
+ if (res != 0) {
481
+ mtmd_input_chunks_free(chunks);
482
+ throw std::runtime_error("Failed to process chunk");
483
+ }
484
+ n_past = new_n_past;
485
+ }
486
+ }
487
+
488
+ if (n_past == all_tokens.size() && n_past > 0 && all_tokens[n_past - 1] != LLAMA_TOKEN_NULL) {
489
+ // we have to evaluate at least 1 token to generate logits.
490
+ n_past--;
491
+ }
492
+
493
+ // Update sampling context to process token sequences
494
+ for (auto & token : all_tokens) {
495
+ if (token == LLAMA_TOKEN_NULL) {
496
+ continue;
497
+ }
498
+ }
499
+ // Set the tokens
500
+ sess->set_tokens(std::move(all_tokens));
501
+
502
+ sess->set_mtmd_bitmap_past_hashes(bitmap_hashes);
503
+
504
+ // Clean up image resources
505
+ mtmd_input_chunks_free(chunks);
506
+ return n_past;
507
+ }