@fugood/llama.node 0.4.6 → 0.4.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +5 -1
- package/lib/index.js +3 -3
- package/lib/index.ts +11 -4
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +15 -374
- package/src/LlamaCompletionWorker.h +0 -2
- package/src/LlamaContext.cpp +8 -87
- package/src/TokenizeWorker.cpp +29 -4
- package/src/TokenizeWorker.h +2 -5
- package/src/common.hpp +385 -0
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/bin/win32/arm64/node.lib
CHANGED
|
Binary file
|
|
Binary file
|
package/bin/win32/x64/node.lib
CHANGED
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/lib/binding.ts
CHANGED
|
@@ -137,6 +137,10 @@ export type LlamaCompletionToken = {
|
|
|
137
137
|
|
|
138
138
|
export type TokenizeResult = {
|
|
139
139
|
tokens: Int32Array
|
|
140
|
+
has_image: boolean
|
|
141
|
+
bitmap_hashes: string[]
|
|
142
|
+
chunk_pos: number[]
|
|
143
|
+
chunk_pos_images: number[]
|
|
140
144
|
}
|
|
141
145
|
|
|
142
146
|
export type EmbeddingResult = {
|
|
@@ -163,7 +167,7 @@ export interface LlamaContext {
|
|
|
163
167
|
callback?: (token: LlamaCompletionToken) => void,
|
|
164
168
|
): Promise<LlamaCompletionResult>
|
|
165
169
|
stopCompletion(): void
|
|
166
|
-
tokenize(text: string): Promise<TokenizeResult>
|
|
170
|
+
tokenize(text: string, image_paths?: string[]): Promise<TokenizeResult>
|
|
167
171
|
detokenize(tokens: number[]): Promise<string>
|
|
168
172
|
embedding(text: string): Promise<EmbeddingResult>
|
|
169
173
|
saveSession(path: string): Promise<void>
|
package/lib/index.js
CHANGED
|
@@ -112,7 +112,7 @@ class LlamaContextWrapper {
|
|
|
112
112
|
getFormattedChat(messages, template, params) {
|
|
113
113
|
const { messages: chat, has_image, image_paths, } = this._formatImageChat(messages);
|
|
114
114
|
const useJinja = this.isJinjaSupported() && (params === null || params === void 0 ? void 0 : params.jinja);
|
|
115
|
-
let tmpl
|
|
115
|
+
let tmpl;
|
|
116
116
|
if (template)
|
|
117
117
|
tmpl = template; // Force replace if provided
|
|
118
118
|
const jsonSchema = getJsonSchema(params === null || params === void 0 ? void 0 : params.response_format);
|
|
@@ -144,8 +144,8 @@ class LlamaContextWrapper {
|
|
|
144
144
|
stopCompletion() {
|
|
145
145
|
return this.ctx.stopCompletion();
|
|
146
146
|
}
|
|
147
|
-
tokenize(text) {
|
|
148
|
-
return this.ctx.tokenize(text);
|
|
147
|
+
tokenize(text, { image_paths } = {}) {
|
|
148
|
+
return this.ctx.tokenize(text, image_paths);
|
|
149
149
|
}
|
|
150
150
|
detokenize(tokens) {
|
|
151
151
|
return this.ctx.detokenize(tokens);
|
package/lib/index.ts
CHANGED
|
@@ -60,6 +60,13 @@ const getJsonSchema = (responseFormat?: CompletionResponseFormat) => {
|
|
|
60
60
|
return null
|
|
61
61
|
}
|
|
62
62
|
|
|
63
|
+
export type FormattedChatResult = {
|
|
64
|
+
type: 'jinja' | 'llama-chat'
|
|
65
|
+
prompt: string
|
|
66
|
+
has_image: boolean
|
|
67
|
+
image_paths?: Array<string>
|
|
68
|
+
}
|
|
69
|
+
|
|
63
70
|
class LlamaContextWrapper {
|
|
64
71
|
ctx: any
|
|
65
72
|
|
|
@@ -133,7 +140,7 @@ class LlamaContextWrapper {
|
|
|
133
140
|
parallel_tool_calls?: object
|
|
134
141
|
tool_choice?: string
|
|
135
142
|
},
|
|
136
|
-
):
|
|
143
|
+
): FormattedChatResult {
|
|
137
144
|
const {
|
|
138
145
|
messages: chat,
|
|
139
146
|
has_image,
|
|
@@ -141,7 +148,7 @@ class LlamaContextWrapper {
|
|
|
141
148
|
} = this._formatImageChat(messages)
|
|
142
149
|
|
|
143
150
|
const useJinja = this.isJinjaSupported() && params?.jinja
|
|
144
|
-
let tmpl
|
|
151
|
+
let tmpl
|
|
145
152
|
if (template) tmpl = template // Force replace if provided
|
|
146
153
|
const jsonSchema = getJsonSchema(params?.response_format)
|
|
147
154
|
|
|
@@ -185,8 +192,8 @@ class LlamaContextWrapper {
|
|
|
185
192
|
return this.ctx.stopCompletion()
|
|
186
193
|
}
|
|
187
194
|
|
|
188
|
-
tokenize(text: string): Promise<TokenizeResult> {
|
|
189
|
-
return this.ctx.tokenize(text)
|
|
195
|
+
tokenize(text: string, { image_paths }: { image_paths?: string[] } = {}): Promise<TokenizeResult> {
|
|
196
|
+
return this.ctx.tokenize(text, image_paths)
|
|
190
197
|
}
|
|
191
198
|
|
|
192
199
|
detokenize(tokens: number[]): Promise<string> {
|
package/package.json
CHANGED
|
@@ -1,367 +1,6 @@
|
|
|
1
1
|
#include "LlamaCompletionWorker.h"
|
|
2
2
|
#include "LlamaContext.h"
|
|
3
3
|
|
|
4
|
-
// Computes FNV-1a hash of the data
|
|
5
|
-
static std::string fnv_hash(const uint8_t * data, size_t len) {
|
|
6
|
-
const uint64_t fnv_prime = 0x100000001b3ULL;
|
|
7
|
-
uint64_t hash = 0xcbf29ce484222325ULL;
|
|
8
|
-
|
|
9
|
-
for (size_t i = 0; i < len; ++i) {
|
|
10
|
-
hash ^= data[i];
|
|
11
|
-
hash *= fnv_prime;
|
|
12
|
-
}
|
|
13
|
-
return std::to_string(hash);
|
|
14
|
-
}
|
|
15
|
-
|
|
16
|
-
static const std::string base64_chars =
|
|
17
|
-
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
|
18
|
-
"abcdefghijklmnopqrstuvwxyz"
|
|
19
|
-
"0123456789+/";
|
|
20
|
-
|
|
21
|
-
// Base64 decoding function
|
|
22
|
-
static std::vector<uint8_t> base64_decode(const std::string &encoded_string) {
|
|
23
|
-
std::vector<uint8_t> decoded;
|
|
24
|
-
int in_len = encoded_string.size();
|
|
25
|
-
int i = 0;
|
|
26
|
-
int j = 0;
|
|
27
|
-
int in_ = 0;
|
|
28
|
-
unsigned char char_array_4[4], char_array_3[3];
|
|
29
|
-
|
|
30
|
-
while (in_len-- && (encoded_string[in_] != '=')) {
|
|
31
|
-
if (isspace(encoded_string[in_])) {
|
|
32
|
-
in_++;
|
|
33
|
-
continue;
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
if (encoded_string[in_] == '=' || base64_chars.find(encoded_string[in_]) == std::string::npos) {
|
|
37
|
-
break;
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
char_array_4[i++] = encoded_string[in_]; in_++;
|
|
41
|
-
if (i == 4) {
|
|
42
|
-
for (i = 0; i < 4; i++) {
|
|
43
|
-
char_array_4[i] = base64_chars.find(char_array_4[i]);
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
|
|
47
|
-
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
|
|
48
|
-
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
|
|
49
|
-
|
|
50
|
-
for (i = 0; i < 3; i++) {
|
|
51
|
-
decoded.push_back(char_array_3[i]);
|
|
52
|
-
}
|
|
53
|
-
i = 0;
|
|
54
|
-
}
|
|
55
|
-
}
|
|
56
|
-
|
|
57
|
-
if (i) {
|
|
58
|
-
for (j = i; j < 4; j++) {
|
|
59
|
-
char_array_4[j] = 0;
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
for (j = 0; j < 4; j++) {
|
|
63
|
-
char_array_4[j] = base64_chars.find(char_array_4[j]);
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
|
|
67
|
-
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
|
|
68
|
-
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
|
|
69
|
-
|
|
70
|
-
for (j = 0; j < i - 1; j++) {
|
|
71
|
-
decoded.push_back(char_array_3[j]);
|
|
72
|
-
}
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
return decoded;
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
size_t common_part(const std::vector<llama_token> &a,
|
|
79
|
-
const std::vector<llama_token> &b) {
|
|
80
|
-
size_t i = 0;
|
|
81
|
-
while (i < a.size() && i < b.size() && a[i] == b[i]) {
|
|
82
|
-
i++;
|
|
83
|
-
}
|
|
84
|
-
return i;
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
// Process images and add them to the tokenized input
|
|
88
|
-
llama_pos processImage(
|
|
89
|
-
const mtmd_context* mtmd_ctx,
|
|
90
|
-
llama_context* ctx,
|
|
91
|
-
LlamaSessionPtr sess,
|
|
92
|
-
const std::vector<std::string>& image_paths,
|
|
93
|
-
const common_params& params,
|
|
94
|
-
std::vector<llama_token>& text_tokens
|
|
95
|
-
) {
|
|
96
|
-
if (mtmd_ctx == nullptr) {
|
|
97
|
-
return false;
|
|
98
|
-
}
|
|
99
|
-
|
|
100
|
-
// Multimodal path
|
|
101
|
-
std::string full_prompt = params.prompt;
|
|
102
|
-
// Add image marker if it doesn't already exist
|
|
103
|
-
if (full_prompt.find("<__image__>") == std::string::npos) {
|
|
104
|
-
full_prompt += " <__image__>";
|
|
105
|
-
}
|
|
106
|
-
|
|
107
|
-
// Prepare bitmaps array for all images
|
|
108
|
-
mtmd::bitmaps bitmaps;
|
|
109
|
-
|
|
110
|
-
std::vector<std::string> bitmap_hashes;
|
|
111
|
-
|
|
112
|
-
// Load all images
|
|
113
|
-
for (const auto& image_path : image_paths) {
|
|
114
|
-
fprintf(stdout, "[DEBUG] Loading image: %s\n",
|
|
115
|
-
image_path.substr(0, 50).c_str()); // Only log part of path for base64
|
|
116
|
-
|
|
117
|
-
// Check if it's a base64 image
|
|
118
|
-
if (image_path.compare(0, 11, "data:image/") == 0) {
|
|
119
|
-
|
|
120
|
-
// Parse base64 data
|
|
121
|
-
std::vector<std::string> parts;
|
|
122
|
-
size_t comma_pos = image_path.find(',');
|
|
123
|
-
if (comma_pos == std::string::npos) {
|
|
124
|
-
bitmaps.entries.clear();
|
|
125
|
-
return false;
|
|
126
|
-
}
|
|
127
|
-
|
|
128
|
-
std::string header = image_path.substr(0, comma_pos);
|
|
129
|
-
std::string base64_data = image_path.substr(comma_pos + 1);
|
|
130
|
-
|
|
131
|
-
if (header.find("base64") == std::string::npos) {
|
|
132
|
-
bitmaps.entries.clear();
|
|
133
|
-
return false;
|
|
134
|
-
}
|
|
135
|
-
|
|
136
|
-
// Decode base64
|
|
137
|
-
try {
|
|
138
|
-
// Decode base64 to binary
|
|
139
|
-
std::vector<uint8_t> image_data = base64_decode(base64_data);
|
|
140
|
-
|
|
141
|
-
// Load bitmap from memory buffer using direct initialization
|
|
142
|
-
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(image_data.data(), image_data.size()));
|
|
143
|
-
if (!bmp.ptr) {
|
|
144
|
-
bitmaps.entries.clear();
|
|
145
|
-
return false;
|
|
146
|
-
}
|
|
147
|
-
|
|
148
|
-
// Calculate bitmap hash (for KV caching)
|
|
149
|
-
std::string hash = fnv_hash(bmp.data(), bmp.nx()*bmp.ny()*3);
|
|
150
|
-
bmp.set_id(hash.c_str());
|
|
151
|
-
bitmaps.entries.push_back(std::move(bmp));
|
|
152
|
-
bitmap_hashes.push_back(hash.c_str());
|
|
153
|
-
} catch (const std::exception& e) {
|
|
154
|
-
bitmaps.entries.clear();
|
|
155
|
-
return false;
|
|
156
|
-
}
|
|
157
|
-
} else if (image_path.compare(0, 7, "http://") == 0 || image_path.compare(0, 8, "https://") == 0) {
|
|
158
|
-
// HTTP URLs are not supported yet
|
|
159
|
-
bitmaps.entries.clear();
|
|
160
|
-
return false;
|
|
161
|
-
} else {
|
|
162
|
-
// Check if file exists
|
|
163
|
-
FILE* file = fopen(image_path.c_str(), "rb");
|
|
164
|
-
if (file == nullptr) {
|
|
165
|
-
bitmaps.entries.clear();
|
|
166
|
-
return false;
|
|
167
|
-
}
|
|
168
|
-
|
|
169
|
-
// Get file size
|
|
170
|
-
fseek(file, 0, SEEK_END);
|
|
171
|
-
long file_size = ftell(file);
|
|
172
|
-
fseek(file, 0, SEEK_SET);
|
|
173
|
-
fclose(file);
|
|
174
|
-
|
|
175
|
-
// Create bitmap directly
|
|
176
|
-
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(image_path.c_str()));
|
|
177
|
-
if (!bmp.ptr) {
|
|
178
|
-
bitmaps.entries.clear();
|
|
179
|
-
return false;
|
|
180
|
-
}
|
|
181
|
-
|
|
182
|
-
// Calculate bitmap hash (for KV caching)
|
|
183
|
-
std::string hash = fnv_hash(bmp.data(), bmp.nx()*bmp.ny()*3);
|
|
184
|
-
bmp.set_id(hash.c_str());
|
|
185
|
-
bitmaps.entries.push_back(std::move(bmp));
|
|
186
|
-
bitmap_hashes.push_back(hash.c_str());
|
|
187
|
-
}
|
|
188
|
-
}
|
|
189
|
-
|
|
190
|
-
mtmd_input_chunks* chunks = mtmd_input_chunks_init();
|
|
191
|
-
if (chunks == nullptr) {
|
|
192
|
-
bitmaps.entries.clear();
|
|
193
|
-
return false;
|
|
194
|
-
}
|
|
195
|
-
|
|
196
|
-
// Create input text
|
|
197
|
-
mtmd_input_text input_text;
|
|
198
|
-
input_text.text = full_prompt.c_str(); // Use the full prompt with image marker
|
|
199
|
-
input_text.add_special = true; // Add BOS token if this is the first message
|
|
200
|
-
input_text.parse_special = true; // Parse special tokens like <__image__>
|
|
201
|
-
|
|
202
|
-
// Tokenize the text and images
|
|
203
|
-
fprintf(stdout, "[DEBUG] Tokenizing text and %zu images\n", bitmaps.entries.size());
|
|
204
|
-
auto bitmaps_c_ptr = bitmaps.c_ptr();
|
|
205
|
-
|
|
206
|
-
// Cast away const for mtmd_tokenize
|
|
207
|
-
int32_t res = mtmd_tokenize(
|
|
208
|
-
const_cast<mtmd_context*>(mtmd_ctx),
|
|
209
|
-
chunks,
|
|
210
|
-
&input_text,
|
|
211
|
-
bitmaps_c_ptr.data(),
|
|
212
|
-
bitmaps_c_ptr.size()
|
|
213
|
-
);
|
|
214
|
-
|
|
215
|
-
if (res != 0) {
|
|
216
|
-
mtmd_input_chunks_free(chunks);
|
|
217
|
-
bitmaps.entries.clear();
|
|
218
|
-
return false;
|
|
219
|
-
}
|
|
220
|
-
|
|
221
|
-
// Log chunk information
|
|
222
|
-
size_t num_chunks = mtmd_input_chunks_size(chunks);
|
|
223
|
-
fprintf(stdout, "[DEBUG] Tokenization successful: num_chunks=%zu\n", num_chunks);
|
|
224
|
-
|
|
225
|
-
// Clear text_tokens before adding new tokens
|
|
226
|
-
text_tokens.clear();
|
|
227
|
-
|
|
228
|
-
// Create a vector to store all tokens (both text and image)
|
|
229
|
-
std::vector<llama_token> all_tokens;
|
|
230
|
-
|
|
231
|
-
// Track the total number of tokens (both text and image)
|
|
232
|
-
size_t total_token_count = 0;
|
|
233
|
-
|
|
234
|
-
// chunk pos
|
|
235
|
-
std::vector<size_t> chunk_pos;
|
|
236
|
-
std::vector<size_t> chunk_pos_images;
|
|
237
|
-
for (size_t i = 0; i < num_chunks; i++) {
|
|
238
|
-
chunk_pos.push_back(total_token_count);
|
|
239
|
-
|
|
240
|
-
const mtmd_input_chunk* chunk = mtmd_input_chunks_get(chunks, i);
|
|
241
|
-
mtmd_input_chunk_type chunk_type = mtmd_input_chunk_get_type(chunk);
|
|
242
|
-
|
|
243
|
-
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
|
244
|
-
size_t n_tokens;
|
|
245
|
-
const llama_token* tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
|
|
246
|
-
|
|
247
|
-
// Add text tokens
|
|
248
|
-
text_tokens.insert(text_tokens.end(), tokens, tokens + n_tokens);
|
|
249
|
-
all_tokens.insert(all_tokens.end(), tokens, tokens + n_tokens);
|
|
250
|
-
total_token_count += n_tokens;
|
|
251
|
-
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
|
252
|
-
chunk_pos_images.push_back(total_token_count);
|
|
253
|
-
|
|
254
|
-
const mtmd_image_tokens* img_tokens = mtmd_input_chunk_get_tokens_image(chunk);
|
|
255
|
-
size_t n_tokens = mtmd_image_tokens_get_n_tokens(img_tokens);
|
|
256
|
-
size_t n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
|
|
257
|
-
|
|
258
|
-
for (size_t j = 0; j < n_pos; j++) {
|
|
259
|
-
all_tokens.push_back(LLAMA_TOKEN_NULL);
|
|
260
|
-
}
|
|
261
|
-
total_token_count += n_pos;
|
|
262
|
-
}
|
|
263
|
-
}
|
|
264
|
-
|
|
265
|
-
llama_pos n_past = common_part(*sess->tokens_ptr(), all_tokens);
|
|
266
|
-
|
|
267
|
-
llama_pos new_n_past = n_past;
|
|
268
|
-
|
|
269
|
-
// Adjust n_past to position of the text chunk
|
|
270
|
-
// TODO: Edit the text chunk to remove the tokens before n_past to speed up
|
|
271
|
-
// need to update the mtmd api
|
|
272
|
-
auto adjusted_n_past = -1;
|
|
273
|
-
for (size_t i = 0; i < chunk_pos.size(); i++) {
|
|
274
|
-
if (n_past < chunk_pos[i]) {
|
|
275
|
-
break;
|
|
276
|
-
}
|
|
277
|
-
bool is_end = i + 1 == chunk_pos.size();
|
|
278
|
-
if (
|
|
279
|
-
chunk_pos[i] < n_past &&
|
|
280
|
-
(!is_end && chunk_pos[i + 1] > n_past)
|
|
281
|
-
// is_end & n_past < total_token_count:
|
|
282
|
-
// don't need to adjust and it will skip eval_chunk_single, let nextToken() to finish the job
|
|
283
|
-
) {
|
|
284
|
-
adjusted_n_past = chunk_pos[i];
|
|
285
|
-
}
|
|
286
|
-
}
|
|
287
|
-
if (adjusted_n_past != -1) {
|
|
288
|
-
n_past = adjusted_n_past;
|
|
289
|
-
new_n_past = n_past;
|
|
290
|
-
fprintf(stdout, "[DEBUG] Adjusted n_past to %d\n", n_past);
|
|
291
|
-
}
|
|
292
|
-
|
|
293
|
-
// Compare bitmap hashes, if they are not the same, backtrack n_past to the position of the first mismatch
|
|
294
|
-
auto mtmd_bitmap_past_hashes = sess->mtmd_bitmap_past_hashes_ptr();
|
|
295
|
-
if (mtmd_bitmap_past_hashes->size() > 0) {
|
|
296
|
-
for (size_t i = 0; i < bitmap_hashes.size(); i++) {
|
|
297
|
-
auto pos = chunk_pos_images[i];
|
|
298
|
-
if (n_past < pos) {
|
|
299
|
-
break;
|
|
300
|
-
}
|
|
301
|
-
if (i >= mtmd_bitmap_past_hashes->size()) {
|
|
302
|
-
break;
|
|
303
|
-
}
|
|
304
|
-
if (bitmap_hashes[i] != (*mtmd_bitmap_past_hashes)[i]) {
|
|
305
|
-
n_past = chunk_pos_images[i];
|
|
306
|
-
new_n_past = n_past;
|
|
307
|
-
break;
|
|
308
|
-
}
|
|
309
|
-
}
|
|
310
|
-
}
|
|
311
|
-
|
|
312
|
-
// Clear all KV cache entries after position n_past
|
|
313
|
-
llama_kv_self_seq_rm(ctx, 0, n_past, -1);
|
|
314
|
-
|
|
315
|
-
for (size_t i = 0; i < chunk_pos.size(); i++) {
|
|
316
|
-
fprintf(stdout, "[DEBUG] Evaluating chunk %zu: n_past=%d, chunk_pos=%zu\n", i, n_past, chunk_pos[i]);
|
|
317
|
-
|
|
318
|
-
// Process chunk only if it's after the current n_past
|
|
319
|
-
if (chunk_pos[i] >= new_n_past) {
|
|
320
|
-
bool chunk_logits_last = (i == num_chunks - 1);
|
|
321
|
-
auto chunk = mtmd_input_chunks_get(chunks, i);
|
|
322
|
-
|
|
323
|
-
// Cast away const for mtmd_helper_eval_chunk_single
|
|
324
|
-
int32_t res = mtmd_helper_eval_chunk_single(
|
|
325
|
-
const_cast<mtmd_context*>(mtmd_ctx),
|
|
326
|
-
ctx,
|
|
327
|
-
chunk,
|
|
328
|
-
n_past,
|
|
329
|
-
0,
|
|
330
|
-
params.n_batch, // batch size
|
|
331
|
-
chunk_logits_last,
|
|
332
|
-
&new_n_past
|
|
333
|
-
);
|
|
334
|
-
|
|
335
|
-
if (res != 0) {
|
|
336
|
-
mtmd_input_chunks_free(chunks);
|
|
337
|
-
bitmaps.entries.clear();
|
|
338
|
-
return false;
|
|
339
|
-
}
|
|
340
|
-
n_past = new_n_past;
|
|
341
|
-
}
|
|
342
|
-
}
|
|
343
|
-
|
|
344
|
-
if (n_past == total_token_count && n_past > 0 && all_tokens[n_past - 1] != LLAMA_TOKEN_NULL) {
|
|
345
|
-
// we have to evaluate at least 1 token to generate logits.
|
|
346
|
-
n_past--;
|
|
347
|
-
}
|
|
348
|
-
|
|
349
|
-
// Update sampling context to process token sequences
|
|
350
|
-
for (auto & token : all_tokens) {
|
|
351
|
-
if (token == LLAMA_TOKEN_NULL) {
|
|
352
|
-
continue;
|
|
353
|
-
}
|
|
354
|
-
}
|
|
355
|
-
// Set the tokens
|
|
356
|
-
sess->set_tokens(std::move(all_tokens));
|
|
357
|
-
|
|
358
|
-
sess->set_mtmd_bitmap_past_hashes(bitmap_hashes);
|
|
359
|
-
|
|
360
|
-
// Clean up image resources
|
|
361
|
-
mtmd_input_chunks_free(chunks);
|
|
362
|
-
bitmaps.entries.clear();
|
|
363
|
-
return n_past;
|
|
364
|
-
}
|
|
365
4
|
|
|
366
5
|
size_t findStoppingStrings(const std::string &text,
|
|
367
6
|
const size_t last_token_size,
|
|
@@ -425,22 +64,25 @@ void LlamaCompletionWorker::Execute() {
|
|
|
425
64
|
LlamaCppSampling sampling{common_sampler_init(model, _params.sampling),
|
|
426
65
|
common_sampler_free};
|
|
427
66
|
|
|
428
|
-
std::vector<llama_token> prompt_tokens;
|
|
429
|
-
|
|
430
67
|
// Process images if any are provided
|
|
431
68
|
if (!_image_paths.empty()) {
|
|
432
69
|
const auto* mtmd_ctx = _sess->get_mtmd_ctx();
|
|
433
70
|
|
|
434
71
|
if (mtmd_ctx != nullptr) {
|
|
435
72
|
// Process the images and get the tokens
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
73
|
+
try {
|
|
74
|
+
n_cur = process_image_prompt(
|
|
75
|
+
ctx,
|
|
76
|
+
mtmd_ctx,
|
|
77
|
+
_sess,
|
|
78
|
+
_params,
|
|
79
|
+
_image_paths
|
|
80
|
+
);
|
|
81
|
+
} catch (const std::exception& e) {
|
|
82
|
+
SetError(e.what());
|
|
83
|
+
_sess->get_mutex().unlock();
|
|
84
|
+
return;
|
|
85
|
+
}
|
|
444
86
|
|
|
445
87
|
if (n_cur <= 0) {
|
|
446
88
|
SetError("Failed to process images");
|
|
@@ -456,7 +98,6 @@ void LlamaCompletionWorker::Execute() {
|
|
|
456
98
|
--n_cur;
|
|
457
99
|
}
|
|
458
100
|
n_input -= n_cur;
|
|
459
|
-
llama_kv_self_seq_rm(ctx, 0, n_cur, -1);
|
|
460
101
|
} else {
|
|
461
102
|
SetError("Multimodal context not initialized");
|
|
462
103
|
_sess->get_mutex().unlock();
|
|
@@ -464,11 +105,11 @@ void LlamaCompletionWorker::Execute() {
|
|
|
464
105
|
}
|
|
465
106
|
} else {
|
|
466
107
|
// Text-only path
|
|
467
|
-
prompt_tokens = ::common_tokenize(ctx, _params.prompt, add_bos);
|
|
108
|
+
std::vector<llama_token> prompt_tokens = ::common_tokenize(ctx, _params.prompt, add_bos);
|
|
468
109
|
n_input = prompt_tokens.size();
|
|
469
110
|
|
|
470
111
|
if (_sess->tokens_ptr()->size() > 0) {
|
|
471
|
-
n_cur =
|
|
112
|
+
n_cur = common_tokens_part(*(_sess->tokens_ptr()), prompt_tokens);
|
|
472
113
|
if (n_cur == n_input) {
|
|
473
114
|
--n_cur;
|
|
474
115
|
}
|
package/src/LlamaContext.cpp
CHANGED
|
@@ -27,80 +27,6 @@ static std::string format_string(const std::string& format, Args ... args) {
|
|
|
27
27
|
return std::string(buf.get(), buf.get() + size - 1); // -1 to exclude null terminator
|
|
28
28
|
}
|
|
29
29
|
|
|
30
|
-
// Computes FNV-1a hash of the data
|
|
31
|
-
static std::string fnv_hash(const uint8_t* data, size_t len) {
|
|
32
|
-
const uint64_t fnv_prime = 0x100000001b3ULL;
|
|
33
|
-
uint64_t hash = 0xcbf29ce484222325ULL;
|
|
34
|
-
|
|
35
|
-
for (size_t i = 0; i < len; ++i) {
|
|
36
|
-
hash ^= data[i];
|
|
37
|
-
hash *= fnv_prime;
|
|
38
|
-
}
|
|
39
|
-
return std::to_string(hash);
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
static const std::string base64_chars =
|
|
43
|
-
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
|
44
|
-
"abcdefghijklmnopqrstuvwxyz"
|
|
45
|
-
"0123456789+/";
|
|
46
|
-
|
|
47
|
-
// Base64 decoding function
|
|
48
|
-
static std::vector<uint8_t> base64_decode(const std::string &encoded_string) {
|
|
49
|
-
std::vector<uint8_t> decoded;
|
|
50
|
-
int in_len = encoded_string.size();
|
|
51
|
-
int i = 0;
|
|
52
|
-
int j = 0;
|
|
53
|
-
int in_ = 0;
|
|
54
|
-
unsigned char char_array_4[4], char_array_3[3];
|
|
55
|
-
|
|
56
|
-
while (in_len-- && (encoded_string[in_] != '=')) {
|
|
57
|
-
if (isspace(encoded_string[in_])) {
|
|
58
|
-
in_++;
|
|
59
|
-
continue;
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
if (encoded_string[in_] == '=' || base64_chars.find(encoded_string[in_]) == std::string::npos) {
|
|
63
|
-
break;
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
char_array_4[i++] = encoded_string[in_]; in_++;
|
|
67
|
-
if (i == 4) {
|
|
68
|
-
for (i = 0; i < 4; i++) {
|
|
69
|
-
char_array_4[i] = base64_chars.find(char_array_4[i]);
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
|
|
73
|
-
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
|
|
74
|
-
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
|
|
75
|
-
|
|
76
|
-
for (i = 0; i < 3; i++) {
|
|
77
|
-
decoded.push_back(char_array_3[i]);
|
|
78
|
-
}
|
|
79
|
-
i = 0;
|
|
80
|
-
}
|
|
81
|
-
}
|
|
82
|
-
|
|
83
|
-
if (i) {
|
|
84
|
-
for (j = i; j < 4; j++) {
|
|
85
|
-
char_array_4[j] = 0;
|
|
86
|
-
}
|
|
87
|
-
|
|
88
|
-
for (j = 0; j < 4; j++) {
|
|
89
|
-
char_array_4[j] = base64_chars.find(char_array_4[j]);
|
|
90
|
-
}
|
|
91
|
-
|
|
92
|
-
char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
|
|
93
|
-
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
|
|
94
|
-
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
|
|
95
|
-
|
|
96
|
-
for (j = 0; j < i - 1; j++) {
|
|
97
|
-
decoded.push_back(char_array_3[j]);
|
|
98
|
-
}
|
|
99
|
-
}
|
|
100
|
-
|
|
101
|
-
return decoded;
|
|
102
|
-
}
|
|
103
|
-
|
|
104
30
|
using json = nlohmann::ordered_json;
|
|
105
31
|
|
|
106
32
|
// loadModelInfo(path: string): object
|
|
@@ -153,18 +79,6 @@ Napi::Value LlamaContext::ModelInfo(const Napi::CallbackInfo& info) {
|
|
|
153
79
|
return metadata;
|
|
154
80
|
}
|
|
155
81
|
|
|
156
|
-
std::vector<common_chat_msg> get_messages(Napi::Array messages) {
|
|
157
|
-
std::vector<common_chat_msg> chat;
|
|
158
|
-
for (size_t i = 0; i < messages.Length(); i++) {
|
|
159
|
-
auto message = messages.Get(i).As<Napi::Object>();
|
|
160
|
-
chat.push_back({
|
|
161
|
-
get_option<std::string>(message, "role", ""),
|
|
162
|
-
get_option<std::string>(message, "content", ""),
|
|
163
|
-
});
|
|
164
|
-
}
|
|
165
|
-
return std::move(chat);
|
|
166
|
-
}
|
|
167
|
-
|
|
168
82
|
void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
|
|
169
83
|
Napi::Function func = DefineClass(
|
|
170
84
|
env, "LlamaContext",
|
|
@@ -919,7 +833,14 @@ Napi::Value LlamaContext::Tokenize(const Napi::CallbackInfo &info) {
|
|
|
919
833
|
.ThrowAsJavaScriptException();
|
|
920
834
|
}
|
|
921
835
|
auto text = info[0].ToString().Utf8Value();
|
|
922
|
-
|
|
836
|
+
std::vector<std::string> image_paths;
|
|
837
|
+
if (info.Length() >= 2 && info[1].IsArray()) {
|
|
838
|
+
auto image_paths_array = info[1].As<Napi::Array>();
|
|
839
|
+
for (size_t i = 0; i < image_paths_array.Length(); i++) {
|
|
840
|
+
image_paths.push_back(image_paths_array.Get(i).ToString().Utf8Value());
|
|
841
|
+
}
|
|
842
|
+
}
|
|
843
|
+
auto *worker = new TokenizeWorker(info, _sess, text, image_paths);
|
|
923
844
|
worker->Queue();
|
|
924
845
|
return worker->Promise();
|
|
925
846
|
}
|
package/src/TokenizeWorker.cpp
CHANGED
|
@@ -2,12 +2,18 @@
|
|
|
2
2
|
#include "LlamaContext.h"
|
|
3
3
|
|
|
4
4
|
TokenizeWorker::TokenizeWorker(const Napi::CallbackInfo &info,
|
|
5
|
-
LlamaSessionPtr &sess, std::string text)
|
|
6
|
-
: AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess), _text(text) {}
|
|
5
|
+
LlamaSessionPtr &sess, std::string text, std::vector<std::string> image_paths)
|
|
6
|
+
: AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess), _text(text), _image_paths(image_paths) {}
|
|
7
7
|
|
|
8
8
|
void TokenizeWorker::Execute() {
|
|
9
|
-
|
|
10
|
-
|
|
9
|
+
auto mtmd_ctx = _sess->get_mtmd_ctx();
|
|
10
|
+
if (!_image_paths.empty()) {
|
|
11
|
+
_result = tokenizeWithImages(mtmd_ctx, _text, _image_paths);
|
|
12
|
+
} else {
|
|
13
|
+
const auto tokens = common_tokenize(_sess->context(), _text, false);
|
|
14
|
+
_result.tokens = tokens;
|
|
15
|
+
_result.has_image = false;
|
|
16
|
+
}
|
|
11
17
|
}
|
|
12
18
|
|
|
13
19
|
void TokenizeWorker::OnOK() {
|
|
@@ -18,6 +24,25 @@ void TokenizeWorker::OnOK() {
|
|
|
18
24
|
memcpy(tokens.Data(), _result.tokens.data(),
|
|
19
25
|
_result.tokens.size() * sizeof(llama_token));
|
|
20
26
|
result.Set("tokens", tokens);
|
|
27
|
+
if (_result.has_image) {
|
|
28
|
+
result.Set("has_image", _result.has_image);
|
|
29
|
+
|
|
30
|
+
auto bitmap_hashes = Napi::Array::New(Napi::AsyncWorker::Env(), _result.bitmap_hashes.size());
|
|
31
|
+
for (size_t i = 0; i < _result.bitmap_hashes.size(); i++) {
|
|
32
|
+
bitmap_hashes.Set(i, _result.bitmap_hashes[i]);
|
|
33
|
+
}
|
|
34
|
+
result.Set("bitmap_hashes", bitmap_hashes);
|
|
35
|
+
auto chunk_pos = Napi::Array::New(Napi::AsyncWorker::Env(), _result.chunk_pos.size());
|
|
36
|
+
for (size_t i = 0; i < _result.chunk_pos.size(); i++) {
|
|
37
|
+
chunk_pos.Set(i, _result.chunk_pos[i]);
|
|
38
|
+
}
|
|
39
|
+
result.Set("chunk_pos", chunk_pos);
|
|
40
|
+
auto chunk_pos_images = Napi::Array::New(Napi::AsyncWorker::Env(), _result.chunk_pos_images.size());
|
|
41
|
+
for (size_t i = 0; i < _result.chunk_pos_images.size(); i++) {
|
|
42
|
+
chunk_pos_images.Set(i, _result.chunk_pos_images[i]);
|
|
43
|
+
}
|
|
44
|
+
result.Set("chunk_pos_images", chunk_pos_images);
|
|
45
|
+
}
|
|
21
46
|
Napi::Promise::Deferred::Resolve(result);
|
|
22
47
|
}
|
|
23
48
|
|
package/src/TokenizeWorker.h
CHANGED
|
@@ -1,15 +1,11 @@
|
|
|
1
1
|
#include "common.hpp"
|
|
2
2
|
#include <vector>
|
|
3
3
|
|
|
4
|
-
struct TokenizeResult {
|
|
5
|
-
std::vector<llama_token> tokens;
|
|
6
|
-
};
|
|
7
|
-
|
|
8
4
|
class TokenizeWorker : public Napi::AsyncWorker,
|
|
9
5
|
public Napi::Promise::Deferred {
|
|
10
6
|
public:
|
|
11
7
|
TokenizeWorker(const Napi::CallbackInfo &info, LlamaSessionPtr &sess,
|
|
12
|
-
std::string text);
|
|
8
|
+
std::string text, std::vector<std::string> image_paths);
|
|
13
9
|
|
|
14
10
|
protected:
|
|
15
11
|
void Execute();
|
|
@@ -19,5 +15,6 @@ protected:
|
|
|
19
15
|
private:
|
|
20
16
|
LlamaSessionPtr _sess;
|
|
21
17
|
std::string _text;
|
|
18
|
+
std::vector<std::string> _image_paths;
|
|
22
19
|
TokenizeResult _result;
|
|
23
20
|
};
|
package/src/common.hpp
CHANGED
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
#include "common/common.h"
|
|
4
4
|
#include "common/sampling.h"
|
|
5
|
+
#include "tools/mtmd/mtmd.h"
|
|
6
|
+
#include "tools/mtmd/clip.h"
|
|
5
7
|
#include "chat.h"
|
|
6
8
|
#include "llama.h"
|
|
7
9
|
#include "tools/mtmd/mtmd.h"
|
|
@@ -120,3 +122,386 @@ private:
|
|
|
120
122
|
};
|
|
121
123
|
|
|
122
124
|
typedef std::shared_ptr<LlamaSession> LlamaSessionPtr;
|
|
125
|
+
|
|
126
|
+
static size_t common_tokens_part(const std::vector<llama_token> &a,
|
|
127
|
+
const std::vector<llama_token> &b) {
|
|
128
|
+
size_t i = 0;
|
|
129
|
+
while (i < a.size() && i < b.size() && a[i] == b[i]) {
|
|
130
|
+
i++;
|
|
131
|
+
}
|
|
132
|
+
return i;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
// Computes FNV-1a hash of the data
|
|
136
|
+
static std::string fnv_hash(const uint8_t * data, size_t len) {
|
|
137
|
+
const uint64_t fnv_prime = 0x100000001b3ULL;
|
|
138
|
+
uint64_t hash = 0xcbf29ce484222325ULL;
|
|
139
|
+
|
|
140
|
+
for (size_t i = 0; i < len; ++i) {
|
|
141
|
+
hash ^= data[i];
|
|
142
|
+
hash *= fnv_prime;
|
|
143
|
+
}
|
|
144
|
+
return std::to_string(hash);
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
static const std::string base64_chars =
|
|
148
|
+
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
|
149
|
+
"abcdefghijklmnopqrstuvwxyz"
|
|
150
|
+
"0123456789+/";
|
|
151
|
+
|
|
152
|
+
// Base64 decoding function
|
|
153
|
+
static std::vector<uint8_t> base64_decode(const std::string &encoded_string) {
|
|
154
|
+
std::vector<uint8_t> decoded;
|
|
155
|
+
int in_len = encoded_string.size();
|
|
156
|
+
int i = 0;
|
|
157
|
+
int j = 0;
|
|
158
|
+
int in_ = 0;
|
|
159
|
+
unsigned char char_array_4[4], char_array_3[3];
|
|
160
|
+
|
|
161
|
+
while (in_len-- && (encoded_string[in_] != '=')) {
|
|
162
|
+
if (isspace(encoded_string[in_])) {
|
|
163
|
+
in_++;
|
|
164
|
+
continue;
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
if (encoded_string[in_] == '=' || base64_chars.find(encoded_string[in_]) == std::string::npos) {
|
|
168
|
+
break;
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
char_array_4[i++] = encoded_string[in_]; in_++;
|
|
172
|
+
if (i == 4) {
|
|
173
|
+
for (i = 0; i < 4; i++) {
|
|
174
|
+
char_array_4[i] = base64_chars.find(char_array_4[i]);
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
|
|
178
|
+
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
|
|
179
|
+
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
|
|
180
|
+
|
|
181
|
+
for (i = 0; i < 3; i++) {
|
|
182
|
+
decoded.push_back(char_array_3[i]);
|
|
183
|
+
}
|
|
184
|
+
i = 0;
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
if (i) {
|
|
189
|
+
for (j = i; j < 4; j++) {
|
|
190
|
+
char_array_4[j] = 0;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
for (j = 0; j < 4; j++) {
|
|
194
|
+
char_array_4[j] = base64_chars.find(char_array_4[j]);
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
|
|
198
|
+
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
|
|
199
|
+
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
|
|
200
|
+
|
|
201
|
+
for (j = 0; j < i - 1; j++) {
|
|
202
|
+
decoded.push_back(char_array_3[j]);
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
return decoded;
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
struct TokenizeResult {
|
|
210
|
+
std::vector<llama_token> tokens;
|
|
211
|
+
|
|
212
|
+
bool has_image = false;
|
|
213
|
+
std::vector<std::string> bitmap_hashes;
|
|
214
|
+
std::vector<size_t> chunk_pos; // both text and image
|
|
215
|
+
std::vector<size_t> chunk_pos_images; // image only
|
|
216
|
+
mtmd_input_chunks* chunks = nullptr;
|
|
217
|
+
};
|
|
218
|
+
|
|
219
|
+
static TokenizeResult tokenizeWithImages(
|
|
220
|
+
const mtmd_context* mtmd_ctx,
|
|
221
|
+
const std::string &prompt,
|
|
222
|
+
const std::vector<std::string> &image_paths
|
|
223
|
+
) {
|
|
224
|
+
if (mtmd_ctx == nullptr) {
|
|
225
|
+
throw std::runtime_error("Multimodal context is not initialized");
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
TokenizeResult result;
|
|
229
|
+
result.has_image = !image_paths.empty();
|
|
230
|
+
|
|
231
|
+
mtmd::bitmaps bitmaps;
|
|
232
|
+
|
|
233
|
+
// Load all images
|
|
234
|
+
for (const auto& image_path : image_paths) {
|
|
235
|
+
fprintf(stdout, "[DEBUG] Loading image: %s\n",
|
|
236
|
+
image_path.substr(0, 50).c_str()); // Only log part of path for base64
|
|
237
|
+
|
|
238
|
+
// Check if it's a base64 image
|
|
239
|
+
if (image_path.compare(0, 11, "data:image/") == 0) {
|
|
240
|
+
|
|
241
|
+
// Parse base64 data
|
|
242
|
+
std::vector<std::string> parts;
|
|
243
|
+
size_t comma_pos = image_path.find(',');
|
|
244
|
+
if (comma_pos == std::string::npos) {
|
|
245
|
+
result.bitmap_hashes.clear();
|
|
246
|
+
throw std::runtime_error("Invalid base64 image");
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
std::string header = image_path.substr(0, comma_pos);
|
|
250
|
+
std::string base64_data = image_path.substr(comma_pos + 1);
|
|
251
|
+
|
|
252
|
+
if (header.find("base64") == std::string::npos) {
|
|
253
|
+
result.bitmap_hashes.clear();
|
|
254
|
+
throw std::runtime_error("Invalid base64 image");
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
// Decode base64
|
|
258
|
+
try {
|
|
259
|
+
// Decode base64 to binary
|
|
260
|
+
std::vector<uint8_t> image_data = base64_decode(base64_data);
|
|
261
|
+
|
|
262
|
+
// Load bitmap from memory buffer using direct initialization
|
|
263
|
+
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(image_data.data(), image_data.size()));
|
|
264
|
+
if (!bmp.ptr) {
|
|
265
|
+
bitmaps.entries.clear();
|
|
266
|
+
throw std::runtime_error("Failed to decode base64 image");
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
// Calculate bitmap hash (for KV caching)
|
|
270
|
+
std::string hash = fnv_hash(bmp.data(), bmp.nx()*bmp.ny()*3);
|
|
271
|
+
bmp.set_id(hash.c_str());
|
|
272
|
+
bitmaps.entries.push_back(std::move(bmp));
|
|
273
|
+
result.bitmap_hashes.push_back(hash.c_str());
|
|
274
|
+
} catch (const std::exception& e) {
|
|
275
|
+
bitmaps.entries.clear();
|
|
276
|
+
throw std::runtime_error("Failed to decode base64 image");
|
|
277
|
+
}
|
|
278
|
+
} else if (image_path.compare(0, 7, "http://") == 0 || image_path.compare(0, 8, "https://") == 0) {
|
|
279
|
+
// HTTP URLs are not supported yet
|
|
280
|
+
bitmaps.entries.clear();
|
|
281
|
+
throw std::runtime_error("HTTP URLs are not supported yet");
|
|
282
|
+
} else {
|
|
283
|
+
// Check if file exists
|
|
284
|
+
FILE* file = fopen(image_path.c_str(), "rb");
|
|
285
|
+
if (file == nullptr) {
|
|
286
|
+
bitmaps.entries.clear();
|
|
287
|
+
throw std::runtime_error("Failed to open image file");
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
// Get file size
|
|
291
|
+
fseek(file, 0, SEEK_END);
|
|
292
|
+
long file_size = ftell(file);
|
|
293
|
+
fseek(file, 0, SEEK_SET);
|
|
294
|
+
fclose(file);
|
|
295
|
+
|
|
296
|
+
// Create bitmap directly
|
|
297
|
+
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(image_path.c_str()));
|
|
298
|
+
if (!bmp.ptr) {
|
|
299
|
+
bitmaps.entries.clear();
|
|
300
|
+
throw std::runtime_error("Failed to create bitmap from image file");
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
// Calculate bitmap hash (for KV caching)
|
|
304
|
+
std::string hash = fnv_hash(bmp.data(), bmp.nx()*bmp.ny()*3);
|
|
305
|
+
bmp.set_id(hash.c_str());
|
|
306
|
+
bitmaps.entries.push_back(std::move(bmp));
|
|
307
|
+
result.bitmap_hashes.push_back(hash.c_str());
|
|
308
|
+
}
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
result.chunks = mtmd_input_chunks_init();
|
|
312
|
+
if (result.chunks == nullptr) {
|
|
313
|
+
bitmaps.entries.clear();
|
|
314
|
+
throw std::runtime_error("Failed to initialize input chunks");
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
// Create input text
|
|
318
|
+
mtmd_input_text input_text;
|
|
319
|
+
input_text.text = prompt.c_str(); // Use the full prompt with image marker
|
|
320
|
+
input_text.add_special = true; // Add BOS token if this is the first message
|
|
321
|
+
input_text.parse_special = true; // Parse special tokens like <__image__>
|
|
322
|
+
|
|
323
|
+
// Tokenize the text and images
|
|
324
|
+
fprintf(stdout, "[DEBUG] Tokenizing text and %zu images\n", bitmaps.entries.size());
|
|
325
|
+
auto bitmaps_c_ptr = bitmaps.c_ptr();
|
|
326
|
+
|
|
327
|
+
// Cast away const for mtmd_tokenize
|
|
328
|
+
int32_t res = mtmd_tokenize(
|
|
329
|
+
const_cast<mtmd_context*>(mtmd_ctx),
|
|
330
|
+
result.chunks,
|
|
331
|
+
&input_text,
|
|
332
|
+
bitmaps_c_ptr.data(),
|
|
333
|
+
bitmaps_c_ptr.size()
|
|
334
|
+
);
|
|
335
|
+
|
|
336
|
+
if (res != 0) {
|
|
337
|
+
mtmd_input_chunks_free(result.chunks);
|
|
338
|
+
bitmaps.entries.clear();
|
|
339
|
+
throw std::runtime_error("Failed to tokenize text and images");
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
// Log chunk information
|
|
343
|
+
size_t num_chunks = mtmd_input_chunks_size(result.chunks);
|
|
344
|
+
fprintf(stdout, "[DEBUG] Tokenization successful: num_chunks=%zu\n", num_chunks);
|
|
345
|
+
|
|
346
|
+
// Track the total number of tokens (both text and image)
|
|
347
|
+
size_t total_token_count = 0;
|
|
348
|
+
|
|
349
|
+
// chunk pos
|
|
350
|
+
for (size_t i = 0; i < num_chunks; i++) {
|
|
351
|
+
result.chunk_pos.push_back(total_token_count);
|
|
352
|
+
|
|
353
|
+
const mtmd_input_chunk* chunk = mtmd_input_chunks_get(result.chunks, i);
|
|
354
|
+
mtmd_input_chunk_type chunk_type = mtmd_input_chunk_get_type(chunk);
|
|
355
|
+
|
|
356
|
+
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
|
357
|
+
size_t n_tokens;
|
|
358
|
+
const llama_token* tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
|
|
359
|
+
|
|
360
|
+
result.tokens.insert(result.tokens.end(), tokens, tokens + n_tokens);
|
|
361
|
+
total_token_count += n_tokens;
|
|
362
|
+
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
|
363
|
+
result.chunk_pos_images.push_back(total_token_count);
|
|
364
|
+
|
|
365
|
+
const mtmd_image_tokens* img_tokens = mtmd_input_chunk_get_tokens_image(chunk);
|
|
366
|
+
size_t n_tokens = mtmd_image_tokens_get_n_tokens(img_tokens);
|
|
367
|
+
size_t n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
|
|
368
|
+
|
|
369
|
+
for (size_t j = 0; j < n_pos; j++) {
|
|
370
|
+
result.tokens.push_back(LLAMA_TOKEN_NULL);
|
|
371
|
+
}
|
|
372
|
+
total_token_count += n_pos;
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
bitmaps.entries.clear();
|
|
377
|
+
|
|
378
|
+
return result;
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
// Process images and add them to the tokenized input
|
|
382
|
+
static llama_pos process_image_prompt(
|
|
383
|
+
llama_context* ctx,
|
|
384
|
+
const mtmd_context* mtmd_ctx,
|
|
385
|
+
LlamaSessionPtr sess,
|
|
386
|
+
const common_params& params,
|
|
387
|
+
const std::vector<std::string>& image_paths
|
|
388
|
+
) {
|
|
389
|
+
if (mtmd_ctx == nullptr) {
|
|
390
|
+
throw std::runtime_error("Multimodal context is not initialized");
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
// Multimodal path
|
|
394
|
+
std::string full_prompt = params.prompt;
|
|
395
|
+
// Add image marker if it doesn't already exist
|
|
396
|
+
if (full_prompt.find("<__image__>") == std::string::npos) {
|
|
397
|
+
full_prompt += " <__image__>";
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
auto result = tokenizeWithImages(mtmd_ctx, full_prompt, image_paths);
|
|
401
|
+
|
|
402
|
+
auto all_tokens = result.tokens;
|
|
403
|
+
auto chunks = result.chunks;
|
|
404
|
+
auto chunk_pos = result.chunk_pos;
|
|
405
|
+
auto chunk_pos_images = result.chunk_pos_images;
|
|
406
|
+
auto bitmap_hashes = result.bitmap_hashes;
|
|
407
|
+
|
|
408
|
+
llama_pos n_past = common_tokens_part(*sess->tokens_ptr(), all_tokens);
|
|
409
|
+
|
|
410
|
+
llama_pos new_n_past = n_past;
|
|
411
|
+
|
|
412
|
+
// Adjust n_past to position of the text chunk
|
|
413
|
+
// TODO: Edit the text chunk to remove the tokens before n_past to speed up
|
|
414
|
+
// need to update the mtmd api
|
|
415
|
+
auto adjusted_n_past = -1;
|
|
416
|
+
for (size_t i = 0; i < chunk_pos.size(); i++) {
|
|
417
|
+
if (n_past < chunk_pos[i]) {
|
|
418
|
+
break;
|
|
419
|
+
}
|
|
420
|
+
bool is_end = i + 1 == chunk_pos.size();
|
|
421
|
+
if (
|
|
422
|
+
chunk_pos[i] < n_past &&
|
|
423
|
+
(!is_end && chunk_pos[i + 1] > n_past)
|
|
424
|
+
// is_end & n_past < total_token_count:
|
|
425
|
+
// don't need to adjust and it will skip eval_chunk_single, let nextToken() to finish the job
|
|
426
|
+
) {
|
|
427
|
+
adjusted_n_past = chunk_pos[i];
|
|
428
|
+
}
|
|
429
|
+
}
|
|
430
|
+
if (adjusted_n_past != -1) {
|
|
431
|
+
n_past = adjusted_n_past;
|
|
432
|
+
new_n_past = n_past;
|
|
433
|
+
fprintf(stdout, "[DEBUG] Adjusted n_past to %d\n", n_past);
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
// Compare bitmap hashes, if they are not the same, backtrack n_past to the position of the first mismatch
|
|
437
|
+
auto mtmd_bitmap_past_hashes = sess->mtmd_bitmap_past_hashes_ptr();
|
|
438
|
+
if (mtmd_bitmap_past_hashes->size() > 0) {
|
|
439
|
+
for (size_t i = 0; i < bitmap_hashes.size(); i++) {
|
|
440
|
+
auto pos = chunk_pos_images[i];
|
|
441
|
+
if (n_past < pos) {
|
|
442
|
+
break;
|
|
443
|
+
}
|
|
444
|
+
if (i >= mtmd_bitmap_past_hashes->size()) {
|
|
445
|
+
break;
|
|
446
|
+
}
|
|
447
|
+
if (bitmap_hashes[i] != (*mtmd_bitmap_past_hashes)[i]) {
|
|
448
|
+
n_past = chunk_pos_images[i];
|
|
449
|
+
new_n_past = n_past;
|
|
450
|
+
break;
|
|
451
|
+
}
|
|
452
|
+
}
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
// Clear all KV cache entries after position n_past
|
|
456
|
+
llama_kv_self_seq_rm(ctx, 0, n_past, -1);
|
|
457
|
+
|
|
458
|
+
size_t num_chunks = mtmd_input_chunks_size(chunks);
|
|
459
|
+
|
|
460
|
+
for (size_t i = 0; i < chunk_pos.size(); i++) {
|
|
461
|
+
fprintf(stdout, "[DEBUG] Evaluating chunk %zu: n_past=%d, chunk_pos=%zu\n", i, n_past, chunk_pos[i]);
|
|
462
|
+
|
|
463
|
+
// Process chunk only if it's after the current n_past
|
|
464
|
+
if (chunk_pos[i] >= new_n_past) {
|
|
465
|
+
bool chunk_logits_last = (i == num_chunks - 1);
|
|
466
|
+
auto chunk = mtmd_input_chunks_get(chunks, i);
|
|
467
|
+
|
|
468
|
+
// Cast away const for mtmd_helper_eval_chunk_single
|
|
469
|
+
int32_t res = mtmd_helper_eval_chunk_single(
|
|
470
|
+
const_cast<mtmd_context*>(mtmd_ctx),
|
|
471
|
+
ctx,
|
|
472
|
+
chunk,
|
|
473
|
+
n_past,
|
|
474
|
+
0,
|
|
475
|
+
params.n_batch, // batch size
|
|
476
|
+
chunk_logits_last,
|
|
477
|
+
&new_n_past
|
|
478
|
+
);
|
|
479
|
+
|
|
480
|
+
if (res != 0) {
|
|
481
|
+
mtmd_input_chunks_free(chunks);
|
|
482
|
+
throw std::runtime_error("Failed to process chunk");
|
|
483
|
+
}
|
|
484
|
+
n_past = new_n_past;
|
|
485
|
+
}
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
if (n_past == all_tokens.size() && n_past > 0 && all_tokens[n_past - 1] != LLAMA_TOKEN_NULL) {
|
|
489
|
+
// we have to evaluate at least 1 token to generate logits.
|
|
490
|
+
n_past--;
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
// Update sampling context to process token sequences
|
|
494
|
+
for (auto & token : all_tokens) {
|
|
495
|
+
if (token == LLAMA_TOKEN_NULL) {
|
|
496
|
+
continue;
|
|
497
|
+
}
|
|
498
|
+
}
|
|
499
|
+
// Set the tokens
|
|
500
|
+
sess->set_tokens(std::move(all_tokens));
|
|
501
|
+
|
|
502
|
+
sess->set_mtmd_bitmap_past_hashes(bitmap_hashes);
|
|
503
|
+
|
|
504
|
+
// Clean up image resources
|
|
505
|
+
mtmd_input_chunks_free(chunks);
|
|
506
|
+
return n_past;
|
|
507
|
+
}
|