@fugood/llama.node 0.4.7 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +4 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/lib/binding.ts +66 -6
- package/lib/index.js +59 -17
- package/lib/index.ts +74 -23
- package/package.json +1 -1
- package/src/DecodeAudioTokenWorker.cpp +40 -0
- package/src/DecodeAudioTokenWorker.h +22 -0
- package/src/EmbeddingWorker.cpp +7 -5
- package/src/LlamaCompletionWorker.cpp +68 -54
- package/src/LlamaCompletionWorker.h +7 -8
- package/src/LlamaContext.cpp +551 -235
- package/src/LlamaContext.h +26 -4
- package/src/LoadSessionWorker.cpp +4 -2
- package/src/SaveSessionWorker.cpp +10 -6
- package/src/TokenizeWorker.cpp +23 -14
- package/src/TokenizeWorker.h +2 -2
- package/src/addons.cc +8 -11
- package/src/common.hpp +129 -126
- package/src/llama.cpp/.github/workflows/build.yml +2 -2
- package/src/llama.cpp/.github/workflows/release.yml +152 -129
- package/src/llama.cpp/.github/workflows/winget.yml +42 -0
- package/src/llama.cpp/common/arg.cpp +14 -13
- package/src/llama.cpp/common/common.cpp +4 -75
- package/src/llama.cpp/common/common.h +7 -12
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +0 -13
- package/src/llama.cpp/examples/lookup/lookup.cpp +0 -11
- package/src/llama.cpp/examples/parallel/parallel.cpp +0 -9
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +6 -6
- package/src/llama.cpp/examples/simple/simple.cpp +1 -1
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
- package/src/llama.cpp/examples/sycl/run-llama2.sh +4 -4
- package/src/llama.cpp/examples/sycl/run-llama3.sh +28 -0
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/sycl/win-run-llama3.bat +9 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +2 -0
- package/src/llama.cpp/ggml/include/ggml.h +11 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +274 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +27 -0
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +18 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +107 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -155
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +5 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +43 -12
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +171 -112
- package/src/llama.cpp/ggml/src/ggml.c +64 -18
- package/src/llama.cpp/include/llama.h +24 -124
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
- package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
- package/src/llama.cpp/src/llama-batch.cpp +3 -1
- package/src/llama.cpp/src/llama-context.cpp +60 -110
- package/src/llama.cpp/src/llama-graph.cpp +137 -233
- package/src/llama.cpp/src/llama-graph.h +49 -7
- package/src/llama.cpp/src/llama-hparams.cpp +17 -1
- package/src/llama.cpp/src/llama-hparams.h +34 -5
- package/src/llama.cpp/src/llama-kv-cache.cpp +654 -321
- package/src/llama.cpp/src/llama-kv-cache.h +201 -85
- package/src/llama.cpp/src/llama-memory.h +3 -2
- package/src/llama.cpp/src/llama-model.cpp +273 -94
- package/src/llama.cpp/src/llama-model.h +4 -1
- package/src/llama.cpp/tests/test-arg-parser.cpp +1 -1
- package/src/llama.cpp/tools/llama-bench/llama-bench.cpp +1 -0
- package/src/llama.cpp/tools/mtmd/CMakeLists.txt +13 -2
- package/src/llama.cpp/tools/mtmd/clip-impl.h +108 -11
- package/src/llama.cpp/tools/mtmd/clip.cpp +466 -88
- package/src/llama.cpp/tools/mtmd/clip.h +6 -4
- package/src/llama.cpp/tools/mtmd/miniaudio.h +93468 -0
- package/src/llama.cpp/tools/mtmd/mtmd-audio.cpp +855 -0
- package/src/llama.cpp/tools/mtmd/mtmd-audio.h +62 -0
- package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +21 -14
- package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +36 -49
- package/src/llama.cpp/tools/mtmd/mtmd.cpp +362 -98
- package/src/llama.cpp/tools/mtmd/mtmd.h +52 -21
- package/src/llama.cpp/tools/run/run.cpp +2 -2
- package/src/llama.cpp/tools/server/server.cpp +158 -47
- package/src/llama.cpp/tools/server/utils.hpp +71 -43
- package/src/llama.cpp/tools/tts/tts.cpp +4 -2
- package/src/tts_utils.cpp +342 -0
- package/src/tts_utils.h +62 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
package/src/common.hpp
CHANGED
|
@@ -1,11 +1,10 @@
|
|
|
1
1
|
#pragma once
|
|
2
2
|
|
|
3
|
+
#include "chat.h"
|
|
3
4
|
#include "common/common.h"
|
|
4
5
|
#include "common/sampling.h"
|
|
5
|
-
#include "tools/mtmd/mtmd.h"
|
|
6
|
-
#include "tools/mtmd/clip.h"
|
|
7
|
-
#include "chat.h"
|
|
8
6
|
#include "llama.h"
|
|
7
|
+
#include "tools/mtmd/clip.h"
|
|
9
8
|
#include "tools/mtmd/mtmd.h"
|
|
10
9
|
#include <memory>
|
|
11
10
|
#include <mutex>
|
|
@@ -27,13 +26,17 @@ static std::string json_stringify(const Napi::Object &obj) {
|
|
|
27
26
|
Napi::Env env = obj.Env();
|
|
28
27
|
Napi::Object json = env.Global().Get("JSON").As<Napi::Object>();
|
|
29
28
|
Napi::Function stringify = json.Get("stringify").As<Napi::Function>();
|
|
30
|
-
return stringify.Call(json, {
|
|
29
|
+
return stringify.Call(json, {obj}).As<Napi::String>().ToString();
|
|
31
30
|
}
|
|
32
31
|
|
|
33
|
-
static void console_log(Napi::Env env, const std::string&
|
|
34
|
-
Napi::Function consoleLog = env.Global()
|
|
35
|
-
|
|
36
|
-
|
|
32
|
+
static void console_log(Napi::Env env, const std::string &message) {
|
|
33
|
+
Napi::Function consoleLog = env.Global()
|
|
34
|
+
.Get("console")
|
|
35
|
+
.As<Napi::Object>()
|
|
36
|
+
.Get("log")
|
|
37
|
+
.As<Napi::Function>();
|
|
38
|
+
consoleLog.Call({Napi::String::New(env, message)});
|
|
39
|
+
}
|
|
37
40
|
|
|
38
41
|
template <typename T>
|
|
39
42
|
constexpr T get_option(const Napi::Object &options, const std::string &name,
|
|
@@ -64,8 +67,7 @@ constexpr T get_option(const Napi::Object &options, const std::string &name,
|
|
|
64
67
|
|
|
65
68
|
class LlamaSession {
|
|
66
69
|
public:
|
|
67
|
-
LlamaSession(common_params params)
|
|
68
|
-
: params_(params) {
|
|
70
|
+
LlamaSession(common_params params) : params_(params) {
|
|
69
71
|
llama_init_ = common_init_from_params(params);
|
|
70
72
|
tokens_.reserve(params.n_ctx);
|
|
71
73
|
}
|
|
@@ -93,21 +95,17 @@ public:
|
|
|
93
95
|
inline const common_params ¶ms() const { return params_; }
|
|
94
96
|
|
|
95
97
|
inline std::mutex &get_mutex() { return mutex; }
|
|
96
|
-
|
|
98
|
+
|
|
97
99
|
// Getter for the multimodal context
|
|
98
|
-
inline const mtmd_context*
|
|
99
|
-
|
|
100
|
-
}
|
|
101
|
-
|
|
100
|
+
inline const mtmd_context *get_mtmd_ctx() const { return _mtmd_ctx; }
|
|
101
|
+
|
|
102
102
|
// Setter for the multimodal context
|
|
103
|
-
inline void set_mtmd_ctx(mtmd_context*
|
|
104
|
-
_mtmd_ctx = ctx;
|
|
105
|
-
}
|
|
103
|
+
inline void set_mtmd_ctx(mtmd_context *ctx) { _mtmd_ctx = ctx; }
|
|
106
104
|
|
|
107
105
|
void dispose() {
|
|
108
106
|
std::lock_guard<std::mutex> lock(mutex);
|
|
109
107
|
tokens_.clear();
|
|
110
|
-
|
|
108
|
+
|
|
111
109
|
// mtmd_ctx is owned by LlamaContext, so we don't free it here
|
|
112
110
|
_mtmd_ctx = nullptr;
|
|
113
111
|
}
|
|
@@ -118,13 +116,13 @@ private:
|
|
|
118
116
|
std::vector<llama_token> tokens_{};
|
|
119
117
|
std::vector<std::string> mtmd_bitmap_past_hashes_{};
|
|
120
118
|
std::mutex mutex;
|
|
121
|
-
mtmd_context*
|
|
119
|
+
mtmd_context *_mtmd_ctx = nullptr;
|
|
122
120
|
};
|
|
123
121
|
|
|
124
122
|
typedef std::shared_ptr<LlamaSession> LlamaSessionPtr;
|
|
125
123
|
|
|
126
124
|
static size_t common_tokens_part(const std::vector<llama_token> &a,
|
|
127
|
-
|
|
125
|
+
const std::vector<llama_token> &b) {
|
|
128
126
|
size_t i = 0;
|
|
129
127
|
while (i < a.size() && i < b.size() && a[i] == b[i]) {
|
|
130
128
|
i++;
|
|
@@ -133,7 +131,7 @@ static size_t common_tokens_part(const std::vector<llama_token> &a,
|
|
|
133
131
|
}
|
|
134
132
|
|
|
135
133
|
// Computes FNV-1a hash of the data
|
|
136
|
-
static std::string fnv_hash(const uint8_t *
|
|
134
|
+
static std::string fnv_hash(const uint8_t *data, size_t len) {
|
|
137
135
|
const uint64_t fnv_prime = 0x100000001b3ULL;
|
|
138
136
|
uint64_t hash = 0xcbf29ce484222325ULL;
|
|
139
137
|
|
|
@@ -144,10 +142,9 @@ static std::string fnv_hash(const uint8_t * data, size_t len) {
|
|
|
144
142
|
return std::to_string(hash);
|
|
145
143
|
}
|
|
146
144
|
|
|
147
|
-
static const std::string base64_chars =
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
"0123456789+/";
|
|
145
|
+
static const std::string base64_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
|
146
|
+
"abcdefghijklmnopqrstuvwxyz"
|
|
147
|
+
"0123456789+/";
|
|
151
148
|
|
|
152
149
|
// Base64 decoding function
|
|
153
150
|
static std::vector<uint8_t> base64_decode(const std::string &encoded_string) {
|
|
@@ -164,18 +161,22 @@ static std::vector<uint8_t> base64_decode(const std::string &encoded_string) {
|
|
|
164
161
|
continue;
|
|
165
162
|
}
|
|
166
163
|
|
|
167
|
-
if (encoded_string[in_] == '=' ||
|
|
164
|
+
if (encoded_string[in_] == '=' ||
|
|
165
|
+
base64_chars.find(encoded_string[in_]) == std::string::npos) {
|
|
168
166
|
break;
|
|
169
167
|
}
|
|
170
168
|
|
|
171
|
-
char_array_4[i++] = encoded_string[in_];
|
|
169
|
+
char_array_4[i++] = encoded_string[in_];
|
|
170
|
+
in_++;
|
|
172
171
|
if (i == 4) {
|
|
173
172
|
for (i = 0; i < 4; i++) {
|
|
174
173
|
char_array_4[i] = base64_chars.find(char_array_4[i]);
|
|
175
174
|
}
|
|
176
175
|
|
|
177
|
-
char_array_3[0] =
|
|
178
|
-
|
|
176
|
+
char_array_3[0] =
|
|
177
|
+
(char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
|
|
178
|
+
char_array_3[1] =
|
|
179
|
+
((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
|
|
179
180
|
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
|
|
180
181
|
|
|
181
182
|
for (i = 0; i < 3; i++) {
|
|
@@ -195,7 +196,8 @@ static std::vector<uint8_t> base64_decode(const std::string &encoded_string) {
|
|
|
195
196
|
}
|
|
196
197
|
|
|
197
198
|
char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
|
|
198
|
-
char_array_3[1] =
|
|
199
|
+
char_array_3[1] =
|
|
200
|
+
((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
|
|
199
201
|
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
|
|
200
202
|
|
|
201
203
|
for (j = 0; j < i - 1; j++) {
|
|
@@ -209,82 +211,86 @@ static std::vector<uint8_t> base64_decode(const std::string &encoded_string) {
|
|
|
209
211
|
struct TokenizeResult {
|
|
210
212
|
std::vector<llama_token> tokens;
|
|
211
213
|
|
|
212
|
-
bool
|
|
214
|
+
bool has_media = false;
|
|
213
215
|
std::vector<std::string> bitmap_hashes;
|
|
214
|
-
std::vector<size_t> chunk_pos;
|
|
215
|
-
std::vector<size_t>
|
|
216
|
-
mtmd_input_chunks*
|
|
216
|
+
std::vector<size_t> chunk_pos; // both text and media
|
|
217
|
+
std::vector<size_t> chunk_pos_media; // media only
|
|
218
|
+
mtmd_input_chunks *chunks = nullptr;
|
|
217
219
|
};
|
|
218
220
|
|
|
219
|
-
static TokenizeResult
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
const std::vector<std::string> &image_paths
|
|
223
|
-
) {
|
|
221
|
+
static TokenizeResult
|
|
222
|
+
tokenizeWithMedia(const mtmd_context *mtmd_ctx, const std::string &prompt,
|
|
223
|
+
const std::vector<std::string> &media_paths) {
|
|
224
224
|
if (mtmd_ctx == nullptr) {
|
|
225
225
|
throw std::runtime_error("Multimodal context is not initialized");
|
|
226
226
|
}
|
|
227
227
|
|
|
228
228
|
TokenizeResult result;
|
|
229
|
-
result.
|
|
229
|
+
result.has_media = !media_paths.empty();
|
|
230
230
|
|
|
231
231
|
mtmd::bitmaps bitmaps;
|
|
232
232
|
|
|
233
|
-
// Load all
|
|
234
|
-
for (const auto&
|
|
235
|
-
fprintf(
|
|
236
|
-
|
|
233
|
+
// Load all media paths
|
|
234
|
+
for (const auto &media_path : media_paths) {
|
|
235
|
+
fprintf(
|
|
236
|
+
stdout, "[DEBUG] Loading media: %s\n",
|
|
237
|
+
media_path.substr(0, 50).c_str()); // Only log part of path for base64
|
|
237
238
|
|
|
238
|
-
// Check if it's a base64
|
|
239
|
-
if (
|
|
239
|
+
// Check if it's a base64 media
|
|
240
|
+
if (media_path.compare(0, 11, "data:image/") == 0 ||
|
|
241
|
+
media_path.compare(0, 11, "data:audio/") == 0) {
|
|
240
242
|
|
|
241
243
|
// Parse base64 data
|
|
242
244
|
std::vector<std::string> parts;
|
|
243
|
-
size_t comma_pos =
|
|
245
|
+
size_t comma_pos = media_path.find(',');
|
|
244
246
|
if (comma_pos == std::string::npos) {
|
|
245
247
|
result.bitmap_hashes.clear();
|
|
246
|
-
throw std::runtime_error(
|
|
248
|
+
throw std::runtime_error(
|
|
249
|
+
"Invalid base64 media format, missing comma separator");
|
|
247
250
|
}
|
|
248
251
|
|
|
249
|
-
std::string header =
|
|
250
|
-
std::string base64_data =
|
|
252
|
+
std::string header = media_path.substr(0, comma_pos);
|
|
253
|
+
std::string base64_data = media_path.substr(comma_pos + 1);
|
|
251
254
|
|
|
252
255
|
if (header.find("base64") == std::string::npos) {
|
|
253
256
|
result.bitmap_hashes.clear();
|
|
254
|
-
throw std::runtime_error("Invalid base64
|
|
257
|
+
throw std::runtime_error("Invalid base64 media");
|
|
255
258
|
}
|
|
256
259
|
|
|
257
260
|
// Decode base64
|
|
258
261
|
try {
|
|
259
262
|
// Decode base64 to binary
|
|
260
|
-
std::vector<uint8_t>
|
|
263
|
+
std::vector<uint8_t> media_data = base64_decode(base64_data);
|
|
261
264
|
|
|
262
265
|
// Load bitmap from memory buffer using direct initialization
|
|
263
|
-
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(
|
|
266
|
+
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(media_data.data(),
|
|
267
|
+
media_data.size()));
|
|
264
268
|
if (!bmp.ptr) {
|
|
265
269
|
bitmaps.entries.clear();
|
|
266
|
-
throw std::runtime_error("Failed to
|
|
270
|
+
throw std::runtime_error("Failed to load base64 media");
|
|
267
271
|
}
|
|
268
272
|
|
|
269
273
|
// Calculate bitmap hash (for KV caching)
|
|
270
|
-
std::string hash = fnv_hash(bmp.data(), bmp.
|
|
274
|
+
std::string hash = fnv_hash(bmp.data(), bmp.n_bytes());
|
|
271
275
|
bmp.set_id(hash.c_str());
|
|
272
276
|
bitmaps.entries.push_back(std::move(bmp));
|
|
273
277
|
result.bitmap_hashes.push_back(hash.c_str());
|
|
274
|
-
} catch (const std::exception&
|
|
278
|
+
} catch (const std::exception &e) {
|
|
275
279
|
bitmaps.entries.clear();
|
|
276
|
-
throw std::runtime_error("Failed to decode base64
|
|
280
|
+
throw std::runtime_error("Failed to decode base64 media");
|
|
277
281
|
}
|
|
278
|
-
} else if (
|
|
282
|
+
} else if (media_path.compare(0, 7, "http://") == 0 ||
|
|
283
|
+
media_path.compare(0, 8, "https://") == 0) {
|
|
279
284
|
// HTTP URLs are not supported yet
|
|
280
285
|
bitmaps.entries.clear();
|
|
281
|
-
throw std::runtime_error("HTTP URLs are not supported yet");
|
|
286
|
+
throw std::runtime_error("HTTP/HTTPS URLs are not supported yet");
|
|
282
287
|
} else {
|
|
288
|
+
// Regular file path
|
|
283
289
|
// Check if file exists
|
|
284
|
-
FILE*
|
|
290
|
+
FILE *file = fopen(media_path.c_str(), "rb");
|
|
285
291
|
if (file == nullptr) {
|
|
286
292
|
bitmaps.entries.clear();
|
|
287
|
-
throw std::runtime_error("
|
|
293
|
+
throw std::runtime_error("File does not exist or cannot be opened");
|
|
288
294
|
}
|
|
289
295
|
|
|
290
296
|
// Get file size
|
|
@@ -294,14 +300,14 @@ static TokenizeResult tokenizeWithImages(
|
|
|
294
300
|
fclose(file);
|
|
295
301
|
|
|
296
302
|
// Create bitmap directly
|
|
297
|
-
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(
|
|
303
|
+
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(media_path.c_str()));
|
|
298
304
|
if (!bmp.ptr) {
|
|
299
305
|
bitmaps.entries.clear();
|
|
300
|
-
throw std::runtime_error("Failed to
|
|
306
|
+
throw std::runtime_error("Failed to load media");
|
|
301
307
|
}
|
|
302
308
|
|
|
303
309
|
// Calculate bitmap hash (for KV caching)
|
|
304
|
-
std::string hash = fnv_hash(bmp.data(), bmp.nx()*bmp.ny()*3);
|
|
310
|
+
std::string hash = fnv_hash(bmp.data(), bmp.nx() * bmp.ny() * 3);
|
|
305
311
|
bmp.set_id(hash.c_str());
|
|
306
312
|
bitmaps.entries.push_back(std::move(bmp));
|
|
307
313
|
result.bitmap_hashes.push_back(hash.c_str());
|
|
@@ -313,58 +319,60 @@ static TokenizeResult tokenizeWithImages(
|
|
|
313
319
|
bitmaps.entries.clear();
|
|
314
320
|
throw std::runtime_error("Failed to initialize input chunks");
|
|
315
321
|
}
|
|
316
|
-
|
|
322
|
+
|
|
317
323
|
// Create input text
|
|
318
324
|
mtmd_input_text input_text;
|
|
319
|
-
input_text.text = prompt.c_str(); // Use the full prompt with
|
|
320
|
-
input_text.add_special = true;
|
|
321
|
-
input_text.parse_special = true;
|
|
325
|
+
input_text.text = prompt.c_str(); // Use the full prompt with media marker
|
|
326
|
+
input_text.add_special = true; // Add BOS token if this is the first message
|
|
327
|
+
input_text.parse_special = true; // Parse special tokens like <__media__>
|
|
322
328
|
|
|
323
|
-
// Tokenize the text and
|
|
324
|
-
fprintf(stdout, "[DEBUG] Tokenizing text and %zu
|
|
329
|
+
// Tokenize the text and media
|
|
330
|
+
fprintf(stdout, "[DEBUG] Tokenizing text and %zu media\n",
|
|
331
|
+
bitmaps.entries.size());
|
|
325
332
|
auto bitmaps_c_ptr = bitmaps.c_ptr();
|
|
326
|
-
|
|
333
|
+
|
|
327
334
|
// Cast away const for mtmd_tokenize
|
|
328
|
-
int32_t res =
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
bitmaps_c_ptr.data(),
|
|
333
|
-
bitmaps_c_ptr.size()
|
|
334
|
-
);
|
|
335
|
-
|
|
335
|
+
int32_t res =
|
|
336
|
+
mtmd_tokenize(const_cast<mtmd_context *>(mtmd_ctx), result.chunks,
|
|
337
|
+
&input_text, bitmaps_c_ptr.data(), bitmaps_c_ptr.size());
|
|
338
|
+
|
|
336
339
|
if (res != 0) {
|
|
337
340
|
mtmd_input_chunks_free(result.chunks);
|
|
338
341
|
bitmaps.entries.clear();
|
|
339
|
-
throw std::runtime_error("Failed to tokenize text and
|
|
342
|
+
throw std::runtime_error("Failed to tokenize text and media");
|
|
340
343
|
}
|
|
341
344
|
|
|
342
345
|
// Log chunk information
|
|
343
346
|
size_t num_chunks = mtmd_input_chunks_size(result.chunks);
|
|
344
|
-
fprintf(stdout, "[DEBUG] Tokenization successful: num_chunks=%zu\n",
|
|
347
|
+
fprintf(stdout, "[DEBUG] Tokenization successful: num_chunks=%zu\n",
|
|
348
|
+
num_chunks);
|
|
345
349
|
|
|
346
|
-
// Track the total number of tokens (both text and
|
|
350
|
+
// Track the total number of tokens (both text and media)
|
|
347
351
|
size_t total_token_count = 0;
|
|
348
352
|
|
|
349
353
|
// chunk pos
|
|
350
354
|
for (size_t i = 0; i < num_chunks; i++) {
|
|
351
355
|
result.chunk_pos.push_back(total_token_count);
|
|
352
356
|
|
|
353
|
-
const mtmd_input_chunk*
|
|
357
|
+
const mtmd_input_chunk *chunk = mtmd_input_chunks_get(result.chunks, i);
|
|
354
358
|
mtmd_input_chunk_type chunk_type = mtmd_input_chunk_get_type(chunk);
|
|
355
359
|
|
|
356
360
|
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
|
357
361
|
size_t n_tokens;
|
|
358
|
-
const llama_token*
|
|
362
|
+
const llama_token *tokens =
|
|
363
|
+
mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
|
|
359
364
|
|
|
360
365
|
result.tokens.insert(result.tokens.end(), tokens, tokens + n_tokens);
|
|
361
366
|
total_token_count += n_tokens;
|
|
362
|
-
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE
|
|
363
|
-
|
|
367
|
+
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE ||
|
|
368
|
+
chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
|
|
369
|
+
result.chunk_pos_media.push_back(total_token_count);
|
|
364
370
|
|
|
365
|
-
|
|
366
|
-
size_t
|
|
367
|
-
|
|
371
|
+
size_t n_tokens = mtmd_input_chunk_get_n_tokens(chunk);
|
|
372
|
+
size_t n_pos = mtmd_input_chunk_get_n_pos(chunk);
|
|
373
|
+
fprintf(stdout, "[DEBUG] Chunk %zu: type=%s, n_tokens=%zu, n_pos=%zu\n",
|
|
374
|
+
i, chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE ? "IMAGE" : "AUDIO",
|
|
375
|
+
n_tokens, n_pos);
|
|
368
376
|
|
|
369
377
|
for (size_t j = 0; j < n_pos; j++) {
|
|
370
378
|
result.tokens.push_back(LLAMA_TOKEN_NULL);
|
|
@@ -374,35 +382,34 @@ static TokenizeResult tokenizeWithImages(
|
|
|
374
382
|
}
|
|
375
383
|
|
|
376
384
|
bitmaps.entries.clear();
|
|
377
|
-
|
|
385
|
+
|
|
378
386
|
return result;
|
|
379
387
|
}
|
|
380
388
|
|
|
381
|
-
// Process
|
|
382
|
-
static llama_pos
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
const common_params& params,
|
|
387
|
-
const std::vector<std::string>& image_paths
|
|
388
|
-
) {
|
|
389
|
+
// Process media and add them to the tokenized input
|
|
390
|
+
static llama_pos
|
|
391
|
+
processMediaPrompt(llama_context *ctx, const mtmd_context *mtmd_ctx,
|
|
392
|
+
LlamaSessionPtr sess, const common_params ¶ms,
|
|
393
|
+
const std::vector<std::string> &media_paths) {
|
|
389
394
|
if (mtmd_ctx == nullptr) {
|
|
390
395
|
throw std::runtime_error("Multimodal context is not initialized");
|
|
391
396
|
}
|
|
392
397
|
|
|
393
398
|
// Multimodal path
|
|
394
399
|
std::string full_prompt = params.prompt;
|
|
395
|
-
|
|
396
|
-
if
|
|
397
|
-
|
|
400
|
+
auto default_media_marker = mtmd_default_marker();
|
|
401
|
+
// Add media marker if it doesn't already exist
|
|
402
|
+
if (full_prompt.find(default_media_marker) == std::string::npos) {
|
|
403
|
+
full_prompt += " ";
|
|
404
|
+
full_prompt += default_media_marker;
|
|
398
405
|
}
|
|
399
406
|
|
|
400
|
-
auto result =
|
|
407
|
+
auto result = tokenizeWithMedia(mtmd_ctx, full_prompt, media_paths);
|
|
401
408
|
|
|
402
409
|
auto all_tokens = result.tokens;
|
|
403
410
|
auto chunks = result.chunks;
|
|
404
411
|
auto chunk_pos = result.chunk_pos;
|
|
405
|
-
auto
|
|
412
|
+
auto chunk_pos_media = result.chunk_pos_media;
|
|
406
413
|
auto bitmap_hashes = result.bitmap_hashes;
|
|
407
414
|
|
|
408
415
|
llama_pos n_past = common_tokens_part(*sess->tokens_ptr(), all_tokens);
|
|
@@ -418,11 +425,10 @@ static llama_pos process_image_prompt(
|
|
|
418
425
|
break;
|
|
419
426
|
}
|
|
420
427
|
bool is_end = i + 1 == chunk_pos.size();
|
|
421
|
-
if (
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
// don't need to adjust and it will skip eval_chunk_single, let nextToken() to finish the job
|
|
428
|
+
if (chunk_pos[i] < n_past && (!is_end && chunk_pos[i + 1] > n_past)
|
|
429
|
+
// is_end & n_past < total_token_count:
|
|
430
|
+
// don't need to adjust and it will skip eval_chunk_single, let
|
|
431
|
+
// nextToken() to finish the job
|
|
426
432
|
) {
|
|
427
433
|
adjusted_n_past = chunk_pos[i];
|
|
428
434
|
}
|
|
@@ -433,11 +439,12 @@ static llama_pos process_image_prompt(
|
|
|
433
439
|
fprintf(stdout, "[DEBUG] Adjusted n_past to %d\n", n_past);
|
|
434
440
|
}
|
|
435
441
|
|
|
436
|
-
// Compare bitmap hashes, if they are not the same, backtrack n_past to the
|
|
442
|
+
// Compare bitmap hashes, if they are not the same, backtrack n_past to the
|
|
443
|
+
// position of the first mismatch
|
|
437
444
|
auto mtmd_bitmap_past_hashes = sess->mtmd_bitmap_past_hashes_ptr();
|
|
438
445
|
if (mtmd_bitmap_past_hashes->size() > 0) {
|
|
439
446
|
for (size_t i = 0; i < bitmap_hashes.size(); i++) {
|
|
440
|
-
auto pos =
|
|
447
|
+
auto pos = chunk_pos_media[i];
|
|
441
448
|
if (n_past < pos) {
|
|
442
449
|
break;
|
|
443
450
|
}
|
|
@@ -445,7 +452,7 @@ static llama_pos process_image_prompt(
|
|
|
445
452
|
break;
|
|
446
453
|
}
|
|
447
454
|
if (bitmap_hashes[i] != (*mtmd_bitmap_past_hashes)[i]) {
|
|
448
|
-
n_past =
|
|
455
|
+
n_past = chunk_pos_media[i];
|
|
449
456
|
new_n_past = n_past;
|
|
450
457
|
break;
|
|
451
458
|
}
|
|
@@ -458,7 +465,8 @@ static llama_pos process_image_prompt(
|
|
|
458
465
|
size_t num_chunks = mtmd_input_chunks_size(chunks);
|
|
459
466
|
|
|
460
467
|
for (size_t i = 0; i < chunk_pos.size(); i++) {
|
|
461
|
-
fprintf(stdout, "[DEBUG] Evaluating chunk %zu: n_past=%d, chunk_pos=%zu\n",
|
|
468
|
+
fprintf(stdout, "[DEBUG] Evaluating chunk %zu: n_past=%d, chunk_pos=%zu\n",
|
|
469
|
+
i, n_past, chunk_pos[i]);
|
|
462
470
|
|
|
463
471
|
// Process chunk only if it's after the current n_past
|
|
464
472
|
if (chunk_pos[i] >= new_n_past) {
|
|
@@ -467,16 +475,10 @@ static llama_pos process_image_prompt(
|
|
|
467
475
|
|
|
468
476
|
// Cast away const for mtmd_helper_eval_chunk_single
|
|
469
477
|
int32_t res = mtmd_helper_eval_chunk_single(
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
0,
|
|
475
|
-
params.n_batch, // batch size
|
|
476
|
-
chunk_logits_last,
|
|
477
|
-
&new_n_past
|
|
478
|
-
);
|
|
479
|
-
|
|
478
|
+
const_cast<mtmd_context *>(mtmd_ctx), ctx, chunk, n_past, 0,
|
|
479
|
+
params.n_batch, // batch size
|
|
480
|
+
chunk_logits_last, &new_n_past);
|
|
481
|
+
|
|
480
482
|
if (res != 0) {
|
|
481
483
|
mtmd_input_chunks_free(chunks);
|
|
482
484
|
throw std::runtime_error("Failed to process chunk");
|
|
@@ -485,13 +487,14 @@ static llama_pos process_image_prompt(
|
|
|
485
487
|
}
|
|
486
488
|
}
|
|
487
489
|
|
|
488
|
-
if (n_past == all_tokens.size() && n_past > 0 &&
|
|
490
|
+
if (n_past == all_tokens.size() && n_past > 0 &&
|
|
491
|
+
all_tokens[n_past - 1] != LLAMA_TOKEN_NULL) {
|
|
489
492
|
// we have to evaluate at least 1 token to generate logits.
|
|
490
493
|
n_past--;
|
|
491
494
|
}
|
|
492
495
|
|
|
493
496
|
// Update sampling context to process token sequences
|
|
494
|
-
for (auto &
|
|
497
|
+
for (auto &token : all_tokens) {
|
|
495
498
|
if (token == LLAMA_TOKEN_NULL) {
|
|
496
499
|
continue;
|
|
497
500
|
}
|
|
@@ -501,7 +504,7 @@ static llama_pos process_image_prompt(
|
|
|
501
504
|
|
|
502
505
|
sess->set_mtmd_bitmap_past_hashes(bitmap_hashes);
|
|
503
506
|
|
|
504
|
-
// Clean up
|
|
507
|
+
// Clean up media resources
|
|
505
508
|
mtmd_input_chunks_free(chunks);
|
|
506
509
|
return n_past;
|
|
507
510
|
}
|
|
@@ -351,7 +351,7 @@ jobs:
|
|
|
351
351
|
|
|
352
352
|
ubuntu-22-cmake-musa:
|
|
353
353
|
runs-on: ubuntu-22.04
|
|
354
|
-
container: mthreads/musa:
|
|
354
|
+
container: mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04
|
|
355
355
|
|
|
356
356
|
steps:
|
|
357
357
|
- name: Clone
|
|
@@ -899,7 +899,7 @@ jobs:
|
|
|
899
899
|
shell: bash
|
|
900
900
|
|
|
901
901
|
env:
|
|
902
|
-
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/
|
|
902
|
+
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7cd9bba0-7aab-4e30-b3ae-2221006a4a05/intel-oneapi-base-toolkit-2025.1.1.34_offline.exe
|
|
903
903
|
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
|
|
904
904
|
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
|
905
905
|
steps:
|