llama-cpp-capacitor 0.0.5 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. package/cpp/LICENSE +21 -0
  2. package/cpp/README.md +4 -0
  3. package/cpp/anyascii.c +22223 -0
  4. package/cpp/anyascii.h +42 -0
  5. package/cpp/chat-parser.cpp +393 -0
  6. package/cpp/chat-parser.h +120 -0
  7. package/cpp/chat.cpp +2315 -0
  8. package/cpp/chat.h +221 -0
  9. package/cpp/common.cpp +1619 -0
  10. package/cpp/common.h +744 -0
  11. package/cpp/ggml-alloc.c +1028 -0
  12. package/cpp/ggml-alloc.h +76 -0
  13. package/cpp/ggml-backend-impl.h +255 -0
  14. package/cpp/ggml-backend-reg.cpp +600 -0
  15. package/cpp/ggml-backend.cpp +2118 -0
  16. package/cpp/ggml-backend.h +354 -0
  17. package/cpp/ggml-common.h +1878 -0
  18. package/cpp/ggml-cpp.h +39 -0
  19. package/cpp/ggml-cpu/amx/amx.cpp +221 -0
  20. package/cpp/ggml-cpu/amx/amx.h +8 -0
  21. package/cpp/ggml-cpu/amx/common.h +91 -0
  22. package/cpp/ggml-cpu/amx/mmq.cpp +2512 -0
  23. package/cpp/ggml-cpu/amx/mmq.h +10 -0
  24. package/cpp/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  25. package/cpp/ggml-cpu/arch/arm/quants.c +3650 -0
  26. package/cpp/ggml-cpu/arch/arm/repack.cpp +1891 -0
  27. package/cpp/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  28. package/cpp/ggml-cpu/arch/x86/quants.c +3820 -0
  29. package/cpp/ggml-cpu/arch/x86/repack.cpp +6307 -0
  30. package/cpp/ggml-cpu/arch-fallback.h +215 -0
  31. package/cpp/ggml-cpu/binary-ops.cpp +158 -0
  32. package/cpp/ggml-cpu/binary-ops.h +16 -0
  33. package/cpp/ggml-cpu/common.h +73 -0
  34. package/cpp/ggml-cpu/ggml-cpu-impl.h +525 -0
  35. package/cpp/ggml-cpu/ggml-cpu.c +3578 -0
  36. package/cpp/ggml-cpu/ggml-cpu.cpp +672 -0
  37. package/cpp/ggml-cpu/ops.cpp +10587 -0
  38. package/cpp/ggml-cpu/ops.h +114 -0
  39. package/cpp/ggml-cpu/quants.c +1193 -0
  40. package/cpp/ggml-cpu/quants.h +97 -0
  41. package/cpp/ggml-cpu/repack.cpp +1982 -0
  42. package/cpp/ggml-cpu/repack.h +120 -0
  43. package/cpp/ggml-cpu/simd-mappings.h +1184 -0
  44. package/cpp/ggml-cpu/traits.cpp +36 -0
  45. package/cpp/ggml-cpu/traits.h +38 -0
  46. package/cpp/ggml-cpu/unary-ops.cpp +186 -0
  47. package/cpp/ggml-cpu/unary-ops.h +28 -0
  48. package/cpp/ggml-cpu/vec.cpp +348 -0
  49. package/cpp/ggml-cpu/vec.h +1121 -0
  50. package/cpp/ggml-cpu.h +145 -0
  51. package/cpp/ggml-impl.h +622 -0
  52. package/cpp/ggml-metal-impl.h +688 -0
  53. package/cpp/ggml-metal.h +66 -0
  54. package/cpp/ggml-metal.m +6833 -0
  55. package/cpp/ggml-opt.cpp +1093 -0
  56. package/cpp/ggml-opt.h +256 -0
  57. package/cpp/ggml-quants.c +5324 -0
  58. package/cpp/ggml-quants.h +106 -0
  59. package/cpp/ggml-threading.cpp +12 -0
  60. package/cpp/ggml-threading.h +14 -0
  61. package/cpp/ggml.c +7108 -0
  62. package/cpp/ggml.h +2492 -0
  63. package/cpp/gguf.cpp +1358 -0
  64. package/cpp/gguf.h +202 -0
  65. package/cpp/json-partial.cpp +256 -0
  66. package/cpp/json-partial.h +38 -0
  67. package/cpp/json-schema-to-grammar.cpp +985 -0
  68. package/cpp/json-schema-to-grammar.h +21 -0
  69. package/cpp/llama-adapter.cpp +388 -0
  70. package/cpp/llama-adapter.h +76 -0
  71. package/cpp/llama-arch.cpp +2355 -0
  72. package/cpp/llama-arch.h +499 -0
  73. package/cpp/llama-batch.cpp +875 -0
  74. package/cpp/llama-batch.h +160 -0
  75. package/cpp/llama-chat.cpp +783 -0
  76. package/cpp/llama-chat.h +65 -0
  77. package/cpp/llama-context.cpp +2748 -0
  78. package/cpp/llama-context.h +306 -0
  79. package/cpp/llama-cparams.cpp +5 -0
  80. package/cpp/llama-cparams.h +41 -0
  81. package/cpp/llama-cpp.h +30 -0
  82. package/cpp/llama-grammar.cpp +1229 -0
  83. package/cpp/llama-grammar.h +173 -0
  84. package/cpp/llama-graph.cpp +1891 -0
  85. package/cpp/llama-graph.h +810 -0
  86. package/cpp/llama-hparams.cpp +180 -0
  87. package/cpp/llama-hparams.h +233 -0
  88. package/cpp/llama-impl.cpp +167 -0
  89. package/cpp/llama-impl.h +61 -0
  90. package/cpp/llama-io.cpp +15 -0
  91. package/cpp/llama-io.h +35 -0
  92. package/cpp/llama-kv-cache-iswa.cpp +318 -0
  93. package/cpp/llama-kv-cache-iswa.h +135 -0
  94. package/cpp/llama-kv-cache.cpp +2059 -0
  95. package/cpp/llama-kv-cache.h +374 -0
  96. package/cpp/llama-kv-cells.h +491 -0
  97. package/cpp/llama-memory-hybrid.cpp +258 -0
  98. package/cpp/llama-memory-hybrid.h +137 -0
  99. package/cpp/llama-memory-recurrent.cpp +1146 -0
  100. package/cpp/llama-memory-recurrent.h +179 -0
  101. package/cpp/llama-memory.cpp +59 -0
  102. package/cpp/llama-memory.h +119 -0
  103. package/cpp/llama-mmap.cpp +600 -0
  104. package/cpp/llama-mmap.h +68 -0
  105. package/cpp/llama-model-loader.cpp +1164 -0
  106. package/cpp/llama-model-loader.h +170 -0
  107. package/cpp/llama-model-saver.cpp +282 -0
  108. package/cpp/llama-model-saver.h +37 -0
  109. package/cpp/llama-model.cpp +19042 -0
  110. package/cpp/llama-model.h +491 -0
  111. package/cpp/llama-sampling.cpp +2575 -0
  112. package/cpp/llama-sampling.h +32 -0
  113. package/cpp/llama-vocab.cpp +3792 -0
  114. package/cpp/llama-vocab.h +176 -0
  115. package/cpp/llama.cpp +358 -0
  116. package/cpp/llama.h +1373 -0
  117. package/cpp/log.cpp +427 -0
  118. package/cpp/log.h +103 -0
  119. package/cpp/minja/chat-template.hpp +550 -0
  120. package/cpp/minja/minja.hpp +3009 -0
  121. package/cpp/nlohmann/json.hpp +25526 -0
  122. package/cpp/nlohmann/json_fwd.hpp +187 -0
  123. package/cpp/regex-partial.cpp +204 -0
  124. package/cpp/regex-partial.h +56 -0
  125. package/cpp/rn-completion.cpp +681 -0
  126. package/cpp/rn-completion.h +116 -0
  127. package/cpp/rn-llama.cpp +345 -0
  128. package/cpp/rn-llama.h +149 -0
  129. package/cpp/rn-mtmd.hpp +602 -0
  130. package/cpp/rn-tts.cpp +591 -0
  131. package/cpp/rn-tts.h +59 -0
  132. package/cpp/sampling.cpp +579 -0
  133. package/cpp/sampling.h +107 -0
  134. package/cpp/tools/mtmd/clip-impl.h +473 -0
  135. package/cpp/tools/mtmd/clip.cpp +4322 -0
  136. package/cpp/tools/mtmd/clip.h +106 -0
  137. package/cpp/tools/mtmd/miniaudio/miniaudio.h +93468 -0
  138. package/cpp/tools/mtmd/mtmd-audio.cpp +769 -0
  139. package/cpp/tools/mtmd/mtmd-audio.h +47 -0
  140. package/cpp/tools/mtmd/mtmd-helper.cpp +460 -0
  141. package/cpp/tools/mtmd/mtmd-helper.h +91 -0
  142. package/cpp/tools/mtmd/mtmd.cpp +1066 -0
  143. package/cpp/tools/mtmd/mtmd.h +298 -0
  144. package/cpp/tools/mtmd/stb/stb_image.h +7988 -0
  145. package/cpp/unicode-data.cpp +7034 -0
  146. package/cpp/unicode-data.h +20 -0
  147. package/cpp/unicode.cpp +1061 -0
  148. package/cpp/unicode.h +68 -0
  149. package/package.json +2 -1
@@ -0,0 +1,602 @@
1
+ #pragma once
2
+
3
+ #include "rn-llama.h"
4
+ #include "tools/mtmd/mtmd.h"
5
+ #include "tools/mtmd/mtmd-helper.h"
6
+ #include "tools/mtmd/clip.h"
7
+ #include <string>
8
+ #include <vector>
9
+ #include <cstdint>
10
+
11
+ namespace rnllama {
12
+
13
+ // MTMD context structure
14
+ struct llama_rn_context_mtmd {
15
+ mtmd_context *mtmd_ctx = nullptr;
16
+
17
+ // State fields
18
+ std::vector<std::string> bitmap_past_hashes;
19
+
20
+ // Constructor - Initialize multimodal
21
+ llama_rn_context_mtmd(
22
+ const std::string &mmproj_path,
23
+ bool use_gpu,
24
+ llama_model *model,
25
+ llama_context *ctx,
26
+ const common_params &params,
27
+ bool &has_multimodal,
28
+ common_params &mutable_params
29
+ );
30
+
31
+ // Destructor - Release multimodal resources
32
+ ~llama_rn_context_mtmd();
33
+
34
+ // Process media
35
+ void processMedia(
36
+ llama_context *ctx,
37
+ const std::string &prompt,
38
+ const std::vector<std::string> &media_paths,
39
+ int n_ctx,
40
+ int n_batch,
41
+ llama_pos &n_past,
42
+ std::vector<llama_token> &embd,
43
+ bool &context_full,
44
+ common_sampler *ctx_sampling
45
+ );
46
+
47
+ // Check if multimodal is enabled
48
+ bool isEnabled(bool has_multimodal) const;
49
+
50
+ // Check if multimodal supports vision
51
+ bool supportVision() const;
52
+
53
+ // Check if multimodal supports audio
54
+ bool supportAudio() const;
55
+ };
56
+
57
+ // Helper function to find common part between two token vectors
58
+ inline size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)
59
+ {
60
+ size_t i;
61
+ for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++)
62
+ {
63
+ }
64
+ return i;
65
+ }
66
+
67
+ // FNV-1a hash function for bitmap hashing
68
+ inline std::string fnv_hash(const uint8_t * data, size_t len) {
69
+ const uint64_t fnv_prime = 0x100000001b3ULL;
70
+ uint64_t hash = 0xcbf29ce484222325ULL;
71
+
72
+ for (size_t i = 0; i < len; ++i) {
73
+ hash ^= data[i];
74
+ hash *= fnv_prime;
75
+ }
76
+ return std::to_string(hash);
77
+ }
78
+
79
+ // Base64 encoding/decoding utilities
80
+ static const std::string base64_chars =
81
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
82
+ "abcdefghijklmnopqrstuvwxyz"
83
+ "0123456789+/";
84
+
85
+ inline bool is_base64(uint8_t c) {
86
+ return (isalnum(c) || (c == '+') || (c == '/'));
87
+ }
88
+
89
+ using raw_buffer = std::vector<uint8_t>;
90
+
91
+ inline raw_buffer base64_decode(const std::string & encoded_string) {
92
+ int i = 0;
93
+ int j = 0;
94
+ int in_ = 0;
95
+
96
+ int in_len = encoded_string.size();
97
+
98
+ uint8_t char_array_4[4];
99
+ uint8_t char_array_3[3];
100
+
101
+ raw_buffer ret;
102
+
103
+ while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_])) {
104
+ char_array_4[i++] = encoded_string[in_]; in_++;
105
+ if (i == 4) {
106
+ for (i = 0; i < 4; i++) {
107
+ char_array_4[i] = base64_chars.find(char_array_4[i]);
108
+ }
109
+
110
+ char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4);
111
+ char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
112
+ char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
113
+
114
+ for (i = 0; (i < 3); i++) {
115
+ ret.push_back(char_array_3[i]);
116
+ }
117
+
118
+ i = 0;
119
+ }
120
+ }
121
+
122
+ if (i) {
123
+ for (j = i; j < 4; j++) {
124
+ char_array_4[j] = 0;
125
+ }
126
+
127
+ for (j = 0; j < 4; j++) {
128
+ char_array_4[j] = base64_chars.find(char_array_4[j]);
129
+ }
130
+
131
+ char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4);
132
+ char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
133
+ char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
134
+
135
+ for (j = 0; j < i - 1; j++) {
136
+ ret.push_back(char_array_3[j]);
137
+ }
138
+ }
139
+
140
+ return ret;
141
+ }
142
+
143
+ // MTMD tokenization result structure
144
+ struct mtmd_tokenize_result {
145
+ std::vector<std::string> bitmap_hashes;
146
+ std::vector<llama_token> tokens;
147
+ std::vector<size_t> chunk_pos; // both text and media
148
+ std::vector<size_t> chunk_pos_media; // media only
149
+ mtmd_input_chunks* chunks = nullptr;
150
+ };
151
+
152
+ // Forward declaration for llama_rn_context
153
+ struct llama_rn_context;
154
+
155
+ // Tokenize text with media function
156
+ inline mtmd_tokenize_result tokenizeWithMedia(llama_rn_context_mtmd *mtmd_wrapper, const std::string &prompt, const std::vector<std::string> &media_paths) {
157
+ mtmd_tokenize_result result;
158
+ mtmd::bitmaps bitmaps;
159
+
160
+ // Load all media paths
161
+ for (const auto& media_path : media_paths) {
162
+ LOG_INFO("[DEBUG] Loading media: %s",
163
+ media_path.substr(0, 50).c_str()); // Only log part of path for base64
164
+
165
+ // Check if it's a base64 media
166
+ if (media_path.compare(0, 11, "data:image/") == 0 || media_path.compare(0, 11, "data:audio/") == 0) {
167
+ LOG_INFO("[DEBUG] Detected base64 encoded media");
168
+
169
+ // Parse base64 data
170
+ std::vector<std::string> parts;
171
+ size_t comma_pos = media_path.find(',');
172
+ if (comma_pos == std::string::npos) {
173
+ throw std::runtime_error("Invalid base64 media format, missing comma separator");
174
+ }
175
+
176
+ std::string header = media_path.substr(0, comma_pos);
177
+ std::string base64_data = media_path.substr(comma_pos + 1);
178
+
179
+ if (header.find("base64") == std::string::npos) {
180
+ bitmaps.entries.clear();
181
+ throw std::runtime_error("Image must be base64 encoded");
182
+ }
183
+
184
+ // Decode base64
185
+ raw_buffer media_data = base64_decode(base64_data);
186
+ LOG_INFO("[DEBUG] Base64 decoded, size: %zu bytes", media_data.size());
187
+
188
+ // Load bitmap from memory buffer using direct initialization
189
+ mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(mtmd_wrapper->mtmd_ctx, media_data.data(), media_data.size()));
190
+ if (!bmp.ptr) {
191
+ bitmaps.entries.clear();
192
+ throw std::runtime_error("Failed to load base64 media");
193
+ }
194
+
195
+ // Calculate bitmap hash (for KV caching)
196
+ std::string hash = fnv_hash(bmp.data(), bmp.n_bytes());
197
+ bmp.set_id(hash.c_str());
198
+ LOG_INFO("[DEBUG] Bitmap hash: %s", hash.c_str());
199
+ bitmaps.entries.push_back(std::move(bmp));
200
+ result.bitmap_hashes.push_back(hash.c_str());
201
+ } else if (media_path.compare(0, 7, "http://") == 0 || media_path.compare(0, 8, "https://") == 0) {
202
+ // HTTP URLs are not supported yet
203
+ LOG_ERROR("[DEBUG] HTTP/HTTPS URLs are not supported yet: %s", media_path.c_str());
204
+ throw std::runtime_error("HTTP/HTTPS URLs are not supported yet");
205
+ } else {
206
+ // Regular file path
207
+ LOG_INFO("[DEBUG] Loading media from file");
208
+
209
+ // Check if file exists
210
+ FILE* file = fopen(media_path.c_str(), "rb");
211
+ if (file == nullptr) {
212
+ bitmaps.entries.clear();
213
+ throw std::runtime_error("File does not exist or cannot be opened");
214
+ }
215
+
216
+ // Get file size
217
+ fseek(file, 0, SEEK_END);
218
+ long file_size = ftell(file);
219
+ fseek(file, 0, SEEK_SET);
220
+ LOG_INFO("[DEBUG] File exists and size is %ld bytes", file_size);
221
+ fclose(file);
222
+
223
+ // Create bitmap directly
224
+ mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(mtmd_wrapper->mtmd_ctx, media_path.c_str()));
225
+ if (!bmp.ptr) {
226
+ bitmaps.entries.clear();
227
+ throw std::runtime_error("Failed to load media");
228
+ }
229
+
230
+ // Calculate bitmap hash (for KV caching)
231
+ std::string hash = fnv_hash(bmp.data(), bmp.nx()*bmp.ny()*3);
232
+ bmp.set_id(hash.c_str());
233
+ LOG_INFO("[DEBUG] Bitmap hash: %s", hash.c_str());
234
+ bitmaps.entries.push_back(std::move(bmp));
235
+ result.bitmap_hashes.push_back(hash.c_str());
236
+ }
237
+ }
238
+
239
+ // Create input chunks
240
+ LOG_INFO("[DEBUG] Initializing input chunks");
241
+ result.chunks = mtmd_input_chunks_init();
242
+ if (result.chunks == nullptr) {
243
+ bitmaps.entries.clear();
244
+ throw std::runtime_error("Failed to initialize input chunks");
245
+ }
246
+
247
+ mtmd_input_text input_text;
248
+ input_text.text = prompt.c_str(); // Use the full prompt with image marker
249
+ input_text.add_special = true; // Add BOS token if this is the first message
250
+ input_text.parse_special = true; // Parse special tokens like <__media__>
251
+
252
+ /**
253
+ * Tokenize the text and media together.
254
+ *
255
+ * Example of tokenization for "foo bar <__media__> baz <__media__>":
256
+ *
257
+ * 1. Input text with media markers:
258
+ *
259
+ * "foo bar <__media__> baz <__media__>"
260
+ *
261
+ * 2. Model-specific markers are added.
262
+ *
263
+ * 3. Text is split and tokenized into chunks:
264
+ *
265
+ * ┌─────────────┐ ┌─────────────────────────┐ ┌─────────┐ ┌─────────────────────────┐
266
+ * │ TEXT CHUNK │ │ IMAGE CHUNK │ │ TEXT │ │ IMAGE CHUNK │
267
+ * │ "foo bar " │ │ │ │ " baz " │ │ │
268
+ * └─────────────┘ └─────────────────────────┘ └─────────┘ └─────────────────────────┘
269
+ * │ │ │ │
270
+ * ▼ ▼ ▼ ▼
271
+ * ┌─────────────┐ ┌─────────────────────────┐ ┌─────────┐ ┌─────────────────────────┐
272
+ * │ [1234,5678] │ │ Image Data Structure │ │ [9012] │ │ Image Data Structure │
273
+ * └─────────────┘ └─────────────────────────┘ └─────────┘ └─────────────────────────┘
274
+ *
275
+ * 4. Image token structure differences:
276
+ *
277
+ * For Qwen2VL (uses M-RoPE with 2D positions):
278
+ * ┌─────────────────────────────────────────┐
279
+ * │ MEDIA_CHUNK │
280
+ * │ ┌───────────────────────────────────┐ │
281
+ * │ │ mtmd_image_tokens: │ │
282
+ * │ │ nx = 16, ny = 16 │ │ ← 2D grid (16×16 = 256 tokens)
283
+ * │ │ use_mrope_pos = true │ │ ← Uses M-RoPE positioning
284
+ * │ │ batch_f32 = [image_embeddings] │ │
285
+ * │ └───────────────────────────────────┘ │
286
+ * └─────────────────────────────────────────┘
287
+ *
288
+ * For other models (uses 1D positions):
289
+ * ┌─────────────────────────────────────────┐
290
+ * │ MEDIA_CHUNK │
291
+ * │ ┌───────────────────────────────────┐ │
292
+ * │ │ mtmd_image_tokens: │ │
293
+ * │ │ nx = 256, ny = 1 │ │ ← 1D sequence (256 tokens)
294
+ * │ │ use_mrope_pos = false │ │ ← Uses standard positioning
295
+ * │ │ batch_f32 = [image_embeddings] │ │
296
+ * │ └───────────────────────────────────┘ │
297
+ * └─────────────────────────────────────────┘
298
+ *
299
+ * 5. Final chunks array:
300
+ * chunks[0] = TEXT_CHUNK([1234, 5678])
301
+ * chunks[1] = MEDIA_CHUNK(first_image)
302
+ * chunks[2] = TEXT_CHUNK([9012])
303
+ * chunks[3] = MEDIA_CHUNK(second_image)
304
+ */
305
+ LOG_INFO("[DEBUG] Tokenizing text and %zu media", bitmaps.entries.size());
306
+ auto bitmaps_c_ptr = bitmaps.c_ptr();
307
+ int32_t res = mtmd_tokenize(mtmd_wrapper->mtmd_ctx, result.chunks, &input_text, bitmaps_c_ptr.data(), bitmaps_c_ptr.size());
308
+ if (res != 0) {
309
+ mtmd_input_chunks_free(result.chunks);
310
+ bitmaps.entries.clear();
311
+ throw std::runtime_error("Failed to tokenize text and media");
312
+ }
313
+
314
+ // Log chunk information
315
+ size_t num_chunks = mtmd_input_chunks_size(result.chunks);
316
+ LOG_INFO("[DEBUG] Tokenization successful: num_chunks=%zu", num_chunks);
317
+
318
+ // Track the total number of tokens (both text and image)
319
+ size_t total_token_count = 0;
320
+
321
+ /**
322
+ * Evaluate the chunks.
323
+ *
324
+ * For our example "foo bar <__media__> baz <__media__>":
325
+ *
326
+ * Token organization in memory:
327
+ *
328
+ * all_tokens: [t0][t1][NULL][NULL]...[NULL][t2][NULL][NULL]...[NULL]
329
+ * positions: 0 1 2 3 ... 257 258 259 260 ... 514
330
+ * chunk_pos: 0 2 258 259
331
+ *
332
+ * Where:
333
+ * - [t0][t1] are text tokens for "foo bar " (positions 0-1)
334
+ * - [NULL]x256 are placeholder tokens for the first image (positions 2-257)
335
+ * - [t2] is the text token for " baz " (position 258)
336
+ * - [NULL]x256 are placeholder tokens for the second image (positions 259-514)
337
+ */
338
+ for (size_t i = 0; i < num_chunks; i++) {
339
+ result.chunk_pos.push_back(total_token_count);
340
+
341
+ const mtmd_input_chunk* chunk = mtmd_input_chunks_get(result.chunks, i);
342
+ mtmd_input_chunk_type chunk_type = mtmd_input_chunk_get_type(chunk);
343
+
344
+ if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
345
+ size_t n_tokens;
346
+ const llama_token* tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
347
+ LOG_INFO("[DEBUG] Chunk %zu: type=TEXT, n_tokens=%zu", i, n_tokens);
348
+
349
+ // Add text tokens
350
+ result.tokens.insert(result.tokens.end(), tokens, tokens + n_tokens);
351
+ total_token_count += n_tokens;
352
+ } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE || chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
353
+ result.chunk_pos_media.push_back(total_token_count);
354
+
355
+ size_t n_tokens = mtmd_input_chunk_get_n_tokens(chunk);
356
+ size_t n_pos = mtmd_input_chunk_get_n_pos(chunk);
357
+ LOG_INFO("[DEBUG] Chunk %zu: type=%s, n_tokens=%zu, n_pos=%zu",
358
+ i, chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE ? "IMAGE" : "AUDIO", n_tokens, n_pos);
359
+
360
+ for (size_t j = 0; j < n_pos; j++) {
361
+ result.tokens.push_back(LLAMA_TOKEN_NULL); // Placeholder token
362
+ }
363
+ total_token_count += n_pos;
364
+ }
365
+ }
366
+
367
+ bitmaps.entries.clear();
368
+
369
+ return result;
370
+ }
371
+
372
+ inline void llama_rn_context_mtmd::processMedia(
373
+ llama_context *ctx,
374
+ const std::string &prompt,
375
+ const std::vector<std::string> &media_paths,
376
+ int n_ctx,
377
+ int n_batch,
378
+ llama_pos &n_past,
379
+ std::vector<llama_token> &embd,
380
+ bool &context_full,
381
+ common_sampler *ctx_sampling
382
+ ) {
383
+ // Multimodal path
384
+ std::string full_prompt = prompt;
385
+ auto default_media_marker = mtmd_default_marker();
386
+ // Add media marker if it doesn't already exist
387
+ if (full_prompt.find(default_media_marker) == std::string::npos) {
388
+ full_prompt += " ";
389
+ full_prompt += default_media_marker;
390
+ }
391
+
392
+ LOG_INFO("[DEBUG] Processing message with role=user, content=%s", full_prompt.c_str());
393
+ LOG_INFO("[DEBUG] Processing %zu media with prompt: %s", media_paths.size(), prompt.c_str());
394
+ LOG_INFO("[DEBUG] Current context state: n_past=%d, n_ctx=%d", n_past, n_ctx);
395
+
396
+ auto result = tokenizeWithMedia(this, full_prompt, media_paths);
397
+
398
+ auto all_tokens = result.tokens;
399
+ auto chunks = result.chunks;
400
+ auto chunk_pos = result.chunk_pos;
401
+ auto chunk_pos_media = result.chunk_pos_media;
402
+ auto bitmap_hashes = result.bitmap_hashes;
403
+
404
+ // Check if we have enough context space for all tokens
405
+ if (all_tokens.size() >= (size_t)n_ctx) {
406
+ mtmd_input_chunks_free(chunks);
407
+ context_full = true;
408
+ throw std::runtime_error("Not enough context space");
409
+ }
410
+
411
+ n_past = common_part(embd, all_tokens);
412
+
413
+ llama_pos new_n_past = n_past;
414
+
415
+ // Adjust n_past to position of the text chunk
416
+ // TODO: Edit the text chunk to remove the tokens before n_past to speed up
417
+ // need to update the mtmd api
418
+ auto adjusted_n_past = -1;
419
+ for (size_t i = 0; i < chunk_pos.size(); i++) {
420
+ if (n_past < chunk_pos[i]) {
421
+ break;
422
+ }
423
+ bool is_end = i + 1 == chunk_pos.size();
424
+ if (
425
+ chunk_pos[i] < n_past &&
426
+ (!is_end && chunk_pos[i + 1] > n_past)
427
+ // is_end & n_past < total_token_count:
428
+ // don't need to adjust and it will skip eval_chunk_single, let nextToken() to finish the job
429
+ ) {
430
+ adjusted_n_past = chunk_pos[i];
431
+ }
432
+ }
433
+ if (adjusted_n_past != -1) {
434
+ n_past = adjusted_n_past;
435
+ new_n_past = n_past;
436
+ LOG_INFO("[DEBUG] Adjusted n_past to %d", n_past);
437
+ }
438
+
439
+ // Compare bitmap hashes, if they are not the same, backtrack n_past to the position of the first mismatch
440
+ if (bitmap_past_hashes.size() > 0) {
441
+ for (size_t i = 0; i < bitmap_hashes.size(); i++) {
442
+ auto pos = chunk_pos_media[i];
443
+ if (n_past < pos) {
444
+ break;
445
+ }
446
+ if (i >= bitmap_past_hashes.size()) {
447
+ break;
448
+ }
449
+ if (bitmap_hashes[i] != bitmap_past_hashes[i]) {
450
+ LOG_INFO(
451
+ "[DEBUG] Bitmap hash mismatch at position %zu, %s != %s",
452
+ i, bitmap_hashes[i].c_str(), bitmap_past_hashes[i].c_str()
453
+ );
454
+ n_past = chunk_pos_media[i];
455
+ new_n_past = n_past;
456
+ break;
457
+ }
458
+ }
459
+ }
460
+
461
+ // Clear all KV cache entries after position n_past
462
+ auto * kv = llama_get_memory(ctx);
463
+
464
+ bool clear_result = llama_memory_seq_rm(kv, 0, n_past, -1);
465
+ if (!clear_result) {
466
+ LOG_ERROR("[DEBUG] llama_memory_seq_rm failed (likely using a non-Transformer model)! Trying full clear...");
467
+ llama_memory_clear(kv, false);
468
+ n_past = 0;
469
+ new_n_past = n_past;
470
+ }
471
+
472
+
473
+ LOG_INFO("[DEBUG] Evaluating chunks: n_past=%d, n_batch=%d", n_past, n_batch);
474
+
475
+ size_t num_chunks = mtmd_input_chunks_size(chunks);
476
+
477
+ for (size_t i = 0; i < chunk_pos.size(); i++) {
478
+
479
+ LOG_INFO("[DEBUG] Evaluating chunk %zu: n_past=%d, chunk_pos=%zu", i, n_past, chunk_pos[i]);
480
+
481
+ // Process chunk only if it's after the current n_past
482
+ if (chunk_pos[i] >= n_past) {
483
+ bool chunk_logits_last = (i == num_chunks - 1);
484
+ auto chunk = mtmd_input_chunks_get(chunks, i);
485
+
486
+ int32_t res = mtmd_helper_eval_chunk_single(
487
+ this->mtmd_ctx,
488
+ ctx,
489
+ chunk,
490
+ n_past,
491
+ 0,
492
+ n_batch,
493
+ chunk_logits_last,
494
+ &new_n_past
495
+ );
496
+ if (res != 0) {
497
+ mtmd_input_chunks_free(chunks);
498
+ throw std::runtime_error("Failed to evaluate chunks");
499
+ }
500
+ n_past = new_n_past;
501
+ }
502
+ }
503
+
504
+ if (n_past == all_tokens.size() && n_past > 0 && all_tokens[n_past - 1] != LLAMA_TOKEN_NULL) {
505
+ // we have to evaluate at least 1 token to generate logits.
506
+ n_past--;
507
+ }
508
+
509
+ // Update embd with all tokens (both text and media)
510
+ embd = all_tokens;
511
+
512
+ bitmap_past_hashes = bitmap_hashes;
513
+
514
+ // Update sampling context with text tokens only
515
+ for (auto & token : all_tokens) {
516
+ if (token == LLAMA_TOKEN_NULL) {
517
+ continue;
518
+ }
519
+ common_sampler_accept(ctx_sampling, token, false);
520
+ }
521
+
522
+ // Clean up media resources
523
+ LOG_INFO("[DEBUG] Cleaning up resources");
524
+ mtmd_input_chunks_free(chunks);
525
+ }
526
+
527
+ inline llama_rn_context_mtmd::llama_rn_context_mtmd(
528
+ const std::string &mmproj_path,
529
+ bool use_gpu,
530
+ llama_model *model,
531
+ llama_context *ctx,
532
+ const common_params &params,
533
+ bool &has_multimodal,
534
+ common_params &mutable_params
535
+ ) {
536
+ LOG_INFO("[DEBUG] Initializing multimodal with mmproj path: %s", mmproj_path.c_str());
537
+
538
+ if (model == nullptr) {
539
+ LOG_ERROR("[DEBUG] Model not loaded, cannot initialize multimodal", "");
540
+ throw std::runtime_error("Model not loaded, cannot initialize multimodal");
541
+ }
542
+
543
+ LOG_INFO("[DEBUG] Model info: n_ctx=%d, n_embd=%d",
544
+ llama_n_ctx(ctx),
545
+ llama_model_n_embd(model));
546
+
547
+ // Initialize mtmd context
548
+ mtmd_context_params mtmd_params = mtmd_context_params_default();
549
+ mtmd_params.use_gpu = use_gpu;
550
+ mtmd_params.print_timings = false;
551
+ mtmd_params.n_threads = params.cpuparams.n_threads;
552
+ mtmd_params.verbosity = (lm_ggml_log_level)LM_GGML_LOG_LEVEL_INFO;
553
+
554
+ LOG_INFO("[DEBUG] Initializing mtmd context with threads=%d", mtmd_params.n_threads);
555
+
556
+ auto mtmd_ctx = mtmd_init_from_file(mmproj_path.c_str(), model, mtmd_params);
557
+ if (mtmd_ctx == nullptr) {
558
+ LOG_ERROR("[DEBUG] Failed to initialize multimodal context with mmproj: %s", mmproj_path.c_str());
559
+ throw std::runtime_error("Failed to initialize multimodal context");
560
+ }
561
+ this->mtmd_ctx = mtmd_ctx;
562
+
563
+ has_multimodal = true;
564
+
565
+ // Check if the model uses M-RoPE or non-causal attention
566
+ bool uses_mrope = mtmd_decode_use_mrope(mtmd_ctx);
567
+ bool uses_non_causal = mtmd_decode_use_non_causal(mtmd_ctx);
568
+ LOG_INFO("[DEBUG] Model multimodal properties: uses_mrope=%d, uses_non_causal=%d",
569
+ uses_mrope ? 1 : 0,
570
+ uses_non_causal ? 1 : 0);
571
+
572
+ // Disable context shifting when multimodal is enabled
573
+ // This is because an media chunk may contain multiple tokens
574
+ // and context shifting could break the media representation
575
+ mutable_params.ctx_shift = false;
576
+
577
+ // params.n_cache_reuse = 0;
578
+
579
+ LOG_INFO("Multimodal context initialized successfully with mmproj: %s", mmproj_path.c_str());
580
+ LOG_INFO("Context shifting disabled for multimodal support");
581
+ }
582
+
583
+ inline llama_rn_context_mtmd::~llama_rn_context_mtmd() {
584
+ if (mtmd_ctx != nullptr) {
585
+ mtmd_free(mtmd_ctx);
586
+ mtmd_ctx = nullptr;
587
+ }
588
+ }
589
+
590
+ inline bool llama_rn_context_mtmd::isEnabled(bool has_multimodal) const {
591
+ return has_multimodal && this != nullptr;
592
+ }
593
+
594
+ inline bool llama_rn_context_mtmd::supportVision() const {
595
+ return mtmd_ctx != nullptr && mtmd_support_vision(mtmd_ctx);
596
+ }
597
+
598
+ inline bool llama_rn_context_mtmd::supportAudio() const {
599
+ return mtmd_ctx != nullptr && mtmd_support_audio(mtmd_ctx);
600
+ }
601
+
602
+ } // namespace rnllama