@fugood/llama.node 0.3.17 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (193) hide show
  1. package/CMakeLists.txt +3 -1
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +39 -2
  19. package/lib/index.js +132 -1
  20. package/lib/index.ts +203 -3
  21. package/package.json +2 -1
  22. package/src/EmbeddingWorker.cpp +1 -1
  23. package/src/LlamaCompletionWorker.cpp +366 -19
  24. package/src/LlamaCompletionWorker.h +30 -10
  25. package/src/LlamaContext.cpp +213 -5
  26. package/src/LlamaContext.h +12 -0
  27. package/src/common.hpp +15 -0
  28. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +133 -24
  29. package/src/llama.cpp/.github/workflows/build.yml +41 -762
  30. package/src/llama.cpp/.github/workflows/docker.yml +5 -2
  31. package/src/llama.cpp/.github/workflows/release.yml +716 -0
  32. package/src/llama.cpp/.github/workflows/server.yml +12 -12
  33. package/src/llama.cpp/CMakeLists.txt +5 -17
  34. package/src/llama.cpp/cmake/build-info.cmake +8 -2
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
  36. package/src/llama.cpp/common/CMakeLists.txt +31 -3
  37. package/src/llama.cpp/common/arg.cpp +48 -29
  38. package/src/llama.cpp/common/chat.cpp +128 -106
  39. package/src/llama.cpp/common/chat.h +2 -0
  40. package/src/llama.cpp/common/common.cpp +37 -1
  41. package/src/llama.cpp/common/common.h +18 -9
  42. package/src/llama.cpp/common/llguidance.cpp +1 -0
  43. package/src/llama.cpp/common/minja/chat-template.hpp +9 -5
  44. package/src/llama.cpp/common/minja/minja.hpp +69 -36
  45. package/src/llama.cpp/common/regex-partial.cpp +204 -0
  46. package/src/llama.cpp/common/regex-partial.h +56 -0
  47. package/src/llama.cpp/common/sampling.cpp +57 -50
  48. package/src/llama.cpp/examples/CMakeLists.txt +2 -23
  49. package/src/llama.cpp/examples/embedding/embedding.cpp +2 -11
  50. package/src/llama.cpp/examples/parallel/parallel.cpp +86 -14
  51. package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
  52. package/src/llama.cpp/examples/training/finetune.cpp +96 -0
  53. package/src/llama.cpp/ggml/CMakeLists.txt +27 -0
  54. package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
  55. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
  56. package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
  57. package/src/llama.cpp/ggml/include/ggml.h +10 -7
  58. package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
  60. package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
  61. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +20 -13
  62. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -2
  63. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +306 -6
  64. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -13
  65. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +29 -16
  66. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
  67. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
  68. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
  69. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +501 -0
  70. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +0 -13
  71. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +0 -6
  72. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
  73. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +36 -11
  74. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +0 -2
  75. package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
  76. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
  77. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +41 -27
  78. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
  79. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +9 -8
  80. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +121 -232
  81. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +7 -15
  82. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
  83. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
  84. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  85. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
  86. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +0 -23
  87. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  88. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +338 -166
  89. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
  90. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
  91. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
  92. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -70
  93. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +657 -193
  94. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +20 -0
  95. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +123 -29
  96. package/src/llama.cpp/ggml/src/ggml.c +29 -20
  97. package/src/llama.cpp/ggml/src/gguf.cpp +33 -33
  98. package/src/llama.cpp/include/llama.h +52 -11
  99. package/src/llama.cpp/requirements/requirements-all.txt +3 -3
  100. package/src/llama.cpp/scripts/xxd.cmake +1 -1
  101. package/src/llama.cpp/src/CMakeLists.txt +1 -0
  102. package/src/llama.cpp/src/llama-adapter.cpp +6 -0
  103. package/src/llama.cpp/src/llama-arch.cpp +3 -0
  104. package/src/llama.cpp/src/llama-batch.cpp +5 -1
  105. package/src/llama.cpp/src/llama-batch.h +2 -1
  106. package/src/llama.cpp/src/llama-chat.cpp +17 -7
  107. package/src/llama.cpp/src/llama-chat.h +1 -0
  108. package/src/llama.cpp/src/llama-context.cpp +389 -501
  109. package/src/llama.cpp/src/llama-context.h +44 -32
  110. package/src/llama.cpp/src/llama-cparams.h +1 -0
  111. package/src/llama.cpp/src/llama-graph.cpp +20 -38
  112. package/src/llama.cpp/src/llama-graph.h +12 -8
  113. package/src/llama.cpp/src/llama-kv-cache.cpp +1503 -389
  114. package/src/llama.cpp/src/llama-kv-cache.h +271 -85
  115. package/src/llama.cpp/src/llama-memory.h +11 -1
  116. package/src/llama.cpp/src/llama-model-loader.cpp +24 -15
  117. package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
  118. package/src/llama.cpp/src/llama-model-saver.h +37 -0
  119. package/src/llama.cpp/src/llama-model.cpp +316 -69
  120. package/src/llama.cpp/src/llama-model.h +8 -1
  121. package/src/llama.cpp/src/llama-quant.cpp +15 -13
  122. package/src/llama.cpp/src/llama-sampling.cpp +18 -6
  123. package/src/llama.cpp/src/llama-vocab.cpp +42 -4
  124. package/src/llama.cpp/src/llama-vocab.h +6 -0
  125. package/src/llama.cpp/src/llama.cpp +14 -0
  126. package/src/llama.cpp/tests/CMakeLists.txt +10 -2
  127. package/src/llama.cpp/tests/test-backend-ops.cpp +107 -47
  128. package/src/llama.cpp/tests/test-chat-template.cpp +10 -11
  129. package/src/llama.cpp/tests/test-chat.cpp +3 -1
  130. package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
  131. package/src/llama.cpp/tests/test-opt.cpp +33 -21
  132. package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
  133. package/src/llama.cpp/tests/test-sampling.cpp +1 -1
  134. package/src/llama.cpp/tools/CMakeLists.txt +39 -0
  135. package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +2 -2
  136. package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
  137. package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +495 -348
  138. package/src/llama.cpp/{examples → tools}/main/main.cpp +6 -9
  139. package/src/llama.cpp/{examples/llava → tools/mtmd}/CMakeLists.txt +1 -35
  140. package/src/llama.cpp/{examples/llava → tools/mtmd}/clip-impl.h +25 -5
  141. package/src/llama.cpp/{examples/llava → tools/mtmd}/clip.cpp +1440 -1349
  142. package/src/llama.cpp/tools/mtmd/clip.h +99 -0
  143. package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd-cli.cpp +70 -44
  144. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
  145. package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd.cpp +251 -281
  146. package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
  147. package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +4 -2
  148. package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +13 -76
  149. package/src/llama.cpp/{examples → tools}/rpc/rpc-server.cpp +70 -74
  150. package/src/llama.cpp/{examples → tools}/run/run.cpp +18 -4
  151. package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
  152. package/src/llama.cpp/{examples → tools}/server/server.cpp +291 -76
  153. package/src/llama.cpp/{examples → tools}/server/utils.hpp +377 -5
  154. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
  155. package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
  156. package/src/llama.cpp/examples/infill/infill.cpp +0 -590
  157. package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
  158. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
  159. package/src/llama.cpp/examples/llava/clip.h +0 -135
  160. package/src/llama.cpp/examples/llava/llava.cpp +0 -586
  161. package/src/llama.cpp/examples/llava/llava.h +0 -49
  162. package/src/llama.cpp/examples/llava/mtmd.h +0 -168
  163. package/src/llama.cpp/examples/llava/qwen2vl-test.cpp +0 -636
  164. /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
  165. /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
  166. /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
  167. /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
  168. /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
  169. /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
  170. /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
  171. /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
  172. /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
  173. /package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +0 -0
  174. /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
  175. /package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +0 -0
  176. /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
  177. /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
  178. /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
  179. /package/src/llama.cpp/{examples/llava → tools/mtmd}/deprecation-warning.cpp +0 -0
  180. /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
  181. /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
  182. /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
  183. /package/src/llama.cpp/{examples → tools}/rpc/CMakeLists.txt +0 -0
  184. /package/src/llama.cpp/{examples → tools}/run/CMakeLists.txt +0 -0
  185. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
  186. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
  187. /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
  188. /package/src/llama.cpp/{examples → tools}/server/httplib.h +0 -0
  189. /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
  190. /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
  191. /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
  192. /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
  193. /package/src/llama.cpp/{examples → tools}/tts/tts.cpp +0 -0
@@ -1,6 +1,80 @@
1
1
  #include "LlamaCompletionWorker.h"
2
2
  #include "LlamaContext.h"
3
3
 
4
+ // Computes FNV-1a hash of the data
5
+ static std::string fnv_hash(const uint8_t * data, size_t len) {
6
+ const uint64_t fnv_prime = 0x100000001b3ULL;
7
+ uint64_t hash = 0xcbf29ce484222325ULL;
8
+
9
+ for (size_t i = 0; i < len; ++i) {
10
+ hash ^= data[i];
11
+ hash *= fnv_prime;
12
+ }
13
+ return std::to_string(hash);
14
+ }
15
+
16
+ static const std::string base64_chars =
17
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
18
+ "abcdefghijklmnopqrstuvwxyz"
19
+ "0123456789+/";
20
+
21
+ // Base64 decoding function
22
+ static std::vector<uint8_t> base64_decode(const std::string &encoded_string) {
23
+ std::vector<uint8_t> decoded;
24
+ int in_len = encoded_string.size();
25
+ int i = 0;
26
+ int j = 0;
27
+ int in_ = 0;
28
+ unsigned char char_array_4[4], char_array_3[3];
29
+
30
+ while (in_len-- && (encoded_string[in_] != '=')) {
31
+ if (isspace(encoded_string[in_])) {
32
+ in_++;
33
+ continue;
34
+ }
35
+
36
+ if (encoded_string[in_] == '=' || base64_chars.find(encoded_string[in_]) == std::string::npos) {
37
+ break;
38
+ }
39
+
40
+ char_array_4[i++] = encoded_string[in_]; in_++;
41
+ if (i == 4) {
42
+ for (i = 0; i < 4; i++) {
43
+ char_array_4[i] = base64_chars.find(char_array_4[i]);
44
+ }
45
+
46
+ char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
47
+ char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
48
+ char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
49
+
50
+ for (i = 0; i < 3; i++) {
51
+ decoded.push_back(char_array_3[i]);
52
+ }
53
+ i = 0;
54
+ }
55
+ }
56
+
57
+ if (i) {
58
+ for (j = i; j < 4; j++) {
59
+ char_array_4[j] = 0;
60
+ }
61
+
62
+ for (j = 0; j < 4; j++) {
63
+ char_array_4[j] = base64_chars.find(char_array_4[j]);
64
+ }
65
+
66
+ char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
67
+ char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
68
+ char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
69
+
70
+ for (j = 0; j < i - 1; j++) {
71
+ decoded.push_back(char_array_3[j]);
72
+ }
73
+ }
74
+
75
+ return decoded;
76
+ }
77
+
4
78
  size_t common_part(const std::vector<llama_token> &a,
5
79
  const std::vector<llama_token> &b) {
6
80
  size_t i = 0;
@@ -10,6 +84,230 @@ size_t common_part(const std::vector<llama_token> &a,
10
84
  return i;
11
85
  }
12
86
 
87
+ // Process images and add them to the tokenized input
88
+ llama_pos processImage(
89
+ const mtmd_context* mtmd_ctx,
90
+ llama_context* ctx,
91
+ LlamaSessionPtr sess,
92
+ const std::vector<std::string>& image_paths,
93
+ const common_params& params,
94
+ std::vector<llama_token>& text_tokens
95
+ ) {
96
+ if (mtmd_ctx == nullptr) {
97
+ return false;
98
+ }
99
+
100
+ // Multimodal path
101
+ std::string full_prompt = params.prompt;
102
+ // Add image marker if it doesn't already exist
103
+ if (full_prompt.find("<__image__>") == std::string::npos) {
104
+ full_prompt += " <__image__>";
105
+ }
106
+
107
+ // Prepare bitmaps array for all images
108
+ mtmd::bitmaps bitmaps;
109
+
110
+ // Load all images
111
+ for (const auto& image_path : image_paths) {
112
+ fprintf(stdout, "[DEBUG] Loading image: %s\n",
113
+ image_path.substr(0, 50).c_str()); // Only log part of path for base64
114
+
115
+ // Check if it's a base64 image
116
+ if (image_path.compare(0, 11, "data:image/") == 0) {
117
+
118
+ // Parse base64 data
119
+ std::vector<std::string> parts;
120
+ size_t comma_pos = image_path.find(',');
121
+ if (comma_pos == std::string::npos) {
122
+ bitmaps.entries.clear();
123
+ return false;
124
+ }
125
+
126
+ std::string header = image_path.substr(0, comma_pos);
127
+ std::string base64_data = image_path.substr(comma_pos + 1);
128
+
129
+ if (header.find("base64") == std::string::npos) {
130
+ bitmaps.entries.clear();
131
+ return false;
132
+ }
133
+
134
+ // Decode base64
135
+ try {
136
+ // Decode base64 to binary
137
+ std::vector<uint8_t> image_data = base64_decode(base64_data);
138
+
139
+ // Load bitmap from memory buffer using direct initialization
140
+ mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(image_data.data(), image_data.size()));
141
+ if (!bmp.ptr) {
142
+ bitmaps.entries.clear();
143
+ return false;
144
+ }
145
+
146
+ // Calculate bitmap hash (for KV caching)
147
+ std::string hash = fnv_hash(bmp.data(), bmp.nx()*bmp.ny()*3);
148
+ bmp.set_id(hash.c_str());
149
+ bitmaps.entries.push_back(std::move(bmp));
150
+ } catch (const std::exception& e) {
151
+ bitmaps.entries.clear();
152
+ return false;
153
+ }
154
+ } else if (image_path.compare(0, 7, "http://") == 0 || image_path.compare(0, 8, "https://") == 0) {
155
+ // HTTP URLs are not supported yet
156
+ bitmaps.entries.clear();
157
+ return false;
158
+ } else {
159
+ // Check if file exists
160
+ FILE* file = fopen(image_path.c_str(), "rb");
161
+ if (file == nullptr) {
162
+ bitmaps.entries.clear();
163
+ return false;
164
+ }
165
+
166
+ // Get file size
167
+ fseek(file, 0, SEEK_END);
168
+ long file_size = ftell(file);
169
+ fseek(file, 0, SEEK_SET);
170
+ fclose(file);
171
+
172
+ // Create bitmap directly
173
+ mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(image_path.c_str()));
174
+ if (!bmp.ptr) {
175
+ bitmaps.entries.clear();
176
+ return false;
177
+ }
178
+
179
+ // Calculate bitmap hash (for KV caching)
180
+ std::string hash = fnv_hash(bmp.data(), bmp.nx()*bmp.ny()*3);
181
+ bmp.set_id(hash.c_str());
182
+ bitmaps.entries.push_back(std::move(bmp));
183
+ }
184
+ }
185
+
186
+ mtmd_input_chunks* chunks = mtmd_input_chunks_init();
187
+ if (chunks == nullptr) {
188
+ bitmaps.entries.clear();
189
+ return false;
190
+ }
191
+
192
+ // Create input text
193
+ mtmd_input_text input_text;
194
+ input_text.text = full_prompt.c_str(); // Use the full prompt with image marker
195
+ input_text.add_special = true; // Add BOS token if this is the first message
196
+ input_text.parse_special = true; // Parse special tokens like <__image__>
197
+
198
+ // Tokenize the text and images
199
+ fprintf(stdout, "[DEBUG] Tokenizing text and %zu images\n", bitmaps.entries.size());
200
+ auto bitmaps_c_ptr = bitmaps.c_ptr();
201
+
202
+ // Cast away const for mtmd_tokenize
203
+ int32_t res = mtmd_tokenize(
204
+ const_cast<mtmd_context*>(mtmd_ctx),
205
+ chunks,
206
+ &input_text,
207
+ bitmaps_c_ptr.data(),
208
+ bitmaps_c_ptr.size()
209
+ );
210
+
211
+ if (res != 0) {
212
+ mtmd_input_chunks_free(chunks);
213
+ bitmaps.entries.clear();
214
+ return false;
215
+ }
216
+
217
+ // Log chunk information
218
+ size_t num_chunks = mtmd_input_chunks_size(chunks);
219
+ fprintf(stdout, "[DEBUG] Tokenization successful: num_chunks=%zu\n", num_chunks);
220
+
221
+ // Clear text_tokens before adding new tokens
222
+ text_tokens.clear();
223
+
224
+ // Create a vector to store all tokens (both text and image)
225
+ std::vector<llama_token> all_tokens;
226
+
227
+ // Track the total number of tokens (both text and image)
228
+ size_t total_token_count = 0;
229
+
230
+ // chunk pos
231
+ std::vector<size_t> chunk_pos;
232
+ for (size_t i = 0; i < num_chunks; i++) {
233
+ chunk_pos.push_back(total_token_count);
234
+
235
+ const mtmd_input_chunk* chunk = mtmd_input_chunks_get(chunks, i);
236
+ mtmd_input_chunk_type chunk_type = mtmd_input_chunk_get_type(chunk);
237
+
238
+ if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
239
+ size_t n_tokens;
240
+ const llama_token* tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
241
+
242
+ // Add text tokens
243
+ text_tokens.insert(text_tokens.end(), tokens, tokens + n_tokens);
244
+ all_tokens.insert(all_tokens.end(), tokens, tokens + n_tokens);
245
+ total_token_count += n_tokens;
246
+ } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
247
+ const mtmd_image_tokens* img_tokens = mtmd_input_chunk_get_tokens_image(chunk);
248
+ size_t n_tokens = mtmd_image_tokens_get_n_tokens(img_tokens);
249
+ size_t n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
250
+
251
+ for (size_t j = 0; j < n_pos; j++) {
252
+ all_tokens.push_back(LLAMA_TOKEN_NULL);
253
+ }
254
+ total_token_count += n_pos;
255
+ }
256
+ }
257
+
258
+ llama_pos n_past = common_part(*sess->tokens_ptr(), all_tokens);
259
+
260
+ llama_pos new_n_past = n_past;
261
+
262
+ for (size_t i = 0; i < chunk_pos.size(); i++) {
263
+ fprintf(stdout, "[DEBUG] Evaluating chunk %zu: n_past=%d, chunk_pos=%zu\n", i, n_past, chunk_pos[i]);
264
+
265
+ // Process chunk only if it's after the current n_past
266
+ if (chunk_pos[i] >= new_n_past) {
267
+ bool chunk_logits_last = (i == num_chunks - 1);
268
+ auto chunk = mtmd_input_chunks_get(chunks, i);
269
+
270
+ // Cast away const for mtmd_helper_eval_chunk_single
271
+ int32_t res = mtmd_helper_eval_chunk_single(
272
+ const_cast<mtmd_context*>(mtmd_ctx),
273
+ ctx,
274
+ chunk,
275
+ n_past,
276
+ 0,
277
+ params.n_batch, // batch size
278
+ chunk_logits_last,
279
+ &new_n_past
280
+ );
281
+
282
+ if (res != 0) {
283
+ mtmd_input_chunks_free(chunks);
284
+ bitmaps.entries.clear();
285
+ return false;
286
+ }
287
+ n_past = new_n_past;
288
+ }
289
+ }
290
+
291
+ if (n_past == total_token_count) {
292
+ // we have to evaluate at least 1 token to generate logits.
293
+ n_past--;
294
+ }
295
+
296
+ // Update sampling context to process token sequences
297
+ for (auto & token : all_tokens) {
298
+ if (token == LLAMA_TOKEN_NULL) {
299
+ continue;
300
+ }
301
+ }
302
+ // Set the tokens
303
+ sess->set_tokens(std::move(all_tokens));
304
+
305
+ // Clean up image resources
306
+ mtmd_input_chunks_free(chunks);
307
+ bitmaps.entries.clear();
308
+ return n_past;
309
+ }
310
+
13
311
  size_t findStoppingStrings(const std::string &text,
14
312
  const size_t last_token_size,
15
313
  const std::vector<std::string> &stop_words) {
@@ -36,9 +334,11 @@ LlamaCompletionWorker::LlamaCompletionWorker(
36
334
  const Napi::CallbackInfo &info, LlamaSessionPtr &sess,
37
335
  Napi::Function callback, common_params params,
38
336
  std::vector<std::string> stop_words,
39
- int32_t chat_format)
337
+ int32_t chat_format,
338
+ std::vector<std::string> image_paths)
40
339
  : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess),
41
- _params(params), _stop_words(stop_words), _chat_format(chat_format) {
340
+ _params(params), _stop_words(stop_words), _chat_format(chat_format),
341
+ _image_paths(image_paths) {
42
342
  if (!callback.IsEmpty()) {
43
343
  _tsfn = Napi::ThreadSafeFunction::New(info.Env(), callback,
44
344
  "LlamaCompletionCallback", 0, 1);
@@ -70,18 +370,59 @@ void LlamaCompletionWorker::Execute() {
70
370
  LlamaCppSampling sampling{common_sampler_init(model, _params.sampling),
71
371
  common_sampler_free};
72
372
 
73
- std::vector<llama_token> prompt_tokens =
74
- ::common_tokenize(ctx, _params.prompt, add_bos);
75
- n_input = prompt_tokens.size();
76
- if (_sess->tokens_ptr()->size() > 0) {
77
- n_cur = common_part(*(_sess->tokens_ptr()), prompt_tokens);
78
- if (n_cur == n_input) {
79
- --n_cur;
373
+ std::vector<llama_token> prompt_tokens;
374
+
375
+ // Process images if any are provided
376
+ if (!_image_paths.empty()) {
377
+ const auto* mtmd_ctx = _sess->get_mtmd_ctx();
378
+
379
+ if (mtmd_ctx != nullptr) {
380
+ // Process the images and get the tokens
381
+ n_cur = processImage(
382
+ mtmd_ctx,
383
+ ctx,
384
+ _sess,
385
+ _image_paths,
386
+ _params,
387
+ prompt_tokens
388
+ );
389
+
390
+ if (n_cur <= 0) {
391
+ SetError("Failed to process images");
392
+ _sess->get_mutex().unlock();
393
+ return;
394
+ }
395
+
396
+ fprintf(stdout, "[DEBUG] Image processing successful, n_cur=%zu, tokens=%zu\n",
397
+ n_cur, _sess->tokens_ptr()->size());
398
+
399
+ n_input = _sess->tokens_ptr()->size();
400
+ if (n_cur == n_input) {
401
+ --n_cur;
402
+ }
403
+ n_input -= n_cur;
404
+ llama_kv_self_seq_rm(ctx, 0, n_cur, -1);
405
+ } else {
406
+ SetError("Multimodal context not initialized");
407
+ _sess->get_mutex().unlock();
408
+ return;
409
+ }
410
+ } else {
411
+ // Text-only path
412
+ prompt_tokens = ::common_tokenize(ctx, _params.prompt, add_bos);
413
+ n_input = prompt_tokens.size();
414
+
415
+ if (_sess->tokens_ptr()->size() > 0) {
416
+ n_cur = common_part(*(_sess->tokens_ptr()), prompt_tokens);
417
+ if (n_cur == n_input) {
418
+ --n_cur;
419
+ }
420
+ n_input -= n_cur;
421
+ llama_kv_self_seq_rm(ctx, 0, n_cur, -1);
80
422
  }
81
- n_input -= n_cur;
82
- llama_kv_cache_seq_rm(ctx, 0, n_cur, -1);
423
+ // Set the tokens
424
+ _sess->set_tokens(std::move(prompt_tokens));
83
425
  }
84
- _sess->set_tokens(std::move(prompt_tokens));
85
426
 
86
427
  const int max_len = _params.n_predict < 0 ? 0 : _params.n_predict;
87
428
  _sess->tokens_ptr()->reserve(_sess->tokens_ptr()->size() + max_len);
@@ -99,8 +440,8 @@ void LlamaCompletionWorker::Execute() {
99
440
  const int n_left = n_cur - n_keep - 1;
100
441
  const int n_discard = n_left / 2;
101
442
 
102
- llama_kv_cache_seq_rm(ctx, 0, n_keep + 1, n_keep + n_discard + 1);
103
- llama_kv_cache_seq_add(ctx, 0, n_keep + 1 + n_discard, n_cur, -n_discard);
443
+ llama_kv_self_seq_rm(ctx, 0, n_keep + 1, n_keep + n_discard + 1);
444
+ llama_kv_self_seq_add(ctx, 0, n_keep + 1 + n_discard, n_cur, -n_discard);
104
445
 
105
446
  // shift the tokens
106
447
  embd->insert(embd->begin() + n_keep + 1,
@@ -110,12 +451,18 @@ void LlamaCompletionWorker::Execute() {
110
451
  n_cur -= n_discard;
111
452
  _result.truncated = true;
112
453
  }
113
- int ret = llama_decode(
114
- ctx, llama_batch_get_one(embd->data() + n_cur, n_input));
115
- if (ret < 0) {
116
- SetError("Failed to decode token, code: " + std::to_string(ret));
117
- break;
454
+
455
+ // For multimodal input, n_past might already be set
456
+ // Only decode text tokens if we have any input left
457
+ if (n_input > 0) {
458
+ int ret = llama_decode(
459
+ ctx, llama_batch_get_one(embd->data() + n_cur, n_input));
460
+ if (ret < 0) {
461
+ SetError("Failed to decode token, code: " + std::to_string(ret));
462
+ break;
463
+ }
118
464
  }
465
+
119
466
  // sample the next token
120
467
  const llama_token new_token_id =
121
468
  common_sampler_sample(sampling.get(), ctx, -1);
@@ -1,5 +1,11 @@
1
+ #pragma once
2
+
1
3
  #include "common.hpp"
4
+ #include <atomic>
2
5
  #include <functional>
6
+ #include <napi.h>
7
+ #include "tools/mtmd/mtmd.h"
8
+ #include "tools/mtmd/clip.h"
3
9
 
4
10
  struct CompletionResult {
5
11
  std::string text = "";
@@ -14,28 +20,42 @@ class LlamaCompletionWorker : public Napi::AsyncWorker,
14
20
  public:
15
21
  LlamaCompletionWorker(const Napi::CallbackInfo &info, LlamaSessionPtr &sess,
16
22
  Napi::Function callback, common_params params,
17
- std::vector<std::string> stop_words = {},
18
- int32_t chat_format = 0);
23
+ std::vector<std::string> stop_words,
24
+ int32_t chat_format,
25
+ std::vector<std::string> image_paths = {});
19
26
 
20
27
  ~LlamaCompletionWorker();
21
28
 
22
- inline void Stop() { _stop = true; }
29
+ Napi::Promise GetPromise() { return Napi::Promise::Deferred::Promise(); }
30
+
31
+ void OnComplete(std::function<void()> cb) {
32
+ _onComplete = cb;
33
+ }
23
34
 
24
- inline void onComplete(std::function<void()> cb) { _onComplete = cb; }
35
+ void SetStop() {
36
+ _stop = true;
37
+ }
25
38
 
26
39
  protected:
27
- void Execute();
28
- void OnOK();
29
- void OnError(const Napi::Error &err);
40
+ void Execute() override;
41
+ void OnOK() override;
42
+ void OnError(const Napi::Error &err) override;
30
43
 
31
44
  private:
32
45
  LlamaSessionPtr _sess;
33
46
  common_params _params;
34
47
  std::vector<std::string> _stop_words;
35
48
  int32_t _chat_format;
36
- Napi::ThreadSafeFunction _tsfn;
49
+ std::vector<std::string> _image_paths;
50
+ std::function<void()> _onComplete;
37
51
  bool _has_callback = false;
38
52
  bool _stop = false;
39
- std::function<void()> _onComplete;
40
- CompletionResult _result;
53
+ Napi::ThreadSafeFunction _tsfn;
54
+ struct {
55
+ size_t tokens_evaluated = 0;
56
+ size_t tokens_predicted = 0;
57
+ bool truncated = false;
58
+ bool context_full = false;
59
+ std::string text;
60
+ } _result;
41
61
  };