@fugood/llama.node 0.3.17 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (193) hide show
  1. package/CMakeLists.txt +3 -1
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +39 -2
  19. package/lib/index.js +132 -1
  20. package/lib/index.ts +203 -3
  21. package/package.json +2 -1
  22. package/src/EmbeddingWorker.cpp +1 -1
  23. package/src/LlamaCompletionWorker.cpp +366 -19
  24. package/src/LlamaCompletionWorker.h +30 -10
  25. package/src/LlamaContext.cpp +213 -5
  26. package/src/LlamaContext.h +12 -0
  27. package/src/common.hpp +15 -0
  28. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +133 -24
  29. package/src/llama.cpp/.github/workflows/build.yml +41 -762
  30. package/src/llama.cpp/.github/workflows/docker.yml +5 -2
  31. package/src/llama.cpp/.github/workflows/release.yml +716 -0
  32. package/src/llama.cpp/.github/workflows/server.yml +12 -12
  33. package/src/llama.cpp/CMakeLists.txt +5 -17
  34. package/src/llama.cpp/cmake/build-info.cmake +8 -2
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
  36. package/src/llama.cpp/common/CMakeLists.txt +31 -3
  37. package/src/llama.cpp/common/arg.cpp +48 -29
  38. package/src/llama.cpp/common/chat.cpp +128 -106
  39. package/src/llama.cpp/common/chat.h +2 -0
  40. package/src/llama.cpp/common/common.cpp +37 -1
  41. package/src/llama.cpp/common/common.h +18 -9
  42. package/src/llama.cpp/common/llguidance.cpp +1 -0
  43. package/src/llama.cpp/common/minja/chat-template.hpp +9 -5
  44. package/src/llama.cpp/common/minja/minja.hpp +69 -36
  45. package/src/llama.cpp/common/regex-partial.cpp +204 -0
  46. package/src/llama.cpp/common/regex-partial.h +56 -0
  47. package/src/llama.cpp/common/sampling.cpp +57 -50
  48. package/src/llama.cpp/examples/CMakeLists.txt +2 -23
  49. package/src/llama.cpp/examples/embedding/embedding.cpp +2 -11
  50. package/src/llama.cpp/examples/parallel/parallel.cpp +86 -14
  51. package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
  52. package/src/llama.cpp/examples/training/finetune.cpp +96 -0
  53. package/src/llama.cpp/ggml/CMakeLists.txt +27 -0
  54. package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
  55. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
  56. package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
  57. package/src/llama.cpp/ggml/include/ggml.h +10 -7
  58. package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
  60. package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
  61. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +20 -13
  62. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -2
  63. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +306 -6
  64. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -13
  65. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +29 -16
  66. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
  67. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
  68. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
  69. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +501 -0
  70. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +0 -13
  71. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +0 -6
  72. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
  73. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +36 -11
  74. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +0 -2
  75. package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
  76. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
  77. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +41 -27
  78. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
  79. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +9 -8
  80. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +121 -232
  81. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +7 -15
  82. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
  83. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
  84. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  85. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
  86. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +0 -23
  87. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  88. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +338 -166
  89. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
  90. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
  91. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
  92. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -70
  93. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +657 -193
  94. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +20 -0
  95. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +123 -29
  96. package/src/llama.cpp/ggml/src/ggml.c +29 -20
  97. package/src/llama.cpp/ggml/src/gguf.cpp +33 -33
  98. package/src/llama.cpp/include/llama.h +52 -11
  99. package/src/llama.cpp/requirements/requirements-all.txt +3 -3
  100. package/src/llama.cpp/scripts/xxd.cmake +1 -1
  101. package/src/llama.cpp/src/CMakeLists.txt +1 -0
  102. package/src/llama.cpp/src/llama-adapter.cpp +6 -0
  103. package/src/llama.cpp/src/llama-arch.cpp +3 -0
  104. package/src/llama.cpp/src/llama-batch.cpp +5 -1
  105. package/src/llama.cpp/src/llama-batch.h +2 -1
  106. package/src/llama.cpp/src/llama-chat.cpp +17 -7
  107. package/src/llama.cpp/src/llama-chat.h +1 -0
  108. package/src/llama.cpp/src/llama-context.cpp +389 -501
  109. package/src/llama.cpp/src/llama-context.h +44 -32
  110. package/src/llama.cpp/src/llama-cparams.h +1 -0
  111. package/src/llama.cpp/src/llama-graph.cpp +20 -38
  112. package/src/llama.cpp/src/llama-graph.h +12 -8
  113. package/src/llama.cpp/src/llama-kv-cache.cpp +1503 -389
  114. package/src/llama.cpp/src/llama-kv-cache.h +271 -85
  115. package/src/llama.cpp/src/llama-memory.h +11 -1
  116. package/src/llama.cpp/src/llama-model-loader.cpp +24 -15
  117. package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
  118. package/src/llama.cpp/src/llama-model-saver.h +37 -0
  119. package/src/llama.cpp/src/llama-model.cpp +316 -69
  120. package/src/llama.cpp/src/llama-model.h +8 -1
  121. package/src/llama.cpp/src/llama-quant.cpp +15 -13
  122. package/src/llama.cpp/src/llama-sampling.cpp +18 -6
  123. package/src/llama.cpp/src/llama-vocab.cpp +42 -4
  124. package/src/llama.cpp/src/llama-vocab.h +6 -0
  125. package/src/llama.cpp/src/llama.cpp +14 -0
  126. package/src/llama.cpp/tests/CMakeLists.txt +10 -2
  127. package/src/llama.cpp/tests/test-backend-ops.cpp +107 -47
  128. package/src/llama.cpp/tests/test-chat-template.cpp +10 -11
  129. package/src/llama.cpp/tests/test-chat.cpp +3 -1
  130. package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
  131. package/src/llama.cpp/tests/test-opt.cpp +33 -21
  132. package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
  133. package/src/llama.cpp/tests/test-sampling.cpp +1 -1
  134. package/src/llama.cpp/tools/CMakeLists.txt +39 -0
  135. package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +2 -2
  136. package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
  137. package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +495 -348
  138. package/src/llama.cpp/{examples → tools}/main/main.cpp +6 -9
  139. package/src/llama.cpp/{examples/llava → tools/mtmd}/CMakeLists.txt +1 -35
  140. package/src/llama.cpp/{examples/llava → tools/mtmd}/clip-impl.h +25 -5
  141. package/src/llama.cpp/{examples/llava → tools/mtmd}/clip.cpp +1440 -1349
  142. package/src/llama.cpp/tools/mtmd/clip.h +99 -0
  143. package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd-cli.cpp +70 -44
  144. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
  145. package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd.cpp +251 -281
  146. package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
  147. package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +4 -2
  148. package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +13 -76
  149. package/src/llama.cpp/{examples → tools}/rpc/rpc-server.cpp +70 -74
  150. package/src/llama.cpp/{examples → tools}/run/run.cpp +18 -4
  151. package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
  152. package/src/llama.cpp/{examples → tools}/server/server.cpp +291 -76
  153. package/src/llama.cpp/{examples → tools}/server/utils.hpp +377 -5
  154. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
  155. package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
  156. package/src/llama.cpp/examples/infill/infill.cpp +0 -590
  157. package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
  158. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
  159. package/src/llama.cpp/examples/llava/clip.h +0 -135
  160. package/src/llama.cpp/examples/llava/llava.cpp +0 -586
  161. package/src/llama.cpp/examples/llava/llava.h +0 -49
  162. package/src/llama.cpp/examples/llava/mtmd.h +0 -168
  163. package/src/llama.cpp/examples/llava/qwen2vl-test.cpp +0 -636
  164. /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
  165. /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
  166. /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
  167. /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
  168. /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
  169. /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
  170. /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
  171. /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
  172. /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
  173. /package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +0 -0
  174. /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
  175. /package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +0 -0
  176. /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
  177. /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
  178. /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
  179. /package/src/llama.cpp/{examples/llava → tools/mtmd}/deprecation-warning.cpp +0 -0
  180. /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
  181. /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
  182. /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
  183. /package/src/llama.cpp/{examples → tools}/rpc/CMakeLists.txt +0 -0
  184. /package/src/llama.cpp/{examples → tools}/run/CMakeLists.txt +0 -0
  185. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
  186. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
  187. /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
  188. /package/src/llama.cpp/{examples → tools}/server/httplib.h +0 -0
  189. /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
  190. /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
  191. /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
  192. /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
  193. /package/src/llama.cpp/{examples → tools}/tts/tts.cpp +0 -0
@@ -0,0 +1,99 @@
1
+ #pragma once
2
+
3
+ #include "ggml.h"
4
+ #include <stddef.h>
5
+ #include <stdint.h>
6
+
7
+ struct clip_ctx;
8
+
9
+ struct clip_image_size {
10
+ int width;
11
+ int height;
12
+ };
13
+
14
+ struct clip_image_f32;
15
+ struct clip_image_u8_batch;
16
+ struct clip_image_f32_batch;
17
+
18
+ struct clip_context_params {
19
+ bool use_gpu;
20
+ enum ggml_log_level verbosity;
21
+ };
22
+
23
+ struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params);
24
+
25
+ void clip_free(struct clip_ctx * ctx);
26
+
27
+ size_t clip_embd_nbytes(const struct clip_ctx * ctx);
28
+ size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h);
29
+
30
+ int32_t clip_get_image_size (const struct clip_ctx * ctx);
31
+ int32_t clip_get_patch_size (const struct clip_ctx * ctx);
32
+ int32_t clip_get_hidden_size(const struct clip_ctx * ctx);
33
+
34
+ // TODO: should be enum, not string
35
+ const char * clip_patch_merge_type(const struct clip_ctx * ctx);
36
+
37
+ const int32_t * clip_image_grid(const struct clip_ctx * ctx);
38
+ size_t get_clip_image_grid_size(const struct clip_ctx * ctx);
39
+
40
+ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img);
41
+
42
+ // for M-RoPE, this will be the number of token positions in X and Y directions
43
+ // for other models, X will be the total number of tokens and Y will be 1
44
+ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img);
45
+ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img);
46
+
47
+ // this should be equal to the embedding dimension of the text model
48
+ int clip_n_mmproj_embd(const struct clip_ctx * ctx);
49
+
50
+ int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
51
+ void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
52
+ struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);
53
+
54
+ struct clip_image_size * clip_image_size_init(void);
55
+ struct clip_image_u8 * clip_image_u8_init (void);
56
+ struct clip_image_f32 * clip_image_f32_init(void);
57
+ struct clip_image_f32_batch * clip_image_f32_batch_init(void); // only used by libllava
58
+
59
+ // nx, ny are the output image dimensions
60
+ unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);
61
+
62
+ void clip_image_size_free (struct clip_image_size * img_size);
63
+ void clip_image_u8_free (struct clip_image_u8 * img);
64
+ void clip_image_f32_free(struct clip_image_f32 * img);
65
+ void clip_image_u8_batch_free (struct clip_image_u8_batch * batch);
66
+ void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
67
+
68
+ // use for accessing underlay data of clip_image_f32_batch
69
+ size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch); // equivalent to batch->size()
70
+ size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->nx
71
+ size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny
72
+ struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
73
+
74
+ /**
75
+ * Build image from pixels decoded by other libraries instead of stb_image.h for better performance.
76
+ * The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes
77
+ */
78
+ void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
79
+
80
+ bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
81
+
82
+ /** interpret bytes as an image file with length bytes_length, and use the result to populate img */
83
+ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
84
+
85
+ /** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
86
+ bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
87
+
88
+ struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
89
+
90
+ bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
91
+ bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
92
+
93
+ int clip_is_minicpmv(const struct clip_ctx * ctx);
94
+ bool clip_is_glm(const struct clip_ctx * ctx);
95
+ bool clip_is_qwen2vl(const struct clip_ctx * ctx);
96
+ bool clip_is_llava(const struct clip_ctx * ctx);
97
+ bool clip_is_gemma3(const struct clip_ctx * ctx);
98
+
99
+ bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
@@ -63,7 +63,7 @@ static void sigint_handler(int signo) {
63
63
  #endif
64
64
 
65
65
  struct mtmd_cli_context {
66
- mtmd_context_ptr ctx_vision;
66
+ mtmd::context_ptr ctx_vision;
67
67
  common_init_result llama_init;
68
68
 
69
69
  llama_model * model;
@@ -72,6 +72,8 @@ struct mtmd_cli_context {
72
72
  llama_batch batch;
73
73
  int n_batch;
74
74
 
75
+ mtmd::bitmaps bitmaps;
76
+
75
77
  // note: we know that gemma3 template is "linear", meaning each turn is completely separated to another
76
78
  // so here we don't need to keep track of chat history
77
79
  common_chat_templates_ptr tmpls;
@@ -90,10 +92,15 @@ struct mtmd_cli_context {
90
92
  batch = llama_batch_init(params.n_batch, 0, 1);
91
93
  n_batch = params.n_batch;
92
94
 
95
+ if (!model || !lctx) {
96
+ exit(1);
97
+ }
98
+
93
99
  if (!llama_model_chat_template(model, nullptr) && params.chat_template.empty()) {
94
100
  LOG_ERR("Model does not have chat template.\n");
95
101
  LOG_ERR(" For old llava models, you may need to use '--chat-template vicuna'\n");
96
102
  LOG_ERR(" For MobileVLM models, use '--chat-template deepseek'\n");
103
+ LOG_ERR(" For Mistral Small 3.1, use '--chat-template mistral-v7'\n");
97
104
  exit(1);
98
105
  }
99
106
 
@@ -112,12 +119,12 @@ struct mtmd_cli_context {
112
119
 
113
120
  void init_vision_context(common_params & params) {
114
121
  const char * clip_path = params.mmproj.path.c_str();
115
- ctx_vision.reset(mtmd_init_from_file(clip_path, model, mtmd_context_params{
116
- /* use_gpu */ params.mmproj_use_gpu,
117
- /* timings */ true,
118
- /* n_threads */ params.cpuparams.n_threads,
119
- /* verbosity */ params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO,
120
- }));
122
+ mtmd_context_params mparams = mtmd_context_params_default();
123
+ mparams.use_gpu = params.mmproj_use_gpu;
124
+ mparams.print_timings = true;
125
+ mparams.n_threads = params.cpuparams.n_threads;
126
+ mparams.verbosity = params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO;
127
+ ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams));
121
128
  if (!ctx_vision.get()) {
122
129
  LOG_ERR("Failed to load vision model from %s\n", clip_path);
123
130
  exit(1);
@@ -134,13 +141,22 @@ struct mtmd_cli_context {
134
141
  antiprompt_tokens.begin()
135
142
  );
136
143
  }
144
+
145
+ bool load_image(const std::string & fname) {
146
+ mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(fname.c_str()));
147
+ if (!bmp.ptr) {
148
+ return false;
149
+ }
150
+ bitmaps.entries.push_back(std::move(bmp));
151
+ return true;
152
+ }
137
153
  };
138
154
 
139
155
  static int generate_response(mtmd_cli_context & ctx, common_sampler * smpl, int n_predict) {
140
156
  llama_tokens generated_tokens;
141
157
  for (int i = 0; i < n_predict; i++) {
142
158
  if (i > n_predict || !g_is_generating || g_is_interrupted) {
143
- printf("\n");
159
+ LOG("\n");
144
160
  break;
145
161
  }
146
162
 
@@ -149,15 +165,15 @@ static int generate_response(mtmd_cli_context & ctx, common_sampler * smpl, int
149
165
  common_sampler_accept(smpl, token_id, true);
150
166
 
151
167
  if (llama_vocab_is_eog(ctx.vocab, token_id) || ctx.check_antiprompt(generated_tokens)) {
152
- printf("\n");
168
+ LOG("\n");
153
169
  break; // end of generation
154
170
  }
155
171
 
156
- printf("%s", common_token_to_piece(ctx.lctx, token_id).c_str());
172
+ LOG("%s", common_token_to_piece(ctx.lctx, token_id).c_str());
157
173
  fflush(stdout);
158
174
 
159
175
  if (g_is_interrupted) {
160
- printf("\n");
176
+ LOG("\n");
161
177
  break;
162
178
  }
163
179
 
@@ -172,9 +188,7 @@ static int generate_response(mtmd_cli_context & ctx, common_sampler * smpl, int
172
188
  return 0;
173
189
  }
174
190
 
175
- static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, std::vector<std::string> & images_fname, bool add_bos = false) {
176
- std::vector<mtmd_bitmap> bitmaps;
177
-
191
+ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, bool add_bos = false) {
178
192
  common_chat_templates_inputs tmpl_inputs;
179
193
  tmpl_inputs.messages = {msg};
180
194
  tmpl_inputs.add_generation_prompt = true;
@@ -182,35 +196,43 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, std::vect
182
196
  auto formatted_chat = common_chat_templates_apply(ctx.tmpls.get(), tmpl_inputs);
183
197
  LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.prompt.c_str());
184
198
 
185
- for (auto & fname : images_fname) {
186
- mtmd_bitmap bitmap;
187
- if (mtmd_helper_bitmap_init_from_file(fname.c_str(), bitmap)) {
188
- LOG_ERR("Unable to load image %s\n", fname.c_str());
189
- return 2; // image not found
190
- }
191
- bitmaps.push_back(std::move(bitmap));
192
- }
193
-
194
199
  mtmd_input_text text;
195
- text.text = formatted_chat.prompt;
200
+ text.text = formatted_chat.prompt.c_str();
196
201
  text.add_special = add_bos;
197
202
  text.parse_special = true;
198
- mtmd_input_chunks chunks;
199
203
 
200
204
  if (g_is_interrupted) return 0;
201
205
 
202
- int32_t res = mtmd_tokenize(ctx.ctx_vision.get(), chunks, text, bitmaps);
206
+ mtmd::input_chunks chunks(mtmd_input_chunks_init());
207
+ auto bitmaps_c_ptr = ctx.bitmaps.c_ptr();
208
+ int32_t res = mtmd_tokenize(ctx.ctx_vision.get(),
209
+ chunks.ptr.get(), // output
210
+ &text, // text
211
+ bitmaps_c_ptr.data(),
212
+ bitmaps_c_ptr.size());
203
213
  if (res != 0) {
204
214
  LOG_ERR("Unable to tokenize prompt, res = %d\n", res);
205
215
  return 1;
206
216
  }
207
217
 
208
- if (mtmd_helper_eval(ctx.ctx_vision.get(), ctx.lctx, chunks, ctx.n_past, 0, ctx.n_batch)) {
218
+ ctx.bitmaps.entries.clear();
219
+
220
+ llama_pos new_n_past;
221
+ if (mtmd_helper_eval_chunks(ctx.ctx_vision.get(),
222
+ ctx.lctx, // lctx
223
+ chunks.ptr.get(), // chunks
224
+ ctx.n_past, // n_past
225
+ 0, // seq_id
226
+ ctx.n_batch, // n_batch
227
+ true, // logits_last
228
+ &new_n_past)) {
209
229
  LOG_ERR("Unable to eval prompt\n");
210
230
  return 1;
211
231
  }
212
232
 
213
- ctx.n_past += mtmd_helper_get_n_pos(chunks);
233
+ ctx.n_past = new_n_past;
234
+
235
+ LOG("\n");
214
236
 
215
237
  return 0;
216
238
  }
@@ -234,14 +256,14 @@ int main(int argc, char ** argv) {
234
256
  }
235
257
 
236
258
  mtmd_cli_context ctx(params);
237
- printf("%s: %s\n", __func__, params.model.path.c_str());
259
+ LOG("%s: loading model: %s\n", __func__, params.model.path.c_str());
238
260
 
239
261
  bool is_single_turn = !params.prompt.empty() && !params.image.empty();
240
262
 
241
263
  struct common_sampler * smpl = common_sampler_init(ctx.model, params.sampling);
242
264
  int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict;
243
265
 
244
- // ctrl+C handling
266
+ // Ctrl+C handling
245
267
  {
246
268
  #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
247
269
  struct sigaction sigint_action;
@@ -267,7 +289,12 @@ int main(int argc, char ** argv) {
267
289
  common_chat_msg msg;
268
290
  msg.role = "user";
269
291
  msg.content = params.prompt;
270
- if (eval_message(ctx, msg, params.image, true)) {
292
+ for (const auto & image : params.image) {
293
+ if (!ctx.load_image(image)) {
294
+ return 1; // error is already printed by libmtmd
295
+ }
296
+ }
297
+ if (eval_message(ctx, msg, true)) {
271
298
  return 1;
272
299
  }
273
300
  if (!g_is_interrupted && generate_response(ctx, smpl, n_predict)) {
@@ -282,7 +309,6 @@ int main(int argc, char ** argv) {
282
309
  LOG("\n");
283
310
 
284
311
  bool is_first_msg = true;
285
- std::vector<std::string> images_fname;
286
312
  std::string content;
287
313
 
288
314
  while (!g_is_interrupted) {
@@ -307,10 +333,17 @@ int main(int argc, char ** argv) {
307
333
  continue;
308
334
  }
309
335
  g_is_generating = true;
310
- if (line.find("/image") == 0) {
336
+ if (line == "/image" || line.find("/image ") == 0) {
337
+ if (line.size() < 8) {
338
+ LOG_ERR("ERR: Missing image filename\n");
339
+ continue;
340
+ }
311
341
  std::string image = line.substr(7);
312
- images_fname.push_back(string_strip(image));
313
- content += "<__image__>";
342
+ if (ctx.load_image(image)) {
343
+ LOG("Image %s loaded\n", image.c_str());
344
+ content += "<__image__>";
345
+ }
346
+ // else, error is already printed by libmtmd
314
347
  continue;
315
348
  } else {
316
349
  content += line;
@@ -318,21 +351,14 @@ int main(int argc, char ** argv) {
318
351
  common_chat_msg msg;
319
352
  msg.role = "user";
320
353
  msg.content = content;
321
- int ret = eval_message(ctx, msg, images_fname, is_first_msg);
322
- if (g_is_interrupted) break;
323
- if (ret == 2) {
324
- // non-fatal error
325
- images_fname.clear();
326
- content.clear();
327
- continue;
328
- }
354
+ int ret = eval_message(ctx, msg, is_first_msg);
329
355
  if (ret) {
330
356
  return 1;
331
357
  }
358
+ if (g_is_interrupted) break;
332
359
  if (generate_response(ctx, smpl, n_predict)) {
333
360
  return 1;
334
361
  }
335
- images_fname.clear();
336
362
  content.clear();
337
363
  is_first_msg = false;
338
364
  }
@@ -0,0 +1,310 @@
1
+ #include "mtmd.h"
2
+ #include "llama.h"
3
+
4
+ #include <algorithm>
5
+ #include <cinttypes>
6
+ #include <vector>
7
+
8
+ #define LOG_INF(...) fprintf(stdout, __VA_ARGS__)
9
+ #define LOG_ERR(...) fprintf(stderr, __VA_ARGS__)
10
+
11
+ size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks) {
12
+ size_t n_tokens = 0;
13
+ for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
14
+ auto chunk = mtmd_input_chunks_get(chunks, i);
15
+ auto chunk_type = mtmd_input_chunk_get_type(chunk);
16
+ if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
17
+ size_t n_tokens_text;
18
+ mtmd_input_chunk_get_tokens_text(chunk, &n_tokens_text);
19
+ n_tokens += n_tokens_text;
20
+ } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
21
+ auto tokens_image = mtmd_input_chunk_get_tokens_image(chunk);
22
+ n_tokens += mtmd_image_tokens_get_n_tokens(tokens_image);
23
+ } else {
24
+ GGML_ASSERT(false && "chunk type not supported");
25
+ }
26
+ }
27
+ return n_tokens;
28
+ }
29
+
30
+ llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks) {
31
+ llama_pos n_pos = 0;
32
+ for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
33
+ auto chunk = mtmd_input_chunks_get(chunks, i);
34
+ auto chunk_type = mtmd_input_chunk_get_type(chunk);
35
+ if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
36
+ size_t n_tokens_text;
37
+ mtmd_input_chunk_get_tokens_text(chunk, &n_tokens_text);
38
+ n_pos += n_tokens_text;
39
+ } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
40
+ auto tokens_image = mtmd_input_chunk_get_tokens_image(chunk);
41
+ n_pos += mtmd_image_tokens_get_n_pos(tokens_image);
42
+ } else {
43
+ GGML_ASSERT(false && "chunk type not supported");
44
+ }
45
+ }
46
+ return n_pos;
47
+ }
48
+
49
+ // helper struct to make working with embd batch easier
50
+ // note: this will be removed after llama_batch_ext refactoring
51
+ struct decode_embd_batch {
52
+ int n_pos_per_embd;
53
+ int n_mmproj_embd;
54
+ std::vector<llama_pos> pos;
55
+ std::vector<llama_pos> pos_view; // used by mrope
56
+ std::vector<int32_t> n_seq_id;
57
+ std::vector<llama_seq_id> seq_id_0;
58
+ std::vector<llama_seq_id *> seq_ids;
59
+ std::vector<int8_t> logits;
60
+ llama_batch batch;
61
+ decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
62
+ pos .resize(n_tokens * n_pos_per_embd);
63
+ n_seq_id.resize(n_tokens);
64
+ seq_ids .resize(n_tokens + 1);
65
+ logits .resize(n_tokens);
66
+ seq_id_0.resize(1);
67
+ seq_ids [n_tokens] = nullptr;
68
+ batch = {
69
+ /*n_tokens =*/ n_tokens,
70
+ /*tokens =*/ nullptr,
71
+ /*embd =*/ embd,
72
+ /*pos =*/ pos.data(),
73
+ /*n_seq_id =*/ n_seq_id.data(),
74
+ /*seq_id =*/ seq_ids.data(),
75
+ /*logits =*/ logits.data(),
76
+ };
77
+ }
78
+
79
+ void set_position_normal(llama_pos pos_0, llama_seq_id seq_id) {
80
+ seq_id_0[0] = seq_id;
81
+ for (int i = 0; i < batch.n_tokens; i++) {
82
+ batch.pos [i] = pos_0 + i;
83
+ batch.n_seq_id[i] = 1;
84
+ batch.seq_id [i] = seq_id_0.data();
85
+ batch.logits [i] = false;
86
+ }
87
+ }
88
+
89
+ void set_position_mrope(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
90
+ GGML_ASSERT(n_pos_per_embd == 4);
91
+ seq_id_0[0] = seq_id;
92
+ for (int y = 0; y < ny; y++) {
93
+ for (int x = 0; x < nx; x++) {
94
+ int i = y * nx + x;
95
+ pos[i ] = pos_0;
96
+ pos[i + batch.n_tokens ] = pos_0 + y;
97
+ pos[i + batch.n_tokens * 2] = pos_0 + x;
98
+ pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
99
+ }
100
+ }
101
+ for (int i = 0; i < batch.n_tokens; i++) {
102
+ batch.n_seq_id[i] = 1;
103
+ batch.seq_id [i] = seq_id_0.data();
104
+ batch.logits [i] = false;
105
+ }
106
+ }
107
+
108
+ llama_batch get_view(int offset, int n_tokens) {
109
+ llama_pos * pos_ptr;
110
+ pos_view.clear();
111
+ pos_view.reserve(n_tokens * n_pos_per_embd);
112
+ if (n_pos_per_embd > 1) {
113
+ // mrope
114
+ // for example, with layout of src: 1234...1234...1234...1234...
115
+ // offset 2 will give us dst: 34...34...34...34...
116
+ for (int i = 0; i < n_pos_per_embd; i++) {
117
+ // assume n_tokens is less than or equal to batch.n_tokens
118
+ // batch.n_tokens is number of **total** tokens
119
+ // n_tokens is number of viewed token
120
+ size_t src_idx = i * batch.n_tokens + offset;
121
+ pos_view.insert(pos_view.end(),
122
+ pos.data() + src_idx,
123
+ pos.data() + src_idx + n_tokens);
124
+ }
125
+ pos_ptr = pos_view.data();
126
+ } else {
127
+ // normal
128
+ pos_ptr = pos.data() + offset;
129
+ }
130
+ return {
131
+ /*n_tokens =*/ n_tokens,
132
+ /*tokens =*/ nullptr,
133
+ /*embd =*/ batch.embd + offset * n_mmproj_embd,
134
+ /*pos =*/ pos_ptr,
135
+ /*n_seq_id =*/ batch.n_seq_id + offset,
136
+ /*seq_id =*/ batch.seq_id + offset,
137
+ /*logits =*/ batch.logits + offset,
138
+ };
139
+ }
140
+ };
141
+
142
+ // Helper function for decoding an image whose embeddings have already been calculated
143
+ int32_t mtmd_helper_decode_image_chunk(
144
+ mtmd_context * ctx,
145
+ struct llama_context * lctx,
146
+ const mtmd_input_chunk * chunk,
147
+ float * encoded_embd,
148
+ llama_pos n_past,
149
+ llama_seq_id seq_id,
150
+ int32_t n_batch,
151
+ llama_pos * new_n_past) {
152
+ if (mtmd_input_chunk_get_type(chunk) != MTMD_INPUT_CHUNK_TYPE_IMAGE) {
153
+ LOG_ERR("failed to decode image chunk: input chunk not of image type\n");
154
+ return -1;
155
+ }
156
+ const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
157
+ if (!image_tokens) {
158
+ LOG_ERR("failed to decode image chunk: image tokens are null\n");
159
+ return -1;
160
+ }
161
+
162
+ const llama_model * model = llama_get_model(lctx);
163
+ int n_mmproj_embd = llama_model_n_embd(model);
164
+ int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1;
165
+
166
+ int32_t n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens);
167
+ int32_t i_batch = 0;
168
+ int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
169
+ decode_embd_batch batch_embd(encoded_embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
170
+
171
+ const int nx = mtmd_image_tokens_get_nx(image_tokens);
172
+ const int ny = mtmd_image_tokens_get_ny(image_tokens);
173
+
174
+ if (mtmd_decode_use_mrope(ctx)) {
175
+ batch_embd.set_position_mrope(n_past, nx, ny, seq_id);
176
+ } else {
177
+ batch_embd.set_position_normal(n_past, seq_id);
178
+ }
179
+
180
+ if (mtmd_decode_use_non_causal(ctx)) {
181
+ llama_set_causal_attn(lctx, false);
182
+ // TODO @ngxson : need to make sure only one image is processed at a time, and n_ubatch must be enough to hold the image
183
+ }
184
+
185
+ while (i_batch < n_img_batches) { // split into batches
186
+ int pos_offset = i_batch*n_batch;
187
+ int n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
188
+ llama_batch batch_embd_view = batch_embd.get_view(pos_offset, n_tokens_batch);
189
+
190
+ LOG_INF("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch);
191
+
192
+ int64_t t1 = ggml_time_ms();
193
+ int32_t ret = llama_decode(lctx, batch_embd_view);
194
+ if (ret != 0) {
195
+ LOG_ERR("failed to decode image\n");
196
+ llama_set_causal_attn(lctx, true); // restore causal attn
197
+ return ret;
198
+ }
199
+
200
+ LOG_INF("image decoded (batch %d/%d) in %" PRId64 " ms\n", i_batch+1, n_img_batches, ggml_time_ms() - t1);
201
+
202
+ i_batch++;
203
+ }
204
+
205
+ n_past += mtmd_image_tokens_get_n_pos(image_tokens);
206
+ *new_n_past = n_past;
207
+
208
+ if (mtmd_decode_use_non_causal(ctx)) {
209
+ llama_set_causal_attn(lctx, true);
210
+ }
211
+ return 0;
212
+ }
213
+
214
+ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
215
+ struct llama_context * lctx,
216
+ const mtmd_input_chunk * chunk,
217
+ llama_pos n_past,
218
+ llama_seq_id seq_id,
219
+ int32_t n_batch,
220
+ bool logits_last,
221
+ llama_pos * new_n_past) {
222
+ int32_t ret;
223
+ llama_batch text_batch = llama_batch_init(n_batch, 0, 1);
224
+ auto chunk_type = mtmd_input_chunk_get_type(chunk);
225
+
226
+ if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
227
+ size_t n_tokens;
228
+ const auto tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
229
+ // LOG_INF("decoding text chunk, n_tokens = %zu\n", n_tokens);
230
+ size_t i = 0;
231
+ while (i < n_tokens) { // split into batches
232
+ text_batch.n_tokens = 0; // clear the batch
233
+ for (; i < n_tokens && text_batch.n_tokens < n_batch; i++) {
234
+ text_batch.n_tokens++;
235
+ text_batch.token [i] = tokens[i];
236
+ text_batch.pos [i] = n_past++;
237
+ text_batch.n_seq_id[i] = 1;
238
+ text_batch.seq_id [i][0] = seq_id;
239
+ text_batch.logits [i] = false;
240
+ }
241
+ bool is_last_token = (i == n_tokens);
242
+ if (logits_last && is_last_token) {
243
+ text_batch.logits[text_batch.n_tokens - 1] = true;
244
+ }
245
+ ret = llama_decode(lctx, text_batch);
246
+ if (ret != 0) {
247
+ LOG_ERR("failed to decode text\n");
248
+ llama_batch_free(text_batch);
249
+ return ret;
250
+ }
251
+ *new_n_past += text_batch.n_tokens;
252
+ }
253
+
254
+ } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
255
+ const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
256
+ int64_t t0 = ggml_time_ms();
257
+
258
+ LOG_INF("encoding image or slice...\n");
259
+
260
+ ret = mtmd_encode(ctx, image_tokens);
261
+ if (ret != 0) {
262
+ LOG_ERR("failed to encode image\n");
263
+ llama_batch_free(text_batch);
264
+ return ret;
265
+ }
266
+
267
+ LOG_INF("image/slice encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
268
+
269
+ float * embd = mtmd_get_output_embd(ctx);
270
+ ret = mtmd_helper_decode_image_chunk(ctx, lctx, chunk, embd, n_past, seq_id, n_batch, new_n_past);
271
+ if (ret != 0) {
272
+ LOG_ERR("failed to decode image\n");
273
+ llama_batch_free(text_batch);
274
+ return ret;
275
+ }
276
+ } else {
277
+ GGML_ABORT("chunk type not supported");
278
+ }
279
+
280
+ return 0;
281
+ }
282
+
283
+ int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
284
+ struct llama_context * lctx,
285
+ const mtmd_input_chunks * chunks,
286
+ llama_pos n_past,
287
+ llama_seq_id seq_id,
288
+ int32_t n_batch,
289
+ bool logits_last,
290
+ llama_pos * new_n_past) {
291
+ size_t n_chunks = mtmd_input_chunks_size(chunks);
292
+ if (n_chunks == 0) {
293
+ LOG_ERR("no chunks to eval\n");
294
+ return 0;
295
+ }
296
+
297
+ for (size_t i = 0; i < n_chunks; i++) {
298
+ bool chunk_logits_last = (i == n_chunks - 1) && logits_last;
299
+ auto chunk = mtmd_input_chunks_get(chunks, i);
300
+
301
+ int32_t res = mtmd_helper_eval_chunk_single(ctx, lctx, chunk, n_past, seq_id, n_batch, chunk_logits_last, &n_past);
302
+ if (res != 0) {
303
+ LOG_ERR("failed to eval chunk %zu\n", i);
304
+ return res;
305
+ }
306
+ *new_n_past = n_past;
307
+ }
308
+
309
+ return 0;
310
+ }