@fugood/llama.node 0.3.17 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (193) hide show
  1. package/CMakeLists.txt +3 -1
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +39 -2
  19. package/lib/index.js +132 -1
  20. package/lib/index.ts +203 -3
  21. package/package.json +2 -1
  22. package/src/EmbeddingWorker.cpp +1 -1
  23. package/src/LlamaCompletionWorker.cpp +366 -19
  24. package/src/LlamaCompletionWorker.h +30 -10
  25. package/src/LlamaContext.cpp +213 -5
  26. package/src/LlamaContext.h +12 -0
  27. package/src/common.hpp +15 -0
  28. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +133 -24
  29. package/src/llama.cpp/.github/workflows/build.yml +41 -762
  30. package/src/llama.cpp/.github/workflows/docker.yml +5 -2
  31. package/src/llama.cpp/.github/workflows/release.yml +716 -0
  32. package/src/llama.cpp/.github/workflows/server.yml +12 -12
  33. package/src/llama.cpp/CMakeLists.txt +5 -17
  34. package/src/llama.cpp/cmake/build-info.cmake +8 -2
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
  36. package/src/llama.cpp/common/CMakeLists.txt +31 -3
  37. package/src/llama.cpp/common/arg.cpp +48 -29
  38. package/src/llama.cpp/common/chat.cpp +128 -106
  39. package/src/llama.cpp/common/chat.h +2 -0
  40. package/src/llama.cpp/common/common.cpp +37 -1
  41. package/src/llama.cpp/common/common.h +18 -9
  42. package/src/llama.cpp/common/llguidance.cpp +1 -0
  43. package/src/llama.cpp/common/minja/chat-template.hpp +9 -5
  44. package/src/llama.cpp/common/minja/minja.hpp +69 -36
  45. package/src/llama.cpp/common/regex-partial.cpp +204 -0
  46. package/src/llama.cpp/common/regex-partial.h +56 -0
  47. package/src/llama.cpp/common/sampling.cpp +57 -50
  48. package/src/llama.cpp/examples/CMakeLists.txt +2 -23
  49. package/src/llama.cpp/examples/embedding/embedding.cpp +2 -11
  50. package/src/llama.cpp/examples/parallel/parallel.cpp +86 -14
  51. package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
  52. package/src/llama.cpp/examples/training/finetune.cpp +96 -0
  53. package/src/llama.cpp/ggml/CMakeLists.txt +27 -0
  54. package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
  55. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
  56. package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
  57. package/src/llama.cpp/ggml/include/ggml.h +10 -7
  58. package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
  60. package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
  61. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +20 -13
  62. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -2
  63. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +306 -6
  64. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -13
  65. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +29 -16
  66. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
  67. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
  68. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
  69. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +501 -0
  70. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +0 -13
  71. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +0 -6
  72. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
  73. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +36 -11
  74. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +0 -2
  75. package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
  76. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
  77. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +41 -27
  78. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
  79. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +9 -8
  80. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +121 -232
  81. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +7 -15
  82. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
  83. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
  84. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  85. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
  86. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +0 -23
  87. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  88. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +338 -166
  89. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
  90. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
  91. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
  92. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -70
  93. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +657 -193
  94. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +20 -0
  95. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +123 -29
  96. package/src/llama.cpp/ggml/src/ggml.c +29 -20
  97. package/src/llama.cpp/ggml/src/gguf.cpp +33 -33
  98. package/src/llama.cpp/include/llama.h +52 -11
  99. package/src/llama.cpp/requirements/requirements-all.txt +3 -3
  100. package/src/llama.cpp/scripts/xxd.cmake +1 -1
  101. package/src/llama.cpp/src/CMakeLists.txt +1 -0
  102. package/src/llama.cpp/src/llama-adapter.cpp +6 -0
  103. package/src/llama.cpp/src/llama-arch.cpp +3 -0
  104. package/src/llama.cpp/src/llama-batch.cpp +5 -1
  105. package/src/llama.cpp/src/llama-batch.h +2 -1
  106. package/src/llama.cpp/src/llama-chat.cpp +17 -7
  107. package/src/llama.cpp/src/llama-chat.h +1 -0
  108. package/src/llama.cpp/src/llama-context.cpp +389 -501
  109. package/src/llama.cpp/src/llama-context.h +44 -32
  110. package/src/llama.cpp/src/llama-cparams.h +1 -0
  111. package/src/llama.cpp/src/llama-graph.cpp +20 -38
  112. package/src/llama.cpp/src/llama-graph.h +12 -8
  113. package/src/llama.cpp/src/llama-kv-cache.cpp +1503 -389
  114. package/src/llama.cpp/src/llama-kv-cache.h +271 -85
  115. package/src/llama.cpp/src/llama-memory.h +11 -1
  116. package/src/llama.cpp/src/llama-model-loader.cpp +24 -15
  117. package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
  118. package/src/llama.cpp/src/llama-model-saver.h +37 -0
  119. package/src/llama.cpp/src/llama-model.cpp +316 -69
  120. package/src/llama.cpp/src/llama-model.h +8 -1
  121. package/src/llama.cpp/src/llama-quant.cpp +15 -13
  122. package/src/llama.cpp/src/llama-sampling.cpp +18 -6
  123. package/src/llama.cpp/src/llama-vocab.cpp +42 -4
  124. package/src/llama.cpp/src/llama-vocab.h +6 -0
  125. package/src/llama.cpp/src/llama.cpp +14 -0
  126. package/src/llama.cpp/tests/CMakeLists.txt +10 -2
  127. package/src/llama.cpp/tests/test-backend-ops.cpp +107 -47
  128. package/src/llama.cpp/tests/test-chat-template.cpp +10 -11
  129. package/src/llama.cpp/tests/test-chat.cpp +3 -1
  130. package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
  131. package/src/llama.cpp/tests/test-opt.cpp +33 -21
  132. package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
  133. package/src/llama.cpp/tests/test-sampling.cpp +1 -1
  134. package/src/llama.cpp/tools/CMakeLists.txt +39 -0
  135. package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +2 -2
  136. package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
  137. package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +495 -348
  138. package/src/llama.cpp/{examples → tools}/main/main.cpp +6 -9
  139. package/src/llama.cpp/{examples/llava → tools/mtmd}/CMakeLists.txt +1 -35
  140. package/src/llama.cpp/{examples/llava → tools/mtmd}/clip-impl.h +25 -5
  141. package/src/llama.cpp/{examples/llava → tools/mtmd}/clip.cpp +1440 -1349
  142. package/src/llama.cpp/tools/mtmd/clip.h +99 -0
  143. package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd-cli.cpp +70 -44
  144. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
  145. package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd.cpp +251 -281
  146. package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
  147. package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +4 -2
  148. package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +13 -76
  149. package/src/llama.cpp/{examples → tools}/rpc/rpc-server.cpp +70 -74
  150. package/src/llama.cpp/{examples → tools}/run/run.cpp +18 -4
  151. package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
  152. package/src/llama.cpp/{examples → tools}/server/server.cpp +291 -76
  153. package/src/llama.cpp/{examples → tools}/server/utils.hpp +377 -5
  154. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
  155. package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
  156. package/src/llama.cpp/examples/infill/infill.cpp +0 -590
  157. package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
  158. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
  159. package/src/llama.cpp/examples/llava/clip.h +0 -135
  160. package/src/llama.cpp/examples/llava/llava.cpp +0 -586
  161. package/src/llama.cpp/examples/llava/llava.h +0 -49
  162. package/src/llama.cpp/examples/llava/mtmd.h +0 -168
  163. package/src/llama.cpp/examples/llava/qwen2vl-test.cpp +0 -636
  164. /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
  165. /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
  166. /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
  167. /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
  168. /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
  169. /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
  170. /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
  171. /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
  172. /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
  173. /package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +0 -0
  174. /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
  175. /package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +0 -0
  176. /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
  177. /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
  178. /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
  179. /package/src/llama.cpp/{examples/llava → tools/mtmd}/deprecation-warning.cpp +0 -0
  180. /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
  181. /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
  182. /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
  183. /package/src/llama.cpp/{examples → tools}/rpc/CMakeLists.txt +0 -0
  184. /package/src/llama.cpp/{examples → tools}/run/CMakeLists.txt +0 -0
  185. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
  186. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
  187. /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
  188. /package/src/llama.cpp/{examples → tools}/server/httplib.h +0 -0
  189. /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
  190. /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
  191. /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
  192. /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
  193. /package/src/llama.cpp/{examples → tools}/tts/tts.cpp +0 -0
@@ -0,0 +1,331 @@
1
+ #ifndef MTMD_H
2
+ #define MTMD_H
3
+
4
+ #include "ggml.h"
5
+ #include "llama.h"
6
+ #include "clip.h"
7
+
8
+ #include <stddef.h>
9
+ #include <stdint.h>
10
+ #include <stdbool.h>
11
+
12
+ #ifdef __cplusplus
13
+ #include <string>
14
+ #include <vector>
15
+ #include <cinttypes>
16
+ #include <memory>
17
+ #endif
18
+
19
+ /**
20
+ * libmtmd: A library for multimodal support in llama.cpp.
21
+ *
22
+ * WARNING: This API is experimental and subject to many BREAKING CHANGES.
23
+ * Issues related to API usage may receive lower priority support.
24
+ *
25
+ * For the usage, see an example in mtmd-cli.cpp
26
+ */
27
+
28
+ #ifdef LLAMA_SHARED
29
+ # if defined(_WIN32) && !defined(__MINGW32__)
30
+ # ifdef LLAMA_BUILD
31
+ # define MTMD_API __declspec(dllexport)
32
+ # else
33
+ # define MTMD_API __declspec(dllimport)
34
+ # endif
35
+ # else
36
+ # define MTMD_API __attribute__ ((visibility ("default")))
37
+ # endif
38
+ #else
39
+ # define MTMD_API
40
+ #endif
41
+
42
+ #define MTMD_DEFAULT_IMAGE_MARKER "<__image__>"
43
+
44
+ #ifdef __cplusplus
45
+ extern "C" {
46
+ #endif
47
+
48
+ enum mtmd_input_chunk_type {
49
+ MTMD_INPUT_CHUNK_TYPE_TEXT,
50
+ MTMD_INPUT_CHUNK_TYPE_IMAGE,
51
+ };
52
+
53
+ // opaque types
54
+ struct mtmd_context;
55
+ struct mtmd_bitmap;
56
+ struct mtmd_image_tokens;
57
+ struct mtmd_input_chunk;
58
+ struct mtmd_input_chunks;
59
+
60
+ struct mtmd_input_text {
61
+ const char * text;
62
+ bool add_special;
63
+ bool parse_special;
64
+ };
65
+
66
+ //
67
+ // C API
68
+ //
69
+
70
+ typedef struct mtmd_context mtmd_context;
71
+ typedef struct mtmd_bitmap mtmd_bitmap;
72
+ typedef struct mtmd_image_tokens mtmd_image_tokens;
73
+ typedef struct mtmd_input_chunk mtmd_input_chunk;
74
+ typedef struct mtmd_input_chunks mtmd_input_chunks;
75
+ typedef struct mtmd_input_text mtmd_input_text;
76
+
77
+ struct mtmd_context_params {
78
+ bool use_gpu;
79
+ bool print_timings;
80
+ int n_threads;
81
+ enum ggml_log_level verbosity;
82
+ const char * image_marker;
83
+ };
84
+
85
+ MTMD_API struct mtmd_context_params mtmd_context_params_default(void);
86
+
87
+ // initialize the mtmd context
88
+ // return nullptr on failure
89
+ MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
90
+ const struct llama_model * text_model,
91
+ const struct mtmd_context_params ctx_params);
92
+
93
+ MTMD_API void mtmd_free(mtmd_context * ctx);
94
+
95
+ // whether we need to set non-causal mask before llama_decode
96
+ MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
97
+
98
+ // whether the current model use M-RoPE for llama_decode
99
+ MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx);
100
+
101
+
102
+ // mtmd_bitmap
103
+ //
104
+ // length of data must be nx * ny * 3
105
+ // the data is in RGBRGBRGB... format
106
+ MTMD_API mtmd_bitmap * mtmd_bitmap_init (uint32_t nx,
107
+ uint32_t ny,
108
+ const unsigned char * data);
109
+ MTMD_API uint32_t mtmd_bitmap_get_nx (const mtmd_bitmap * bitmap);
110
+ MTMD_API uint32_t mtmd_bitmap_get_ny (const mtmd_bitmap * bitmap);
111
+ MTMD_API const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap);
112
+ MTMD_API void mtmd_bitmap_free (mtmd_bitmap * bitmap);
113
+ // bitmap ID is optional, but useful for KV cache tracking
114
+ // these getters/setters are dedicated functions, so you can for example calculate the hash of the image based on mtmd_bitmap_get_data()
115
+ MTMD_API const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap);
116
+ MTMD_API void mtmd_bitmap_set_id(mtmd_bitmap * bitmap, const char * id);
117
+
118
+
119
+ // mtmd_input_chunks
120
+ //
121
+ // this is simply a list of mtmd_input_chunk
122
+ // the elements can only be populated via mtmd_tokenize()
123
+ MTMD_API mtmd_input_chunks * mtmd_input_chunks_init(void);
124
+ MTMD_API size_t mtmd_input_chunks_size(const mtmd_input_chunks * chunks);
125
+ MTMD_API const mtmd_input_chunk * mtmd_input_chunks_get (const mtmd_input_chunks * chunks, size_t idx);
126
+ MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks);
127
+
128
+ // mtmd_input_chunk
129
+ //
130
+ // the instance will be constructed via mtmd_tokenize()
131
+ // it will be freed along with mtmd_input_chunks
132
+ MTMD_API enum mtmd_input_chunk_type mtmd_input_chunk_get_type (const mtmd_input_chunk * chunk);
133
+ MTMD_API const llama_token * mtmd_input_chunk_get_tokens_text (const mtmd_input_chunk * chunk, size_t * n_tokens_output);
134
+ MTMD_API const mtmd_image_tokens * mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk);
135
+
136
+ // in case you want to use custom logic to handle the chunk (i.e. KV cache management)
137
+ // you can move the chunk ownership to your own code by copying it
138
+ // remember to free the chunk when you are done with it
139
+ MTMD_API mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk);
140
+ MTMD_API void mtmd_input_chunk_free(mtmd_input_chunk * chunk);
141
+
142
+
143
+ // mtmd_image_tokens
144
+ //
145
+ // the instance will be constructed via mtmd_tokenize()
146
+ // it will be freed along with mtmd_input_chunk
147
+ MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens);
148
+ MTMD_API size_t mtmd_image_tokens_get_nx (const mtmd_image_tokens * image_tokens);
149
+ MTMD_API size_t mtmd_image_tokens_get_ny (const mtmd_image_tokens * image_tokens);
150
+ MTMD_API const char * mtmd_image_tokens_get_id (const mtmd_image_tokens * image_tokens);
151
+ // number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
152
+ MTMD_API llama_pos mtmd_image_tokens_get_n_pos (const mtmd_image_tokens * image_tokens);
153
+
154
+ // tokenize an input text prompt and an image
155
+ // the prompt must have the input image marker (default: "<__image__>") in it
156
+ // the marker will be replaced with the image tokens
157
+ // for example:
158
+ // "here is an image: <__image__>\ndescribe it in detail."
159
+ // this will gives 3 chunks:
160
+ // 1. "here is an image: <start_of_image>"
161
+ // 2. (image tokens)
162
+ // 3. "<end_of_image>\ndescribe it in detail."
163
+ // number of bitmaps must be equal to the number of image markers in the prompt
164
+ // this function is thread-safe (shared ctx)
165
+ // return values:
166
+ // 0 on success
167
+ // 1 on number of images not matching the number of markers
168
+ // 2 on image preprocessing error
169
+ MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
170
+ mtmd_input_chunks * output,
171
+ const mtmd_input_text * text,
172
+ const mtmd_bitmap ** bitmaps,
173
+ size_t n_bitmaps);
174
+
175
+ // returns 0 on success
176
+ MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
177
+ const mtmd_image_tokens * image_tokens);
178
+
179
+ // get output embeddings from the last encode pass
180
+ MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
181
+
182
+ /////////////////////////////////////////
183
+
184
+ //
185
+ // Helper functions (can be implemented based on other functions)
186
+ //
187
+ // Please note that these helpers are not guaranteed to be stable.
188
+ // BREAKING CHANGES are expected.
189
+ //
190
+
191
+ // helper function to construct a mtmd_bitmap from a file
192
+ // returns nullptr on failure
193
+ // this function is thread-safe
194
+ MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname);
195
+
196
+ // helper function to construct a mtmd_bitmap from a buffer containing a file
197
+ // the file content must be an image in format supported by stb_image (jpg, png, bmp, gif, etc.)
198
+ // returns nullptr on failure
199
+ // this function is thread-safe
200
+ MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len);
201
+
202
+ // helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
203
+ MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
204
+
205
+ // helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
206
+ // normally, n_pos is equal to n_tokens, but for M-RoPE it is different
207
+ MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks);
208
+
209
+ // helper function that automatically:
210
+ // 1. run llama_decode() on text chunks
211
+ // 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
212
+ // if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
213
+ // otherwise, returns 0 on success
214
+ // this function is NOT thread-safe
215
+ MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
216
+ struct llama_context * lctx,
217
+ const mtmd_input_chunks * chunks,
218
+ llama_pos n_past,
219
+ llama_seq_id seq_id,
220
+ int32_t n_batch,
221
+ bool logits_last,
222
+ llama_pos * new_n_past);
223
+
224
+ // works like mtmd_helper_eval_chunks(), but only for a single chunk
225
+ // this function is NOT thread-safe
226
+ MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
227
+ struct llama_context * lctx,
228
+ const mtmd_input_chunk * chunk,
229
+ llama_pos n_past,
230
+ llama_seq_id seq_id,
231
+ int32_t n_batch,
232
+ bool logits_last,
233
+ llama_pos * new_n_past);
234
+
235
+ // helper function to decode an image whose embeddings have already been calculated
236
+ // this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention)
237
+ // ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure
238
+ MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
239
+ struct llama_context * lctx,
240
+ const mtmd_input_chunk * chunk,
241
+ float * encoded_embd,
242
+ llama_pos n_past,
243
+ llama_seq_id seq_id,
244
+ int32_t n_batch,
245
+ llama_pos * new_n_past);
246
+
247
+ /////////////////////////////////////////
248
+
249
+ // test function, to be used in test-mtmd-c-api.c
250
+ MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void);
251
+
252
+ #ifdef __cplusplus
253
+ } // extern "C"
254
+ #endif
255
+
256
+ //
257
+ // C++ wrappers
258
+ //
259
+
260
+ #ifdef __cplusplus
261
+
262
+ namespace mtmd {
263
+
264
+ struct mtmd_context_deleter {
265
+ void operator()(mtmd_context * val) { mtmd_free(val); }
266
+ };
267
+ using context_ptr = std::unique_ptr<mtmd_context, mtmd_context_deleter>;
268
+
269
+ struct mtmd_bitmap_deleter {
270
+ void operator()(mtmd_bitmap * val) { mtmd_bitmap_free(val); }
271
+ };
272
+ using bitmap_ptr = std::unique_ptr<mtmd_bitmap, mtmd_bitmap_deleter>;
273
+
274
+ struct mtmd_input_chunks_deleter {
275
+ void operator()(mtmd_input_chunks * val) { mtmd_input_chunks_free(val); }
276
+ };
277
+ using input_chunks_ptr = std::unique_ptr<mtmd_input_chunks, mtmd_input_chunks_deleter>;
278
+
279
+ struct mtmd_input_chunk_deleter {
280
+ void operator()(mtmd_input_chunk * val) { mtmd_input_chunk_free(val); }
281
+ };
282
+ using input_chunk_ptr = std::unique_ptr<mtmd_input_chunk, mtmd_input_chunk_deleter>;
283
+
284
+ struct bitmap {
285
+ bitmap_ptr ptr;
286
+ bitmap() : ptr(nullptr) {}
287
+ bitmap(mtmd_bitmap * bitmap) : ptr(bitmap) {}
288
+ bitmap(bitmap && other) noexcept : ptr(std::move(other.ptr)) {}
289
+ bitmap(uint32_t nx, uint32_t ny, const unsigned char * data) {
290
+ ptr.reset(mtmd_bitmap_init(nx, ny, data));
291
+ }
292
+ ~bitmap() = default;
293
+ uint32_t nx() { return mtmd_bitmap_get_nx(ptr.get()); }
294
+ uint32_t ny() { return mtmd_bitmap_get_ny(ptr.get()); }
295
+ const unsigned char * data() { return mtmd_bitmap_get_data(ptr.get()); }
296
+ std::string id() { return mtmd_bitmap_get_id(ptr.get()); }
297
+ void set_id(const char * id) { mtmd_bitmap_set_id(ptr.get(), id); }
298
+ };
299
+
300
+ struct bitmaps {
301
+ std::vector<bitmap> entries;
302
+ ~bitmaps() = default;
303
+ // return list of pointers to mtmd_bitmap
304
+ // example:
305
+ // auto bitmaps_c_ptr = bitmaps.c_ptr();
306
+ // int32_t res = mtmd_tokenize(... bitmaps_c_ptr.data(), bitmaps_c_ptr.size());
307
+ std::vector<const mtmd_bitmap *> c_ptr() {
308
+ std::vector<const mtmd_bitmap *> res(entries.size());
309
+ for (size_t i = 0; i < entries.size(); i++) {
310
+ res[i] = entries[i].ptr.get();
311
+ }
312
+ return res;
313
+ }
314
+ };
315
+
316
+ struct input_chunks {
317
+ input_chunks_ptr ptr;
318
+ input_chunks() = default;
319
+ input_chunks(mtmd_input_chunks * chunks) : ptr(chunks) {}
320
+ ~input_chunks() = default;
321
+ size_t size() { return mtmd_input_chunks_size(ptr.get()); }
322
+ const mtmd_input_chunk * operator[](size_t idx) {
323
+ return mtmd_input_chunks_get(ptr.get(), idx);
324
+ }
325
+ };
326
+
327
+ } // namespace mtmd
328
+
329
+ #endif
330
+
331
+ #endif
@@ -1554,7 +1554,10 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par
1554
1554
  if (int(batch_indeces.size()) != num_answers) {
1555
1555
  batch_indeces.resize(num_answers);
1556
1556
  }
1557
- for (int s = 0; s < num_answers; ++s) batch_indeces[s] = s0 + s;
1557
+
1558
+ for (int s = 0; s < num_answers; ++s) {
1559
+ batch_indeces[s] = s0 + s;
1560
+ }
1558
1561
 
1559
1562
  for (size_t i = 0; i < cur_task.common_prefix; ++i) {
1560
1563
  //llama_batch_add(batch, cur_task.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false);
@@ -1970,7 +1973,6 @@ int main(int argc, char ** argv) {
1970
1973
  common_params params;
1971
1974
 
1972
1975
  params.n_ctx = 512;
1973
- params.logits_all = true;
1974
1976
  params.escape = false;
1975
1977
 
1976
1978
  if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
@@ -57,6 +57,12 @@ static const std::vector<quant_option> QUANT_OPTIONS = {
57
57
  { "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
58
58
  };
59
59
 
60
+ // Quantization types. Changes to this struct must be replicated in llama-quantize.cpp
61
+ struct tensor_quantization {
62
+ std::string name;
63
+ ggml_type quant = GGML_TYPE_COUNT;
64
+ };
65
+
60
66
  static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE = "quantize.imatrix.file";
61
67
  static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET = "quantize.imatrix.dataset";
62
68
  static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES = "quantize.imatrix.entries_count";
@@ -244,56 +250,10 @@ static ggml_type parse_ggml_type(const char * arg) {
244
250
  return type;
245
251
  }
246
252
  }
247
- fprintf(stderr, "%s: invalid ggml_type '%s'\n", __func__, arg);
253
+ fprintf(stderr, "\n%s: invalid ggml_type '%s'\n\n", __func__, arg);
248
254
  return GGML_TYPE_COUNT;
249
255
  }
250
256
 
251
- // Allowed tensors for arbitrary quantization with --tensor-type option
252
- static const std::vector<std::string> ALLOWED_TENSOR_TYPE = {
253
- "attn_k",
254
- "attn_kv_a_mqa",
255
- "attn_kv_b",
256
- "attn_o",
257
- "attn_output",
258
- "attn_q",
259
- "attn_q_a",
260
- "attn_q_b",
261
- "attn_qkv",
262
- "attn_v",
263
- "channel_mix_key",
264
- "channel_mix_receptance",
265
- "channel_mix_value",
266
- "cls",
267
- "cls.output",
268
- "cross_attn_k",
269
- "cross_attn_o",
270
- "cross_attn_q",
271
- "cross_attn_v",
272
- "ffn_act",
273
- "ffn_down",
274
- "ffn_down_exps",
275
- "ffn_down_shexp",
276
- "ffn_gate",
277
- "ffn_gate_exps",
278
- "ffn_gate_shexp",
279
- "ffn_up",
280
- "ffn_up_exps",
281
- "ffn_up_shexp",
282
- "ssm_in",
283
- "ssm_out",
284
- "time_mix_gate",
285
- "time_mix_key",
286
- "time_mix_output",
287
- "time_mix_receptance",
288
- "time_mix_value",
289
- };
290
-
291
- // changes to this struct must be replicated in llama-quant.cpp
292
- struct tensor_quantization {
293
- std::string name;
294
- ggml_type quant = GGML_TYPE_COUNT;
295
- };
296
-
297
257
  static bool parse_tensor_type(const char * data, std::vector<tensor_quantization> & tensor_type) {
298
258
  const char * sep = strchr(data, '=');
299
259
  if (sep == nullptr) {
@@ -306,7 +266,6 @@ static bool parse_tensor_type(const char * data, std::vector<tensor_quantization
306
266
  printf("\n%s: missing tensor name\n\n", __func__);
307
267
  return false;
308
268
  }
309
-
310
269
  if (const size_t qt_len = strlen(sep); qt_len == 1) {
311
270
  printf("\n%s: missing quantization type\n\n", __func__);
312
271
  return false;
@@ -315,37 +274,15 @@ static bool parse_tensor_type(const char * data, std::vector<tensor_quantization
315
274
  std::string tn(data, tn_len);
316
275
  std::transform(tn.begin(), tn.end(), tn.begin(), tolower);
317
276
  sep++;
318
- const std::string qt(sep);
319
-
320
- bool found = false;
321
- for (const auto & allowed : ALLOWED_TENSOR_TYPE) {
322
- std::string tensor;
323
- tensor = tn.rfind('.') != std::string::npos ? tn.substr(tn.rfind('.') + 1) : tn;
324
- // handle special case of cls.output
325
- std::string cls_output = "cls.output";
326
- if (tn.find(cls_output) != std::string::npos) {
327
- tensor = "cls.output";
328
- }
329
- // check if an allowed tensor exists and it's at the end of the kv string
330
- if (tensor == allowed) {
331
- found = true;
332
- break;
333
- }
334
- }
335
- if (!found) {
336
- printf("\n%s: invalid tensor name '%s'\n\n", __func__, tn.c_str());
337
- return false;
338
- }
339
-
340
- if (parse_ggml_type(qt.c_str()) == GGML_TYPE_COUNT) {
341
- printf("\n%s: invalid quantization type '%s'\n\n", __func__, qt.c_str());
342
- return false;
343
- }
344
-
345
277
  tensor_quantization tqz;
346
278
  tqz.name = tn;
347
- tqz.quant = parse_ggml_type(qt.c_str());
279
+ tqz.quant = parse_ggml_type(sep);
348
280
  tensor_type.emplace_back(std::move(tqz));
281
+ if (tqz.quant == GGML_TYPE_COUNT) {
282
+ printf("\n%s: invalid quantization type '%s'\n\n", __func__, sep);
283
+ return false;
284
+ }
285
+
349
286
  return true;
350
287
  }
351
288
 
@@ -2,24 +2,6 @@
2
2
  #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
3
3
  #endif
4
4
 
5
- #include "ggml-cpu.h"
6
-
7
- #ifdef GGML_USE_CUDA
8
- #include "ggml-cuda.h"
9
- #endif
10
-
11
- #ifdef GGML_USE_METAL
12
- #include "ggml-metal.h"
13
- #endif
14
-
15
- #ifdef GGML_USE_VULKAN
16
- #include "ggml-vulkan.h"
17
- #endif
18
-
19
- #ifdef GGML_USE_SYCL
20
- #include "ggml-sycl.h"
21
- #endif
22
-
23
5
  #include "ggml-rpc.h"
24
6
  #ifdef _WIN32
25
7
  # define NOMINMAX
@@ -154,6 +136,7 @@ struct rpc_server_params {
154
136
  size_t backend_mem = 0;
155
137
  bool use_cache = false;
156
138
  int n_threads = std::max(1U, std::thread::hardware_concurrency()/2);
139
+ std::string device;
157
140
  };
158
141
 
159
142
  static void print_usage(int /*argc*/, char ** argv, rpc_server_params params) {
@@ -161,6 +144,7 @@ static void print_usage(int /*argc*/, char ** argv, rpc_server_params params) {
161
144
  fprintf(stderr, "options:\n");
162
145
  fprintf(stderr, " -h, --help show this help message and exit\n");
163
146
  fprintf(stderr, " -t, --threads number of threads for the CPU backend (default: %d)\n", params.n_threads);
147
+ fprintf(stderr, " -d DEV, --device device to use\n");
164
148
  fprintf(stderr, " -H HOST, --host HOST host to bind to (default: %s)\n", params.host.c_str());
165
149
  fprintf(stderr, " -p PORT, --port PORT port to bind to (default: %d)\n", params.port);
166
150
  fprintf(stderr, " -m MEM, --mem MEM backend memory size (in MB)\n");
@@ -186,6 +170,22 @@ static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params &
186
170
  fprintf(stderr, "error: invalid number of threads: %d\n", params.n_threads);
187
171
  return false;
188
172
  }
173
+ } else if (arg == "-d" || arg == "--device") {
174
+ if (++i >= argc) {
175
+ return false;
176
+ }
177
+ params.device = argv[i];
178
+ if (ggml_backend_dev_by_name(params.device.c_str()) == nullptr) {
179
+ fprintf(stderr, "error: unknown device: %s\n", params.device.c_str());
180
+ fprintf(stderr, "available devices:\n");
181
+ for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
182
+ auto * dev = ggml_backend_dev_get(i);
183
+ size_t free, total;
184
+ ggml_backend_dev_memory(dev, &free, &total);
185
+ printf(" %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
186
+ }
187
+ return false;
188
+ }
189
189
  } else if (arg == "-p" || arg == "--port") {
190
190
  if (++i >= argc) {
191
191
  return false;
@@ -214,66 +214,55 @@ static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params &
214
214
  }
215
215
 
216
216
  static ggml_backend_t create_backend(const rpc_server_params & params) {
217
- ggml_backend_t backend = NULL;
218
- #ifdef GGML_USE_CUDA
219
- fprintf(stderr, "%s: using CUDA backend\n", __func__);
220
- backend = ggml_backend_cuda_init(0); // init device 0
221
- if (!backend) {
222
- fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
223
- }
224
- #elif GGML_USE_METAL
225
- fprintf(stderr, "%s: using Metal backend\n", __func__);
226
- backend = ggml_backend_metal_init();
227
- if (!backend) {
228
- fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
217
+ ggml_backend_t backend = nullptr;
218
+
219
+ if (!params.device.empty()) {
220
+ ggml_backend_dev_t dev = ggml_backend_dev_by_name(params.device.c_str());
221
+ if (dev) {
222
+ backend = ggml_backend_dev_init(dev, nullptr);
223
+ if (!backend) {
224
+ fprintf(stderr, "Failed to create backend for device %s\n", params.device.c_str());
225
+ return nullptr;
226
+ }
227
+ }
229
228
  }
230
- #elif GGML_USE_VULKAN
231
- fprintf(stderr, "%s: using Vulkan backend\n", __func__);
232
- backend = ggml_backend_vk_init(0); // init device 0
229
+
230
+ // try to initialize a GPU backend first
233
231
  if (!backend) {
234
- fprintf(stderr, "%s: ggml_backend_vulkan_init() failed\n", __func__);
232
+ backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr);
235
233
  }
236
- #elif GGML_USE_SYCL
237
- fprintf(stderr, "%s: using SYCL backend\n", __func__);
238
- backend = ggml_backend_sycl_init(0); // init device 0
234
+
235
+ // if there aren't GPU backends fallback to CPU backend
239
236
  if (!backend) {
240
- fprintf(stderr, "%s: ggml_backend_sycl_init() failed\n", __func__);
237
+ backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
241
238
  }
242
- #endif
243
239
 
244
- // if there aren't GPU Backends fallback to CPU backend
245
- if (!backend) {
246
- fprintf(stderr, "%s: using CPU backend\n", __func__);
247
- backend = ggml_backend_cpu_init();
248
- ggml_backend_cpu_set_n_threads(backend, params.n_threads);
240
+ if (backend) {
241
+ fprintf(stderr, "%s: using %s backend\n", __func__, ggml_backend_name(backend));
242
+
243
+ // set the number of threads
244
+ ggml_backend_dev_t dev = ggml_backend_get_device(backend);
245
+ ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr;
246
+ if (reg) {
247
+ auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
248
+ if (ggml_backend_set_n_threads_fn) {
249
+ ggml_backend_set_n_threads_fn(backend, params.n_threads);
250
+ }
251
+ }
249
252
  }
253
+
250
254
  return backend;
251
255
  }
252
256
 
253
- static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
254
- #ifdef GGML_USE_CUDA
255
- ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
256
- #elif GGML_USE_VULKAN
257
- ggml_backend_vk_get_device_memory(0, free_mem, total_mem);
258
- #elif GGML_USE_SYCL
259
- ggml_backend_sycl_get_device_memory(0, free_mem, total_mem);
260
- #else
261
- #ifdef _WIN32
262
- MEMORYSTATUSEX status;
263
- status.dwLength = sizeof(status);
264
- GlobalMemoryStatusEx(&status);
265
- *total_mem = status.ullTotalPhys;
266
- *free_mem = status.ullAvailPhys;
267
- #else
268
- long pages = sysconf(_SC_PHYS_PAGES);
269
- long page_size = sysconf(_SC_PAGE_SIZE);
270
- *total_mem = pages * page_size;
271
- *free_mem = *total_mem;
272
- #endif
273
- #endif
257
+ static void get_backend_memory(ggml_backend_t backend, size_t * free_mem, size_t * total_mem) {
258
+ ggml_backend_dev_t dev = ggml_backend_get_device(backend);
259
+ GGML_ASSERT(dev != nullptr);
260
+ ggml_backend_dev_memory(dev, free_mem, total_mem);
274
261
  }
275
262
 
276
263
  int main(int argc, char * argv[]) {
264
+ ggml_backend_load_all();
265
+
277
266
  rpc_server_params params;
278
267
  if (!rpc_server_params_parse(argc, argv, params)) {
279
268
  fprintf(stderr, "Invalid parameters\n");
@@ -301,7 +290,7 @@ int main(int argc, char * argv[]) {
301
290
  free_mem = params.backend_mem;
302
291
  total_mem = params.backend_mem;
303
292
  } else {
304
- get_backend_memory(&free_mem, &total_mem);
293
+ get_backend_memory(backend, &free_mem, &total_mem);
305
294
  }
306
295
  const char * cache_dir = nullptr;
307
296
  std::string cache_dir_str;
@@ -313,14 +302,21 @@ int main(int argc, char * argv[]) {
313
302
  }
314
303
  cache_dir = cache_dir_str.c_str();
315
304
  }
316
- printf("Starting RPC server v%d.%d.%d\n",
317
- RPC_PROTO_MAJOR_VERSION,
318
- RPC_PROTO_MINOR_VERSION,
319
- RPC_PROTO_PATCH_VERSION);
320
- printf(" endpoint : %s\n", endpoint.c_str());
321
- printf(" local cache : %s\n", cache_dir ? cache_dir : "n/a");
322
- printf(" backend memory : %zu MB\n", free_mem / (1024 * 1024));
323
- ggml_backend_rpc_start_server(backend, endpoint.c_str(), cache_dir, free_mem, total_mem);
305
+
306
+ ggml_backend_reg_t reg = ggml_backend_reg_by_name("RPC");
307
+ if (!reg) {
308
+ fprintf(stderr, "Failed to find RPC backend\n");
309
+ return 1;
310
+ }
311
+
312
+ auto start_server_fn = (decltype(ggml_backend_rpc_start_server)*) ggml_backend_reg_get_proc_address(reg, "ggml_backend_rpc_start_server");
313
+ if (!start_server_fn) {
314
+ fprintf(stderr, "Failed to obtain RPC backend start server function\n");
315
+ return 1;
316
+ }
317
+
318
+ start_server_fn(backend, endpoint.c_str(), cache_dir, free_mem, total_mem);
319
+
324
320
  ggml_backend_free(backend);
325
321
  return 0;
326
322
  }