@fugood/llama.node 0.3.16 → 0.3.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. package/CMakeLists.txt +3 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +5 -0
  19. package/package.json +1 -1
  20. package/src/LlamaCompletionWorker.cpp +8 -0
  21. package/src/LlamaCompletionWorker.h +1 -0
  22. package/src/LlamaContext.cpp +3 -2
  23. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
  24. package/src/llama.cpp/.github/workflows/build.yml +70 -27
  25. package/src/llama.cpp/.github/workflows/docker.yml +6 -6
  26. package/src/llama.cpp/.github/workflows/server.yml +7 -11
  27. package/src/llama.cpp/CMakeLists.txt +23 -1
  28. package/src/llama.cpp/common/CMakeLists.txt +6 -3
  29. package/src/llama.cpp/common/arg.cpp +809 -105
  30. package/src/llama.cpp/common/arg.h +9 -0
  31. package/src/llama.cpp/common/chat.cpp +1 -1
  32. package/src/llama.cpp/common/common.cpp +31 -521
  33. package/src/llama.cpp/common/common.h +17 -36
  34. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  35. package/src/llama.cpp/common/llguidance.cpp +30 -47
  36. package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
  37. package/src/llama.cpp/common/minja/minja.hpp +119 -93
  38. package/src/llama.cpp/common/sampling.cpp +3 -0
  39. package/src/llama.cpp/docs/build.md +122 -7
  40. package/src/llama.cpp/examples/CMakeLists.txt +0 -9
  41. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  42. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
  43. package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
  44. package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
  45. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
  46. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  47. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
  48. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  49. package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
  50. package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
  51. package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
  52. package/src/llama.cpp/examples/llava/clip.h +39 -22
  53. package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
  54. package/src/llama.cpp/examples/llava/llava.cpp +64 -52
  55. package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
  56. package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
  57. package/src/llama.cpp/examples/llava/mtmd.h +168 -0
  58. package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
  59. package/src/llama.cpp/examples/main/main.cpp +16 -5
  60. package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
  61. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  62. package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
  63. package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
  64. package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
  65. package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
  66. package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
  67. package/src/llama.cpp/examples/run/run.cpp +14 -28
  68. package/src/llama.cpp/examples/server/httplib.h +313 -247
  69. package/src/llama.cpp/examples/server/server.cpp +238 -139
  70. package/src/llama.cpp/examples/server/utils.hpp +51 -2
  71. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  72. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  73. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  74. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  75. package/src/llama.cpp/examples/tts/tts.cpp +6 -9
  76. package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
  77. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  78. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  79. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  80. package/src/llama.cpp/ggml/include/ggml.h +66 -99
  81. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
  82. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  83. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  84. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  85. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  86. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  87. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  88. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  89. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  90. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
  91. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  92. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  93. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  94. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -192
  96. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1003 -13519
  99. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
  101. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
  102. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
  103. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
  104. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
  105. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  106. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  107. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  108. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  109. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
  110. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  111. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  112. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  113. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  114. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  115. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
  116. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  117. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
  118. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
  119. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +96 -22
  120. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  121. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  123. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  124. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +2 -292
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  130. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +204 -280
  131. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  133. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  134. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  135. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  136. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  137. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  138. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
  139. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +646 -114
  140. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
  141. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +17 -8
  142. package/src/llama.cpp/ggml/src/ggml.c +141 -245
  143. package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
  144. package/src/llama.cpp/include/llama.h +30 -11
  145. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  146. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  147. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  148. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  149. package/src/llama.cpp/requirements/requirements-all.txt +2 -0
  150. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  151. package/src/llama.cpp/src/CMakeLists.txt +3 -2
  152. package/src/llama.cpp/src/llama-adapter.cpp +37 -1
  153. package/src/llama.cpp/src/llama-arch.cpp +160 -17
  154. package/src/llama.cpp/src/llama-arch.h +16 -0
  155. package/src/llama.cpp/src/llama-chat.cpp +82 -17
  156. package/src/llama.cpp/src/llama-chat.h +6 -2
  157. package/src/llama.cpp/src/llama-context.cpp +108 -92
  158. package/src/llama.cpp/src/llama-context.h +1 -2
  159. package/src/llama.cpp/src/llama-graph.cpp +189 -119
  160. package/src/llama.cpp/src/llama-graph.h +26 -6
  161. package/src/llama.cpp/src/llama-hparams.h +13 -0
  162. package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
  163. package/src/llama.cpp/src/llama-kv-cache.h +41 -115
  164. package/src/llama.cpp/src/llama-memory.h +1 -1
  165. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  166. package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
  167. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  168. package/src/llama.cpp/src/llama-model.cpp +1760 -534
  169. package/src/llama.cpp/src/llama-model.h +13 -1
  170. package/src/llama.cpp/src/llama-quant.cpp +29 -8
  171. package/src/llama.cpp/src/llama-sampling.cpp +7 -1
  172. package/src/llama.cpp/src/llama-vocab.cpp +44 -6
  173. package/src/llama.cpp/src/llama.cpp +1 -1
  174. package/src/llama.cpp/tests/CMakeLists.txt +43 -30
  175. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  176. package/src/llama.cpp/tests/test-backend-ops.cpp +82 -43
  177. package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
  178. package/src/llama.cpp/tests/test-chat.cpp +12 -2
  179. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  180. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  181. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  182. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  183. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  184. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  185. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  186. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  187. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  188. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  189. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  190. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  191. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  192. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  193. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  194. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  195. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  196. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  197. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  198. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  199. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  200. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  201. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  202. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
@@ -0,0 +1,168 @@
1
+ #ifndef MTMD_H
2
+ #define MTMD_H
3
+
4
+ #include "ggml.h"
5
+ #include "llama.h"
6
+ #include "clip.h"
7
+
8
+ #include <vector>
9
+ #include <cinttypes>
10
+ #include <memory>
11
+
12
+ #ifdef LLAMA_SHARED
13
+ # if defined(_WIN32) && !defined(__MINGW32__)
14
+ # ifdef LLAMA_BUILD
15
+ # define MTMD_API __declspec(dllexport)
16
+ # else
17
+ # define MTMD_API __declspec(dllimport)
18
+ # endif
19
+ # else
20
+ # define MTMD_API __attribute__ ((visibility ("default")))
21
+ # endif
22
+ #else
23
+ # define MTMD_API
24
+ #endif
25
+
26
+ #ifdef __cplusplus
27
+
28
+ enum mtmd_input_chunk_type {
29
+ MTMD_INPUT_CHUNK_TYPE_TEXT,
30
+ MTMD_INPUT_CHUNK_TYPE_IMAGE,
31
+ };
32
+
33
+ struct mtmd_context;
34
+ struct mtmd_image_tokens;
35
+
36
+ // represents raw image data, layout is RGBRGBRGB...
37
+ // length of data must be nx * ny * 3
38
+ struct mtmd_bitmap {
39
+ uint32_t nx;
40
+ uint32_t ny;
41
+ std::vector<unsigned char> data;
42
+ std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
43
+ };
44
+
45
+ struct mtmd_image_tokens_deleter {
46
+ void operator()(mtmd_image_tokens * val); // forward declaration
47
+ };
48
+ using mtmd_image_tokens_ptr = std::unique_ptr<mtmd_image_tokens, mtmd_image_tokens_deleter>;
49
+
50
+ struct mtmd_input_chunk {
51
+ mtmd_input_chunk_type type;
52
+ std::vector<llama_token> tokens_text;
53
+ mtmd_image_tokens_ptr tokens_image;
54
+ };
55
+
56
+ using mtmd_input_chunks = std::vector<mtmd_input_chunk>;
57
+
58
+ struct mtmd_context_params {
59
+ bool use_gpu = true;
60
+ bool print_timings = true;
61
+ int n_threads = 4;
62
+ enum ggml_log_level verbosity = GGML_LOG_LEVEL_INFO;
63
+ const char * image_marker = "<__image__>";
64
+ };
65
+
66
+ struct mtmd_input_text {
67
+ std::string text;
68
+ bool add_special;
69
+ bool parse_special;
70
+ };
71
+
72
+ // initialize the mtmd context
73
+ // return nullptr on failure
74
+ MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
75
+ const llama_model * text_model,
76
+ const mtmd_context_params ctx_params);
77
+
78
+ MTMD_API void mtmd_free(mtmd_context * ctx);
79
+
80
+ // tokenize an input text prompt and an image
81
+ // the prompt must have the input image marker (default: "<__image__>") in it
82
+ // the marker will be replaced with the image tokens
83
+ // for example:
84
+ // "here is an image: <__image__>\ndescribe it in detail."
85
+ // this will gives 3 chunks:
86
+ // 1. "here is an image: <start_of_image>"
87
+ // 2. (image tokens)
88
+ // 3. "<end_of_image>\ndescribe it in detail."
89
+ // number of bitmaps must be equal to the number of image markers in the prompt
90
+ // this function is thread-safe (shared ctx)
91
+ // return values:
92
+ // 0 on success
93
+ // 1 on number of images not matching the number of markers
94
+ // 2 on image preprocessing error
95
+ MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
96
+ std::vector<mtmd_input_chunk> & output,
97
+ const mtmd_input_text & text,
98
+ const std::vector<mtmd_bitmap> & bitmaps);
99
+
100
+ // access mtmd_image_tokens
101
+ MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens);
102
+ MTMD_API size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens);
103
+ MTMD_API size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens);
104
+ MTMD_API std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens);
105
+ MTMD_API llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens); // number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
106
+ MTMD_API void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens);
107
+
108
+ // returns 0 on success
109
+ MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
110
+ const mtmd_image_tokens * image_tokens);
111
+
112
+ // get output embeddings from the last encode pass
113
+ MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
114
+
115
+ // whether we need to set non-causal mask before llama_decode
116
+ MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
117
+
118
+ // whether the current model use M-RoPE for llama_decode
119
+ MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx);
120
+
121
+
122
+
123
+ //
124
+ // helper functions (can be implemented based on other functions)
125
+ //
126
+
127
+ // helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
128
+ MTMD_API size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks);
129
+
130
+ // helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
131
+ MTMD_API llama_pos mtmd_helper_get_n_pos(mtmd_input_chunks & chunks);
132
+
133
+ // helper function that automatically:
134
+ // 1. run llama_decode() on text chunks
135
+ // 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
136
+ // if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
137
+ // otherwise, returns 0 on success
138
+ MTMD_API int32_t mtmd_helper_eval(mtmd_context * ctx,
139
+ llama_context * lctx,
140
+ mtmd_input_chunks & chunks,
141
+ llama_pos pos0,
142
+ llama_seq_id seq_id,
143
+ int32_t n_batch);
144
+
145
+ // helper function to construct a mtmd_bitmap from a file
146
+ // returns 0 on success
147
+ // this function is thread-safe
148
+ MTMD_API int32_t mtmd_helper_bitmap_init_from_file(const char * fname, mtmd_bitmap & output);
149
+
150
+ // helper function to construct a mtmd_bitmap from a buffer
151
+ // the buffer must be an image in format supported by stb_image (jpg, png, bmp, gif, etc.)
152
+ // returns 0 on success
153
+ // this function is thread-safe
154
+ MTMD_API int32_t mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len, mtmd_bitmap & output);
155
+
156
+ // convenient unique_ptr wrappers
157
+ struct mtmd_context_deleter {
158
+ void operator()(mtmd_context * val) { mtmd_free(val); }
159
+ };
160
+ using mtmd_context_ptr = std::unique_ptr<mtmd_context, mtmd_context_deleter>;
161
+
162
+ #else
163
+
164
+ static_assert(false && "C header is not yet supported by this library");
165
+
166
+ #endif
167
+
168
+ #endif
@@ -23,7 +23,12 @@
23
23
  #include <algorithm>
24
24
  #include <iostream>
25
25
  #include <fstream>
26
+ #include <limits>
27
+ #include <cassert>
28
+ #include <cmath>
26
29
 
30
+ // THIS FILE IS ONLY USED FOR TESTING THE QWEN2VL MODEL
31
+ // IT IS NOT A PRODUCTION CODE
27
32
 
28
33
  static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed,
29
34
  int n_batch, int * n_past, int * st_pos_id, struct clip_image_size * image_size) {
@@ -89,20 +94,12 @@ static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct lla
89
94
 
90
95
  static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past, int * st_pos_id) {
91
96
  int N = (int) tokens.size();
92
- std::vector<llama_pos> pos;
93
97
  for (int i = 0; i < N; i += n_batch) {
94
98
  int n_eval = (int) tokens.size() - i;
95
99
  if (n_eval > n_batch) {
96
100
  n_eval = n_batch;
97
101
  }
98
102
  auto batch = llama_batch_get_one(&tokens[i], n_eval);
99
- // TODO: add mrope pos ids somewhere else
100
- pos.resize(batch.n_tokens * 4);
101
- std::fill(pos.begin(), pos.end(), 0);
102
- for (int j = 0; j < batch.n_tokens * 3; j ++) {
103
- pos[j] = *st_pos_id + (j % batch.n_tokens);
104
- }
105
- batch.pos = pos.data();
106
103
 
107
104
  if (llama_decode(ctx_llama, batch)) {
108
105
  LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
@@ -314,7 +311,7 @@ static struct llama_model * llava_init(common_params * params) {
314
311
 
315
312
  llama_model_params model_params = common_model_params_to_llama(*params);
316
313
 
317
- llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
314
+ llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
318
315
  if (model == NULL) {
319
316
  LOG_ERR("%s: unable to load model\n" , __func__);
320
317
  return NULL;
@@ -323,14 +320,14 @@ static struct llama_model * llava_init(common_params * params) {
323
320
  }
324
321
 
325
322
  static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
326
- const char * clip_path = params->mmproj.c_str();
323
+ const char * clip_path = params->mmproj.path.c_str();
327
324
 
328
325
  auto prompt = params->prompt;
329
326
  if (prompt.empty()) {
330
327
  prompt = "describe the image in detail.";
331
328
  }
332
329
 
333
- auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
330
+ auto ctx_clip = clip_model_load(clip_path, GGML_LOG_LEVEL_INFO);
334
331
 
335
332
  llama_context_params ctx_params = common_context_params_to_llama(*params);
336
333
  ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
@@ -367,14 +364,14 @@ static void debug_test_mrope_2d() {
367
364
  // 1. Initialize backend
368
365
  ggml_backend_t backend = NULL;
369
366
  std::string backend_name = "";
370
- #ifdef GGML_USE_CUDA
371
- fprintf(stderr, "%s: using CUDA backend\n", __func__);
372
- backend = ggml_backend_cuda_init(0); // init device 0
373
- backend_name = "cuda";
374
- if (!backend) {
375
- fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
376
- }
377
- #endif
367
+ // #ifdef GGML_USE_CUDA
368
+ // fprintf(stderr, "%s: using CUDA backend\n", __func__);
369
+ // backend = ggml_backend_cuda_init(0); // init device 0
370
+ // backend_name = "cuda";
371
+ // if (!backend) {
372
+ // fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
373
+ // }
374
+ // #endif
378
375
  // if there aren't GPU Backends fallback to CPU backend
379
376
  if (!backend) {
380
377
  backend = ggml_backend_cpu_init();
@@ -483,28 +480,82 @@ static void debug_test_mrope_2d() {
483
480
  ggml_backend_free(backend);
484
481
  }
485
482
 
486
- static void debug_dump_img_embed(struct llava_context * ctx_llava) {
487
- int n_embd = llama_model_n_embd(llama_get_model(ctx_llava->ctx_llama));
488
- int ne = n_embd * 4;
489
- float vals[56 * 56 * 3];
483
+ enum model_output_type {
484
+ conv3d,
485
+ patch_embed,
486
+ patch_win_attn_scatter,
487
+ first_attn_layer,
488
+ last_attn_layer,
489
+ attn_softmax,
490
+ final_layer,
491
+ };
492
+
493
+ static void debug_dump_img_embed(struct llava_context * ctx_llava, model_output_type output_type) {
494
+ constexpr int ih = 140;
495
+ constexpr int iw = 196;
496
+ // constexpr int ih = 56;
497
+ // constexpr int iw = 56;
498
+ // int n_embd = llama_model_n_embd(llama_get_model(ctx_llava->ctx_llama));
499
+ int n_embd = 1280;
500
+ int merge = 1;
501
+ if (output_type == model_output_type::final_layer) {
502
+ n_embd = 2048;
503
+ merge = 2;
504
+ }
505
+ else if (output_type == model_output_type::attn_softmax) {
506
+ merge = 1;
507
+ n_embd = (ih/14/merge) * (iw/14/merge) * 16;
508
+ }
509
+
510
+ int ne = (ih/14/merge) * (iw/14/merge) * n_embd;
511
+ float vals[iw * ih * 3];
490
512
  // float embd[ne];
491
513
  std::vector<float> embd;
492
514
  embd.resize(ne);
493
515
 
494
- for (int i = 0; i < 56*56; i++)
516
+ for (int i = 0; i < iw*ih; i++)
495
517
  {
496
518
  for (int c = 0; c < 3; c++)
497
- vals[i * 3 + c] = (float)(i % (56 * 56)) / (56*56);
519
+ vals[i * 3 + c] = (float)i / (iw*ih);
498
520
  }
499
521
 
500
- clip_encode_float_image(ctx_llava->ctx_clip, 16, vals, 56, 56, embd.data());
522
+ clip_encode_float_image(ctx_llava->ctx_clip, 8, vals, ih, iw, embd.data());
523
+
524
+ std::string file_postfix = "";
525
+ switch (output_type)
526
+ {
527
+ case model_output_type::conv3d:
528
+ file_postfix = "conv3d";
529
+ break;
530
+ case model_output_type::patch_embed:
531
+ file_postfix = "patch_embed";
532
+ break;
533
+ case model_output_type::patch_win_attn_scatter:
534
+ file_postfix = "scatter";
535
+ break;
536
+ case model_output_type::first_attn_layer:
537
+ file_postfix = "first_attn";
538
+ break;
539
+ case model_output_type::last_attn_layer:
540
+ file_postfix = "last_attn";
541
+ break;
542
+ case model_output_type::attn_softmax:
543
+ file_postfix = "attn_softmax";
544
+ break;
545
+ case model_output_type::final_layer:
546
+ file_postfix = "final";
547
+ break;
548
+ default:
549
+ break;
550
+ }
551
+ auto output_path = "img_embed_" + file_postfix + ".bin";
501
552
 
502
- std::ofstream outFile("img_embed.bin", std::ios::binary);
553
+ std::ofstream outFile(output_path, std::ios::binary);
503
554
  if (outFile.is_open()) {
504
555
  outFile.write(reinterpret_cast<const char*>(embd.data()), ne * sizeof(float));
505
556
 
506
557
  outFile.close();
507
- std::cout << "Data successfully written to mrope.bin" << std::endl;
558
+ std::cout << "Data successfully written to ::[ " << output_path << std::endl;
508
559
  } else {
509
560
  std::cerr << "Error opening file!" << std::endl;
510
561
  }
@@ -524,7 +575,7 @@ int main(int argc, char ** argv) {
524
575
 
525
576
  common_init();
526
577
 
527
- if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
578
+ if (params.mmproj.path.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
528
579
  print_usage(argc, argv);
529
580
  return 1;
530
581
  }
@@ -551,8 +602,9 @@ int main(int argc, char ** argv) {
551
602
  } else if (params.image[0].empty()) {
552
603
  auto ctx_llava = llava_init_context(&params, model);
553
604
 
554
- debug_test_mrope_2d();
555
- debug_dump_img_embed(ctx_llava);
605
+ // debug_test_mrope_2d();
606
+ debug_dump_img_embed(ctx_llava, model_output_type::final_layer);
607
+ // debug_dump_img_embed(ctx_llava, model_output_type::last_attn_layer);
556
608
 
557
609
  llama_perf_context_print(ctx_llava->ctx_llama);
558
610
  ctx_llava->model = NULL;
@@ -865,9 +865,22 @@ int main(int argc, char ** argv) {
865
865
  console::set_display(console::reset);
866
866
  display = true;
867
867
 
868
- // Add tokens to embd only if the input buffer is non-empty
869
- // Entering a empty line lets the user pass control back
870
- if (buffer.length() > 1) {
868
+ if (buffer.empty()) { // Ctrl+D on empty line exits
869
+ LOG("EOF by user\n");
870
+ break;
871
+ }
872
+
873
+ if (buffer.back() == '\n') {
874
+ // Implement #587:
875
+ // If the user wants the text to end in a newline,
876
+ // this should be accomplished by explicitly adding a newline by using \ followed by return,
877
+ // then returning control by pressing return again.
878
+ buffer.pop_back();
879
+ }
880
+
881
+ if (buffer.empty()) { // Enter key on empty line lets the user pass control back
882
+ LOG_DBG("empty line, passing control back\n");
883
+ } else { // Add tokens to embd only if the input buffer is non-empty
871
884
  // append input suffix if any
872
885
  if (!params.input_suffix.empty() && !params.conversation_mode) {
873
886
  LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
@@ -915,8 +928,6 @@ int main(int argc, char ** argv) {
915
928
 
916
929
  n_remain -= line_inp.size();
917
930
  LOG_DBG("n_remain: %d\n", n_remain);
918
- } else {
919
- LOG_DBG("empty line, passing control back\n");
920
931
  }
921
932
 
922
933
  input_echo = false; // do not echo this again
@@ -106,6 +106,8 @@ int main(int argc, char ** argv) {
106
106
 
107
107
  common_params params;
108
108
 
109
+ params.n_predict = 128;
110
+
109
111
  if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
110
112
  return 1;
111
113
  }
@@ -405,7 +407,7 @@ int main(int argc, char ** argv) {
405
407
  params.prompt_file = "used built-in defaults";
406
408
  }
407
409
  LOG_INF("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
408
- LOG_INF("Model and path used: \033[32m%s\033[0m\n\n", params.model.c_str());
410
+ LOG_INF("Model and path used: \033[32m%s\033[0m\n\n", params.model.path.c_str());
409
411
 
410
412
  LOG_INF("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6);
411
413
  LOG_INF("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6);
@@ -64,7 +64,7 @@ int main(int argc, char ** argv) {
64
64
 
65
65
  llama_model_params model_params = common_model_params_to_llama(params);
66
66
 
67
- llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
67
+ llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
68
68
 
69
69
  if (model == NULL) {
70
70
  LOG_ERR("%s: unable to load model\n" , __func__);
@@ -851,7 +851,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
851
851
 
852
852
  LOG_INF("%s : calculating hellaswag score over selected tasks.\n", __func__);
853
853
 
854
- LOG("\ntask\tacc_norm\n");
854
+ LOG("\ntask\tacc_norm\t95%% confidence interval\n");
855
855
 
856
856
  double acc = 0.0f;
857
857
 
@@ -985,8 +985,22 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
985
985
  acc += 1.0;
986
986
  }
987
987
 
988
- // Print the accumulated accuracy mean x 100
989
- LOG("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0);
988
+ double freq = acc / double(i + 1);
989
+
990
+ const double za = 1.95996398454;
991
+
992
+ // // Wald normal approx
993
+ // double conf =za*sqrt(freq*(1-freq)/double(i + 1));
994
+ // LOG("%zu\t%.8lf +/- %.8lf\n", i + 1, freq*100.0, conf*100.0);
995
+
996
+ // Wilson score interval, more accurate
997
+ double z = za * za / double(i + 1);
998
+ double cnf = z * sqrt(double(i + 1) * (4.0 * freq * (1 - freq) + z)) / (za + za);
999
+ double a = (freq + z * 0.5 - cnf) / (1.0 + z);
1000
+ double b = (freq + z * 0.5 + cnf) / (1.0 + z);
1001
+
1002
+ // Print the accumulated accuracy mean x 100 and confidence interval
1003
+ LOG("%zu\t%3.8lf%%\t[%3.4lf%%, %3.4lf%%]\n", i + 1, freq * 100.0, a * 100.0, b * 100.0);
990
1004
  }
991
1005
 
992
1006
  i0 = i1 - 1;
@@ -9,6 +9,7 @@
9
9
  #include <fstream>
10
10
  #include <cmath>
11
11
  #include <cctype>
12
+ #include <algorithm>
12
13
 
13
14
  struct quant_option {
14
15
  std::string name;
@@ -16,7 +17,7 @@ struct quant_option {
16
17
  std::string desc;
17
18
  };
18
19
 
19
- static const std::vector<struct quant_option> QUANT_OPTIONS = {
20
+ static const std::vector<quant_option> QUANT_OPTIONS = {
20
21
  { "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
21
22
  { "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 4.78G, +0.4511 ppl @ Llama-3-8B", },
22
23
  { "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 5.21G, +0.1316 ppl @ Llama-3-8B", },
@@ -105,7 +106,8 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
105
106
  //
106
107
  [[noreturn]]
107
108
  static void usage(const char * executable) {
108
- printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
109
+ printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type]\n", executable);
110
+ printf(" [--token-embedding-type] [--tensor-type] [--keep-split] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
109
111
  printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
110
112
  printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
111
113
  printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
@@ -114,6 +116,8 @@ static void usage(const char * executable) {
114
116
  printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
115
117
  printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
116
118
  printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
119
+ printf(" --tensor-type TENSOR=TYPE: quantize this tensor to this ggml_type. example: --tensor-type attn_q=q8_0\n");
120
+ printf(" Advanced option to selectively quantize tensors. May be specified multiple times.\n");
117
121
  printf(" --keep-split: will generate quantized model in the same shards as input\n");
118
122
  printf(" --override-kv KEY=TYPE:VALUE\n");
119
123
  printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
@@ -244,6 +248,107 @@ static ggml_type parse_ggml_type(const char * arg) {
244
248
  return GGML_TYPE_COUNT;
245
249
  }
246
250
 
251
+ // Allowed tensors for arbitrary quantization with --tensor-type option
252
+ static const std::vector<std::string> ALLOWED_TENSOR_TYPE = {
253
+ "attn_k",
254
+ "attn_kv_a_mqa",
255
+ "attn_kv_b",
256
+ "attn_o",
257
+ "attn_output",
258
+ "attn_q",
259
+ "attn_q_a",
260
+ "attn_q_b",
261
+ "attn_qkv",
262
+ "attn_v",
263
+ "channel_mix_key",
264
+ "channel_mix_receptance",
265
+ "channel_mix_value",
266
+ "cls",
267
+ "cls.output",
268
+ "cross_attn_k",
269
+ "cross_attn_o",
270
+ "cross_attn_q",
271
+ "cross_attn_v",
272
+ "ffn_act",
273
+ "ffn_down",
274
+ "ffn_down_exps",
275
+ "ffn_down_shexp",
276
+ "ffn_gate",
277
+ "ffn_gate_exps",
278
+ "ffn_gate_shexp",
279
+ "ffn_up",
280
+ "ffn_up_exps",
281
+ "ffn_up_shexp",
282
+ "ssm_in",
283
+ "ssm_out",
284
+ "time_mix_gate",
285
+ "time_mix_key",
286
+ "time_mix_output",
287
+ "time_mix_receptance",
288
+ "time_mix_value",
289
+ };
290
+
291
+ // changes to this struct must be replicated in llama-quant.cpp
292
+ struct tensor_quantization {
293
+ std::string name;
294
+ ggml_type quant = GGML_TYPE_COUNT;
295
+ };
296
+
297
+ static bool parse_tensor_type(const char * data, std::vector<tensor_quantization> & tensor_type) {
298
+ const char * sep = strchr(data, '=');
299
+ if (sep == nullptr) {
300
+ printf("\n%s: malformed tensor type '%s'\n\n", __func__, data);
301
+ return false;
302
+ }
303
+
304
+ const size_t tn_len = sep - data;
305
+ if (tn_len == 0) {
306
+ printf("\n%s: missing tensor name\n\n", __func__);
307
+ return false;
308
+ }
309
+
310
+ if (const size_t qt_len = strlen(sep); qt_len == 1) {
311
+ printf("\n%s: missing quantization type\n\n", __func__);
312
+ return false;
313
+ }
314
+
315
+ std::string tn(data, tn_len);
316
+ std::transform(tn.begin(), tn.end(), tn.begin(), tolower);
317
+ sep++;
318
+ const std::string qt(sep);
319
+
320
+ bool found = false;
321
+ for (const auto & allowed : ALLOWED_TENSOR_TYPE) {
322
+ std::string tensor;
323
+ tensor = tn.rfind('.') != std::string::npos ? tn.substr(tn.rfind('.') + 1) : tn;
324
+ // handle special case of cls.output
325
+ std::string cls_output = "cls.output";
326
+ if (tn.find(cls_output) != std::string::npos) {
327
+ tensor = "cls.output";
328
+ }
329
+ // check if an allowed tensor exists and it's at the end of the kv string
330
+ if (tensor == allowed) {
331
+ found = true;
332
+ break;
333
+ }
334
+ }
335
+ if (!found) {
336
+ printf("\n%s: invalid tensor name '%s'\n\n", __func__, tn.c_str());
337
+ return false;
338
+ }
339
+
340
+ if (parse_ggml_type(qt.c_str()) == GGML_TYPE_COUNT) {
341
+ printf("\n%s: invalid quantization type '%s'\n\n", __func__, qt.c_str());
342
+ return false;
343
+ }
344
+
345
+ tensor_quantization tqz;
346
+ tqz.name = tn;
347
+ tqz.quant = parse_ggml_type(qt.c_str());
348
+ tensor_type.emplace_back(std::move(tqz));
349
+ return true;
350
+ }
351
+
247
352
  int main(int argc, char ** argv) {
248
353
  if (argc < 3) {
249
354
  usage(argv[0]);
@@ -255,6 +360,7 @@ int main(int argc, char ** argv) {
255
360
  std::string imatrix_file;
256
361
  std::vector<std::string> included_weights, excluded_weights;
257
362
  std::vector<llama_model_kv_override> kv_overrides;
363
+ std::vector<tensor_quantization> tensor_types;
258
364
 
259
365
  for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
260
366
  if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
@@ -277,6 +383,10 @@ int main(int argc, char ** argv) {
277
383
  } else {
278
384
  usage(argv[0]);
279
385
  }
386
+ } else if (strcmp(argv[arg_idx], "--tensor-type") == 0) {
387
+ if (arg_idx == argc-1 || !parse_tensor_type(argv[++arg_idx], tensor_types)) {
388
+ usage(argv[0]);
389
+ }
280
390
  } else if (strcmp(argv[arg_idx], "--override-kv") == 0) {
281
391
  if (arg_idx == argc-1 || !string_parse_kv_override(argv[++arg_idx], kv_overrides)) {
282
392
  usage(argv[0]);
@@ -361,6 +471,9 @@ int main(int argc, char ** argv) {
361
471
  kv_overrides.back().key[0] = 0;
362
472
  params.kv_overrides = &kv_overrides;
363
473
  }
474
+ if (!tensor_types.empty()) {
475
+ params.tensor_types = &tensor_types;
476
+ }
364
477
 
365
478
  llama_backend_init();
366
479
 
@@ -1,2 +1,4 @@
1
- add_executable(rpc-server rpc-server.cpp)
2
- target_link_libraries(rpc-server PRIVATE ggml llama)
1
+ set(TARGET rpc-server)
2
+ add_executable(${TARGET} rpc-server.cpp)
3
+ target_link_libraries(${TARGET} PRIVATE ggml)
4
+ target_compile_features(${TARGET} PRIVATE cxx_std_17)