@fugood/llama.node 0.3.17 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (193) hide show
  1. package/CMakeLists.txt +3 -1
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +39 -2
  19. package/lib/index.js +132 -1
  20. package/lib/index.ts +203 -3
  21. package/package.json +2 -1
  22. package/src/EmbeddingWorker.cpp +1 -1
  23. package/src/LlamaCompletionWorker.cpp +366 -19
  24. package/src/LlamaCompletionWorker.h +30 -10
  25. package/src/LlamaContext.cpp +213 -5
  26. package/src/LlamaContext.h +12 -0
  27. package/src/common.hpp +15 -0
  28. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +133 -24
  29. package/src/llama.cpp/.github/workflows/build.yml +41 -762
  30. package/src/llama.cpp/.github/workflows/docker.yml +5 -2
  31. package/src/llama.cpp/.github/workflows/release.yml +716 -0
  32. package/src/llama.cpp/.github/workflows/server.yml +12 -12
  33. package/src/llama.cpp/CMakeLists.txt +5 -17
  34. package/src/llama.cpp/cmake/build-info.cmake +8 -2
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
  36. package/src/llama.cpp/common/CMakeLists.txt +31 -3
  37. package/src/llama.cpp/common/arg.cpp +48 -29
  38. package/src/llama.cpp/common/chat.cpp +128 -106
  39. package/src/llama.cpp/common/chat.h +2 -0
  40. package/src/llama.cpp/common/common.cpp +37 -1
  41. package/src/llama.cpp/common/common.h +18 -9
  42. package/src/llama.cpp/common/llguidance.cpp +1 -0
  43. package/src/llama.cpp/common/minja/chat-template.hpp +9 -5
  44. package/src/llama.cpp/common/minja/minja.hpp +69 -36
  45. package/src/llama.cpp/common/regex-partial.cpp +204 -0
  46. package/src/llama.cpp/common/regex-partial.h +56 -0
  47. package/src/llama.cpp/common/sampling.cpp +57 -50
  48. package/src/llama.cpp/examples/CMakeLists.txt +2 -23
  49. package/src/llama.cpp/examples/embedding/embedding.cpp +2 -11
  50. package/src/llama.cpp/examples/parallel/parallel.cpp +86 -14
  51. package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
  52. package/src/llama.cpp/examples/training/finetune.cpp +96 -0
  53. package/src/llama.cpp/ggml/CMakeLists.txt +27 -0
  54. package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
  55. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
  56. package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
  57. package/src/llama.cpp/ggml/include/ggml.h +10 -7
  58. package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
  60. package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
  61. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +20 -13
  62. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -2
  63. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +306 -6
  64. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -13
  65. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +29 -16
  66. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
  67. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
  68. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
  69. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +501 -0
  70. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +0 -13
  71. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +0 -6
  72. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
  73. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +36 -11
  74. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +0 -2
  75. package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
  76. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
  77. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +41 -27
  78. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
  79. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +9 -8
  80. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +121 -232
  81. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +7 -15
  82. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
  83. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
  84. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  85. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
  86. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +0 -23
  87. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  88. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +338 -166
  89. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
  90. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
  91. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
  92. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -70
  93. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +657 -193
  94. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +20 -0
  95. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +123 -29
  96. package/src/llama.cpp/ggml/src/ggml.c +29 -20
  97. package/src/llama.cpp/ggml/src/gguf.cpp +33 -33
  98. package/src/llama.cpp/include/llama.h +52 -11
  99. package/src/llama.cpp/requirements/requirements-all.txt +3 -3
  100. package/src/llama.cpp/scripts/xxd.cmake +1 -1
  101. package/src/llama.cpp/src/CMakeLists.txt +1 -0
  102. package/src/llama.cpp/src/llama-adapter.cpp +6 -0
  103. package/src/llama.cpp/src/llama-arch.cpp +3 -0
  104. package/src/llama.cpp/src/llama-batch.cpp +5 -1
  105. package/src/llama.cpp/src/llama-batch.h +2 -1
  106. package/src/llama.cpp/src/llama-chat.cpp +17 -7
  107. package/src/llama.cpp/src/llama-chat.h +1 -0
  108. package/src/llama.cpp/src/llama-context.cpp +389 -501
  109. package/src/llama.cpp/src/llama-context.h +44 -32
  110. package/src/llama.cpp/src/llama-cparams.h +1 -0
  111. package/src/llama.cpp/src/llama-graph.cpp +20 -38
  112. package/src/llama.cpp/src/llama-graph.h +12 -8
  113. package/src/llama.cpp/src/llama-kv-cache.cpp +1503 -389
  114. package/src/llama.cpp/src/llama-kv-cache.h +271 -85
  115. package/src/llama.cpp/src/llama-memory.h +11 -1
  116. package/src/llama.cpp/src/llama-model-loader.cpp +24 -15
  117. package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
  118. package/src/llama.cpp/src/llama-model-saver.h +37 -0
  119. package/src/llama.cpp/src/llama-model.cpp +316 -69
  120. package/src/llama.cpp/src/llama-model.h +8 -1
  121. package/src/llama.cpp/src/llama-quant.cpp +15 -13
  122. package/src/llama.cpp/src/llama-sampling.cpp +18 -6
  123. package/src/llama.cpp/src/llama-vocab.cpp +42 -4
  124. package/src/llama.cpp/src/llama-vocab.h +6 -0
  125. package/src/llama.cpp/src/llama.cpp +14 -0
  126. package/src/llama.cpp/tests/CMakeLists.txt +10 -2
  127. package/src/llama.cpp/tests/test-backend-ops.cpp +107 -47
  128. package/src/llama.cpp/tests/test-chat-template.cpp +10 -11
  129. package/src/llama.cpp/tests/test-chat.cpp +3 -1
  130. package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
  131. package/src/llama.cpp/tests/test-opt.cpp +33 -21
  132. package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
  133. package/src/llama.cpp/tests/test-sampling.cpp +1 -1
  134. package/src/llama.cpp/tools/CMakeLists.txt +39 -0
  135. package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +2 -2
  136. package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
  137. package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +495 -348
  138. package/src/llama.cpp/{examples → tools}/main/main.cpp +6 -9
  139. package/src/llama.cpp/{examples/llava → tools/mtmd}/CMakeLists.txt +1 -35
  140. package/src/llama.cpp/{examples/llava → tools/mtmd}/clip-impl.h +25 -5
  141. package/src/llama.cpp/{examples/llava → tools/mtmd}/clip.cpp +1440 -1349
  142. package/src/llama.cpp/tools/mtmd/clip.h +99 -0
  143. package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd-cli.cpp +70 -44
  144. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
  145. package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd.cpp +251 -281
  146. package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
  147. package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +4 -2
  148. package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +13 -76
  149. package/src/llama.cpp/{examples → tools}/rpc/rpc-server.cpp +70 -74
  150. package/src/llama.cpp/{examples → tools}/run/run.cpp +18 -4
  151. package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
  152. package/src/llama.cpp/{examples → tools}/server/server.cpp +291 -76
  153. package/src/llama.cpp/{examples → tools}/server/utils.hpp +377 -5
  154. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
  155. package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
  156. package/src/llama.cpp/examples/infill/infill.cpp +0 -590
  157. package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
  158. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
  159. package/src/llama.cpp/examples/llava/clip.h +0 -135
  160. package/src/llama.cpp/examples/llava/llava.cpp +0 -586
  161. package/src/llama.cpp/examples/llava/llava.h +0 -49
  162. package/src/llama.cpp/examples/llava/mtmd.h +0 -168
  163. package/src/llama.cpp/examples/llava/qwen2vl-test.cpp +0 -636
  164. /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
  165. /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
  166. /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
  167. /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
  168. /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
  169. /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
  170. /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
  171. /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
  172. /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
  173. /package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +0 -0
  174. /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
  175. /package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +0 -0
  176. /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
  177. /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
  178. /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
  179. /package/src/llama.cpp/{examples/llava → tools/mtmd}/deprecation-warning.cpp +0 -0
  180. /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
  181. /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
  182. /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
  183. /package/src/llama.cpp/{examples → tools}/rpc/CMakeLists.txt +0 -0
  184. /package/src/llama.cpp/{examples → tools}/run/CMakeLists.txt +0 -0
  185. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
  186. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
  187. /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
  188. /package/src/llama.cpp/{examples → tools}/server/httplib.h +0 -0
  189. /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
  190. /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
  191. /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
  192. /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
  193. /package/src/llama.cpp/{examples → tools}/tts/tts.cpp +0 -0
@@ -1,586 +0,0 @@
1
- #include "clip.h"
2
- #include "llava.h"
3
-
4
- #include "llama.h"
5
-
6
- #include <algorithm>
7
- #include <cerrno>
8
- #include <cstdio>
9
- #include <cstdlib>
10
- #include <cstring>
11
- #include <limits>
12
- #include <vector>
13
- #include <memory>
14
-
15
- #if defined(LLAVA_LOG_OFF)
16
- # define LOG_INF(...)
17
- # define LOG_WRN(...)
18
- # define LOG_ERR(...)
19
- # define LOG_DBG(...)
20
- #else // defined(LLAVA_LOG_OFF)
21
- # define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
22
- # define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
23
- # define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
24
- # define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
25
- #endif // defined(LLAVA_LOG_OFF)
26
-
27
- // RGB uint8 image
28
- struct clip_image_u8 {
29
- int nx;
30
- int ny;
31
-
32
- std::vector<uint8_t> buf;
33
- };
34
-
35
- // RGB float32 image (NHWC)
36
- // Memory layout: RGBRGBRGB...
37
- struct clip_image_f32 {
38
- int nx;
39
- int ny;
40
-
41
- std::vector<float> buf;
42
- };
43
-
44
- struct clip_image_grid_shape {
45
- int first;
46
- int second;
47
- };
48
-
49
- // convenience cpp wrapper
50
- struct clip_image_f32_batch_deleter {
51
- void operator()(clip_image_f32_batch * val) { clip_image_f32_batch_free(val); }
52
- };
53
- typedef std::unique_ptr<clip_image_f32_batch, clip_image_f32_batch_deleter> clip_image_f32_batch_ptr;
54
-
55
- struct clip_image_size_deleter {
56
- void operator()(clip_image_f32_batch * val) { clip_image_f32_batch_free(val); }
57
- };
58
- typedef std::unique_ptr<clip_image_size, clip_image_size_deleter> clip_image_size_ptr;
59
-
60
- /**
61
- * Selects the best resolution from a list of possible resolutions based on the original size.
62
- *
63
- * @param original_size The original size of the image in the format (width, height).
64
- * @param possible_resolutions A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
65
- * @return The best fit resolution in the format (width, height).
66
- */
67
- static std::pair<int, int> select_best_resolution(const std::pair<int, int>& original_size, const std::vector<std::pair<int, int>>& possible_resolutions) {
68
- int original_width = original_size.first;
69
- int original_height = original_size.second;
70
-
71
- std::pair<int, int> best_fit;
72
- int max_effective_resolution = 0;
73
- int min_wasted_resolution = std::numeric_limits<int>::max();
74
-
75
- for (const auto& resolution : possible_resolutions) {
76
- int width = resolution.first;
77
- int height = resolution.second;
78
- float scale = std::min(static_cast<float>(width) / original_width, static_cast<float>(height) / original_height);
79
- int downscaled_width = static_cast<int>(original_width * scale);
80
- int downscaled_height = static_cast<int>(original_height * scale);
81
- int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
82
- int wasted_resolution = (width * height) - effective_resolution;
83
- // LOG_DBG("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
84
- if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
85
- max_effective_resolution = effective_resolution;
86
- min_wasted_resolution = wasted_resolution;
87
- best_fit = resolution;
88
- }
89
- }
90
-
91
- return best_fit;
92
- }
93
-
94
- /**
95
- * @brief Get the anyres image grid shape object
96
- *
97
- * @param image_size
98
- * @param grid_pinpoints
99
- * @param image_patch_size
100
- * @return <int, int>
101
- */
102
- static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<int, int> & image_size, const std::vector<std::pair<int, int>> & grid_pinpoints, int image_patch_size) {
103
- /**
104
- Conversion from gguf flat array to vector:
105
- std::vector<std::pair<int, int>> possible_resolutions;
106
- for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) {
107
- possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
108
- }
109
- */
110
- auto best_resolution = select_best_resolution(image_size, grid_pinpoints);
111
- return {best_resolution.first / image_patch_size, best_resolution.second / image_patch_size};
112
- }
113
-
114
- // Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
115
- static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out, clip_image_f32 * img_input) {
116
- struct {
117
- struct ggml_context * ctx;
118
- } model;
119
-
120
- const int32_t image_size = clip_get_image_size(ctx_clip);
121
- const int32_t patch_size = clip_get_patch_size(ctx_clip);
122
-
123
- int32_t num_patches_per_side = image_size / patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches)
124
-
125
- int num_patches_width = grid_shape.first; // grid 1-4
126
- int num_patches_height = grid_shape.second; // grid 1-4
127
-
128
- const size_t num_images = num_patches_width * num_patches_height + 1;
129
-
130
- // TODO: size calculation is not calculated - it's only tens of MB
131
- size_t ctx_size = 0;
132
-
133
- {
134
- ctx_size += clip_embd_nbytes(ctx_clip) * num_images * 8; // image_features
135
- ctx_size += 1024*1024 * ggml_type_size(GGML_TYPE_F32);
136
- }
137
-
138
- struct ggml_init_params params {
139
- /*.mem_size =*/ ctx_size,
140
- /*.mem_buffer =*/ NULL,
141
- /*.no_alloc =*/ false, // NOTE: this should be false when using the legacy API
142
- };
143
-
144
- // Python reference code for full unpad:
145
- /*
146
- base_image_feature = image_feature[0]
147
- image_feature = image_feature[1:]
148
- image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
149
- image_feature = image_feature.flatten(1, 2).flatten(2, 3)
150
- image_feature = unpad_image(image_feature, image_sizes[image_idx])
151
- image_feature = torch.cat((
152
- image_feature,
153
- self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1)
154
- ), dim=-1)
155
- image_feature = image_feature.flatten(1, 2).transpose(0, 1)
156
- image_feature = torch.cat((base_image_feature, image_feature), dim=0)
157
- */
158
- // We now have two options: unpad or no unpad. Unpad removes tokens for faster llm eval.
159
- // In terms of result quality it appears to make no difference, so we'll start with the easier approach given 5D tensors are not supported in ggml yet.
160
- // Without unpad we have to split the sub-image embeddings into patches of 24 features each and permute them.
161
- // Once all images are processed to prepended the base_image_features without any changes.
162
-
163
- // Pytorch reference simplified, modified for ggml compatibility - confirmed identical output in python (for a 2x2 grid image (676x676 scaling))
164
- /*
165
- image_feature = image_feature.view(2, 2, 24, 24, 4096)
166
- image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous()
167
- image_feature = image_feature.view(2, 24, 2, 24, 4096)
168
- image_feature = image_feature.flatten(0, 3)
169
-
170
- // Reshape to 4D tensor by merging the last two dimensions
171
- image_feature = image_feature.view(2, 2, 24, 24*4096)
172
- image_feature = image_feature.permute(0, 2, 1, 3).contiguous()
173
- image_feature = image_feature.view(-1, 4096)
174
- */
175
-
176
- model.ctx = ggml_init(params);
177
-
178
- struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_output_tokens(ctx_clip, img_input), num_images - 1); // example: 4096 x 576 x 4
179
- // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
180
- // fill it with the image embeddings, ignoring the base
181
- for (size_t i = 1; i < num_images; i++) {
182
- size_t offset = (i-1) * clip_embd_nbytes(ctx_clip);
183
- memcpy((uint8_t *)(image_features->data) + offset, image_embd_v[i], clip_embd_nbytes(ctx_clip));
184
- }
185
-
186
- struct ggml_cgraph * gf = ggml_new_graph(model.ctx);
187
- size_t size_ele = ggml_type_size(GGML_TYPE_F32);
188
-
189
- struct ggml_tensor *image_features_patchview = ggml_view_4d(model.ctx, image_features,
190
- num_patches_per_side * clip_n_mmproj_embd(ctx_clip),
191
- num_patches_per_side,
192
- num_patches_width,
193
- num_patches_height,
194
- size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip),
195
- size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side,
196
- size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side * num_patches_width, 0);
197
- // ggml_tensor_printf(image_features_patchview,"image_features_patchview",__LINE__,false,false);
198
- struct ggml_tensor *permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, image_features_patchview, 0, 2, 1, 3));
199
- /**
200
- At the end of each row we have to add the row_end embeddings, which are the same as the newline embeddings
201
- image_feature = torch.cat((
202
- image_feature,
203
- self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.device)
204
- ), dim=-1)
205
- *
206
- */
207
-
208
- // ggml_tensor_printf(permuted_cont,"permuted_cont",__LINE__,false,false);
209
- struct ggml_tensor *flatten = ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip), num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, size_ele * clip_n_mmproj_embd(ctx_clip), 0);
210
- // ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
211
- ggml_build_forward_expand(gf, flatten);
212
- ggml_graph_compute_with_ctx(model.ctx, gf, 1);
213
- struct ggml_tensor* result = ggml_graph_node(gf, -1);
214
-
215
- memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
216
- // append without newline tokens (default behavior in llava_arch when not using unpad ):
217
- memcpy(image_embd_out + clip_n_output_tokens(ctx_clip, img_input) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
218
- *n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_output_tokens(ctx_clip, img_input));
219
-
220
- // Debug: Test single segments
221
- // Current findings: sending base image, sending a segment embedding all works similar to python
222
- // However, permuted embeddings do not work yet (stride issue?)
223
- // memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as context
224
- // memcpy(image_embd_out, (float*)prepared_cont->data, clip_embd_nbytes(ctx_clip)); // main image as context
225
- // *n_img_pos_out=576;
226
-
227
- ggml_free(model.ctx);
228
- return true;
229
- }
230
-
231
- static clip_image_f32 * reshape_by_patch(clip_image_f32 * image, int patch_size) {
232
- int width = image->nx;
233
- int height = image->ny;
234
- int num_patches = (height / patch_size) * (width / patch_size);
235
- clip_image_f32 * patch = clip_image_f32_init();
236
- patch->nx = patch_size * num_patches;
237
- patch->ny = patch_size;
238
- patch->buf.resize(3 * patch->nx * patch->ny);
239
-
240
- int patch_index = 0;
241
-
242
- for (int i = 0; i < height; i += patch_size) {
243
- for (int j = 0; j < width; j += patch_size) {
244
- for (int pi = 0; pi < patch_size; ++pi) {
245
- for (int pj = 0; pj < patch_size; ++pj) {
246
- int input_index = ((i + pi) * width + (j + pj)) * 3;
247
- int output_index = (pi * patch_size * num_patches + patch_index * patch_size + pj) * 3;
248
- patch->buf[output_index] = image->buf[input_index];
249
- patch->buf[output_index+1] = image->buf[input_index+1];
250
- patch->buf[output_index+2] = image->buf[input_index+2];
251
- }
252
- }
253
- patch_index++;
254
- }
255
- }
256
- return patch;
257
- }
258
-
259
- static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
260
- // std::vector<clip_image_f32*> img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
261
- clip_image_f32_batch_ptr img_res_v(clip_image_f32_batch_init());
262
- if (!clip_image_preprocess(ctx_clip, img, img_res_v.get())) {
263
- LOG_ERR("%s: unable to preprocess image\n", __func__);
264
- return false;
265
- }
266
-
267
- const int64_t t_img_enc_start_us = ggml_time_us();
268
-
269
- const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
270
-
271
- const size_t n_imgs = clip_image_f32_batch_n_images(img_res_v.get());
272
-
273
- if (clip_is_minicpmv(ctx_clip) || clip_is_qwen2vl(ctx_clip)) {
274
- std::vector<float *> image_embd_v;
275
- image_embd_v.resize(n_imgs);
276
- clip_image_size load_image_size;
277
-
278
- for (size_t i = 0; i < n_imgs; i++) {
279
- const int64_t t_img_enc_step_start_us = ggml_time_us();
280
- int nx = clip_image_f32_batch_nx(img_res_v.get(), i);
281
- int ny = clip_image_f32_batch_ny(img_res_v.get(), i);
282
- image_embd_v[i] = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, nx, ny));
283
- int patch_size = 14;
284
- load_image_size.width = nx;
285
- load_image_size.height = ny;
286
- clip_add_load_image_size(ctx_clip, &load_image_size);
287
-
288
- bool encoded = false;
289
- clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), i);
290
- if (clip_is_qwen2vl(ctx_clip)) {
291
- encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd_v[i]);
292
- }
293
- else {
294
- encoded = clip_image_encode(ctx_clip, n_threads, reshape_by_patch(img_res, patch_size), image_embd_v[i]);
295
- }
296
-
297
- if (!encoded) {
298
- LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) n_imgs);
299
- return false;
300
- }
301
- const int64_t t_img_enc_steop_batch_us = ggml_time_us();
302
- LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)n_imgs, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
303
- }
304
- const int64_t t_img_enc_batch_us = ggml_time_us();
305
- LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)n_imgs, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
306
-
307
- int n_img_pos_out = 0;
308
- for (size_t i = 0; i < image_embd_v.size(); i++) {
309
- int nx = clip_image_f32_batch_nx(img_res_v.get(), i);
310
- int ny = clip_image_f32_batch_ny(img_res_v.get(), i);
311
- clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), i);
312
- std::memcpy(
313
- image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
314
- image_embd_v[i],
315
- clip_embd_nbytes_by_img(ctx_clip, nx, ny));
316
- n_img_pos_out += clip_n_output_tokens(ctx_clip, img_res);
317
- }
318
- *n_img_pos = n_img_pos_out;
319
- for (size_t i = 0; i < image_embd_v.size(); i++) {
320
- free(image_embd_v[i]);
321
- }
322
- image_embd_v.clear();
323
- load_image_size.width = img->nx;
324
- load_image_size.height = img->ny;
325
- clip_add_load_image_size(ctx_clip, &load_image_size);
326
- LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size.width, load_image_size.height);
327
- }
328
- else if (clip_is_glm(ctx_clip)){
329
- struct clip_image_size * load_image_size = clip_image_size_init();
330
- load_image_size->width = clip_image_f32_batch_nx(img_res_v.get(), 0);
331
- load_image_size->height = clip_image_f32_batch_ny(img_res_v.get(), 0);
332
- clip_add_load_image_size(ctx_clip, load_image_size);
333
-
334
- clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
335
- bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd);
336
- int pos = int(load_image_size->width/clip_get_patch_size(ctx_clip)/2);
337
- *n_img_pos = (pos * pos + 2);
338
- if (!encoded){
339
- LOG_ERR("Unable to encode image \n");
340
- return false;
341
- }
342
- }
343
- else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
344
- // flat / default llava-1.5 type embedding
345
- clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
346
- *n_img_pos = clip_n_output_tokens(ctx_clip, img_res);
347
- bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
348
- if (!encoded) {
349
- LOG_ERR("Unable to encode image\n");
350
-
351
- return false;
352
- }
353
- }
354
- else {
355
- // spatial_unpad llava-1.6 type embedding
356
- // TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working
357
- std::vector<float *> image_embd_v;
358
- image_embd_v.resize(n_imgs);
359
- for (size_t i = 0; i < n_imgs; i++) {
360
- clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), i);
361
- image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
362
- const bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
363
- if (!encoded) {
364
- LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) n_imgs);
365
- return false;
366
- }
367
- }
368
- const int64_t t_img_enc_batch_us = ggml_time_us();
369
- LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)n_imgs, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
370
-
371
- const int32_t * image_grid = clip_image_grid(ctx_clip);
372
- const size_t num_gridpoints = get_clip_image_grid_size(ctx_clip);
373
-
374
- std::vector<std::pair<int, int>> grid_pinpoints;
375
- for (size_t i = 0; i < num_gridpoints; i += 2) {
376
- grid_pinpoints.push_back({image_grid[i], image_grid[i+1]});
377
- }
378
-
379
- const int32_t image_size = clip_get_image_size(ctx_clip);
380
-
381
- struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);
382
-
383
- int n_img_pos_out;
384
- clip_image_f32 * img_input = clip_image_f32_get_img(img_res_v.get(), 0);
385
- clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out, img_input);
386
- *n_img_pos = n_img_pos_out;
387
-
388
- for (size_t i = 0; i < image_embd_v.size(); i++) {
389
- free(image_embd_v[i]);
390
- }
391
- image_embd_v.clear();
392
-
393
- // debug image/segment/normalization content:
394
- // clip_image_u8 * tmp = clip_image_u8_init();
395
- // clip_image_convert_f32_to_u8(*image_feature, *tmp);
396
- // clip_image_save_to_bmp(*tmp, "image_feature.bmp");
397
- }
398
-
399
- LOG_INF("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
400
-
401
- const int64_t t_img_enc_end_us = ggml_time_us();
402
- float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
403
-
404
- LOG_INF("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
405
-
406
- return true;
407
- }
408
-
409
- bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip) {
410
- // make sure that the correct mmproj was used, i.e., compare apples to apples
411
- int n_llama_embd = llama_model_n_embd(llama_get_model(ctx_llama));
412
- auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
413
- if (n_image_embd != n_llama_embd) {
414
- LOG_ERR("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
415
- return false;
416
- }
417
- return true;
418
- }
419
-
420
- bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
421
- // Granite vision uses up to 10 patches + base patch
422
- int num_max_patches = 11;
423
- if (clip_is_minicpmv(ctx_clip)) {
424
- num_max_patches = 10;
425
- }
426
- if (clip_is_glm(ctx_clip)) {
427
- num_max_patches = 1;
428
- }
429
- float * image_embd;
430
- if (clip_is_qwen2vl(ctx_clip)) {
431
- // qwen2vl don't split image into chunks, so `num_max_patches` is not needed.
432
- image_embd = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img->nx, img->ny));
433
- } else {
434
- image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
435
- }
436
- if (!image_embd) {
437
- LOG_ERR("Unable to allocate memory for image embeddings\n");
438
- return false;
439
- }
440
-
441
- int n_img_pos;
442
- if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
443
- LOG_ERR("%s: cannot encode image, aborting\n", __func__);
444
- free(image_embd);
445
- return false;
446
- }
447
- *image_embd_out = image_embd;
448
- *n_img_pos_out = n_img_pos;
449
-
450
- return true;
451
- }
452
-
453
- struct llava_embd_batch {
454
- std::vector<llama_pos> pos;
455
- std::vector<int32_t> n_seq_id;
456
- std::vector<llama_seq_id> seq_id_0;
457
- std::vector<llama_seq_id *> seq_ids;
458
- std::vector<int8_t> logits;
459
- llama_batch batch;
460
- llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
461
- pos .resize(n_tokens);
462
- n_seq_id.resize(n_tokens);
463
- seq_ids .resize(n_tokens + 1);
464
- logits .resize(n_tokens);
465
- seq_id_0.resize(1);
466
- seq_id_0[0] = seq_id;
467
- seq_ids [n_tokens] = nullptr;
468
- batch = {
469
- /*n_tokens =*/ n_tokens,
470
- /*tokens =*/ nullptr,
471
- /*embd =*/ embd,
472
- /*pos =*/ pos.data(),
473
- /*n_seq_id =*/ n_seq_id.data(),
474
- /*seq_id =*/ seq_ids.data(),
475
- /*logits =*/ logits.data(),
476
- };
477
- for (int i = 0; i < n_tokens; i++) {
478
- batch.pos [i] = pos_0 + i;
479
- batch.n_seq_id[i] = 1;
480
- batch.seq_id [i] = seq_id_0.data();
481
- batch.logits [i] = false;
482
- }
483
- }
484
- };
485
-
486
- bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, int n_batch, int * n_past) {
487
- int n_embd = llama_model_n_embd(llama_get_model(ctx_llama));
488
-
489
- for (int i = 0; i < image_embed->n_image_pos; i += n_batch) {
490
- int n_eval = image_embed->n_image_pos - i;
491
- if (n_eval > n_batch) {
492
- n_eval = n_batch;
493
- }
494
- float * embd = image_embed->embed+i*n_embd;
495
- llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
496
- if (llama_decode(ctx_llama, llava_batch.batch)) {
497
- LOG_ERR("%s : failed to eval\n", __func__);
498
- return false;
499
- }
500
- *n_past += n_eval;
501
- }
502
- return true;
503
- }
504
-
505
- struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) {
506
- clip_image_u8 * img = clip_image_u8_init();
507
- if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
508
- clip_image_u8_free(img);
509
- LOG_ERR("%s: can't load image from bytes, is it a valid image?", __func__);
510
- return NULL;
511
- }
512
-
513
- float* image_embed = NULL;
514
- int n_image_pos = 0;
515
- bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
516
- if (!image_embed_result) {
517
- clip_image_u8_free(img);
518
- LOG_ERR("%s: couldn't embed the image\n", __func__);
519
- return NULL;
520
- }
521
-
522
- clip_image_u8_free(img);
523
- auto result = (llava_image_embed*)malloc(sizeof(llava_image_embed));
524
- result->embed = image_embed;
525
- result->n_image_pos = n_image_pos;
526
- return result;
527
- }
528
-
529
- static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) {
530
- auto file = fopen(path, "rb");
531
- if (file == NULL) {
532
- LOG_ERR("%s: can't read file %s\n", __func__, path);
533
- return false;
534
- }
535
-
536
- fseek(file, 0, SEEK_END);
537
- auto fileSize = ftell(file);
538
- fseek(file, 0, SEEK_SET);
539
-
540
- auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data
541
- if (buffer == NULL) {
542
- LOG_ERR("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
543
- perror("Memory allocation error");
544
- fclose(file);
545
- return false;
546
- }
547
- errno = 0;
548
- size_t ret = fread(buffer, 1, fileSize, file); // Read the file into the buffer
549
- if (ferror(file)) {
550
- LOG_ERR("read error: %s", strerror(errno));
551
- free(buffer);
552
- fclose(file);
553
- return false;
554
- }
555
- if (ret != (size_t) fileSize) {
556
- LOG_ERR("unexpectedly reached end of file");
557
- free(buffer);
558
- fclose(file);
559
- return false;
560
- }
561
- fclose(file); // Close the file
562
-
563
- *bytesOut = buffer;
564
- *sizeOut = fileSize;
565
- return true;
566
- }
567
-
568
- struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) {
569
- unsigned char* image_bytes;
570
- long image_bytes_length;
571
- auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
572
- if (!loaded) {
573
- LOG_ERR("%s: failed to load %s\n", __func__, image_path);
574
- return NULL;
575
- }
576
-
577
- llava_image_embed *embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length);
578
- free(image_bytes);
579
-
580
- return embed;
581
- }
582
-
583
- void llava_image_embed_free(struct llava_image_embed * embed) {
584
- free(embed->embed);
585
- free(embed);
586
- }
@@ -1,49 +0,0 @@
1
- #ifndef LLAVA_H
2
- #define LLAVA_H
3
-
4
- #include "ggml.h"
5
-
6
- #ifdef LLAMA_SHARED
7
- # if defined(_WIN32) && !defined(__MINGW32__)
8
- # ifdef LLAMA_BUILD
9
- # define LLAVA_API __declspec(dllexport)
10
- # else
11
- # define LLAVA_API __declspec(dllimport)
12
- # endif
13
- # else
14
- # define LLAVA_API __attribute__ ((visibility ("default")))
15
- # endif
16
- #else
17
- # define LLAVA_API
18
- #endif
19
-
20
- #ifdef __cplusplus
21
- extern "C" {
22
- #endif
23
-
24
- struct clip_ctx;
25
- struct llava_image_embed {
26
- float * embed;
27
- int n_image_pos;
28
- };
29
-
30
- /** sanity check for clip <-> llava embed size match */
31
- LLAVA_API bool llava_validate_embed_size(const struct llama_context * ctx_llama, const struct clip_ctx * ctx_clip);
32
-
33
- LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
34
-
35
- /** build an image embed from image file bytes */
36
- LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
37
- /** build an image embed from a path to an image filename */
38
- LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
39
- /** free an embedding made with llava_image_embed_make_* */
40
- LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
41
-
42
- /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
43
- LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
44
-
45
- #ifdef __cplusplus
46
- }
47
- #endif
48
-
49
- #endif