@fugood/llama.node 0.0.1-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (204) hide show
  1. package/CMakeLists.txt +85 -0
  2. package/README.md +56 -0
  3. package/bin/darwin/arm64/llama-node.node +0 -0
  4. package/bin/darwin/x64/llama-node.node +0 -0
  5. package/bin/linux/arm64/llama-node.node +0 -0
  6. package/bin/linux/x64/llama-node.node +0 -0
  7. package/bin/win32/arm64/llama-node.node +0 -0
  8. package/bin/win32/arm64/node.lib +0 -0
  9. package/bin/win32/x64/llama-node.node +0 -0
  10. package/bin/win32/x64/node.lib +0 -0
  11. package/lib/binding.js +13 -0
  12. package/lib/binding.ts +57 -0
  13. package/lib/index.js +24 -0
  14. package/lib/index.ts +13 -0
  15. package/package.json +65 -0
  16. package/src/addons.cpp +506 -0
  17. package/src/llama.cpp/CMakeLists.txt +1320 -0
  18. package/src/llama.cpp/build.zig +172 -0
  19. package/src/llama.cpp/cmake/FindSIMD.cmake +100 -0
  20. package/src/llama.cpp/common/CMakeLists.txt +87 -0
  21. package/src/llama.cpp/common/base64.hpp +392 -0
  22. package/src/llama.cpp/common/common.cpp +2949 -0
  23. package/src/llama.cpp/common/common.h +324 -0
  24. package/src/llama.cpp/common/console.cpp +501 -0
  25. package/src/llama.cpp/common/console.h +19 -0
  26. package/src/llama.cpp/common/grammar-parser.cpp +440 -0
  27. package/src/llama.cpp/common/grammar-parser.h +29 -0
  28. package/src/llama.cpp/common/json-schema-to-grammar.cpp +764 -0
  29. package/src/llama.cpp/common/json-schema-to-grammar.h +4 -0
  30. package/src/llama.cpp/common/json.hpp +24766 -0
  31. package/src/llama.cpp/common/log.h +724 -0
  32. package/src/llama.cpp/common/ngram-cache.cpp +282 -0
  33. package/src/llama.cpp/common/ngram-cache.h +94 -0
  34. package/src/llama.cpp/common/sampling.cpp +353 -0
  35. package/src/llama.cpp/common/sampling.h +147 -0
  36. package/src/llama.cpp/common/stb_image.h +8396 -0
  37. package/src/llama.cpp/common/train.cpp +1513 -0
  38. package/src/llama.cpp/common/train.h +233 -0
  39. package/src/llama.cpp/examples/CMakeLists.txt +52 -0
  40. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +5 -0
  41. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1640 -0
  42. package/src/llama.cpp/examples/batched/CMakeLists.txt +5 -0
  43. package/src/llama.cpp/examples/batched/batched.cpp +262 -0
  44. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +5 -0
  45. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +261 -0
  46. package/src/llama.cpp/examples/beam-search/CMakeLists.txt +5 -0
  47. package/src/llama.cpp/examples/beam-search/beam-search.cpp +188 -0
  48. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +6 -0
  49. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +275 -0
  50. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +5 -0
  51. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +936 -0
  52. package/src/llama.cpp/examples/embedding/CMakeLists.txt +5 -0
  53. package/src/llama.cpp/examples/embedding/embedding.cpp +211 -0
  54. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +9 -0
  55. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +195 -0
  56. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +5 -0
  57. package/src/llama.cpp/examples/export-lora/export-lora.cpp +462 -0
  58. package/src/llama.cpp/examples/finetune/CMakeLists.txt +5 -0
  59. package/src/llama.cpp/examples/finetune/finetune.cpp +1861 -0
  60. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +5 -0
  61. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +132 -0
  62. package/src/llama.cpp/examples/gguf/CMakeLists.txt +5 -0
  63. package/src/llama.cpp/examples/gguf/gguf.cpp +256 -0
  64. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +5 -0
  65. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +553 -0
  66. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +5 -0
  67. package/src/llama.cpp/examples/gritlm/gritlm.cpp +215 -0
  68. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +5 -0
  69. package/src/llama.cpp/examples/imatrix/imatrix.cpp +655 -0
  70. package/src/llama.cpp/examples/infill/CMakeLists.txt +5 -0
  71. package/src/llama.cpp/examples/infill/infill.cpp +767 -0
  72. package/src/llama.cpp/examples/jeopardy/questions.txt +100 -0
  73. package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +5 -0
  74. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +1286 -0
  75. package/src/llama.cpp/examples/llama.android/app/src/main/cpp/CMakeLists.txt +50 -0
  76. package/src/llama.cpp/examples/llama.android/app/src/main/cpp/llama-android.cpp +443 -0
  77. package/src/llama.cpp/examples/llava/CMakeLists.txt +37 -0
  78. package/src/llama.cpp/examples/llava/clip.cpp +2027 -0
  79. package/src/llama.cpp/examples/llava/clip.h +85 -0
  80. package/src/llama.cpp/examples/llava/llava-cli.cpp +309 -0
  81. package/src/llama.cpp/examples/llava/llava.cpp +426 -0
  82. package/src/llama.cpp/examples/llava/llava.h +50 -0
  83. package/src/llama.cpp/examples/llava/requirements.txt +3 -0
  84. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +5 -0
  85. package/src/llama.cpp/examples/lookahead/lookahead.cpp +485 -0
  86. package/src/llama.cpp/examples/lookup/CMakeLists.txt +23 -0
  87. package/src/llama.cpp/examples/lookup/lookup-create.cpp +41 -0
  88. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +47 -0
  89. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +160 -0
  90. package/src/llama.cpp/examples/lookup/lookup.cpp +258 -0
  91. package/src/llama.cpp/examples/main/CMakeLists.txt +5 -0
  92. package/src/llama.cpp/examples/main/main.cpp +957 -0
  93. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +33 -0
  94. package/src/llama.cpp/examples/parallel/CMakeLists.txt +5 -0
  95. package/src/llama.cpp/examples/parallel/parallel.cpp +427 -0
  96. package/src/llama.cpp/examples/passkey/CMakeLists.txt +5 -0
  97. package/src/llama.cpp/examples/passkey/passkey.cpp +302 -0
  98. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +5 -0
  99. package/src/llama.cpp/examples/perplexity/perplexity.cpp +1943 -0
  100. package/src/llama.cpp/examples/quantize/CMakeLists.txt +6 -0
  101. package/src/llama.cpp/examples/quantize/quantize.cpp +423 -0
  102. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +6 -0
  103. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +424 -0
  104. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +5 -0
  105. package/src/llama.cpp/examples/retrieval/retrieval.cpp +350 -0
  106. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +5 -0
  107. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +246 -0
  108. package/src/llama.cpp/examples/server/CMakeLists.txt +40 -0
  109. package/src/llama.cpp/examples/server/bench/requirements.txt +2 -0
  110. package/src/llama.cpp/examples/server/httplib.h +9465 -0
  111. package/src/llama.cpp/examples/server/server.cpp +3826 -0
  112. package/src/llama.cpp/examples/server/tests/requirements.txt +6 -0
  113. package/src/llama.cpp/examples/server/utils.hpp +653 -0
  114. package/src/llama.cpp/examples/simple/CMakeLists.txt +5 -0
  115. package/src/llama.cpp/examples/simple/simple.cpp +183 -0
  116. package/src/llama.cpp/examples/speculative/CMakeLists.txt +5 -0
  117. package/src/llama.cpp/examples/speculative/speculative.cpp +614 -0
  118. package/src/llama.cpp/examples/sycl/CMakeLists.txt +9 -0
  119. package/src/llama.cpp/examples/sycl/ls-sycl-device.cpp +13 -0
  120. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +5 -0
  121. package/src/llama.cpp/examples/tokenize/tokenize.cpp +42 -0
  122. package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +5 -0
  123. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +1252 -0
  124. package/src/llama.cpp/ggml-alloc.c +985 -0
  125. package/src/llama.cpp/ggml-alloc.h +76 -0
  126. package/src/llama.cpp/ggml-backend-impl.h +141 -0
  127. package/src/llama.cpp/ggml-backend.c +2099 -0
  128. package/src/llama.cpp/ggml-backend.h +233 -0
  129. package/src/llama.cpp/ggml-common.h +1853 -0
  130. package/src/llama.cpp/ggml-cuda.h +43 -0
  131. package/src/llama.cpp/ggml-impl.h +265 -0
  132. package/src/llama.cpp/ggml-kompute.cpp +2006 -0
  133. package/src/llama.cpp/ggml-kompute.h +46 -0
  134. package/src/llama.cpp/ggml-metal.h +66 -0
  135. package/src/llama.cpp/ggml-mpi.c +216 -0
  136. package/src/llama.cpp/ggml-mpi.h +39 -0
  137. package/src/llama.cpp/ggml-opencl.cpp +2301 -0
  138. package/src/llama.cpp/ggml-opencl.h +36 -0
  139. package/src/llama.cpp/ggml-quants.c +12678 -0
  140. package/src/llama.cpp/ggml-quants.h +133 -0
  141. package/src/llama.cpp/ggml-sycl.cpp +17882 -0
  142. package/src/llama.cpp/ggml-sycl.h +49 -0
  143. package/src/llama.cpp/ggml-vulkan-shaders.hpp +69849 -0
  144. package/src/llama.cpp/ggml-vulkan.cpp +6442 -0
  145. package/src/llama.cpp/ggml-vulkan.h +29 -0
  146. package/src/llama.cpp/ggml.c +21819 -0
  147. package/src/llama.cpp/ggml.h +2403 -0
  148. package/src/llama.cpp/llama.cpp +17468 -0
  149. package/src/llama.cpp/llama.h +1117 -0
  150. package/src/llama.cpp/pocs/CMakeLists.txt +12 -0
  151. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +9 -0
  152. package/src/llama.cpp/pocs/vdot/q8dot.cpp +172 -0
  153. package/src/llama.cpp/pocs/vdot/vdot.cpp +310 -0
  154. package/src/llama.cpp/prompts/LLM-questions.txt +49 -0
  155. package/src/llama.cpp/prompts/alpaca.txt +1 -0
  156. package/src/llama.cpp/prompts/assistant.txt +31 -0
  157. package/src/llama.cpp/prompts/chat-with-baichuan.txt +4 -0
  158. package/src/llama.cpp/prompts/chat-with-bob.txt +7 -0
  159. package/src/llama.cpp/prompts/chat-with-qwen.txt +1 -0
  160. package/src/llama.cpp/prompts/chat-with-vicuna-v0.txt +7 -0
  161. package/src/llama.cpp/prompts/chat-with-vicuna-v1.txt +7 -0
  162. package/src/llama.cpp/prompts/chat.txt +28 -0
  163. package/src/llama.cpp/prompts/dan-modified.txt +1 -0
  164. package/src/llama.cpp/prompts/dan.txt +1 -0
  165. package/src/llama.cpp/prompts/mnemonics.txt +93 -0
  166. package/src/llama.cpp/prompts/parallel-questions.txt +43 -0
  167. package/src/llama.cpp/prompts/reason-act.txt +18 -0
  168. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +3 -0
  169. package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +1 -0
  170. package/src/llama.cpp/requirements/requirements-convert-lora-to-ggml.txt +2 -0
  171. package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +2 -0
  172. package/src/llama.cpp/requirements/requirements-convert.txt +5 -0
  173. package/src/llama.cpp/requirements.txt +12 -0
  174. package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +24 -0
  175. package/src/llama.cpp/scripts/xxd.cmake +16 -0
  176. package/src/llama.cpp/sgemm.cpp +999 -0
  177. package/src/llama.cpp/sgemm.h +12 -0
  178. package/src/llama.cpp/tests/CMakeLists.txt +78 -0
  179. package/src/llama.cpp/tests/get-model.cpp +21 -0
  180. package/src/llama.cpp/tests/get-model.h +2 -0
  181. package/src/llama.cpp/tests/test-autorelease.cpp +24 -0
  182. package/src/llama.cpp/tests/test-backend-ops.cpp +2266 -0
  183. package/src/llama.cpp/tests/test-c.c +7 -0
  184. package/src/llama.cpp/tests/test-chat-template.cpp +107 -0
  185. package/src/llama.cpp/tests/test-double-float.cpp +57 -0
  186. package/src/llama.cpp/tests/test-grad0.cpp +1606 -0
  187. package/src/llama.cpp/tests/test-grammar-integration.cpp +243 -0
  188. package/src/llama.cpp/tests/test-grammar-parser.cpp +250 -0
  189. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +899 -0
  190. package/src/llama.cpp/tests/test-llama-grammar.cpp +402 -0
  191. package/src/llama.cpp/tests/test-model-load-cancel.cpp +27 -0
  192. package/src/llama.cpp/tests/test-opt.cpp +181 -0
  193. package/src/llama.cpp/tests/test-quantize-fns.cpp +185 -0
  194. package/src/llama.cpp/tests/test-quantize-perf.cpp +363 -0
  195. package/src/llama.cpp/tests/test-rope.cpp +221 -0
  196. package/src/llama.cpp/tests/test-sampling.cpp +301 -0
  197. package/src/llama.cpp/tests/test-tokenizer-0-falcon.cpp +187 -0
  198. package/src/llama.cpp/tests/test-tokenizer-0-llama.cpp +190 -0
  199. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +123 -0
  200. package/src/llama.cpp/tests/test-tokenizer-1-llama.cpp +111 -0
  201. package/src/llama.cpp/unicode-data.cpp +1651 -0
  202. package/src/llama.cpp/unicode-data.h +16 -0
  203. package/src/llama.cpp/unicode.cpp +277 -0
  204. package/src/llama.cpp/unicode.h +28 -0
@@ -0,0 +1,426 @@
1
+ #include "clip.h"
2
+ #include "common.h"
3
+ #include "llama.h"
4
+ #include "llava.h"
5
+ #include "base64.hpp"
6
+
7
+ #include <cstdio>
8
+ #include <cstdlib>
9
+ #include <vector>
10
+ #include <numeric>
11
+
12
+ // RGB uint8 image
13
+ struct clip_image_u8 {
14
+ int nx;
15
+ int ny;
16
+
17
+ std::vector<uint8_t> buf;
18
+ };
19
+
20
+ // RGB float32 image (NHWC)
21
+ // Memory layout: RGBRGBRGB...
22
+ struct clip_image_f32 {
23
+ int nx;
24
+ int ny;
25
+
26
+ std::vector<float> buf;
27
+ };
28
+
29
+ struct clip_image_grid_shape {
30
+ int first;
31
+ int second;
32
+ };
33
+
34
+ /**
35
+ * Selects the best resolution from a list of possible resolutions based on the original size.
36
+ *
37
+ * @param original_size The original size of the image in the format (width, height).
38
+ * @param possible_resolutions A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
39
+ * @return The best fit resolution in the format (width, height).
40
+ */
41
+ static std::pair<int, int> select_best_resolution(const std::pair<int, int>& original_size, const std::vector<std::pair<int, int>>& possible_resolutions) {
42
+ int original_width = original_size.first;
43
+ int original_height = original_size.second;
44
+
45
+ std::pair<int, int> best_fit;
46
+ int max_effective_resolution = 0;
47
+ int min_wasted_resolution = std::numeric_limits<int>::max();
48
+
49
+ for (const auto& resolution : possible_resolutions) {
50
+ int width = resolution.first;
51
+ int height = resolution.second;
52
+ float scale = std::min(static_cast<float>(width) / original_width, static_cast<float>(height) / original_height);
53
+ int downscaled_width = static_cast<int>(original_width * scale);
54
+ int downscaled_height = static_cast<int>(original_height * scale);
55
+ int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
56
+ int wasted_resolution = (width * height) - effective_resolution;
57
+ // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
58
+ if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
59
+ max_effective_resolution = effective_resolution;
60
+ min_wasted_resolution = wasted_resolution;
61
+ best_fit = resolution;
62
+ }
63
+ }
64
+
65
+ return best_fit;
66
+ }
67
+
68
+ /**
69
+ * @brief Get the anyres image grid shape object
70
+ *
71
+ * @param image_size
72
+ * @param grid_pinpoints
73
+ * @param image_patch_size
74
+ * @return <int, int>
75
+ */
76
+ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<int, int> & image_size, const std::vector<std::pair<int, int>> & grid_pinpoints, int image_patch_size) {
77
+ /**
78
+ Conversion from gguf flat array to vector:
79
+ std::vector<std::pair<int, int>> possible_resolutions;
80
+ for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) {
81
+ possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
82
+ }
83
+ */
84
+ auto best_resolution = select_best_resolution(image_size, grid_pinpoints);
85
+ return {best_resolution.first / image_patch_size, best_resolution.second / image_patch_size};
86
+ }
87
+
88
+ // Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
89
+ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
90
+ struct {
91
+ struct ggml_tensor * newline;
92
+ struct ggml_context * ctx;
93
+ } model;
94
+
95
+ const int32_t image_size = clip_image_size(ctx_clip);
96
+ const int32_t patch_size = clip_patch_size(ctx_clip);
97
+
98
+ int32_t num_patches_per_side = image_size / patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches)
99
+
100
+ int num_patches_width = grid_shape.first; // grid 1-4
101
+ int num_patches_height = grid_shape.second; // grid 1-4
102
+
103
+ const size_t num_images = num_patches_width * num_patches_height + 1;
104
+
105
+ // TODO: size calculation is not calculated - it's only tens of MB
106
+ size_t ctx_size = 0;
107
+
108
+ {
109
+ ctx_size += clip_embd_nbytes(ctx_clip) * num_images * 8; // image_features
110
+ ctx_size += 1024*1024 * ggml_type_size(GGML_TYPE_F32);
111
+ }
112
+
113
+ struct ggml_init_params params {
114
+ /*.mem_size =*/ ctx_size,
115
+ /*.mem_buffer =*/ NULL,
116
+ /*.no_alloc =*/ false, // NOTE: this should be false when using the legacy API
117
+ };
118
+
119
+ // Python reference code for full unpad:
120
+ /*
121
+ base_image_feature = image_feature[0]
122
+ image_feature = image_feature[1:]
123
+ image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
124
+ image_feature = image_feature.flatten(1, 2).flatten(2, 3)
125
+ image_feature = unpad_image(image_feature, image_sizes[image_idx])
126
+ image_feature = torch.cat((
127
+ image_feature,
128
+ self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1)
129
+ ), dim=-1)
130
+ image_feature = image_feature.flatten(1, 2).transpose(0, 1)
131
+ image_feature = torch.cat((base_image_feature, image_feature), dim=0)
132
+ */
133
+ // We now have two options: unpad or no unpad. Unpad removes tokens for faster llm eval.
134
+ // In terms of result quality it appears to make no difference, so we'll start with the easier approach given 5D tensors are not supported in ggml yet.
135
+ // Without unpad we have to split the sub-image embeddings into patches of 24 features each and permute them.
136
+ // Once all images are processed to prepended the base_image_features without any changes.
137
+
138
+ // Pytorch reference simplified, modified for ggml compatibility - confirmed identical output in python (for a 2x2 grid image (676x676 scaling))
139
+ /*
140
+ image_feature = image_feature.view(2, 2, 24, 24, 4096)
141
+ image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous()
142
+ image_feature = image_feature.view(2, 24, 2, 24, 4096)
143
+ image_feature = image_feature.flatten(0, 3)
144
+
145
+ // Reshape to 4D tensor by merging the last two dimensions
146
+ image_feature = image_feature.view(2, 2, 24, 24*4096)
147
+ image_feature = image_feature.permute(0, 2, 1, 3).contiguous()
148
+ image_feature = image_feature.view(-1, 4096)
149
+ */
150
+
151
+ model.ctx = ggml_init(params);
152
+
153
+ ggml_tensor * newline_tmp = clip_get_newline_tensor(ctx_clip);
154
+ model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]);
155
+ if (newline_tmp->backend != GGML_BACKEND_TYPE_CPU) {
156
+ if (newline_tmp->buffer == NULL) {
157
+ LOG_TEE("newline_tmp tensor buffer is NULL\n");
158
+ }
159
+ ggml_backend_tensor_get(newline_tmp, model.newline->data, 0, ggml_nbytes(newline_tmp));
160
+ } else {
161
+ model.newline->data = newline_tmp->data;
162
+ if (model.newline->data == NULL) {
163
+ LOG_TEE("newline_tmp tensor data is NULL\n");
164
+ }
165
+ }
166
+
167
+ struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
168
+ // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
169
+ // fill it with the image embeddings, ignoring the base
170
+ for (size_t i = 1; i < num_images; i++) {
171
+ size_t offset = (i-1) * clip_embd_nbytes(ctx_clip);
172
+ memcpy((uint8_t *)(image_features->data) + offset, image_embd_v[i], clip_embd_nbytes(ctx_clip));
173
+ }
174
+
175
+ struct ggml_cgraph * gf = ggml_new_graph(model.ctx);
176
+ size_t size_ele = ggml_type_size(GGML_TYPE_F32);
177
+
178
+ struct ggml_tensor *image_features_patchview = ggml_view_4d(model.ctx, image_features,
179
+ num_patches_per_side * clip_n_mmproj_embd(ctx_clip),
180
+ num_patches_per_side,
181
+ num_patches_width,
182
+ num_patches_height,
183
+ size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip),
184
+ size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side,
185
+ size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side * num_patches_width, 0);
186
+ // ggml_tensor_printf(image_features_patchview,"image_features_patchview",__LINE__,false,false);
187
+ struct ggml_tensor *permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, image_features_patchview, 0, 2, 1, 3));
188
+ /**
189
+ At the end of each row we have to add the row_end embeddings, which are the same as the newline embeddings
190
+ image_feature = torch.cat((
191
+ image_feature,
192
+ self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.device)
193
+ ), dim=-1)
194
+ *
195
+ */
196
+
197
+ // ggml_tensor_printf(permuted_cont,"permuted_cont",__LINE__,false,false);
198
+ struct ggml_tensor *flatten = ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip), num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, size_ele * clip_n_mmproj_embd(ctx_clip), 0);
199
+ // ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
200
+ ggml_build_forward_expand(gf, flatten);
201
+ ggml_graph_compute_with_ctx(model.ctx, gf, 1);
202
+ struct ggml_tensor* result = gf->nodes[gf->n_nodes - 1];
203
+
204
+ memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
205
+ // append without newline tokens (default behavior in llava_arch when not using unpad ):
206
+ memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
207
+ *n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_patches(ctx_clip));
208
+
209
+ // Debug: Test single segments
210
+ // Current findings: sending base image, sending a segment embedding all works similar to python
211
+ // However, permuted embeddings do not work yet (stride issue?)
212
+ // memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as context
213
+ // memcpy(image_embd_out, (float*)prepared_cont->data, clip_embd_nbytes(ctx_clip)); // main image as context
214
+ // *n_img_pos_out=576;
215
+
216
+ ggml_free(model.ctx);
217
+ return true;
218
+ }
219
+
220
+
221
+ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
222
+ // std::vector<clip_image_f32*> img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
223
+ clip_image_f32_batch img_res_v;
224
+ img_res_v.size = 0;
225
+ img_res_v.data = nullptr;
226
+ if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
227
+ LOG_TEE("%s: unable to preprocess image\n", __func__);
228
+ delete[] img_res_v.data;
229
+ return false;
230
+ }
231
+
232
+ const int64_t t_img_enc_start_us = ggml_time_us();
233
+
234
+ const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
235
+
236
+ if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
237
+ // flat / default llava-1.5 type embedding
238
+ *n_img_pos = clip_n_patches(ctx_clip);
239
+ bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
240
+ delete[] img_res_v.data;
241
+ if (!encoded) {
242
+ LOG_TEE("Unable to encode image\n");
243
+
244
+ return false;
245
+ }
246
+ } else {
247
+ // spatial_unpad llava-1.6 type embedding
248
+ // TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working
249
+ std::vector<float *> image_embd_v;
250
+ image_embd_v.resize(img_res_v.size);
251
+ for (size_t i = 0; i < img_res_v.size; i++) {
252
+ image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
253
+ const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
254
+ if (!encoded) {
255
+ LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
256
+ return false;
257
+ }
258
+ }
259
+ const int64_t t_img_enc_batch_us = ggml_time_us();
260
+ LOG_TEE("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
261
+
262
+ const int32_t * image_grid = clip_image_grid(ctx_clip);
263
+
264
+ std::vector<std::pair<int, int>> grid_pinpoints;
265
+ for (int i = 0; i < 32 && image_grid[i] != 0; i += 2) {
266
+ grid_pinpoints.push_back({image_grid[i], image_grid[i+1]});
267
+ }
268
+
269
+ // free all img_res_v - not needed anymore
270
+ delete[] img_res_v.data;
271
+ img_res_v.size = 0;
272
+ img_res_v.data = nullptr;
273
+
274
+ const int32_t image_size = clip_image_size(ctx_clip);
275
+
276
+ struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);
277
+
278
+ int n_img_pos_out;
279
+ clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out);
280
+ *n_img_pos = n_img_pos_out;
281
+
282
+ for (size_t i = 0; i < image_embd_v.size(); i++) {
283
+ free(image_embd_v[i]);
284
+ }
285
+ image_embd_v.clear();
286
+
287
+ // debug image/segment/normalization content:
288
+ // clip_image_u8 * tmp = clip_image_u8_init();
289
+ // clip_image_convert_f32_to_u8(*image_feature, *tmp);
290
+ // clip_image_save_to_bmp(*tmp, "image_feature.bmp");
291
+ }
292
+
293
+ LOG_TEE("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
294
+
295
+ const int64_t t_img_enc_end_us = ggml_time_us();
296
+ float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
297
+
298
+ LOG_TEE("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
299
+
300
+ return true;
301
+ }
302
+
303
+ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip) {
304
+ // make sure that the correct mmproj was used, i.e., compare apples to apples
305
+ int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
306
+ auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
307
+ if (n_image_embd != n_llama_embd) {
308
+ LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
309
+ return false;
310
+ }
311
+ return true;
312
+ }
313
+
314
+ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
315
+ float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model
316
+ if (!image_embd) {
317
+ LOG_TEE("Unable to allocate memory for image embeddings\n");
318
+ return false;
319
+ }
320
+
321
+ int n_img_pos;
322
+ if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
323
+ LOG_TEE("%s: cannot encode image, aborting\n", __func__);
324
+ free(image_embd);
325
+ return false;
326
+ }
327
+ *image_embd_out = image_embd;
328
+ *n_img_pos_out = n_img_pos;
329
+
330
+ return true;
331
+ }
332
+
333
+ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, int n_batch, int * n_past) {
334
+ int n_embd = llama_n_embd(llama_get_model(ctx_llama));
335
+
336
+ for (int i = 0; i < image_embed->n_image_pos; i += n_batch) {
337
+ int n_eval = image_embed->n_image_pos - i;
338
+ if (n_eval > n_batch) {
339
+ n_eval = n_batch;
340
+ }
341
+ llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
342
+ if (llama_decode(ctx_llama, batch)) {
343
+ LOG_TEE("%s : failed to eval\n", __func__);
344
+ return false;
345
+ }
346
+ *n_past += n_eval;
347
+ }
348
+ return true;
349
+ }
350
+
351
+ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) {
352
+ clip_image_u8 * img = clip_image_u8_init();
353
+ if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
354
+ clip_image_u8_free(img);
355
+ LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__);
356
+ return NULL;
357
+ }
358
+
359
+ float* image_embed = NULL;
360
+ int n_image_pos = 0;
361
+ bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
362
+ if (!image_embed_result) {
363
+ clip_image_u8_free(img);
364
+ LOG_TEE("%s: coulnd't embed the image\n", __func__);
365
+ return NULL;
366
+ }
367
+
368
+ clip_image_u8_free(img);
369
+ auto result = (llava_image_embed*)malloc(sizeof(llava_image_embed));
370
+ result->embed = image_embed;
371
+ result->n_image_pos = n_image_pos;
372
+ return result;
373
+ }
374
+
375
+ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) {
376
+ auto file = fopen(path, "rb");
377
+ if (file == NULL) {
378
+ LOG_TEE("%s: can't read file %s\n", __func__, path);
379
+ return false;
380
+ }
381
+
382
+ fseek(file, 0, SEEK_END);
383
+ auto fileSize = ftell(file);
384
+ fseek(file, 0, SEEK_SET);
385
+
386
+ auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data
387
+ if (buffer == NULL) {
388
+ LOG_TEE("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
389
+ perror("Memory allocation error");
390
+ fclose(file);
391
+ return false;
392
+ }
393
+ errno = 0;
394
+ size_t ret = fread(buffer, 1, fileSize, file); // Read the file into the buffer
395
+ if (ferror(file)) {
396
+ die_fmt("read error: %s", strerror(errno));
397
+ }
398
+ if (ret != (size_t) fileSize) {
399
+ die("unexpectedly reached end of file");
400
+ }
401
+ fclose(file); // Close the file
402
+
403
+ *bytesOut = buffer;
404
+ *sizeOut = fileSize;
405
+ return true;
406
+ }
407
+
408
+ struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) {
409
+ unsigned char* image_bytes;
410
+ long image_bytes_length;
411
+ auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
412
+ if (!loaded) {
413
+ LOG_TEE("%s: failed to load %s\n", __func__, image_path);
414
+ return NULL;
415
+ }
416
+
417
+ llava_image_embed *embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length);
418
+ free(image_bytes);
419
+
420
+ return embed;
421
+ }
422
+
423
+ void llava_image_embed_free(struct llava_image_embed * embed) {
424
+ free(embed->embed);
425
+ free(embed);
426
+ }
@@ -0,0 +1,50 @@
1
+ #ifndef LLAVA_H
2
+ #define LLAVA_H
3
+
4
+ #include "ggml.h"
5
+
6
+ #ifdef LLAMA_SHARED
7
+ # if defined(_WIN32) && !defined(__MINGW32__)
8
+ # ifdef LLAMA_BUILD
9
+ # define LLAVA_API __declspec(dllexport)
10
+ # else
11
+ # define LLAVA_API __declspec(dllimport)
12
+ # endif
13
+ # else
14
+ # define LLAVA_API __attribute__ ((visibility ("default")))
15
+ # endif
16
+ #else
17
+ # define LLAVA_API
18
+ #endif
19
+
20
+ struct clip_ctx;
21
+
22
+ #ifdef __cplusplus
23
+ extern "C" {
24
+ #endif
25
+
26
+ struct llava_image_embed {
27
+ float * embed;
28
+ int n_image_pos;
29
+ };
30
+
31
+ /** sanity check for clip <-> llava embed size match */
32
+ LLAVA_API bool llava_validate_embed_size(const struct llama_context * ctx_llama, const struct clip_ctx * ctx_clip);
33
+
34
+ LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
35
+
36
+ /** build an image embed from image file bytes */
37
+ LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
38
+ /** build an image embed from a path to an image filename */
39
+ LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
40
+ LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
41
+ /** free an embedding made with llava_image_embed_make_* */
42
+
43
+ /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
44
+ LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
45
+
46
+ #ifdef __cplusplus
47
+ }
48
+ #endif
49
+
50
+ #endif
@@ -0,0 +1,3 @@
1
+ -r ../../requirements/requirements-convert.txt
2
+ pillow~=10.2.0
3
+ torch~=2.1.1
@@ -0,0 +1,5 @@
1
+ set(TARGET lookahead)
2
+ add_executable(${TARGET} lookahead.cpp)
3
+ install(TARGETS ${TARGET} RUNTIME)
4
+ target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
+ target_compile_features(${TARGET} PRIVATE cxx_std_11)