@fugood/llama.node 0.3.16 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (281) hide show
  1. package/CMakeLists.txt +6 -1
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +44 -2
  19. package/lib/index.js +132 -1
  20. package/lib/index.ts +203 -3
  21. package/package.json +2 -1
  22. package/src/EmbeddingWorker.cpp +1 -1
  23. package/src/LlamaCompletionWorker.cpp +374 -19
  24. package/src/LlamaCompletionWorker.h +31 -10
  25. package/src/LlamaContext.cpp +216 -7
  26. package/src/LlamaContext.h +12 -0
  27. package/src/common.hpp +15 -0
  28. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +233 -0
  29. package/src/llama.cpp/.github/workflows/build.yml +89 -767
  30. package/src/llama.cpp/.github/workflows/docker.yml +9 -6
  31. package/src/llama.cpp/.github/workflows/release.yml +716 -0
  32. package/src/llama.cpp/.github/workflows/server.yml +19 -23
  33. package/src/llama.cpp/CMakeLists.txt +11 -1
  34. package/src/llama.cpp/cmake/build-info.cmake +8 -2
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
  36. package/src/llama.cpp/common/CMakeLists.txt +35 -4
  37. package/src/llama.cpp/common/arg.cpp +844 -121
  38. package/src/llama.cpp/common/arg.h +9 -0
  39. package/src/llama.cpp/common/chat.cpp +129 -107
  40. package/src/llama.cpp/common/chat.h +2 -0
  41. package/src/llama.cpp/common/common.cpp +64 -518
  42. package/src/llama.cpp/common/common.h +35 -45
  43. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  44. package/src/llama.cpp/common/llguidance.cpp +31 -47
  45. package/src/llama.cpp/common/minja/chat-template.hpp +23 -11
  46. package/src/llama.cpp/common/minja/minja.hpp +186 -127
  47. package/src/llama.cpp/common/regex-partial.cpp +204 -0
  48. package/src/llama.cpp/common/regex-partial.h +56 -0
  49. package/src/llama.cpp/common/sampling.cpp +60 -50
  50. package/src/llama.cpp/docs/build.md +122 -7
  51. package/src/llama.cpp/examples/CMakeLists.txt +2 -32
  52. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  53. package/src/llama.cpp/examples/embedding/embedding.cpp +9 -12
  54. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  55. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  56. package/src/llama.cpp/examples/parallel/parallel.cpp +89 -15
  57. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  58. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  59. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  60. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  61. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  62. package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
  63. package/src/llama.cpp/examples/training/finetune.cpp +96 -0
  64. package/src/llama.cpp/ggml/CMakeLists.txt +35 -2
  65. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  66. package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
  67. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
  68. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  69. package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
  70. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  71. package/src/llama.cpp/ggml/include/ggml.h +76 -106
  72. package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -8
  73. package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
  74. package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
  75. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  76. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  77. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  78. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  79. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  80. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  81. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  82. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  83. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +66 -33
  84. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  85. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  86. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  87. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  88. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -194
  89. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  90. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1060 -410
  91. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1008 -13533
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +31 -16
  93. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +90 -12
  94. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -13
  95. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +266 -72
  96. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1034 -88
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8796 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  101. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  102. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +252 -0
  103. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  104. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
  105. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  106. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  107. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  108. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  109. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +106 -14
  110. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  111. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -262
  112. package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
  113. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
  114. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +307 -40
  115. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +125 -45
  116. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +10 -8
  117. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +239 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  119. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  120. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +9 -307
  121. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
  122. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
  123. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  124. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +944 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  130. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  131. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +507 -411
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  133. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  134. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
  135. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  136. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  137. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  138. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  140. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  141. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
  142. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +83 -49
  143. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1278 -282
  144. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +32 -0
  145. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +133 -30
  146. package/src/llama.cpp/ggml/src/ggml.c +170 -265
  147. package/src/llama.cpp/ggml/src/gguf.cpp +34 -33
  148. package/src/llama.cpp/include/llama.h +82 -22
  149. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  150. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  151. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  152. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  153. package/src/llama.cpp/requirements/requirements-all.txt +5 -3
  154. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  155. package/src/llama.cpp/scripts/xxd.cmake +1 -1
  156. package/src/llama.cpp/src/CMakeLists.txt +4 -2
  157. package/src/llama.cpp/src/llama-adapter.cpp +43 -1
  158. package/src/llama.cpp/src/llama-arch.cpp +163 -17
  159. package/src/llama.cpp/src/llama-arch.h +16 -0
  160. package/src/llama.cpp/src/llama-batch.cpp +5 -1
  161. package/src/llama.cpp/src/llama-batch.h +2 -1
  162. package/src/llama.cpp/src/llama-chat.cpp +91 -16
  163. package/src/llama.cpp/src/llama-chat.h +7 -2
  164. package/src/llama.cpp/src/llama-context.cpp +479 -575
  165. package/src/llama.cpp/src/llama-context.h +44 -33
  166. package/src/llama.cpp/src/llama-cparams.h +1 -0
  167. package/src/llama.cpp/src/llama-graph.cpp +209 -157
  168. package/src/llama.cpp/src/llama-graph.h +38 -14
  169. package/src/llama.cpp/src/llama-hparams.h +13 -0
  170. package/src/llama.cpp/src/llama-kv-cache.cpp +1604 -543
  171. package/src/llama.cpp/src/llama-kv-cache.h +283 -171
  172. package/src/llama.cpp/src/llama-memory.h +12 -2
  173. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  174. package/src/llama.cpp/src/llama-model-loader.cpp +34 -20
  175. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  176. package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
  177. package/src/llama.cpp/src/llama-model-saver.h +37 -0
  178. package/src/llama.cpp/src/llama-model.cpp +1803 -330
  179. package/src/llama.cpp/src/llama-model.h +21 -2
  180. package/src/llama.cpp/src/llama-quant.cpp +33 -10
  181. package/src/llama.cpp/src/llama-sampling.cpp +25 -7
  182. package/src/llama.cpp/src/llama-vocab.cpp +86 -10
  183. package/src/llama.cpp/src/llama-vocab.h +6 -0
  184. package/src/llama.cpp/src/llama.cpp +15 -1
  185. package/src/llama.cpp/tests/CMakeLists.txt +52 -31
  186. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  187. package/src/llama.cpp/tests/test-backend-ops.cpp +189 -90
  188. package/src/llama.cpp/tests/test-chat-template.cpp +26 -6
  189. package/src/llama.cpp/tests/test-chat.cpp +15 -3
  190. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  191. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  192. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  193. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  194. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  195. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  196. package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
  197. package/src/llama.cpp/tests/test-opt.cpp +33 -21
  198. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  199. package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
  200. package/src/llama.cpp/tests/test-sampling.cpp +1 -1
  201. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  202. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  203. package/src/llama.cpp/tools/CMakeLists.txt +39 -0
  204. package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +3 -3
  205. package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +1 -1
  206. package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +15 -16
  207. package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
  208. package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +623 -274
  209. package/src/llama.cpp/{examples → tools}/main/main.cpp +22 -14
  210. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +47 -0
  211. package/src/llama.cpp/tools/mtmd/clip-impl.h +365 -0
  212. package/src/llama.cpp/tools/mtmd/clip.cpp +3646 -0
  213. package/src/llama.cpp/tools/mtmd/clip.h +99 -0
  214. package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +22 -0
  215. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +370 -0
  216. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
  217. package/src/llama.cpp/tools/mtmd/mtmd.cpp +678 -0
  218. package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
  219. package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +21 -5
  220. package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +53 -3
  221. package/src/llama.cpp/tools/rpc/CMakeLists.txt +4 -0
  222. package/src/llama.cpp/tools/rpc/rpc-server.cpp +322 -0
  223. package/src/llama.cpp/tools/run/CMakeLists.txt +16 -0
  224. package/src/llama.cpp/{examples → tools}/run/run.cpp +30 -30
  225. package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
  226. package/src/llama.cpp/{examples → tools}/server/httplib.h +313 -247
  227. package/src/llama.cpp/{examples → tools}/server/server.cpp +529 -215
  228. package/src/llama.cpp/{examples → tools}/server/utils.hpp +427 -6
  229. package/src/llama.cpp/{examples → tools}/tts/tts.cpp +6 -9
  230. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
  231. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  232. package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
  233. package/src/llama.cpp/examples/infill/infill.cpp +0 -590
  234. package/src/llama.cpp/examples/llava/CMakeLists.txt +0 -66
  235. package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
  236. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
  237. package/src/llama.cpp/examples/llava/clip.cpp +0 -3206
  238. package/src/llama.cpp/examples/llava/clip.h +0 -118
  239. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  240. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  241. package/src/llama.cpp/examples/llava/llava.cpp +0 -574
  242. package/src/llama.cpp/examples/llava/llava.h +0 -49
  243. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  244. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +0 -584
  245. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  246. package/src/llama.cpp/examples/rpc/CMakeLists.txt +0 -2
  247. package/src/llama.cpp/examples/rpc/rpc-server.cpp +0 -171
  248. package/src/llama.cpp/examples/run/CMakeLists.txt +0 -5
  249. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  250. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  251. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  252. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  253. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  254. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  255. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  256. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  257. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  258. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  259. /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
  260. /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
  261. /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
  262. /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
  263. /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
  264. /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
  265. /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
  266. /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
  267. /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
  268. /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
  269. /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
  270. /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
  271. /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
  272. /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
  273. /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
  274. /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
  275. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
  276. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
  277. /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
  278. /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
  279. /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
  280. /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
  281. /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
@@ -1,574 +0,0 @@
1
- #include "clip.h"
2
- #include "llava.h"
3
-
4
- #include "llama.h"
5
-
6
- #include <algorithm>
7
- #include <cerrno>
8
- #include <cstdio>
9
- #include <cstdlib>
10
- #include <cstring>
11
- #include <limits>
12
- #include <vector>
13
-
14
- #if defined(LLAVA_LOG_OFF)
15
- # define LOG_INF(...)
16
- # define LOG_WRN(...)
17
- # define LOG_ERR(...)
18
- # define LOG_DBG(...)
19
- #else // defined(LLAVA_LOG_OFF)
20
- # define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
21
- # define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
22
- # define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
23
- # define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
24
- #endif // defined(LLAVA_LOG_OFF)
25
-
26
- // RGB uint8 image
27
- struct clip_image_u8 {
28
- int nx;
29
- int ny;
30
-
31
- std::vector<uint8_t> buf;
32
- };
33
-
34
- // RGB float32 image (NHWC)
35
- // Memory layout: RGBRGBRGB...
36
- struct clip_image_f32 {
37
- int nx;
38
- int ny;
39
-
40
- std::vector<float> buf;
41
- };
42
-
43
- struct clip_image_grid_shape {
44
- int first;
45
- int second;
46
- };
47
-
48
- /**
49
- * Selects the best resolution from a list of possible resolutions based on the original size.
50
- *
51
- * @param original_size The original size of the image in the format (width, height).
52
- * @param possible_resolutions A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
53
- * @return The best fit resolution in the format (width, height).
54
- */
55
- static std::pair<int, int> select_best_resolution(const std::pair<int, int>& original_size, const std::vector<std::pair<int, int>>& possible_resolutions) {
56
- int original_width = original_size.first;
57
- int original_height = original_size.second;
58
-
59
- std::pair<int, int> best_fit;
60
- int max_effective_resolution = 0;
61
- int min_wasted_resolution = std::numeric_limits<int>::max();
62
-
63
- for (const auto& resolution : possible_resolutions) {
64
- int width = resolution.first;
65
- int height = resolution.second;
66
- float scale = std::min(static_cast<float>(width) / original_width, static_cast<float>(height) / original_height);
67
- int downscaled_width = static_cast<int>(original_width * scale);
68
- int downscaled_height = static_cast<int>(original_height * scale);
69
- int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
70
- int wasted_resolution = (width * height) - effective_resolution;
71
- // LOG_DBG("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
72
- if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
73
- max_effective_resolution = effective_resolution;
74
- min_wasted_resolution = wasted_resolution;
75
- best_fit = resolution;
76
- }
77
- }
78
-
79
- return best_fit;
80
- }
81
-
82
- /**
83
- * @brief Get the anyres image grid shape object
84
- *
85
- * @param image_size
86
- * @param grid_pinpoints
87
- * @param image_patch_size
88
- * @return <int, int>
89
- */
90
- static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<int, int> & image_size, const std::vector<std::pair<int, int>> & grid_pinpoints, int image_patch_size) {
91
- /**
92
- Conversion from gguf flat array to vector:
93
- std::vector<std::pair<int, int>> possible_resolutions;
94
- for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) {
95
- possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
96
- }
97
- */
98
- auto best_resolution = select_best_resolution(image_size, grid_pinpoints);
99
- return {best_resolution.first / image_patch_size, best_resolution.second / image_patch_size};
100
- }
101
-
102
- // Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
103
- static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
104
- struct {
105
- struct ggml_context * ctx;
106
- } model;
107
-
108
- const int32_t image_size = clip_image_size(ctx_clip);
109
- const int32_t patch_size = clip_patch_size(ctx_clip);
110
-
111
- int32_t num_patches_per_side = image_size / patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches)
112
-
113
- int num_patches_width = grid_shape.first; // grid 1-4
114
- int num_patches_height = grid_shape.second; // grid 1-4
115
-
116
- const size_t num_images = num_patches_width * num_patches_height + 1;
117
-
118
- // TODO: size calculation is not calculated - it's only tens of MB
119
- size_t ctx_size = 0;
120
-
121
- {
122
- ctx_size += clip_embd_nbytes(ctx_clip) * num_images * 8; // image_features
123
- ctx_size += 1024*1024 * ggml_type_size(GGML_TYPE_F32);
124
- }
125
-
126
- struct ggml_init_params params {
127
- /*.mem_size =*/ ctx_size,
128
- /*.mem_buffer =*/ NULL,
129
- /*.no_alloc =*/ false, // NOTE: this should be false when using the legacy API
130
- };
131
-
132
- // Python reference code for full unpad:
133
- /*
134
- base_image_feature = image_feature[0]
135
- image_feature = image_feature[1:]
136
- image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
137
- image_feature = image_feature.flatten(1, 2).flatten(2, 3)
138
- image_feature = unpad_image(image_feature, image_sizes[image_idx])
139
- image_feature = torch.cat((
140
- image_feature,
141
- self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1)
142
- ), dim=-1)
143
- image_feature = image_feature.flatten(1, 2).transpose(0, 1)
144
- image_feature = torch.cat((base_image_feature, image_feature), dim=0)
145
- */
146
- // We now have two options: unpad or no unpad. Unpad removes tokens for faster llm eval.
147
- // In terms of result quality it appears to make no difference, so we'll start with the easier approach given 5D tensors are not supported in ggml yet.
148
- // Without unpad we have to split the sub-image embeddings into patches of 24 features each and permute them.
149
- // Once all images are processed to prepended the base_image_features without any changes.
150
-
151
- // Pytorch reference simplified, modified for ggml compatibility - confirmed identical output in python (for a 2x2 grid image (676x676 scaling))
152
- /*
153
- image_feature = image_feature.view(2, 2, 24, 24, 4096)
154
- image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous()
155
- image_feature = image_feature.view(2, 24, 2, 24, 4096)
156
- image_feature = image_feature.flatten(0, 3)
157
-
158
- // Reshape to 4D tensor by merging the last two dimensions
159
- image_feature = image_feature.view(2, 2, 24, 24*4096)
160
- image_feature = image_feature.permute(0, 2, 1, 3).contiguous()
161
- image_feature = image_feature.view(-1, 4096)
162
- */
163
-
164
- model.ctx = ggml_init(params);
165
-
166
- struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
167
- // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
168
- // fill it with the image embeddings, ignoring the base
169
- for (size_t i = 1; i < num_images; i++) {
170
- size_t offset = (i-1) * clip_embd_nbytes(ctx_clip);
171
- memcpy((uint8_t *)(image_features->data) + offset, image_embd_v[i], clip_embd_nbytes(ctx_clip));
172
- }
173
-
174
- struct ggml_cgraph * gf = ggml_new_graph(model.ctx);
175
- size_t size_ele = ggml_type_size(GGML_TYPE_F32);
176
-
177
- struct ggml_tensor *image_features_patchview = ggml_view_4d(model.ctx, image_features,
178
- num_patches_per_side * clip_n_mmproj_embd(ctx_clip),
179
- num_patches_per_side,
180
- num_patches_width,
181
- num_patches_height,
182
- size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip),
183
- size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side,
184
- size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side * num_patches_width, 0);
185
- // ggml_tensor_printf(image_features_patchview,"image_features_patchview",__LINE__,false,false);
186
- struct ggml_tensor *permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, image_features_patchview, 0, 2, 1, 3));
187
- /**
188
- At the end of each row we have to add the row_end embeddings, which are the same as the newline embeddings
189
- image_feature = torch.cat((
190
- image_feature,
191
- self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.device)
192
- ), dim=-1)
193
- *
194
- */
195
-
196
- // ggml_tensor_printf(permuted_cont,"permuted_cont",__LINE__,false,false);
197
- struct ggml_tensor *flatten = ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip), num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, size_ele * clip_n_mmproj_embd(ctx_clip), 0);
198
- // ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
199
- ggml_build_forward_expand(gf, flatten);
200
- ggml_graph_compute_with_ctx(model.ctx, gf, 1);
201
- struct ggml_tensor* result = ggml_graph_node(gf, -1);
202
-
203
- memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
204
- // append without newline tokens (default behavior in llava_arch when not using unpad ):
205
- memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
206
- *n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_patches(ctx_clip));
207
-
208
- // Debug: Test single segments
209
- // Current findings: sending base image, sending a segment embedding all works similar to python
210
- // However, permuted embeddings do not work yet (stride issue?)
211
- // memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as context
212
- // memcpy(image_embd_out, (float*)prepared_cont->data, clip_embd_nbytes(ctx_clip)); // main image as context
213
- // *n_img_pos_out=576;
214
-
215
- ggml_free(model.ctx);
216
- return true;
217
- }
218
-
219
- static clip_image_f32 * reshape_by_patch(clip_image_f32 * image, int patch_size) {
220
- int width = image->nx;
221
- int height = image->ny;
222
- int num_patches = (height / patch_size) * (width / patch_size);
223
- clip_image_f32 * patch = clip_image_f32_init();
224
- patch->nx = patch_size * num_patches;
225
- patch->ny = patch_size;
226
- patch->buf.resize(3 * patch->nx * patch->ny);
227
-
228
- int patch_index = 0;
229
-
230
- for (int i = 0; i < height; i += patch_size) {
231
- for (int j = 0; j < width; j += patch_size) {
232
- for (int pi = 0; pi < patch_size; ++pi) {
233
- for (int pj = 0; pj < patch_size; ++pj) {
234
- int input_index = ((i + pi) * width + (j + pj)) * 3;
235
- int output_index = (pi * patch_size * num_patches + patch_index * patch_size + pj) * 3;
236
- patch->buf[output_index] = image->buf[input_index];
237
- patch->buf[output_index+1] = image->buf[input_index+1];
238
- patch->buf[output_index+2] = image->buf[input_index+2];
239
- }
240
- }
241
- patch_index++;
242
- }
243
- }
244
- return patch;
245
- }
246
-
247
- static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
248
- // std::vector<clip_image_f32*> img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
249
- clip_image_f32_batch img_res_v;
250
- img_res_v.size = 0;
251
- img_res_v.data = nullptr;
252
- if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
253
- LOG_ERR("%s: unable to preprocess image\n", __func__);
254
- delete[] img_res_v.data;
255
- return false;
256
- }
257
-
258
- const int64_t t_img_enc_start_us = ggml_time_us();
259
-
260
- const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
261
-
262
- if (clip_is_minicpmv(ctx_clip) || clip_is_qwen2vl(ctx_clip)) {
263
- std::vector<float *> image_embd_v;
264
- image_embd_v.resize(img_res_v.size);
265
- struct clip_image_size * load_image_size = clip_image_size_init();
266
-
267
- for (size_t i = 0; i < img_res_v.size; i++) {
268
- const int64_t t_img_enc_step_start_us = ggml_time_us();
269
- image_embd_v[i] = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny));
270
- int patch_size=14;
271
- load_image_size->width = img_res_v.data[i].nx;
272
- load_image_size->height = img_res_v.data[i].ny;
273
- clip_add_load_image_size(ctx_clip, load_image_size);
274
-
275
- bool encoded = false;
276
- if (clip_is_qwen2vl(ctx_clip)) {
277
- encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
278
- }
279
- else {
280
- encoded = clip_image_encode(ctx_clip, n_threads, reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
281
- }
282
-
283
- if (!encoded) {
284
- LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
285
- return false;
286
- }
287
- const int64_t t_img_enc_steop_batch_us = ggml_time_us();
288
- LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
289
- }
290
- const int64_t t_img_enc_batch_us = ggml_time_us();
291
- LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
292
-
293
- int n_img_pos_out = 0;
294
- for (size_t i = 0; i < image_embd_v.size(); i++) {
295
- std::memcpy(
296
- image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
297
- image_embd_v[i],
298
- clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny));
299
- n_img_pos_out += clip_n_patches_by_img(ctx_clip, &img_res_v.data[i]);
300
- }
301
- *n_img_pos = n_img_pos_out;
302
- for (size_t i = 0; i < image_embd_v.size(); i++) {
303
- free(image_embd_v[i]);
304
- }
305
- image_embd_v.clear();
306
- load_image_size->width = img->nx;
307
- load_image_size->height = img->ny;
308
- clip_add_load_image_size(ctx_clip, load_image_size);
309
- LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
310
- delete[] img_res_v.data;
311
- img_res_v.size = 0;
312
- img_res_v.data = nullptr;
313
- }
314
- else if (clip_is_glm(ctx_clip)){
315
- struct clip_image_size * load_image_size = clip_image_size_init();
316
- load_image_size->width = img_res_v.data[0].nx;
317
- load_image_size->height = img_res_v.data[0].ny;
318
- clip_add_load_image_size(ctx_clip, load_image_size);
319
-
320
- bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd);
321
- int pos = int(load_image_size->width/clip_patch_size(ctx_clip)/2);
322
- *n_img_pos = (pos * pos + 2);
323
- if (!encoded){
324
- LOG_ERR("Unable to encode image \n");
325
- return false;
326
- }
327
- }
328
- else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
329
- // flat / default llava-1.5 type embedding
330
- *n_img_pos = clip_n_patches(ctx_clip);
331
- bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
332
- delete[] img_res_v.data;
333
- if (!encoded) {
334
- LOG_ERR("Unable to encode image\n");
335
-
336
- return false;
337
- }
338
- }
339
- else {
340
- // spatial_unpad llava-1.6 type embedding
341
- // TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working
342
- std::vector<float *> image_embd_v;
343
- image_embd_v.resize(img_res_v.size);
344
- for (size_t i = 0; i < img_res_v.size; i++) {
345
- image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
346
- const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
347
- if (!encoded) {
348
- LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
349
- return false;
350
- }
351
- }
352
- const int64_t t_img_enc_batch_us = ggml_time_us();
353
- LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
354
-
355
- const int32_t * image_grid = clip_image_grid(ctx_clip);
356
- const size_t num_gridpoints = get_clip_image_grid_size(ctx_clip);
357
-
358
- std::vector<std::pair<int, int>> grid_pinpoints;
359
- for (size_t i = 0; i < num_gridpoints; i += 2) {
360
- grid_pinpoints.push_back({image_grid[i], image_grid[i+1]});
361
- }
362
-
363
- // free all img_res_v - not needed anymore
364
- delete[] img_res_v.data;
365
- img_res_v.size = 0;
366
- img_res_v.data = nullptr;
367
-
368
- const int32_t image_size = clip_image_size(ctx_clip);
369
-
370
- struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);
371
-
372
- int n_img_pos_out;
373
- clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out);
374
- *n_img_pos = n_img_pos_out;
375
-
376
- for (size_t i = 0; i < image_embd_v.size(); i++) {
377
- free(image_embd_v[i]);
378
- }
379
- image_embd_v.clear();
380
-
381
- // debug image/segment/normalization content:
382
- // clip_image_u8 * tmp = clip_image_u8_init();
383
- // clip_image_convert_f32_to_u8(*image_feature, *tmp);
384
- // clip_image_save_to_bmp(*tmp, "image_feature.bmp");
385
- }
386
-
387
- LOG_INF("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
388
-
389
- const int64_t t_img_enc_end_us = ggml_time_us();
390
- float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
391
-
392
- LOG_INF("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
393
-
394
- return true;
395
- }
396
-
397
- bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip) {
398
- // make sure that the correct mmproj was used, i.e., compare apples to apples
399
- int n_llama_embd = llama_model_n_embd(llama_get_model(ctx_llama));
400
- auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
401
- if (n_image_embd != n_llama_embd) {
402
- LOG_ERR("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
403
- return false;
404
- }
405
- return true;
406
- }
407
-
408
- bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
409
- // Granite vision uses up to 10 patches + base patch
410
- int num_max_patches = 11;
411
- if (clip_is_minicpmv(ctx_clip)) {
412
- num_max_patches = 10;
413
- }
414
- if (clip_is_glm(ctx_clip)) {
415
- num_max_patches = 1;
416
- }
417
- float * image_embd;
418
- if (clip_is_qwen2vl(ctx_clip)) {
419
- // qwen2vl don't split image into chunks, so `num_max_patches` is not needed.
420
- image_embd = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img->nx, img->ny));
421
- } else {
422
- image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
423
- }
424
- if (!image_embd) {
425
- LOG_ERR("Unable to allocate memory for image embeddings\n");
426
- return false;
427
- }
428
-
429
- int n_img_pos;
430
- if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
431
- LOG_ERR("%s: cannot encode image, aborting\n", __func__);
432
- free(image_embd);
433
- return false;
434
- }
435
- *image_embd_out = image_embd;
436
- *n_img_pos_out = n_img_pos;
437
-
438
- return true;
439
- }
440
-
441
- struct llava_embd_batch {
442
- std::vector<llama_pos> pos;
443
- std::vector<int32_t> n_seq_id;
444
- std::vector<llama_seq_id> seq_id_0;
445
- std::vector<llama_seq_id *> seq_ids;
446
- std::vector<int8_t> logits;
447
- llama_batch batch;
448
- llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
449
- pos .resize(n_tokens);
450
- n_seq_id.resize(n_tokens);
451
- seq_ids .resize(n_tokens + 1);
452
- logits .resize(n_tokens);
453
- seq_id_0.resize(1);
454
- seq_id_0[0] = seq_id;
455
- seq_ids [n_tokens] = nullptr;
456
- batch = {
457
- /*n_tokens =*/ n_tokens,
458
- /*tokens =*/ nullptr,
459
- /*embd =*/ embd,
460
- /*pos =*/ pos.data(),
461
- /*n_seq_id =*/ n_seq_id.data(),
462
- /*seq_id =*/ seq_ids.data(),
463
- /*logits =*/ logits.data(),
464
- };
465
- for (int i = 0; i < n_tokens; i++) {
466
- batch.pos [i] = pos_0 + i;
467
- batch.n_seq_id[i] = 1;
468
- batch.seq_id [i] = seq_id_0.data();
469
- batch.logits [i] = false;
470
- }
471
- }
472
- };
473
-
474
- bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, int n_batch, int * n_past) {
475
- int n_embd = llama_model_n_embd(llama_get_model(ctx_llama));
476
-
477
- for (int i = 0; i < image_embed->n_image_pos; i += n_batch) {
478
- int n_eval = image_embed->n_image_pos - i;
479
- if (n_eval > n_batch) {
480
- n_eval = n_batch;
481
- }
482
- float * embd = image_embed->embed+i*n_embd;
483
- llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
484
- if (llama_decode(ctx_llama, llava_batch.batch)) {
485
- LOG_ERR("%s : failed to eval\n", __func__);
486
- return false;
487
- }
488
- *n_past += n_eval;
489
- }
490
- return true;
491
- }
492
-
493
- struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) {
494
- clip_image_u8 * img = clip_image_u8_init();
495
- if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
496
- clip_image_u8_free(img);
497
- LOG_ERR("%s: can't load image from bytes, is it a valid image?", __func__);
498
- return NULL;
499
- }
500
-
501
- float* image_embed = NULL;
502
- int n_image_pos = 0;
503
- bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
504
- if (!image_embed_result) {
505
- clip_image_u8_free(img);
506
- LOG_ERR("%s: couldn't embed the image\n", __func__);
507
- return NULL;
508
- }
509
-
510
- clip_image_u8_free(img);
511
- auto result = (llava_image_embed*)malloc(sizeof(llava_image_embed));
512
- result->embed = image_embed;
513
- result->n_image_pos = n_image_pos;
514
- return result;
515
- }
516
-
517
- static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) {
518
- auto file = fopen(path, "rb");
519
- if (file == NULL) {
520
- LOG_ERR("%s: can't read file %s\n", __func__, path);
521
- return false;
522
- }
523
-
524
- fseek(file, 0, SEEK_END);
525
- auto fileSize = ftell(file);
526
- fseek(file, 0, SEEK_SET);
527
-
528
- auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data
529
- if (buffer == NULL) {
530
- LOG_ERR("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
531
- perror("Memory allocation error");
532
- fclose(file);
533
- return false;
534
- }
535
- errno = 0;
536
- size_t ret = fread(buffer, 1, fileSize, file); // Read the file into the buffer
537
- if (ferror(file)) {
538
- LOG_ERR("read error: %s", strerror(errno));
539
- free(buffer);
540
- fclose(file);
541
- return false;
542
- }
543
- if (ret != (size_t) fileSize) {
544
- LOG_ERR("unexpectedly reached end of file");
545
- free(buffer);
546
- fclose(file);
547
- return false;
548
- }
549
- fclose(file); // Close the file
550
-
551
- *bytesOut = buffer;
552
- *sizeOut = fileSize;
553
- return true;
554
- }
555
-
556
- struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) {
557
- unsigned char* image_bytes;
558
- long image_bytes_length;
559
- auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
560
- if (!loaded) {
561
- LOG_ERR("%s: failed to load %s\n", __func__, image_path);
562
- return NULL;
563
- }
564
-
565
- llava_image_embed *embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length);
566
- free(image_bytes);
567
-
568
- return embed;
569
- }
570
-
571
- void llava_image_embed_free(struct llava_image_embed * embed) {
572
- free(embed->embed);
573
- free(embed);
574
- }
@@ -1,49 +0,0 @@
1
- #ifndef LLAVA_H
2
- #define LLAVA_H
3
-
4
- #include "ggml.h"
5
-
6
- #ifdef LLAMA_SHARED
7
- # if defined(_WIN32) && !defined(__MINGW32__)
8
- # ifdef LLAMA_BUILD
9
- # define LLAVA_API __declspec(dllexport)
10
- # else
11
- # define LLAVA_API __declspec(dllimport)
12
- # endif
13
- # else
14
- # define LLAVA_API __attribute__ ((visibility ("default")))
15
- # endif
16
- #else
17
- # define LLAVA_API
18
- #endif
19
-
20
- #ifdef __cplusplus
21
- extern "C" {
22
- #endif
23
-
24
- struct clip_ctx;
25
- struct llava_image_embed {
26
- float * embed;
27
- int n_image_pos;
28
- };
29
-
30
- /** sanity check for clip <-> llava embed size match */
31
- LLAVA_API bool llava_validate_embed_size(const struct llama_context * ctx_llama, const struct clip_ctx * ctx_clip);
32
-
33
- LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
34
-
35
- /** build an image embed from image file bytes */
36
- LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
37
- /** build an image embed from a path to an image filename */
38
- LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
39
- /** free an embedding made with llava_image_embed_make_* */
40
- LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
41
-
42
- /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
43
- LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
44
-
45
- #ifdef __cplusplus
46
- }
47
- #endif
48
-
49
- #endif