@fugood/llama.node 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (286) hide show
  1. package/CMakeLists.txt +7 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +18 -1
  17. package/package.json +1 -1
  18. package/src/DetokenizeWorker.cpp +1 -1
  19. package/src/EmbeddingWorker.cpp +17 -7
  20. package/src/EmbeddingWorker.h +2 -1
  21. package/src/LlamaCompletionWorker.cpp +8 -8
  22. package/src/LlamaCompletionWorker.h +2 -2
  23. package/src/LlamaContext.cpp +89 -27
  24. package/src/LlamaContext.h +2 -0
  25. package/src/TokenizeWorker.cpp +1 -1
  26. package/src/common.hpp +4 -4
  27. package/src/llama.cpp/.github/workflows/build.yml +240 -168
  28. package/src/llama.cpp/.github/workflows/docker.yml +8 -8
  29. package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
  30. package/src/llama.cpp/.github/workflows/server.yml +21 -14
  31. package/src/llama.cpp/CMakeLists.txt +14 -6
  32. package/src/llama.cpp/Sources/llama/llama.h +4 -0
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/cmake/common.cmake +33 -0
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
  36. package/src/llama.cpp/common/CMakeLists.txt +6 -4
  37. package/src/llama.cpp/common/arg.cpp +986 -770
  38. package/src/llama.cpp/common/arg.h +22 -22
  39. package/src/llama.cpp/common/common.cpp +212 -351
  40. package/src/llama.cpp/common/common.h +204 -117
  41. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  42. package/src/llama.cpp/common/log.cpp +50 -50
  43. package/src/llama.cpp/common/log.h +18 -18
  44. package/src/llama.cpp/common/ngram-cache.cpp +36 -36
  45. package/src/llama.cpp/common/ngram-cache.h +19 -19
  46. package/src/llama.cpp/common/sampling.cpp +163 -121
  47. package/src/llama.cpp/common/sampling.h +41 -20
  48. package/src/llama.cpp/common/speculative.cpp +274 -0
  49. package/src/llama.cpp/common/speculative.h +28 -0
  50. package/src/llama.cpp/docs/build.md +134 -161
  51. package/src/llama.cpp/examples/CMakeLists.txt +33 -14
  52. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  53. package/src/llama.cpp/examples/batched/batched.cpp +19 -18
  54. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  55. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
  56. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  57. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  58. package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
  60. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
  61. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
  63. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
  64. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
  65. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  66. package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
  67. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
  68. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
  69. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
  70. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
  71. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  72. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
  73. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  74. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
  75. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  76. package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
  77. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  78. package/src/llama.cpp/examples/imatrix/imatrix.cpp +31 -13
  79. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  80. package/src/llama.cpp/examples/infill/infill.cpp +41 -87
  81. package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
  82. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +439 -459
  83. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +2 -0
  84. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
  85. package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
  86. package/src/llama.cpp/examples/llava/clip.cpp +263 -66
  87. package/src/llama.cpp/examples/llava/clip.h +8 -2
  88. package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
  89. package/src/llama.cpp/examples/llava/llava.cpp +83 -22
  90. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
  91. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
  92. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  93. package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
  94. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  95. package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
  96. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  97. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +16 -15
  98. package/src/llama.cpp/examples/lookup/lookup.cpp +30 -30
  99. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  100. package/src/llama.cpp/examples/main/main.cpp +73 -114
  101. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
  102. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  103. package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
  104. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  105. package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
  106. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  107. package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
  108. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  109. package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
  110. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  111. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
  112. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  113. package/src/llama.cpp/examples/retrieval/retrieval.cpp +16 -16
  114. package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
  115. package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
  116. package/src/llama.cpp/examples/run/run.cpp +911 -0
  117. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  118. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +38 -21
  119. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -16
  120. package/src/llama.cpp/examples/server/server.cpp +2073 -1339
  121. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  122. package/src/llama.cpp/examples/server/utils.hpp +354 -277
  123. package/src/llama.cpp/examples/simple/CMakeLists.txt +2 -2
  124. package/src/llama.cpp/examples/simple/simple.cpp +130 -94
  125. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  126. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +200 -0
  127. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  128. package/src/llama.cpp/examples/speculative/speculative.cpp +68 -64
  129. package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
  130. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
  131. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  132. package/src/llama.cpp/examples/tokenize/tokenize.cpp +3 -3
  133. package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
  134. package/src/llama.cpp/examples/tts/tts.cpp +932 -0
  135. package/src/llama.cpp/ggml/CMakeLists.txt +54 -36
  136. package/src/llama.cpp/ggml/include/ggml-backend.h +63 -34
  137. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  138. package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
  139. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  140. package/src/llama.cpp/ggml/include/ggml-cpu.h +135 -0
  141. package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
  142. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  143. package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
  144. package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
  145. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  146. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  147. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  148. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  149. package/src/llama.cpp/ggml/include/ggml.h +159 -417
  150. package/src/llama.cpp/ggml/src/CMakeLists.txt +121 -1155
  151. package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -28
  152. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +57 -36
  153. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +552 -0
  154. package/src/llama.cpp/ggml/src/ggml-backend.cpp +306 -867
  155. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +87 -0
  156. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +216 -65
  157. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +76 -0
  158. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
  159. package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
  160. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +343 -177
  161. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
  162. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
  163. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
  164. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
  165. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
  169. package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
  170. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +336 -0
  171. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  172. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  173. package/src/llama.cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
  174. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
  175. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  176. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  177. package/src/llama.cpp/ggml/src/{ggml-aarch64.c → ggml-cpu/ggml-cpu-aarch64.cpp} +1299 -246
  178. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
  179. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  180. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  181. package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +14 -242
  182. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
  183. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  184. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  185. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  186. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
  187. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +628 -0
  188. package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +666 -0
  189. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +152 -0
  190. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +104 -0
  192. package/src/llama.cpp/ggml/src/ggml-impl.h +393 -22
  193. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
  194. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +360 -127
  195. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +105 -0
  196. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
  197. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +107 -0
  198. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
  199. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
  200. package/src/llama.cpp/ggml/src/ggml-opt.cpp +854 -0
  201. package/src/llama.cpp/ggml/src/ggml-quants.c +188 -10702
  202. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
  203. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  204. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +478 -300
  205. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +84 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +36 -5
  208. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +259 -0
  209. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
  210. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
  211. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
  212. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +34 -35
  213. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
  214. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  215. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
  216. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3638 -4151
  217. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
  218. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
  219. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -87
  220. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +7 -6
  221. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  222. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  223. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
  224. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
  225. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
  226. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
  227. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  228. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +141 -0
  229. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  230. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  231. package/src/llama.cpp/ggml/src/ggml-threading.h +14 -0
  232. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +92 -0
  233. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2138 -887
  234. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +3 -1
  235. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
  236. package/src/llama.cpp/ggml/src/ggml.c +4427 -20125
  237. package/src/llama.cpp/include/llama-cpp.h +25 -0
  238. package/src/llama.cpp/include/llama.h +93 -52
  239. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
  240. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
  241. package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
  242. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  243. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  244. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  245. package/src/llama.cpp/src/CMakeLists.txt +4 -8
  246. package/src/llama.cpp/src/llama-grammar.cpp +15 -15
  247. package/src/llama.cpp/src/llama-grammar.h +2 -5
  248. package/src/llama.cpp/src/llama-sampling.cpp +779 -194
  249. package/src/llama.cpp/src/llama-sampling.h +21 -2
  250. package/src/llama.cpp/src/llama-vocab.cpp +55 -10
  251. package/src/llama.cpp/src/llama-vocab.h +35 -11
  252. package/src/llama.cpp/src/llama.cpp +4317 -2979
  253. package/src/llama.cpp/src/unicode-data.cpp +2 -2
  254. package/src/llama.cpp/src/unicode.cpp +62 -51
  255. package/src/llama.cpp/src/unicode.h +9 -10
  256. package/src/llama.cpp/tests/CMakeLists.txt +48 -38
  257. package/src/llama.cpp/tests/test-arg-parser.cpp +15 -15
  258. package/src/llama.cpp/tests/test-backend-ops.cpp +324 -80
  259. package/src/llama.cpp/tests/test-barrier.cpp +1 -0
  260. package/src/llama.cpp/tests/test-chat-template.cpp +59 -9
  261. package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
  262. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
  263. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
  264. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
  265. package/src/llama.cpp/tests/test-log.cpp +2 -2
  266. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  267. package/src/llama.cpp/tests/test-quantize-fns.cpp +24 -21
  268. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  269. package/src/llama.cpp/tests/test-rope.cpp +62 -20
  270. package/src/llama.cpp/tests/test-sampling.cpp +163 -138
  271. package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
  272. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  273. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  274. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
  275. package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
  276. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
  277. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
  278. package/src/llama.cpp/common/train.cpp +0 -1515
  279. package/src/llama.cpp/common/train.h +0 -233
  280. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
  281. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
  282. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -39
  283. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +0 -600
  284. package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
  285. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  286. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -0,0 +1,581 @@
1
+ #include "arg.h"
2
+ #include "base64.hpp"
3
+ #include "log.h"
4
+ #include "common.h"
5
+ #include "sampling.h"
6
+ #include "clip.h"
7
+ #include "llava.h"
8
+ #include "llama.h"
9
+ #include "ggml.h"
10
+
11
+ #ifdef GGML_USE_CUDA
12
+ #include "ggml-cuda.h"
13
+ #endif
14
+ #ifdef NDEBUG
15
+ #include "ggml-alloc.h"
16
+ #include "ggml-backend.h"
17
+ #endif
18
+
19
+ #include <cstdio>
20
+ #include <cstdlib>
21
+ #include <cstring>
22
+ #include <vector>
23
+ #include <algorithm>
24
+ #include <iostream>
25
+ #include <fstream>
26
+
27
+
28
+ static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed,
29
+ int n_batch, int * n_past, int * st_pos_id, struct clip_image_size * image_size) {
30
+ int n_embd = llama_n_embd(llama_get_model(ctx_llama));
31
+ const int patch_size = 14 * 2;
32
+ const int ph = image_size->height / patch_size + (image_size->height % patch_size > 0);
33
+ const int pw = image_size->width / patch_size + (image_size->width % patch_size > 0);
34
+ auto img_tokens = image_embed->n_image_pos;
35
+ // llama_pos mrope_pos[img_tokens * 4];
36
+ std::vector<llama_pos> mrope_pos;
37
+ mrope_pos.resize(img_tokens * 4);
38
+
39
+ for (int y = 0; y < ph; y++)
40
+ {
41
+ for (int x = 0; x < pw; x++)
42
+ {
43
+ int i = y * pw + x;
44
+ mrope_pos[i] = *st_pos_id;
45
+ mrope_pos[i + img_tokens] = *st_pos_id + y;
46
+ mrope_pos[i + img_tokens * 2] = *st_pos_id + x;
47
+ mrope_pos[i + img_tokens * 3] = 0;
48
+ }
49
+ }
50
+ *st_pos_id += std::max(pw, ph);
51
+
52
+ int processed = 0;
53
+ std::vector<llama_pos> batch_mrope_pos;
54
+ batch_mrope_pos.resize(img_tokens * 4);
55
+
56
+ for (int i = 0; i < img_tokens; i += n_batch) {
57
+ int n_eval = img_tokens - i;
58
+ if (n_eval > n_batch) {
59
+ n_eval = n_batch;
60
+ }
61
+
62
+ // llama_pos batch_mrope_pos[n_eval * 4];
63
+ std::fill(batch_mrope_pos.begin(), batch_mrope_pos.end(), 0);
64
+ memcpy(batch_mrope_pos.data(), &mrope_pos[processed], n_eval * sizeof(llama_pos));
65
+ memcpy(&batch_mrope_pos[n_eval * 1], &mrope_pos[img_tokens * 1 + processed], n_eval * sizeof(llama_pos));
66
+ memcpy(&batch_mrope_pos[n_eval * 2], &mrope_pos[img_tokens * 2 + processed], n_eval * sizeof(llama_pos));
67
+ memcpy(&batch_mrope_pos[n_eval * 3], &mrope_pos[img_tokens * 3 + processed], n_eval * sizeof(llama_pos));
68
+
69
+ llama_batch batch = {
70
+ int32_t(n_eval), // n_tokens
71
+ nullptr, // token
72
+ (image_embed->embed+i*n_embd), // embed
73
+ batch_mrope_pos.data(), // pos
74
+ nullptr, // n_seq_id
75
+ nullptr, // seq_id
76
+ nullptr, // logits
77
+ };
78
+
79
+ if (llama_decode(ctx_llama, batch)) {
80
+ LOG_ERR("%s : failed to eval\n", __func__);
81
+ return false;
82
+ }
83
+ *n_past += n_eval;
84
+ processed += n_eval;
85
+ }
86
+ return true;
87
+ }
88
+
89
+
90
+ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past, int * st_pos_id) {
91
+ int N = (int) tokens.size();
92
+ std::vector<llama_pos> pos;
93
+ for (int i = 0; i < N; i += n_batch) {
94
+ int n_eval = (int) tokens.size() - i;
95
+ if (n_eval > n_batch) {
96
+ n_eval = n_batch;
97
+ }
98
+ auto batch = llama_batch_get_one(&tokens[i], n_eval);
99
+ // TODO: add mrope pos ids somewhere else
100
+ pos.resize(batch.n_tokens * 4);
101
+ std::fill(pos.begin(), pos.end(), 0);
102
+ for (int j = 0; j < batch.n_tokens * 3; j ++) {
103
+ pos[j] = *st_pos_id + (j % batch.n_tokens);
104
+ }
105
+ batch.pos = pos.data();
106
+
107
+ if (llama_decode(ctx_llama, batch)) {
108
+ LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
109
+ return false;
110
+ }
111
+ *n_past += n_eval;
112
+ *st_pos_id += n_eval;
113
+ }
114
+ return true;
115
+ }
116
+
117
+ static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past, int * st_pos_id) {
118
+ std::vector<llama_token> tokens;
119
+ tokens.push_back(id);
120
+ return eval_tokens(ctx_llama, tokens, 1, n_past, st_pos_id);
121
+ }
122
+
123
+ static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, int * st_pos_id, bool add_bos){
124
+ std::string str2 = str;
125
+ std::vector<llama_token> embd_inp = common_tokenize(ctx_llama, str2, add_bos, true);
126
+ eval_tokens(ctx_llama, embd_inp, n_batch, n_past, st_pos_id);
127
+ return true;
128
+ }
129
+
130
+ static const char * sample(struct common_sampler * smpl,
131
+ struct llama_context * ctx_llama,
132
+ int * n_past, int * st_pos_id) {
133
+ const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
134
+ common_sampler_accept(smpl, id, true);
135
+ static std::string ret;
136
+ if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
137
+ ret = "</s>";
138
+ } else {
139
+ ret = common_token_to_piece(ctx_llama, id);
140
+ }
141
+ eval_id(ctx_llama, id, n_past, st_pos_id);
142
+ return ret.c_str();
143
+ }
144
+
145
+ static const char* IMG_BASE64_TAG_BEGIN = "<img src=\"data:image/jpeg;base64,";
146
+ static const char* IMG_BASE64_TAG_END = "\">";
147
+
148
+ static void find_image_tag_in_prompt(const std::string& prompt, size_t& begin_out, size_t& end_out) {
149
+ begin_out = prompt.find(IMG_BASE64_TAG_BEGIN);
150
+ end_out = prompt.find(IMG_BASE64_TAG_END, (begin_out == std::string::npos) ? 0UL : begin_out);
151
+ }
152
+
153
+ static bool prompt_contains_image(const std::string& prompt) {
154
+ size_t begin, end;
155
+ find_image_tag_in_prompt(prompt, begin, end);
156
+ return (begin != std::string::npos);
157
+ }
158
+
159
+ // replaces the base64 image tag in the prompt with `replacement`
160
+ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip_ctx * ctx_clip, int n_threads, const std::string& prompt) {
161
+ size_t img_base64_str_start, img_base64_str_end;
162
+ find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
163
+ if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
164
+ LOG_ERR("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
165
+ return NULL;
166
+ }
167
+
168
+ auto base64_bytes_start = img_base64_str_start + strlen(IMG_BASE64_TAG_BEGIN);
169
+ auto base64_bytes_count = img_base64_str_end - base64_bytes_start;
170
+ auto base64_str = prompt.substr(base64_bytes_start, base64_bytes_count );
171
+
172
+ auto required_bytes = base64::required_encode_size(base64_str.size());
173
+ auto img_bytes = std::vector<unsigned char>(required_bytes);
174
+ base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin());
175
+
176
+ auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
177
+ if (!embed) {
178
+ LOG_ERR("%s: could not load image from base64 string.\n", __func__);
179
+ return NULL;
180
+ }
181
+
182
+ return embed;
183
+ }
184
+
185
+ static std::string remove_image_from_prompt(const std::string& prompt, const char * replacement = "") {
186
+ size_t begin, end;
187
+ find_image_tag_in_prompt(prompt, begin, end);
188
+ if (begin == std::string::npos || end == std::string::npos) {
189
+ return prompt;
190
+ }
191
+ auto pre = prompt.substr(0, begin);
192
+ auto post = prompt.substr(end + strlen(IMG_BASE64_TAG_END));
193
+ return pre + replacement + post;
194
+ }
195
+
196
+ struct llava_context {
197
+ struct clip_ctx * ctx_clip = NULL;
198
+ struct llama_context * ctx_llama = NULL;
199
+ struct llama_model * model = NULL;
200
+ };
201
+
202
+ static void print_usage(int, char ** argv) {
203
+ LOG("\n example usage:\n");
204
+ LOG("\n %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
205
+ LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
206
+ }
207
+
208
+ static struct llava_image_embed * load_image(llava_context * ctx_llava, common_params * params, const std::string & fname) {
209
+
210
+ // load and preprocess the image
211
+ llava_image_embed * embed = NULL;
212
+ auto prompt = params->prompt;
213
+ if (prompt_contains_image(prompt)) {
214
+ if (!params->image.empty()) {
215
+ LOG_INF("using base64 encoded image instead of command line image path\n");
216
+ }
217
+ embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
218
+ if (!embed) {
219
+ LOG_ERR("%s: can't load image from prompt\n", __func__);
220
+ return NULL;
221
+ }
222
+ params->prompt = remove_image_from_prompt(prompt);
223
+ } else {
224
+ embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str());
225
+ if (!embed) {
226
+ fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
227
+ return NULL;
228
+ }
229
+ }
230
+
231
+ return embed;
232
+ }
233
+
234
+ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, common_params * params, const std::string & prompt) {
235
+ int n_past = 0;
236
+ int cur_pos_id = 0;
237
+
238
+ const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
239
+
240
+ std::string system_prompt, user_prompt;
241
+ size_t image_pos = prompt.find("<|vision_start|>");
242
+ if (image_pos != std::string::npos) {
243
+ // new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
244
+ system_prompt = prompt.substr(0, image_pos);
245
+ user_prompt = prompt.substr(image_pos + std::string("<|vision_pad|>").length());
246
+ LOG_INF("system_prompt: %s\n", system_prompt.c_str());
247
+ if (params->verbose_prompt) {
248
+ auto tmp = common_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
249
+ for (int i = 0; i < (int) tmp.size(); i++) {
250
+ LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
251
+ }
252
+ }
253
+ LOG_INF("user_prompt: %s\n", user_prompt.c_str());
254
+ if (params->verbose_prompt) {
255
+ auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
256
+ for (int i = 0; i < (int) tmp.size(); i++) {
257
+ LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
258
+ }
259
+ }
260
+ } else {
261
+ // llava-1.5 native mode
262
+ system_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|>";
263
+ user_prompt = "<|vision_end|>" + prompt + "<|im_end|>\n<|im_start|>assistant\n";
264
+ if (params->verbose_prompt) {
265
+ auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
266
+ for (int i = 0; i < (int) tmp.size(); i++) {
267
+ LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
268
+ }
269
+ }
270
+ }
271
+
272
+ eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, &cur_pos_id, true);
273
+ if (image_embed != nullptr) {
274
+ auto image_size = clip_get_load_image_size(ctx_llava->ctx_clip);
275
+ qwen2vl_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past, &cur_pos_id, image_size);
276
+ }
277
+ eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, &cur_pos_id, false);
278
+
279
+ // generate the response
280
+
281
+ LOG("\n");
282
+
283
+ struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling);
284
+ if (!smpl) {
285
+ LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
286
+ exit(1);
287
+ }
288
+
289
+ std::string response = "";
290
+ for (int i = 0; i < max_tgt_len; i++) {
291
+ const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past, &cur_pos_id);
292
+ response += tmp;
293
+ if (strcmp(tmp, "</s>") == 0) break;
294
+ if (strstr(tmp, "###")) break; // Yi-VL behavior
295
+ LOG("%s", tmp);
296
+ if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
297
+ if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
298
+ if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
299
+
300
+ fflush(stdout);
301
+ }
302
+
303
+ common_sampler_free(smpl);
304
+ LOG("\n");
305
+ }
306
+
307
+ static struct llama_model * llava_init(common_params * params) {
308
+ llama_backend_init();
309
+ llama_numa_init(params->numa);
310
+
311
+ llama_model_params model_params = common_model_params_to_llama(*params);
312
+
313
+ llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
314
+ if (model == NULL) {
315
+ LOG_ERR("%s: unable to load model\n" , __func__);
316
+ return NULL;
317
+ }
318
+ return model;
319
+ }
320
+
321
+ static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
322
+ const char * clip_path = params->mmproj.c_str();
323
+
324
+ auto prompt = params->prompt;
325
+ if (prompt.empty()) {
326
+ prompt = "describe the image in detail.";
327
+ }
328
+
329
+ auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
330
+
331
+
332
+ llama_context_params ctx_params = common_context_params_to_llama(*params);
333
+ ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
334
+
335
+ llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
336
+
337
+ if (ctx_llama == NULL) {
338
+ LOG_ERR("%s: failed to create the llama_context\n" , __func__);
339
+ return NULL;
340
+ }
341
+
342
+ auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
343
+
344
+ ctx_llava->ctx_llama = ctx_llama;
345
+ ctx_llava->ctx_clip = ctx_clip;
346
+ ctx_llava->model = model;
347
+ return ctx_llava;
348
+ }
349
+
350
+ static void llava_free(struct llava_context * ctx_llava) {
351
+ if (ctx_llava->ctx_clip) {
352
+ clip_free(ctx_llava->ctx_clip);
353
+ ctx_llava->ctx_clip = NULL;
354
+ }
355
+
356
+ llama_free(ctx_llava->ctx_llama);
357
+ llama_free_model(ctx_llava->model);
358
+ llama_backend_free();
359
+ }
360
+
361
+ #ifndef NDEBUG
362
+
363
+ static void debug_test_mrope_2d() {
364
+ // 1. Initialize backend
365
+ ggml_backend_t backend = NULL;
366
+ std::string backend_name = "";
367
+ #ifdef GGML_USE_CUDA
368
+ fprintf(stderr, "%s: using CUDA backend\n", __func__);
369
+ backend = ggml_backend_cuda_init(0); // init device 0
370
+ backend_name = "cuda";
371
+ if (!backend) {
372
+ fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
373
+ }
374
+ #endif
375
+ // if there aren't GPU Backends fallback to CPU backend
376
+ if (!backend) {
377
+ backend = ggml_backend_cpu_init();
378
+ backend_name = "cpu";
379
+ }
380
+
381
+ // Calculate the size needed to allocate
382
+ size_t ctx_size = 0;
383
+ ctx_size += 2 * ggml_tensor_overhead(); // tensors
384
+ // no need to allocate anything else!
385
+
386
+ // 2. Allocate `ggml_context` to store tensor data
387
+ struct ggml_init_params params = {
388
+ /*.mem_size =*/ ctx_size,
389
+ /*.mem_buffer =*/ NULL,
390
+ /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_backend_alloc_ctx_tensors()
391
+ };
392
+ struct ggml_context * ctx = ggml_init(params);
393
+
394
+ struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 128, 12, 30);
395
+ ggml_set_name(inp_raw, "inp_raw");
396
+ ggml_set_input(inp_raw);
397
+
398
+ struct ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 30 * 4);
399
+ ggml_set_name(pos, "pos");
400
+ ggml_set_input(pos);
401
+
402
+ std::vector<float> dummy_q;
403
+ dummy_q.resize(128 * 12 * 30);
404
+ std::fill(dummy_q.begin(), dummy_q.end(), 0.1);
405
+ // memcpy(inp_raw->data, dummy_q.data(), 128 * 12 * 30 * ggml_element_size(inp_raw));
406
+
407
+ std::vector<int> pos_id;
408
+ pos_id.resize(30 * 4);
409
+ for (int i = 0; i < 30; i ++) {
410
+ pos_id[i] = i;
411
+ pos_id[i + 30] = i + 10;
412
+ pos_id[i + 60] = i + 20;
413
+ pos_id[i + 90] = i + 30;
414
+ }
415
+ int sections[4] = {32, 32, 0, 0};
416
+
417
+ // 4. Allocate a `ggml_backend_buffer` to store all tensors
418
+ ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
419
+
420
+ // 5. Copy tensor data from main memory (RAM) to backend buffer
421
+ ggml_backend_tensor_set(inp_raw, dummy_q.data(), 0, ggml_nbytes(inp_raw));
422
+ ggml_backend_tensor_set(pos, pos_id.data(), 0, ggml_nbytes(pos));
423
+
424
+ // 6. Create a `ggml_cgraph` for mul_mat operation
425
+ struct ggml_cgraph * gf = NULL;
426
+ struct ggml_context * ctx_cgraph = NULL;
427
+
428
+ // create a temporally context to build the graph
429
+ struct ggml_init_params params0 = {
430
+ /*.mem_size =*/ ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(),
431
+ /*.mem_buffer =*/ NULL,
432
+ /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
433
+ };
434
+ ctx_cgraph = ggml_init(params0);
435
+ gf = ggml_new_graph(ctx_cgraph);
436
+
437
+ struct ggml_tensor * result0 = ggml_rope_multi(
438
+ ctx_cgraph, inp_raw, pos, nullptr,
439
+ 128/2, sections, LLAMA_ROPE_TYPE_VISION, 32768, 1000000, 1,
440
+ 0, 1, 32, 1);
441
+
442
+ // Add "result" tensor and all of its dependencies to the cgraph
443
+ ggml_build_forward_expand(gf, result0);
444
+
445
+ // 7. Create a `ggml_gallocr` for cgraph computation
446
+ ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
447
+ ggml_gallocr_alloc_graph(allocr, gf);
448
+
449
+ // 9. Run the computation
450
+ int n_threads = 1; // Optional: number of threads to perform some operations with multi-threading
451
+ if (ggml_backend_is_cpu(backend)) {
452
+ ggml_backend_cpu_set_n_threads(backend, n_threads);
453
+ }
454
+ ggml_backend_graph_compute(backend, gf);
455
+
456
+ // 10. Retrieve results (output tensors)
457
+ // in this example, output tensor is always the last tensor in the graph
458
+ struct ggml_tensor * result = result0;
459
+ // struct ggml_tensor * result = gf->nodes[gf->n_nodes - 1];
460
+ float * result_data = (float *)malloc(ggml_nbytes(result));
461
+ // because the tensor data is stored in device buffer, we need to copy it back to RAM
462
+ ggml_backend_tensor_get(result, result_data, 0, ggml_nbytes(result));
463
+ const std::string bin_file = "mrope_2d_" + backend_name +".bin";
464
+ std::ofstream outFile(bin_file, std::ios::binary);
465
+
466
+ if (outFile.is_open()) {
467
+ outFile.write(reinterpret_cast<const char*>(result_data), ggml_nbytes(result));
468
+ outFile.close();
469
+ std::cout << "Data successfully written to " + bin_file << std::endl;
470
+ } else {
471
+ std::cerr << "Error opening file!" << std::endl;
472
+ }
473
+
474
+ free(result_data);
475
+ // 11. Free memory and exit
476
+ ggml_free(ctx_cgraph);
477
+ ggml_gallocr_free(allocr);
478
+ ggml_free(ctx);
479
+ ggml_backend_buffer_free(buffer);
480
+ ggml_backend_free(backend);
481
+ }
482
+
483
+ static void debug_dump_img_embed(struct llava_context * ctx_llava) {
484
+ int n_embd = llama_n_embd(llama_get_model(ctx_llava->ctx_llama));
485
+ int ne = n_embd * 4;
486
+ float vals[56 * 56 * 3];
487
+ // float embd[ne];
488
+ std::vector<float> embd;
489
+ embd.resize(ne);
490
+
491
+ for (int i = 0; i < 56*56; i++)
492
+ {
493
+ for (int c = 0; c < 3; c++)
494
+ vals[i * 3 + c] = (float)(i % (56 * 56)) / (56*56);
495
+ }
496
+
497
+ clip_encode_float_image(ctx_llava->ctx_clip, 16, vals, 56, 56, embd.data());
498
+
499
+ std::ofstream outFile("img_embed.bin", std::ios::binary);
500
+ if (outFile.is_open()) {
501
+ outFile.write(reinterpret_cast<const char*>(embd.data()), ne * sizeof(float));
502
+
503
+ outFile.close();
504
+ std::cout << "Data successfully written to mrope.bin" << std::endl;
505
+ } else {
506
+ std::cerr << "Error opening file!" << std::endl;
507
+ }
508
+ }
509
+
510
+ #endif
511
+
512
+
513
+ int main(int argc, char ** argv) {
514
+ ggml_time_init();
515
+
516
+ common_params params;
517
+
518
+ if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
519
+ return 1;
520
+ }
521
+
522
+ common_init();
523
+
524
+ if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
525
+ print_usage(argc, argv);
526
+ return 1;
527
+ }
528
+
529
+ auto * model = llava_init(&params);
530
+ if (model == NULL) {
531
+ fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
532
+ return 1;
533
+ }
534
+
535
+ if (prompt_contains_image(params.prompt)) {
536
+ auto * ctx_llava = llava_init_context(&params, model);
537
+
538
+ auto * image_embed = load_image(ctx_llava, &params, "");
539
+
540
+ // process the prompt
541
+ process_prompt(ctx_llava, image_embed, &params, params.prompt);
542
+
543
+ llama_perf_context_print(ctx_llava->ctx_llama);
544
+ llava_image_embed_free(image_embed);
545
+ ctx_llava->model = NULL;
546
+ llava_free(ctx_llava);
547
+ #ifndef NDEBUG
548
+ } else if (params.image[0].empty()) {
549
+ auto ctx_llava = llava_init_context(&params, model);
550
+
551
+ debug_test_mrope_2d();
552
+ debug_dump_img_embed(ctx_llava);
553
+
554
+ llama_perf_context_print(ctx_llava->ctx_llama);
555
+ ctx_llava->model = NULL;
556
+ llava_free(ctx_llava);
557
+ #endif
558
+ } else {
559
+ for (auto & image : params.image) {
560
+ auto * ctx_llava = llava_init_context(&params, model);
561
+
562
+ auto * image_embed = load_image(ctx_llava, &params, image);
563
+ if (!image_embed) {
564
+ LOG_ERR("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
565
+ return 1;
566
+ }
567
+
568
+ // process the prompt
569
+ process_prompt(ctx_llava, image_embed, &params, params.prompt);
570
+
571
+ llama_perf_context_print(ctx_llava->ctx_llama);
572
+ llava_image_embed_free(image_embed);
573
+ ctx_llava->model = NULL;
574
+ llava_free(ctx_llava);
575
+ }
576
+ }
577
+
578
+ llama_free_model(model);
579
+
580
+ return 0;
581
+ }
@@ -2,4 +2,4 @@ set(TARGET llama-lookahead)
2
2
  add_executable(${TARGET} lookahead.cpp)
3
3
  install(TARGETS ${TARGET} RUNTIME)
4
4
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
- target_compile_features(${TARGET} PRIVATE cxx_std_11)
5
+ target_compile_features(${TARGET} PRIVATE cxx_std_17)