@fugood/llama.node 0.3.16 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (281) hide show
  1. package/CMakeLists.txt +6 -1
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +44 -2
  19. package/lib/index.js +132 -1
  20. package/lib/index.ts +203 -3
  21. package/package.json +2 -1
  22. package/src/EmbeddingWorker.cpp +1 -1
  23. package/src/LlamaCompletionWorker.cpp +374 -19
  24. package/src/LlamaCompletionWorker.h +31 -10
  25. package/src/LlamaContext.cpp +216 -7
  26. package/src/LlamaContext.h +12 -0
  27. package/src/common.hpp +15 -0
  28. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +233 -0
  29. package/src/llama.cpp/.github/workflows/build.yml +89 -767
  30. package/src/llama.cpp/.github/workflows/docker.yml +9 -6
  31. package/src/llama.cpp/.github/workflows/release.yml +716 -0
  32. package/src/llama.cpp/.github/workflows/server.yml +19 -23
  33. package/src/llama.cpp/CMakeLists.txt +11 -1
  34. package/src/llama.cpp/cmake/build-info.cmake +8 -2
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
  36. package/src/llama.cpp/common/CMakeLists.txt +35 -4
  37. package/src/llama.cpp/common/arg.cpp +844 -121
  38. package/src/llama.cpp/common/arg.h +9 -0
  39. package/src/llama.cpp/common/chat.cpp +129 -107
  40. package/src/llama.cpp/common/chat.h +2 -0
  41. package/src/llama.cpp/common/common.cpp +64 -518
  42. package/src/llama.cpp/common/common.h +35 -45
  43. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  44. package/src/llama.cpp/common/llguidance.cpp +31 -47
  45. package/src/llama.cpp/common/minja/chat-template.hpp +23 -11
  46. package/src/llama.cpp/common/minja/minja.hpp +186 -127
  47. package/src/llama.cpp/common/regex-partial.cpp +204 -0
  48. package/src/llama.cpp/common/regex-partial.h +56 -0
  49. package/src/llama.cpp/common/sampling.cpp +60 -50
  50. package/src/llama.cpp/docs/build.md +122 -7
  51. package/src/llama.cpp/examples/CMakeLists.txt +2 -32
  52. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  53. package/src/llama.cpp/examples/embedding/embedding.cpp +9 -12
  54. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  55. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  56. package/src/llama.cpp/examples/parallel/parallel.cpp +89 -15
  57. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  58. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  59. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  60. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  61. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  62. package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
  63. package/src/llama.cpp/examples/training/finetune.cpp +96 -0
  64. package/src/llama.cpp/ggml/CMakeLists.txt +35 -2
  65. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  66. package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
  67. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
  68. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  69. package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
  70. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  71. package/src/llama.cpp/ggml/include/ggml.h +76 -106
  72. package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -8
  73. package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
  74. package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
  75. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  76. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  77. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  78. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  79. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  80. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  81. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  82. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  83. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +66 -33
  84. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  85. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  86. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  87. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  88. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -194
  89. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  90. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1060 -410
  91. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1008 -13533
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +31 -16
  93. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +90 -12
  94. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -13
  95. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +266 -72
  96. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1034 -88
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8796 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  101. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  102. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +252 -0
  103. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  104. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
  105. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  106. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  107. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  108. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  109. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +106 -14
  110. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  111. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -262
  112. package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
  113. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
  114. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +307 -40
  115. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +125 -45
  116. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +10 -8
  117. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +239 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  119. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  120. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +9 -307
  121. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
  122. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
  123. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  124. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +944 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  130. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  131. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +507 -411
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  133. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  134. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
  135. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  136. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  137. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  138. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  140. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  141. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
  142. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +83 -49
  143. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1278 -282
  144. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +32 -0
  145. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +133 -30
  146. package/src/llama.cpp/ggml/src/ggml.c +170 -265
  147. package/src/llama.cpp/ggml/src/gguf.cpp +34 -33
  148. package/src/llama.cpp/include/llama.h +82 -22
  149. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  150. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  151. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  152. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  153. package/src/llama.cpp/requirements/requirements-all.txt +5 -3
  154. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  155. package/src/llama.cpp/scripts/xxd.cmake +1 -1
  156. package/src/llama.cpp/src/CMakeLists.txt +4 -2
  157. package/src/llama.cpp/src/llama-adapter.cpp +43 -1
  158. package/src/llama.cpp/src/llama-arch.cpp +163 -17
  159. package/src/llama.cpp/src/llama-arch.h +16 -0
  160. package/src/llama.cpp/src/llama-batch.cpp +5 -1
  161. package/src/llama.cpp/src/llama-batch.h +2 -1
  162. package/src/llama.cpp/src/llama-chat.cpp +91 -16
  163. package/src/llama.cpp/src/llama-chat.h +7 -2
  164. package/src/llama.cpp/src/llama-context.cpp +479 -575
  165. package/src/llama.cpp/src/llama-context.h +44 -33
  166. package/src/llama.cpp/src/llama-cparams.h +1 -0
  167. package/src/llama.cpp/src/llama-graph.cpp +209 -157
  168. package/src/llama.cpp/src/llama-graph.h +38 -14
  169. package/src/llama.cpp/src/llama-hparams.h +13 -0
  170. package/src/llama.cpp/src/llama-kv-cache.cpp +1604 -543
  171. package/src/llama.cpp/src/llama-kv-cache.h +283 -171
  172. package/src/llama.cpp/src/llama-memory.h +12 -2
  173. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  174. package/src/llama.cpp/src/llama-model-loader.cpp +34 -20
  175. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  176. package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
  177. package/src/llama.cpp/src/llama-model-saver.h +37 -0
  178. package/src/llama.cpp/src/llama-model.cpp +1803 -330
  179. package/src/llama.cpp/src/llama-model.h +21 -2
  180. package/src/llama.cpp/src/llama-quant.cpp +33 -10
  181. package/src/llama.cpp/src/llama-sampling.cpp +25 -7
  182. package/src/llama.cpp/src/llama-vocab.cpp +86 -10
  183. package/src/llama.cpp/src/llama-vocab.h +6 -0
  184. package/src/llama.cpp/src/llama.cpp +15 -1
  185. package/src/llama.cpp/tests/CMakeLists.txt +52 -31
  186. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  187. package/src/llama.cpp/tests/test-backend-ops.cpp +189 -90
  188. package/src/llama.cpp/tests/test-chat-template.cpp +26 -6
  189. package/src/llama.cpp/tests/test-chat.cpp +15 -3
  190. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  191. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  192. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  193. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  194. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  195. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  196. package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
  197. package/src/llama.cpp/tests/test-opt.cpp +33 -21
  198. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  199. package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
  200. package/src/llama.cpp/tests/test-sampling.cpp +1 -1
  201. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  202. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  203. package/src/llama.cpp/tools/CMakeLists.txt +39 -0
  204. package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +3 -3
  205. package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +1 -1
  206. package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +15 -16
  207. package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
  208. package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +623 -274
  209. package/src/llama.cpp/{examples → tools}/main/main.cpp +22 -14
  210. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +47 -0
  211. package/src/llama.cpp/tools/mtmd/clip-impl.h +365 -0
  212. package/src/llama.cpp/tools/mtmd/clip.cpp +3646 -0
  213. package/src/llama.cpp/tools/mtmd/clip.h +99 -0
  214. package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +22 -0
  215. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +370 -0
  216. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
  217. package/src/llama.cpp/tools/mtmd/mtmd.cpp +678 -0
  218. package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
  219. package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +21 -5
  220. package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +53 -3
  221. package/src/llama.cpp/tools/rpc/CMakeLists.txt +4 -0
  222. package/src/llama.cpp/tools/rpc/rpc-server.cpp +322 -0
  223. package/src/llama.cpp/tools/run/CMakeLists.txt +16 -0
  224. package/src/llama.cpp/{examples → tools}/run/run.cpp +30 -30
  225. package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
  226. package/src/llama.cpp/{examples → tools}/server/httplib.h +313 -247
  227. package/src/llama.cpp/{examples → tools}/server/server.cpp +529 -215
  228. package/src/llama.cpp/{examples → tools}/server/utils.hpp +427 -6
  229. package/src/llama.cpp/{examples → tools}/tts/tts.cpp +6 -9
  230. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
  231. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  232. package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
  233. package/src/llama.cpp/examples/infill/infill.cpp +0 -590
  234. package/src/llama.cpp/examples/llava/CMakeLists.txt +0 -66
  235. package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
  236. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
  237. package/src/llama.cpp/examples/llava/clip.cpp +0 -3206
  238. package/src/llama.cpp/examples/llava/clip.h +0 -118
  239. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  240. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  241. package/src/llama.cpp/examples/llava/llava.cpp +0 -574
  242. package/src/llama.cpp/examples/llava/llava.h +0 -49
  243. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  244. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +0 -584
  245. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  246. package/src/llama.cpp/examples/rpc/CMakeLists.txt +0 -2
  247. package/src/llama.cpp/examples/rpc/rpc-server.cpp +0 -171
  248. package/src/llama.cpp/examples/run/CMakeLists.txt +0 -5
  249. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  250. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  251. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  252. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  253. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  254. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  255. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  256. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  257. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  258. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  259. /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
  260. /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
  261. /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
  262. /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
  263. /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
  264. /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
  265. /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
  266. /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
  267. /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
  268. /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
  269. /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
  270. /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
  271. /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
  272. /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
  273. /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
  274. /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
  275. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
  276. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
  277. /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
  278. /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
  279. /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
  280. /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
  281. /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
@@ -0,0 +1,99 @@
1
+ #pragma once
2
+
3
+ #include "ggml.h"
4
+ #include <stddef.h>
5
+ #include <stdint.h>
6
+
7
+ struct clip_ctx;
8
+
9
+ struct clip_image_size {
10
+ int width;
11
+ int height;
12
+ };
13
+
14
+ struct clip_image_f32;
15
+ struct clip_image_u8_batch;
16
+ struct clip_image_f32_batch;
17
+
18
+ struct clip_context_params {
19
+ bool use_gpu;
20
+ enum ggml_log_level verbosity;
21
+ };
22
+
23
+ struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params);
24
+
25
+ void clip_free(struct clip_ctx * ctx);
26
+
27
+ size_t clip_embd_nbytes(const struct clip_ctx * ctx);
28
+ size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h);
29
+
30
+ int32_t clip_get_image_size (const struct clip_ctx * ctx);
31
+ int32_t clip_get_patch_size (const struct clip_ctx * ctx);
32
+ int32_t clip_get_hidden_size(const struct clip_ctx * ctx);
33
+
34
+ // TODO: should be enum, not string
35
+ const char * clip_patch_merge_type(const struct clip_ctx * ctx);
36
+
37
+ const int32_t * clip_image_grid(const struct clip_ctx * ctx);
38
+ size_t get_clip_image_grid_size(const struct clip_ctx * ctx);
39
+
40
+ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img);
41
+
42
+ // for M-RoPE, this will be the number of token positions in X and Y directions
43
+ // for other models, X will be the total number of tokens and Y will be 1
44
+ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img);
45
+ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img);
46
+
47
+ // this should be equal to the embedding dimension of the text model
48
+ int clip_n_mmproj_embd(const struct clip_ctx * ctx);
49
+
50
+ int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
51
+ void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
52
+ struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);
53
+
54
+ struct clip_image_size * clip_image_size_init(void);
55
+ struct clip_image_u8 * clip_image_u8_init (void);
56
+ struct clip_image_f32 * clip_image_f32_init(void);
57
+ struct clip_image_f32_batch * clip_image_f32_batch_init(void); // only used by libllava
58
+
59
+ // nx, ny are the output image dimensions
60
+ unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);
61
+
62
+ void clip_image_size_free (struct clip_image_size * img_size);
63
+ void clip_image_u8_free (struct clip_image_u8 * img);
64
+ void clip_image_f32_free(struct clip_image_f32 * img);
65
+ void clip_image_u8_batch_free (struct clip_image_u8_batch * batch);
66
+ void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
67
+
68
+ // use for accessing underlay data of clip_image_f32_batch
69
+ size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch); // equivalent to batch->size()
70
+ size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->nx
71
+ size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny
72
+ struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
73
+
74
+ /**
75
+ * Build image from pixels decoded by other libraries instead of stb_image.h for better performance.
76
+ * The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes
77
+ */
78
+ void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
79
+
80
+ bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
81
+
82
+ /** interpret bytes as an image file with length bytes_length, and use the result to populate img */
83
+ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
84
+
85
+ /** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
86
+ bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
87
+
88
+ struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
89
+
90
+ bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
91
+ bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
92
+
93
+ int clip_is_minicpmv(const struct clip_ctx * ctx);
94
+ bool clip_is_glm(const struct clip_ctx * ctx);
95
+ bool clip_is_qwen2vl(const struct clip_ctx * ctx);
96
+ bool clip_is_llava(const struct clip_ctx * ctx);
97
+ bool clip_is_gemma3(const struct clip_ctx * ctx);
98
+
99
+ bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
@@ -0,0 +1,22 @@
1
+ #include <cstdio>
2
+ #include <string>
3
+
4
+ int main(int argc, char** argv) {
5
+ std::string filename = "main";
6
+ if (argc >= 1) {
7
+ filename = argv[0];
8
+ }
9
+
10
+ // Get only the program name from the full path
11
+ size_t pos = filename.find_last_of("/\\");
12
+ if (pos != std::string::npos) {
13
+ filename = filename.substr(pos+1);
14
+ }
15
+
16
+ fprintf(stdout, "\n");
17
+ fprintf(stdout, "WARNING: The binary '%s' is deprecated.\n", filename.c_str());
18
+ fprintf(stdout, "Please use 'llama-mtmd-cli' instead.\n");
19
+ fprintf(stdout, "\n");
20
+
21
+ return EXIT_FAILURE;
22
+ }
@@ -0,0 +1,370 @@
1
+ #include "arg.h"
2
+ #include "log.h"
3
+ #include "common.h"
4
+ #include "sampling.h"
5
+ #include "llama.h"
6
+ #include "ggml.h"
7
+ #include "console.h"
8
+ #include "chat.h"
9
+ #include "mtmd.h"
10
+
11
+ #include <vector>
12
+ #include <limits.h>
13
+ #include <cinttypes>
14
+
15
+ #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
16
+ #include <signal.h>
17
+ #include <unistd.h>
18
+ #elif defined (_WIN32)
19
+ #define WIN32_LEAN_AND_MEAN
20
+ #ifndef NOMINMAX
21
+ #define NOMINMAX
22
+ #endif
23
+ #include <windows.h>
24
+ #include <signal.h>
25
+ #endif
26
+
27
+ // volatile, because of signal being an interrupt
28
+ static volatile bool g_is_generating = false;
29
+ static volatile bool g_is_interrupted = false;
30
+
31
+ /**
32
+ * Please note that this is NOT a production-ready stuff.
33
+ * It is a playground for trying multimodal support in llama.cpp.
34
+ * For contributors: please keep this code simple and easy to understand.
35
+ */
36
+
37
+ static void show_additional_info(int /*argc*/, char ** argv) {
38
+ LOG(
39
+ "Experimental CLI for multimodal\n\n"
40
+ "Usage: %s [options] -m <model> --mmproj <mmproj> --image <image> -p <prompt>\n\n"
41
+ " -m and --mmproj are required\n"
42
+ " -hf user/repo can replace both -m and --mmproj in most cases\n"
43
+ " --image and -p are optional, if NOT provided, the CLI will run in chat mode\n"
44
+ " to disable using GPU for mmproj model, add --no-mmproj-offload\n",
45
+ argv[0]
46
+ );
47
+ }
48
+
49
+ #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
50
+ static void sigint_handler(int signo) {
51
+ if (signo == SIGINT) {
52
+ if (g_is_generating) {
53
+ g_is_generating = false;
54
+ } else {
55
+ console::cleanup();
56
+ if (g_is_interrupted) {
57
+ _exit(1);
58
+ }
59
+ g_is_interrupted = true;
60
+ }
61
+ }
62
+ }
63
+ #endif
64
+
65
+ struct mtmd_cli_context {
66
+ mtmd::context_ptr ctx_vision;
67
+ common_init_result llama_init;
68
+
69
+ llama_model * model;
70
+ llama_context * lctx;
71
+ const llama_vocab * vocab;
72
+ llama_batch batch;
73
+ int n_batch;
74
+
75
+ mtmd::bitmaps bitmaps;
76
+
77
+ // note: we know that gemma3 template is "linear", meaning each turn is completely separated to another
78
+ // so here we don't need to keep track of chat history
79
+ common_chat_templates_ptr tmpls;
80
+
81
+ // support for legacy templates (models not having EOT token)
82
+ llama_tokens antiprompt_tokens;
83
+
84
+ int n_threads = 1;
85
+ llama_pos n_past = 0;
86
+
87
+ mtmd_cli_context(common_params & params) : llama_init(common_init_from_params(params)) {
88
+ model = llama_init.model.get();
89
+ lctx = llama_init.context.get();
90
+ vocab = llama_model_get_vocab(model);
91
+ n_threads = params.cpuparams.n_threads;
92
+ batch = llama_batch_init(params.n_batch, 0, 1);
93
+ n_batch = params.n_batch;
94
+
95
+ if (!model || !lctx) {
96
+ exit(1);
97
+ }
98
+
99
+ if (!llama_model_chat_template(model, nullptr) && params.chat_template.empty()) {
100
+ LOG_ERR("Model does not have chat template.\n");
101
+ LOG_ERR(" For old llava models, you may need to use '--chat-template vicuna'\n");
102
+ LOG_ERR(" For MobileVLM models, use '--chat-template deepseek'\n");
103
+ LOG_ERR(" For Mistral Small 3.1, use '--chat-template mistral-v7'\n");
104
+ exit(1);
105
+ }
106
+
107
+ tmpls = common_chat_templates_init(model, params.chat_template);
108
+ LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(tmpls.get(), params.use_jinja).c_str());
109
+
110
+ init_vision_context(params);
111
+
112
+ // load antiprompt tokens for legacy templates
113
+ if (params.chat_template == "vicuna") {
114
+ antiprompt_tokens = common_tokenize(lctx, "ASSISTANT:", false, true);
115
+ } else if (params.chat_template == "deepseek") {
116
+ antiprompt_tokens = common_tokenize(lctx, "###", false, true);
117
+ }
118
+ }
119
+
120
+ void init_vision_context(common_params & params) {
121
+ const char * clip_path = params.mmproj.path.c_str();
122
+ mtmd_context_params mparams = mtmd_context_params_default();
123
+ mparams.use_gpu = params.mmproj_use_gpu;
124
+ mparams.print_timings = true;
125
+ mparams.n_threads = params.cpuparams.n_threads;
126
+ mparams.verbosity = params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO;
127
+ ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams));
128
+ if (!ctx_vision.get()) {
129
+ LOG_ERR("Failed to load vision model from %s\n", clip_path);
130
+ exit(1);
131
+ }
132
+ }
133
+
134
+ bool check_antiprompt(const llama_tokens & generated_tokens) {
135
+ if (antiprompt_tokens.empty() || generated_tokens.size() < antiprompt_tokens.size()) {
136
+ return false;
137
+ }
138
+ return std::equal(
139
+ generated_tokens.end() - antiprompt_tokens.size(),
140
+ generated_tokens.end(),
141
+ antiprompt_tokens.begin()
142
+ );
143
+ }
144
+
145
+ bool load_image(const std::string & fname) {
146
+ mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(fname.c_str()));
147
+ if (!bmp.ptr) {
148
+ return false;
149
+ }
150
+ bitmaps.entries.push_back(std::move(bmp));
151
+ return true;
152
+ }
153
+ };
154
+
155
+ static int generate_response(mtmd_cli_context & ctx, common_sampler * smpl, int n_predict) {
156
+ llama_tokens generated_tokens;
157
+ for (int i = 0; i < n_predict; i++) {
158
+ if (i > n_predict || !g_is_generating || g_is_interrupted) {
159
+ LOG("\n");
160
+ break;
161
+ }
162
+
163
+ llama_token token_id = common_sampler_sample(smpl, ctx.lctx, -1);
164
+ generated_tokens.push_back(token_id);
165
+ common_sampler_accept(smpl, token_id, true);
166
+
167
+ if (llama_vocab_is_eog(ctx.vocab, token_id) || ctx.check_antiprompt(generated_tokens)) {
168
+ LOG("\n");
169
+ break; // end of generation
170
+ }
171
+
172
+ LOG("%s", common_token_to_piece(ctx.lctx, token_id).c_str());
173
+ fflush(stdout);
174
+
175
+ if (g_is_interrupted) {
176
+ LOG("\n");
177
+ break;
178
+ }
179
+
180
+ // eval the token
181
+ common_batch_clear(ctx.batch);
182
+ common_batch_add(ctx.batch, token_id, ctx.n_past++, {0}, true);
183
+ if (llama_decode(ctx.lctx, ctx.batch)) {
184
+ LOG_ERR("failed to decode token\n");
185
+ return 1;
186
+ }
187
+ }
188
+ return 0;
189
+ }
190
+
191
+ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, bool add_bos = false) {
192
+ common_chat_templates_inputs tmpl_inputs;
193
+ tmpl_inputs.messages = {msg};
194
+ tmpl_inputs.add_generation_prompt = true;
195
+ tmpl_inputs.use_jinja = false; // jinja is buggy here
196
+ auto formatted_chat = common_chat_templates_apply(ctx.tmpls.get(), tmpl_inputs);
197
+ LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.prompt.c_str());
198
+
199
+ mtmd_input_text text;
200
+ text.text = formatted_chat.prompt.c_str();
201
+ text.add_special = add_bos;
202
+ text.parse_special = true;
203
+
204
+ if (g_is_interrupted) return 0;
205
+
206
+ mtmd::input_chunks chunks(mtmd_input_chunks_init());
207
+ auto bitmaps_c_ptr = ctx.bitmaps.c_ptr();
208
+ int32_t res = mtmd_tokenize(ctx.ctx_vision.get(),
209
+ chunks.ptr.get(), // output
210
+ &text, // text
211
+ bitmaps_c_ptr.data(),
212
+ bitmaps_c_ptr.size());
213
+ if (res != 0) {
214
+ LOG_ERR("Unable to tokenize prompt, res = %d\n", res);
215
+ return 1;
216
+ }
217
+
218
+ ctx.bitmaps.entries.clear();
219
+
220
+ llama_pos new_n_past;
221
+ if (mtmd_helper_eval_chunks(ctx.ctx_vision.get(),
222
+ ctx.lctx, // lctx
223
+ chunks.ptr.get(), // chunks
224
+ ctx.n_past, // n_past
225
+ 0, // seq_id
226
+ ctx.n_batch, // n_batch
227
+ true, // logits_last
228
+ &new_n_past)) {
229
+ LOG_ERR("Unable to eval prompt\n");
230
+ return 1;
231
+ }
232
+
233
+ ctx.n_past = new_n_past;
234
+
235
+ LOG("\n");
236
+
237
+ return 0;
238
+ }
239
+
240
+ int main(int argc, char ** argv) {
241
+ ggml_time_init();
242
+
243
+ common_params params;
244
+ params.sampling.temp = 0.2; // lower temp by default for better quality
245
+
246
+ if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
247
+ return 1;
248
+ }
249
+
250
+ common_init();
251
+
252
+ if (params.mmproj.path.empty()) {
253
+ show_additional_info(argc, argv);
254
+ LOG_ERR("ERR: Missing --mmproj argument\n");
255
+ return 1;
256
+ }
257
+
258
+ mtmd_cli_context ctx(params);
259
+ LOG("%s: loading model: %s\n", __func__, params.model.path.c_str());
260
+
261
+ bool is_single_turn = !params.prompt.empty() && !params.image.empty();
262
+
263
+ struct common_sampler * smpl = common_sampler_init(ctx.model, params.sampling);
264
+ int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict;
265
+
266
+ // Ctrl+C handling
267
+ {
268
+ #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
269
+ struct sigaction sigint_action;
270
+ sigint_action.sa_handler = sigint_handler;
271
+ sigemptyset (&sigint_action.sa_mask);
272
+ sigint_action.sa_flags = 0;
273
+ sigaction(SIGINT, &sigint_action, NULL);
274
+ #elif defined (_WIN32)
275
+ auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
276
+ return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
277
+ };
278
+ SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
279
+ #endif
280
+ }
281
+
282
+ if (g_is_interrupted) return 130;
283
+
284
+ if (is_single_turn) {
285
+ g_is_generating = true;
286
+ if (params.prompt.find("<__image__>") == std::string::npos) {
287
+ params.prompt += " <__image__>";
288
+ }
289
+ common_chat_msg msg;
290
+ msg.role = "user";
291
+ msg.content = params.prompt;
292
+ for (const auto & image : params.image) {
293
+ if (!ctx.load_image(image)) {
294
+ return 1; // error is already printed by libmtmd
295
+ }
296
+ }
297
+ if (eval_message(ctx, msg, true)) {
298
+ return 1;
299
+ }
300
+ if (!g_is_interrupted && generate_response(ctx, smpl, n_predict)) {
301
+ return 1;
302
+ }
303
+
304
+ } else {
305
+ LOG("\n Running in chat mode, available commands:");
306
+ LOG("\n /image <path> load an image");
307
+ LOG("\n /clear clear the chat history");
308
+ LOG("\n /quit or /exit exit the program");
309
+ LOG("\n");
310
+
311
+ bool is_first_msg = true;
312
+ std::string content;
313
+
314
+ while (!g_is_interrupted) {
315
+ g_is_generating = false;
316
+ LOG("\n> ");
317
+ console::set_display(console::user_input);
318
+ std::string line;
319
+ console::readline(line, false);
320
+ if (g_is_interrupted) break;
321
+ console::set_display(console::reset);
322
+ line = string_strip(line);
323
+ if (line.empty()) {
324
+ continue;
325
+ }
326
+ if (line == "/quit" || line == "/exit") {
327
+ break;
328
+ }
329
+ if (line == "/clear") {
330
+ ctx.n_past = 0;
331
+ llama_kv_self_seq_rm(ctx.lctx, 0, 1, -1); // keep BOS
332
+ LOG("Chat history cleared\n\n");
333
+ continue;
334
+ }
335
+ g_is_generating = true;
336
+ if (line == "/image" || line.find("/image ") == 0) {
337
+ if (line.size() < 8) {
338
+ LOG_ERR("ERR: Missing image filename\n");
339
+ continue;
340
+ }
341
+ std::string image = line.substr(7);
342
+ if (ctx.load_image(image)) {
343
+ LOG("Image %s loaded\n", image.c_str());
344
+ content += "<__image__>";
345
+ }
346
+ // else, error is already printed by libmtmd
347
+ continue;
348
+ } else {
349
+ content += line;
350
+ }
351
+ common_chat_msg msg;
352
+ msg.role = "user";
353
+ msg.content = content;
354
+ int ret = eval_message(ctx, msg, is_first_msg);
355
+ if (ret) {
356
+ return 1;
357
+ }
358
+ if (g_is_interrupted) break;
359
+ if (generate_response(ctx, smpl, n_predict)) {
360
+ return 1;
361
+ }
362
+ content.clear();
363
+ is_first_msg = false;
364
+ }
365
+ }
366
+ if (g_is_interrupted) LOG("\nInterrupted by user\n");
367
+ LOG("\n\n");
368
+ llama_perf_context_print(ctx.lctx);
369
+ return g_is_interrupted ? 130 : 0;
370
+ }