@fugood/llama.node 0.3.16 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (281) hide show
  1. package/CMakeLists.txt +6 -1
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +44 -2
  19. package/lib/index.js +132 -1
  20. package/lib/index.ts +203 -3
  21. package/package.json +2 -1
  22. package/src/EmbeddingWorker.cpp +1 -1
  23. package/src/LlamaCompletionWorker.cpp +374 -19
  24. package/src/LlamaCompletionWorker.h +31 -10
  25. package/src/LlamaContext.cpp +216 -7
  26. package/src/LlamaContext.h +12 -0
  27. package/src/common.hpp +15 -0
  28. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +233 -0
  29. package/src/llama.cpp/.github/workflows/build.yml +89 -767
  30. package/src/llama.cpp/.github/workflows/docker.yml +9 -6
  31. package/src/llama.cpp/.github/workflows/release.yml +716 -0
  32. package/src/llama.cpp/.github/workflows/server.yml +19 -23
  33. package/src/llama.cpp/CMakeLists.txt +11 -1
  34. package/src/llama.cpp/cmake/build-info.cmake +8 -2
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
  36. package/src/llama.cpp/common/CMakeLists.txt +35 -4
  37. package/src/llama.cpp/common/arg.cpp +844 -121
  38. package/src/llama.cpp/common/arg.h +9 -0
  39. package/src/llama.cpp/common/chat.cpp +129 -107
  40. package/src/llama.cpp/common/chat.h +2 -0
  41. package/src/llama.cpp/common/common.cpp +64 -518
  42. package/src/llama.cpp/common/common.h +35 -45
  43. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  44. package/src/llama.cpp/common/llguidance.cpp +31 -47
  45. package/src/llama.cpp/common/minja/chat-template.hpp +23 -11
  46. package/src/llama.cpp/common/minja/minja.hpp +186 -127
  47. package/src/llama.cpp/common/regex-partial.cpp +204 -0
  48. package/src/llama.cpp/common/regex-partial.h +56 -0
  49. package/src/llama.cpp/common/sampling.cpp +60 -50
  50. package/src/llama.cpp/docs/build.md +122 -7
  51. package/src/llama.cpp/examples/CMakeLists.txt +2 -32
  52. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  53. package/src/llama.cpp/examples/embedding/embedding.cpp +9 -12
  54. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  55. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  56. package/src/llama.cpp/examples/parallel/parallel.cpp +89 -15
  57. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  58. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  59. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  60. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  61. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  62. package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
  63. package/src/llama.cpp/examples/training/finetune.cpp +96 -0
  64. package/src/llama.cpp/ggml/CMakeLists.txt +35 -2
  65. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  66. package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
  67. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
  68. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  69. package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
  70. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  71. package/src/llama.cpp/ggml/include/ggml.h +76 -106
  72. package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -8
  73. package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
  74. package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
  75. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  76. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  77. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  78. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  79. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  80. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  81. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  82. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  83. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +66 -33
  84. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  85. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  86. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  87. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  88. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -194
  89. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  90. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1060 -410
  91. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1008 -13533
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +31 -16
  93. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +90 -12
  94. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -13
  95. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +266 -72
  96. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1034 -88
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8796 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  101. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  102. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +252 -0
  103. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  104. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
  105. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  106. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  107. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  108. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  109. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +106 -14
  110. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  111. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -262
  112. package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
  113. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
  114. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +307 -40
  115. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +125 -45
  116. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +10 -8
  117. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +239 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  119. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  120. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +9 -307
  121. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
  122. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
  123. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  124. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +944 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  130. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  131. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +507 -411
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  133. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  134. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
  135. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  136. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  137. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  138. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  140. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  141. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
  142. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +83 -49
  143. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1278 -282
  144. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +32 -0
  145. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +133 -30
  146. package/src/llama.cpp/ggml/src/ggml.c +170 -265
  147. package/src/llama.cpp/ggml/src/gguf.cpp +34 -33
  148. package/src/llama.cpp/include/llama.h +82 -22
  149. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  150. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  151. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  152. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  153. package/src/llama.cpp/requirements/requirements-all.txt +5 -3
  154. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  155. package/src/llama.cpp/scripts/xxd.cmake +1 -1
  156. package/src/llama.cpp/src/CMakeLists.txt +4 -2
  157. package/src/llama.cpp/src/llama-adapter.cpp +43 -1
  158. package/src/llama.cpp/src/llama-arch.cpp +163 -17
  159. package/src/llama.cpp/src/llama-arch.h +16 -0
  160. package/src/llama.cpp/src/llama-batch.cpp +5 -1
  161. package/src/llama.cpp/src/llama-batch.h +2 -1
  162. package/src/llama.cpp/src/llama-chat.cpp +91 -16
  163. package/src/llama.cpp/src/llama-chat.h +7 -2
  164. package/src/llama.cpp/src/llama-context.cpp +479 -575
  165. package/src/llama.cpp/src/llama-context.h +44 -33
  166. package/src/llama.cpp/src/llama-cparams.h +1 -0
  167. package/src/llama.cpp/src/llama-graph.cpp +209 -157
  168. package/src/llama.cpp/src/llama-graph.h +38 -14
  169. package/src/llama.cpp/src/llama-hparams.h +13 -0
  170. package/src/llama.cpp/src/llama-kv-cache.cpp +1604 -543
  171. package/src/llama.cpp/src/llama-kv-cache.h +283 -171
  172. package/src/llama.cpp/src/llama-memory.h +12 -2
  173. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  174. package/src/llama.cpp/src/llama-model-loader.cpp +34 -20
  175. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  176. package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
  177. package/src/llama.cpp/src/llama-model-saver.h +37 -0
  178. package/src/llama.cpp/src/llama-model.cpp +1803 -330
  179. package/src/llama.cpp/src/llama-model.h +21 -2
  180. package/src/llama.cpp/src/llama-quant.cpp +33 -10
  181. package/src/llama.cpp/src/llama-sampling.cpp +25 -7
  182. package/src/llama.cpp/src/llama-vocab.cpp +86 -10
  183. package/src/llama.cpp/src/llama-vocab.h +6 -0
  184. package/src/llama.cpp/src/llama.cpp +15 -1
  185. package/src/llama.cpp/tests/CMakeLists.txt +52 -31
  186. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  187. package/src/llama.cpp/tests/test-backend-ops.cpp +189 -90
  188. package/src/llama.cpp/tests/test-chat-template.cpp +26 -6
  189. package/src/llama.cpp/tests/test-chat.cpp +15 -3
  190. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  191. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  192. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  193. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  194. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  195. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  196. package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
  197. package/src/llama.cpp/tests/test-opt.cpp +33 -21
  198. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  199. package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
  200. package/src/llama.cpp/tests/test-sampling.cpp +1 -1
  201. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  202. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  203. package/src/llama.cpp/tools/CMakeLists.txt +39 -0
  204. package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +3 -3
  205. package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +1 -1
  206. package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +15 -16
  207. package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
  208. package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +623 -274
  209. package/src/llama.cpp/{examples → tools}/main/main.cpp +22 -14
  210. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +47 -0
  211. package/src/llama.cpp/tools/mtmd/clip-impl.h +365 -0
  212. package/src/llama.cpp/tools/mtmd/clip.cpp +3646 -0
  213. package/src/llama.cpp/tools/mtmd/clip.h +99 -0
  214. package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +22 -0
  215. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +370 -0
  216. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
  217. package/src/llama.cpp/tools/mtmd/mtmd.cpp +678 -0
  218. package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
  219. package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +21 -5
  220. package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +53 -3
  221. package/src/llama.cpp/tools/rpc/CMakeLists.txt +4 -0
  222. package/src/llama.cpp/tools/rpc/rpc-server.cpp +322 -0
  223. package/src/llama.cpp/tools/run/CMakeLists.txt +16 -0
  224. package/src/llama.cpp/{examples → tools}/run/run.cpp +30 -30
  225. package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
  226. package/src/llama.cpp/{examples → tools}/server/httplib.h +313 -247
  227. package/src/llama.cpp/{examples → tools}/server/server.cpp +529 -215
  228. package/src/llama.cpp/{examples → tools}/server/utils.hpp +427 -6
  229. package/src/llama.cpp/{examples → tools}/tts/tts.cpp +6 -9
  230. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
  231. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  232. package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
  233. package/src/llama.cpp/examples/infill/infill.cpp +0 -590
  234. package/src/llama.cpp/examples/llava/CMakeLists.txt +0 -66
  235. package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
  236. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
  237. package/src/llama.cpp/examples/llava/clip.cpp +0 -3206
  238. package/src/llama.cpp/examples/llava/clip.h +0 -118
  239. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  240. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  241. package/src/llama.cpp/examples/llava/llava.cpp +0 -574
  242. package/src/llama.cpp/examples/llava/llava.h +0 -49
  243. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  244. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +0 -584
  245. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  246. package/src/llama.cpp/examples/rpc/CMakeLists.txt +0 -2
  247. package/src/llama.cpp/examples/rpc/rpc-server.cpp +0 -171
  248. package/src/llama.cpp/examples/run/CMakeLists.txt +0 -5
  249. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  250. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  251. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  252. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  253. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  254. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  255. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  256. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  257. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  258. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  259. /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
  260. /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
  261. /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
  262. /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
  263. /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
  264. /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
  265. /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
  266. /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
  267. /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
  268. /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
  269. /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
  270. /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
  271. /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
  272. /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
  273. /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
  274. /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
  275. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
  276. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
  277. /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
  278. /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
  279. /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
  280. /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
  281. /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
@@ -1,12 +1,24 @@
1
+ #include "gguf.h" // for reading GGUF splits
1
2
  #include "arg.h"
2
3
 
4
+ #include "common.h"
3
5
  #include "log.h"
4
6
  #include "sampling.h"
5
7
  #include "chat.h"
6
8
 
9
+ // fix problem with std::min and std::max
10
+ #if defined(_WIN32)
11
+ #define WIN32_LEAN_AND_MEAN
12
+ #ifndef NOMINMAX
13
+ # define NOMINMAX
14
+ #endif
15
+ #include <windows.h>
16
+ #endif
17
+
7
18
  #include <algorithm>
8
19
  #include <climits>
9
20
  #include <cstdarg>
21
+ #include <filesystem>
10
22
  #include <fstream>
11
23
  #include <regex>
12
24
  #include <set>
@@ -14,10 +26,42 @@
14
26
  #include <thread>
15
27
  #include <vector>
16
28
 
29
+ //#define LLAMA_USE_CURL
30
+
31
+ #if defined(LLAMA_USE_CURL)
32
+ #include <curl/curl.h>
33
+ #include <curl/easy.h>
34
+ #include <future>
35
+ #endif
36
+
17
37
  #include "json-schema-to-grammar.h"
18
38
 
19
39
  using json = nlohmann::ordered_json;
20
40
 
41
+ std::initializer_list<enum llama_example> mmproj_examples = {
42
+ LLAMA_EXAMPLE_LLAVA,
43
+ LLAMA_EXAMPLE_SERVER,
44
+ };
45
+
46
+ static std::string read_file(const std::string & fname) {
47
+ std::ifstream file(fname);
48
+ if (!file) {
49
+ throw std::runtime_error(string_format("error: failed to open file '%s'\n", fname.c_str()));
50
+ }
51
+ std::string content((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
52
+ file.close();
53
+ return content;
54
+ }
55
+
56
+ static void write_file(const std::string & fname, const std::string & content) {
57
+ std::ofstream file(fname);
58
+ if (!file) {
59
+ throw std::runtime_error(string_format("error: failed to open file '%s'\n", fname.c_str()));
60
+ }
61
+ file << content;
62
+ file.close();
63
+ }
64
+
21
65
  common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
22
66
  this->examples = std::move(examples);
23
67
  return *this;
@@ -126,47 +170,635 @@ std::string common_arg::to_string() {
126
170
  }
127
171
 
128
172
  //
129
- // utils
173
+ // downloader
130
174
  //
131
175
 
132
- static void common_params_handle_model_default(
133
- std::string & model,
134
- const std::string & model_url,
135
- std::string & hf_repo,
136
- std::string & hf_file,
137
- const std::string & hf_token,
138
- const std::string & model_default) {
139
- if (!hf_repo.empty()) {
140
- // short-hand to avoid specifying --hf-file -> default it to --model
141
- if (hf_file.empty()) {
142
- if (model.empty()) {
143
- auto auto_detected = common_get_hf_file(hf_repo, hf_token);
144
- if (auto_detected.first.empty() || auto_detected.second.empty()) {
145
- exit(1); // built without CURL, error message already printed
176
+ struct common_hf_file_res {
177
+ std::string repo; // repo name with ":tag" removed
178
+ std::string ggufFile;
179
+ std::string mmprojFile;
180
+ };
181
+
182
+ #ifdef LLAMA_USE_CURL
183
+
184
+ bool common_has_curl() {
185
+ return true;
186
+ }
187
+
188
+ #ifdef __linux__
189
+ #include <linux/limits.h>
190
+ #elif defined(_WIN32)
191
+ # if !defined(PATH_MAX)
192
+ # define PATH_MAX MAX_PATH
193
+ # endif
194
+ #elif defined(_AIX)
195
+ #include <sys/limits.h>
196
+ #else
197
+ #include <sys/syslimits.h>
198
+ #endif
199
+ #define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
200
+
201
+ //
202
+ // CURL utils
203
+ //
204
+
205
+ using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
206
+
207
+ // cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
208
+ struct curl_slist_ptr {
209
+ struct curl_slist * ptr = nullptr;
210
+ ~curl_slist_ptr() {
211
+ if (ptr) {
212
+ curl_slist_free_all(ptr);
213
+ }
214
+ }
215
+ };
216
+
217
+ #define CURL_MAX_RETRY 3
218
+ #define CURL_RETRY_DELAY_SECONDS 2
219
+
220
+ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds, const char * method_name) {
221
+ int remaining_attempts = max_attempts;
222
+
223
+ while (remaining_attempts > 0) {
224
+ LOG_INF("%s: %s %s (attempt %d of %d)...\n", __func__ , method_name, url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
225
+
226
+ CURLcode res = curl_easy_perform(curl);
227
+ if (res == CURLE_OK) {
228
+ return true;
229
+ }
230
+
231
+ int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
232
+ LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
233
+
234
+ remaining_attempts--;
235
+ if (remaining_attempts == 0) break;
236
+ std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
237
+ }
238
+
239
+ LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
240
+
241
+ return false;
242
+ }
243
+
244
+ // download one single file from remote URL to local path
245
+ static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token) {
246
+ // Initialize libcurl
247
+ curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
248
+ curl_slist_ptr http_headers;
249
+ if (!curl) {
250
+ LOG_ERR("%s: error initializing libcurl\n", __func__);
251
+ return false;
252
+ }
253
+
254
+ // Set the URL, allow to follow http redirection
255
+ curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
256
+ curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
257
+
258
+ http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
259
+ // Check if hf-token or bearer-token was specified
260
+ if (!bearer_token.empty()) {
261
+ std::string auth_header = "Authorization: Bearer " + bearer_token;
262
+ http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
263
+ }
264
+ curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
265
+
266
+ #if defined(_WIN32)
267
+ // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
268
+ // operating system. Currently implemented under MS-Windows.
269
+ curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
270
+ #endif
271
+
272
+ // Check if the file already exists locally
273
+ auto file_exists = std::filesystem::exists(path);
274
+
275
+ // If the file exists, check its JSON metadata companion file.
276
+ std::string metadata_path = path + ".json";
277
+ nlohmann::json metadata; // TODO @ngxson : get rid of this json, use regex instead
278
+ std::string etag;
279
+ std::string last_modified;
280
+
281
+ if (file_exists) {
282
+ // Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
283
+ std::ifstream metadata_in(metadata_path);
284
+ if (metadata_in.good()) {
285
+ try {
286
+ metadata_in >> metadata;
287
+ LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
288
+ if (metadata.contains("etag") && metadata.at("etag").is_string()) {
289
+ etag = metadata.at("etag");
146
290
  }
147
- hf_repo = auto_detected.first;
148
- hf_file = auto_detected.second;
149
- } else {
150
- hf_file = model;
291
+ if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
292
+ last_modified = metadata.at("lastModified");
293
+ }
294
+ } catch (const nlohmann::json::exception & e) {
295
+ LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
151
296
  }
152
297
  }
153
- // make sure model path is present (for caching purposes)
154
- if (model.empty()) {
155
- // this is to avoid different repo having same file name, or same file name in different subdirs
156
- std::string filename = hf_repo + "_" + hf_file;
157
- // to make sure we don't have any slashes in the filename
158
- string_replace_all(filename, "/", "_");
159
- model = fs_get_cache_file(filename);
298
+ // if we cannot open the metadata file, we assume that the downloaded file is not valid (etag and last-modified are left empty, so we will download it again)
299
+ } else {
300
+ LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
301
+ }
302
+
303
+ // Send a HEAD request to retrieve the etag and last-modified headers
304
+ struct common_load_model_from_url_headers {
305
+ std::string etag;
306
+ std::string last_modified;
307
+ };
308
+
309
+ common_load_model_from_url_headers headers;
310
+ bool head_request_ok = false;
311
+ bool should_download = !file_exists; // by default, we should download if the file does not exist
312
+
313
+ // get ETag to see if the remote file has changed
314
+ {
315
+ typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
316
+ auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
317
+ common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
318
+
319
+ static std::regex header_regex("([^:]+): (.*)\r\n");
320
+ static std::regex etag_regex("ETag", std::regex_constants::icase);
321
+ static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
322
+
323
+ std::string header(buffer, n_items);
324
+ std::smatch match;
325
+ if (std::regex_match(header, match, header_regex)) {
326
+ const std::string & key = match[1];
327
+ const std::string & value = match[2];
328
+ if (std::regex_match(key, match, etag_regex)) {
329
+ headers->etag = value;
330
+ } else if (std::regex_match(key, match, last_modified_regex)) {
331
+ headers->last_modified = value;
332
+ }
333
+ }
334
+ return n_items;
335
+ };
336
+
337
+ curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
338
+ curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
339
+ curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
340
+ curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
341
+
342
+ // we only allow retrying once for HEAD requests
343
+ // this is for the use case of using running offline (no internet), retrying can be annoying
344
+ bool was_perform_successful = curl_perform_with_retry(url, curl.get(), 1, 0, "HEAD");
345
+ if (!was_perform_successful) {
346
+ head_request_ok = false;
160
347
  }
161
- } else if (!model_url.empty()) {
162
- if (model.empty()) {
163
- auto f = string_split<std::string>(model_url, '#').front();
164
- f = string_split<std::string>(f, '?').front();
165
- model = fs_get_cache_file(string_split<std::string>(f, '/').back());
348
+
349
+ long http_code = 0;
350
+ curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
351
+ if (http_code == 200) {
352
+ head_request_ok = true;
353
+ } else {
354
+ LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
355
+ head_request_ok = false;
356
+ }
357
+ }
358
+
359
+ // if head_request_ok is false, we don't have the etag or last-modified headers
360
+ // we leave should_download as-is, which is true if the file does not exist
361
+ if (head_request_ok) {
362
+ // check if ETag or Last-Modified headers are different
363
+ // if it is, we need to download the file again
364
+ if (!etag.empty() && etag != headers.etag) {
365
+ LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
366
+ should_download = true;
367
+ } else if (!last_modified.empty() && last_modified != headers.last_modified) {
368
+ LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
369
+ should_download = true;
166
370
  }
167
- } else if (model.empty()) {
168
- model = model_default;
169
371
  }
372
+
373
+ if (should_download) {
374
+ std::string path_temporary = path + ".downloadInProgress";
375
+ if (file_exists) {
376
+ LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
377
+ if (remove(path.c_str()) != 0) {
378
+ LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
379
+ return false;
380
+ }
381
+ }
382
+
383
+ // Set the output file
384
+
385
+ struct FILE_deleter {
386
+ void operator()(FILE * f) const {
387
+ fclose(f);
388
+ }
389
+ };
390
+
391
+ std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
392
+ if (!outfile) {
393
+ LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str());
394
+ return false;
395
+ }
396
+
397
+ typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
398
+ auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t {
399
+ return fwrite(data, size, nmemb, (FILE *)fd);
400
+ };
401
+ curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L);
402
+ curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
403
+ curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get());
404
+
405
+ // display download progress
406
+ curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
407
+
408
+ // helper function to hide password in URL
409
+ auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
410
+ std::size_t protocol_pos = url.find("://");
411
+ if (protocol_pos == std::string::npos) {
412
+ return url; // Malformed URL
413
+ }
414
+
415
+ std::size_t at_pos = url.find('@', protocol_pos + 3);
416
+ if (at_pos == std::string::npos) {
417
+ return url; // No password in URL
418
+ }
419
+
420
+ return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
421
+ };
422
+
423
+ // start the download
424
+ LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
425
+ llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
426
+ bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS, "GET");
427
+ if (!was_perform_successful) {
428
+ return false;
429
+ }
430
+
431
+ long http_code = 0;
432
+ curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
433
+ if (http_code < 200 || http_code >= 400) {
434
+ LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
435
+ return false;
436
+ }
437
+
438
+ // Causes file to be closed explicitly here before we rename it.
439
+ outfile.reset();
440
+
441
+ // Write the updated JSON metadata file.
442
+ metadata.update({
443
+ {"url", url},
444
+ {"etag", headers.etag},
445
+ {"lastModified", headers.last_modified}
446
+ });
447
+ write_file(metadata_path, metadata.dump(4));
448
+ LOG_DBG("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
449
+
450
+ if (rename(path_temporary.c_str(), path.c_str()) != 0) {
451
+ LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
452
+ return false;
453
+ }
454
+ } else {
455
+ LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
456
+ }
457
+
458
+ return true;
459
+ }
460
+
461
+ // download multiple files from remote URLs to local paths
462
+ // the input is a vector of pairs <url, path>
463
+ static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token) {
464
+ // Prepare download in parallel
465
+ std::vector<std::future<bool>> futures_download;
466
+ for (auto const & item : urls) {
467
+ futures_download.push_back(std::async(std::launch::async, [bearer_token](const std::pair<std::string, std::string> & it) -> bool {
468
+ return common_download_file_single(it.first, it.second, bearer_token);
469
+ }, item));
470
+ }
471
+
472
+ // Wait for all downloads to complete
473
+ for (auto & f : futures_download) {
474
+ if (!f.get()) {
475
+ return false;
476
+ }
477
+ }
478
+
479
+ return true;
480
+ }
481
+
482
+ static bool common_download_model(
483
+ const common_params_model & model,
484
+ const std::string & bearer_token) {
485
+ // Basic validation of the model.url
486
+ if (model.url.empty()) {
487
+ LOG_ERR("%s: invalid model url\n", __func__);
488
+ return false;
489
+ }
490
+
491
+ if (!common_download_file_single(model.url, model.path, bearer_token)) {
492
+ return false;
493
+ }
494
+
495
+ // check for additional GGUFs split to download
496
+ int n_split = 0;
497
+ {
498
+ struct gguf_init_params gguf_params = {
499
+ /*.no_alloc = */ true,
500
+ /*.ctx = */ NULL,
501
+ };
502
+ auto * ctx_gguf = gguf_init_from_file(model.path.c_str(), gguf_params);
503
+ if (!ctx_gguf) {
504
+ LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, model.path.c_str());
505
+ return false;
506
+ }
507
+
508
+ auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
509
+ if (key_n_split >= 0) {
510
+ n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
511
+ }
512
+
513
+ gguf_free(ctx_gguf);
514
+ }
515
+
516
+ if (n_split > 1) {
517
+ char split_prefix[PATH_MAX] = {0};
518
+ char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
519
+
520
+ // Verify the first split file format
521
+ // and extract split URL and PATH prefixes
522
+ {
523
+ if (!llama_split_prefix(split_prefix, sizeof(split_prefix), model.path.c_str(), 0, n_split)) {
524
+ LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, model.path.c_str(), n_split);
525
+ return false;
526
+ }
527
+
528
+ if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model.url.c_str(), 0, n_split)) {
529
+ LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model.url.c_str(), n_split);
530
+ return false;
531
+ }
532
+ }
533
+
534
+ std::vector<std::pair<std::string, std::string>> urls;
535
+ for (int idx = 1; idx < n_split; idx++) {
536
+ char split_path[PATH_MAX] = {0};
537
+ llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
538
+
539
+ char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
540
+ llama_split_path(split_url, sizeof(split_url), split_url_prefix, idx, n_split);
541
+
542
+ if (std::string(split_path) == model.path) {
543
+ continue; // skip the already downloaded file
544
+ }
545
+
546
+ urls.push_back({split_url, split_path});
547
+ }
548
+
549
+ // Download in parallel
550
+ common_download_file_multiple(urls, bearer_token);
551
+ }
552
+
553
+ return true;
554
+ }
555
+
556
+ std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params) {
557
+ curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
558
+ curl_slist_ptr http_headers;
559
+ std::vector<char> res_buffer;
560
+
561
+ curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
562
+ curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
563
+ curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
564
+ typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
565
+ auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
566
+ auto data_vec = static_cast<std::vector<char> *>(data);
567
+ data_vec->insert(data_vec->end(), (char *)ptr, (char *)ptr + size * nmemb);
568
+ return size * nmemb;
569
+ };
570
+ curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
571
+ curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_buffer);
572
+ #if defined(_WIN32)
573
+ curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
574
+ #endif
575
+ if (params.timeout > 0) {
576
+ curl_easy_setopt(curl.get(), CURLOPT_TIMEOUT, params.timeout);
577
+ }
578
+ if (params.max_size > 0) {
579
+ curl_easy_setopt(curl.get(), CURLOPT_MAXFILESIZE, params.max_size);
580
+ }
581
+ http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
582
+ for (const auto & header : params.headers) {
583
+ http_headers.ptr = curl_slist_append(http_headers.ptr, header.c_str());
584
+ }
585
+ curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
586
+
587
+ CURLcode res = curl_easy_perform(curl.get());
588
+
589
+ if (res != CURLE_OK) {
590
+ std::string error_msg = curl_easy_strerror(res);
591
+ throw std::runtime_error("error: cannot make GET request: " + error_msg);
592
+ }
593
+
594
+ long res_code;
595
+ curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
596
+
597
+ return { res_code, std::move(res_buffer) };
598
+ }
599
+
600
+ /**
601
+ * Allow getting the HF file from the HF repo with tag (like ollama), for example:
602
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
603
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
604
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
605
+ * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
606
+ *
607
+ * Return pair of <repo, file> (with "repo" already having tag removed)
608
+ *
609
+ * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
610
+ */
611
+ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token) {
612
+ auto parts = string_split<std::string>(hf_repo_with_tag, ':');
613
+ std::string tag = parts.size() > 1 ? parts.back() : "latest";
614
+ std::string hf_repo = parts[0];
615
+ if (string_split<std::string>(hf_repo, '/').size() != 2) {
616
+ throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
617
+ }
618
+
619
+ std::string url = get_model_endpoint() + "v2/" + hf_repo + "/manifests/" + tag;
620
+
621
+ // headers
622
+ std::vector<std::string> headers;
623
+ headers.push_back("Accept: application/json");
624
+ if (!bearer_token.empty()) {
625
+ headers.push_back("Authorization: Bearer " + bearer_token);
626
+ }
627
+ // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
628
+ // User-Agent header is already set in common_remote_get_content, no need to set it here
629
+
630
+ // we use "=" to avoid clashing with other component, while still being allowed on windows
631
+ std::string cached_response_fname = "manifest=" + hf_repo + "=" + tag + ".json";
632
+ string_replace_all(cached_response_fname, "/", "_");
633
+ std::string cached_response_path = fs_get_cache_file(cached_response_fname);
634
+
635
+ // make the request
636
+ common_remote_params params;
637
+ params.headers = headers;
638
+ long res_code = 0;
639
+ std::string res_str;
640
+ bool use_cache = false;
641
+ try {
642
+ auto res = common_remote_get_content(url, params);
643
+ res_code = res.first;
644
+ res_str = std::string(res.second.data(), res.second.size());
645
+ } catch (const std::exception & e) {
646
+ LOG_WRN("error: failed to get manifest: %s\n", e.what());
647
+ LOG_WRN("try reading from cache\n");
648
+ // try to read from cache
649
+ try {
650
+ res_str = read_file(cached_response_path);
651
+ res_code = 200;
652
+ use_cache = true;
653
+ } catch (const std::exception & e) {
654
+ throw std::runtime_error("error: failed to get manifest (check your internet connection)");
655
+ }
656
+ }
657
+ std::string ggufFile;
658
+ std::string mmprojFile;
659
+
660
+ if (res_code == 200 || res_code == 304) {
661
+ // extract ggufFile.rfilename in json, using regex
662
+ {
663
+ std::regex pattern("\"ggufFile\"[\\s\\S]*?\"rfilename\"\\s*:\\s*\"([^\"]+)\"");
664
+ std::smatch match;
665
+ if (std::regex_search(res_str, match, pattern)) {
666
+ ggufFile = match[1].str();
667
+ }
668
+ }
669
+ // extract mmprojFile.rfilename in json, using regex
670
+ {
671
+ std::regex pattern("\"mmprojFile\"[\\s\\S]*?\"rfilename\"\\s*:\\s*\"([^\"]+)\"");
672
+ std::smatch match;
673
+ if (std::regex_search(res_str, match, pattern)) {
674
+ mmprojFile = match[1].str();
675
+ }
676
+ }
677
+ if (!use_cache) {
678
+ // if not using cached response, update the cache file
679
+ write_file(cached_response_path, res_str);
680
+ }
681
+ } else if (res_code == 401) {
682
+ throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
683
+ } else {
684
+ throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
685
+ }
686
+
687
+ // check response
688
+ if (ggufFile.empty()) {
689
+ throw std::runtime_error("error: model does not have ggufFile");
690
+ }
691
+
692
+ return { hf_repo, ggufFile, mmprojFile };
693
+ }
694
+
695
+ #else
696
+
697
+ bool common_has_curl() {
698
+ return false;
699
+ }
700
+
701
+ static bool common_download_file_single(const std::string &, const std::string &, const std::string &) {
702
+ LOG_ERR("error: built without CURL, cannot download model from internet\n");
703
+ return false;
704
+ }
705
+
706
+ static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &) {
707
+ LOG_ERR("error: built without CURL, cannot download model from the internet\n");
708
+ return false;
709
+ }
710
+
711
+ static bool common_download_model(
712
+ const common_params_model &,
713
+ const std::string &) {
714
+ LOG_ERR("error: built without CURL, cannot download model from the internet\n");
715
+ return false;
716
+ }
717
+
718
+ static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &) {
719
+ LOG_ERR("error: built without CURL, cannot download model from the internet\n");
720
+ return {};
721
+ }
722
+
723
+ std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params &) {
724
+ if (!url.empty()) {
725
+ throw std::runtime_error("error: built without CURL, cannot download model from the internet");
726
+ }
727
+
728
+ return {};
729
+ }
730
+
731
+ #endif // LLAMA_USE_CURL
732
+
733
+ //
734
+ // utils
735
+ //
736
+
737
+ struct handle_model_result {
738
+ bool found_mmproj = false;
739
+ common_params_model mmproj;
740
+ };
741
+
742
+ static handle_model_result common_params_handle_model(
743
+ struct common_params_model & model,
744
+ const std::string & bearer_token,
745
+ const std::string & model_path_default) {
746
+ handle_model_result result;
747
+ // handle pre-fill default model path and url based on hf_repo and hf_file
748
+ {
749
+ if (!model.hf_repo.empty()) {
750
+ // short-hand to avoid specifying --hf-file -> default it to --model
751
+ if (model.hf_file.empty()) {
752
+ if (model.path.empty()) {
753
+ auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token);
754
+ if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
755
+ exit(1); // built without CURL, error message already printed
756
+ }
757
+ model.hf_repo = auto_detected.repo;
758
+ model.hf_file = auto_detected.ggufFile;
759
+ if (!auto_detected.mmprojFile.empty()) {
760
+ result.found_mmproj = true;
761
+ result.mmproj.hf_repo = model.hf_repo;
762
+ result.mmproj.hf_file = auto_detected.mmprojFile;
763
+ }
764
+ } else {
765
+ model.hf_file = model.path;
766
+ }
767
+ }
768
+
769
+ std::string model_endpoint = get_model_endpoint();
770
+ model.url = model_endpoint + model.hf_repo + "/resolve/main/" + model.hf_file;
771
+ // make sure model path is present (for caching purposes)
772
+ if (model.path.empty()) {
773
+ // this is to avoid different repo having same file name, or same file name in different subdirs
774
+ std::string filename = model.hf_repo + "_" + model.hf_file;
775
+ // to make sure we don't have any slashes in the filename
776
+ string_replace_all(filename, "/", "_");
777
+ model.path = fs_get_cache_file(filename);
778
+ }
779
+
780
+ } else if (!model.url.empty()) {
781
+ if (model.path.empty()) {
782
+ auto f = string_split<std::string>(model.url, '#').front();
783
+ f = string_split<std::string>(f, '?').front();
784
+ model.path = fs_get_cache_file(string_split<std::string>(f, '/').back());
785
+ }
786
+
787
+ } else if (model.path.empty()) {
788
+ model.path = model_path_default;
789
+ }
790
+ }
791
+
792
+ // then, download it if needed
793
+ if (!model.url.empty()) {
794
+ bool ok = common_download_model(model, bearer_token);
795
+ if (!ok) {
796
+ LOG_ERR("error: failed to download model from %s\n", model.url.c_str());
797
+ exit(1);
798
+ }
799
+ }
800
+
801
+ return result;
170
802
  }
171
803
 
172
804
  const std::vector<ggml_type> kv_cache_types = {
@@ -300,10 +932,25 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
300
932
  throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
301
933
  }
302
934
 
303
- // TODO: refactor model params in a common struct
304
- common_params_handle_model_default(params.model, params.model_url, params.hf_repo, params.hf_file, params.hf_token, DEFAULT_MODEL_PATH);
305
- common_params_handle_model_default(params.speculative.model, params.speculative.model_url, params.speculative.hf_repo, params.speculative.hf_file, params.hf_token, "");
306
- common_params_handle_model_default(params.vocoder.model, params.vocoder.model_url, params.vocoder.hf_repo, params.vocoder.hf_file, params.hf_token, "");
935
+ // handle model and download
936
+ {
937
+ auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH);
938
+ if (params.no_mmproj) {
939
+ params.mmproj = {};
940
+ } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
941
+ // optionally, handle mmproj model when -hf is specified
942
+ params.mmproj = res.mmproj;
943
+ }
944
+ // only download mmproj if the current example is using it
945
+ for (auto & ex : mmproj_examples) {
946
+ if (ctx_arg.ex == ex) {
947
+ common_params_handle_model(params.mmproj, params.hf_token, "");
948
+ break;
949
+ }
950
+ }
951
+ common_params_handle_model(params.speculative.model, params.hf_token, "");
952
+ common_params_handle_model(params.vocoder.model, params.hf_token, "");
953
+ }
307
954
 
308
955
  if (params.escape) {
309
956
  string_process_escapes(params.prompt);
@@ -322,6 +969,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
322
969
  params.kv_overrides.back().key[0] = 0;
323
970
  }
324
971
 
972
+ if (!params.tensor_buft_overrides.empty()) {
973
+ params.tensor_buft_overrides.push_back({nullptr, nullptr});
974
+ }
975
+
325
976
  if (params.reranking && params.embedding) {
326
977
  throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both");
327
978
  }
@@ -431,7 +1082,6 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
431
1082
  "llama-embedding",
432
1083
  "llama-eval-callback",
433
1084
  "llama-export-lora",
434
- "llama-gbnf-validator",
435
1085
  "llama-gen-docs",
436
1086
  "llama-gguf",
437
1087
  "llama-gguf-hash",
@@ -439,20 +1089,18 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
439
1089
  "llama-gritlm",
440
1090
  "llama-imatrix",
441
1091
  "llama-infill",
442
- "llama-llava-cli",
1092
+ "llama-mtmd-cli",
443
1093
  "llama-llava-clip-quantize-cli",
444
1094
  "llama-lookahead",
445
1095
  "llama-lookup",
446
1096
  "llama-lookup-create",
447
1097
  "llama-lookup-merge",
448
1098
  "llama-lookup-stats",
449
- "llama-minicpmv-cli",
450
1099
  "llama-parallel",
451
1100
  "llama-passkey",
452
1101
  "llama-perplexity",
453
1102
  "llama-q8dot",
454
1103
  "llama-quantize",
455
- "llama-quantize-stats",
456
1104
  "llama-qwen2vl-cli",
457
1105
  "llama-retrieval",
458
1106
  "llama-run",
@@ -541,6 +1189,9 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
541
1189
  fprintf(stderr, "%s\n", ex.what());
542
1190
  ctx_arg.params = params_org;
543
1191
  return false;
1192
+ } catch (std::exception & ex) {
1193
+ fprintf(stderr, "%s\n", ex.what());
1194
+ exit(1); // for other exceptions, we exit with status code 1
544
1195
  }
545
1196
 
546
1197
  return true;
@@ -632,7 +1283,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
632
1283
  [](common_params & params) {
633
1284
  params.use_color = true;
634
1285
  }
635
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
1286
+ ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
636
1287
  add_opt(common_arg(
637
1288
  {"-t", "--threads"}, "N",
638
1289
  string_format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
@@ -765,7 +1416,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
765
1416
  add_opt(common_arg(
766
1417
  {"-n", "--predict", "--n-predict"}, "N",
767
1418
  string_format(
768
- ex == LLAMA_EXAMPLE_MAIN || ex == LLAMA_EXAMPLE_INFILL
1419
+ ex == LLAMA_EXAMPLE_MAIN
769
1420
  ? "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)"
770
1421
  : "number of tokens to predict (default: %d, -1 = infinity)",
771
1422
  params.n_predict),
@@ -841,13 +1492,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
841
1492
  {"-f", "--file"}, "FNAME",
842
1493
  "a file containing the prompt (default: none)",
843
1494
  [](common_params & params, const std::string & value) {
844
- std::ifstream file(value);
845
- if (!file) {
846
- throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
847
- }
1495
+ params.prompt = read_file(value);
848
1496
  // store the external file name in params
849
1497
  params.prompt_file = value;
850
- std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
851
1498
  if (!params.prompt.empty() && params.prompt.back() == '\n') {
852
1499
  params.prompt.pop_back();
853
1500
  }
@@ -857,11 +1504,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
857
1504
  {"-sysf", "--system-prompt-file"}, "FNAME",
858
1505
  "a file containing the system prompt (default: none)",
859
1506
  [](common_params & params, const std::string & value) {
860
- std::ifstream file(value);
861
- if (!file) {
862
- throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
863
- }
864
- std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.system_prompt));
1507
+ params.system_prompt = read_file(value);
865
1508
  if (!params.system_prompt.empty() && params.system_prompt.back() == '\n') {
866
1509
  params.system_prompt.pop_back();
867
1510
  }
@@ -1012,7 +1655,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1012
1655
  params.input_prefix = value;
1013
1656
  params.enable_chat_template = false;
1014
1657
  }
1015
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
1658
+ ).set_examples({LLAMA_EXAMPLE_MAIN}));
1016
1659
  add_opt(common_arg(
1017
1660
  {"--in-suffix"}, "STRING",
1018
1661
  "string to suffix after user inputs with (default: empty)",
@@ -1020,7 +1663,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1020
1663
  params.input_suffix = value;
1021
1664
  params.enable_chat_template = false;
1022
1665
  }
1023
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
1666
+ ).set_examples({LLAMA_EXAMPLE_MAIN}));
1024
1667
  add_opt(common_arg(
1025
1668
  {"--no-warmup"},
1026
1669
  "skip warming up the model with an empty run",
@@ -1037,7 +1680,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1037
1680
  [](common_params & params) {
1038
1681
  params.spm_infill = true;
1039
1682
  }
1040
- ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL}));
1683
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
1041
1684
  add_opt(common_arg(
1042
1685
  {"--samplers"}, "SAMPLERS",
1043
1686
  string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
@@ -1285,23 +1928,32 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1285
1928
  add_opt(common_arg(
1286
1929
  {"--grammar-file"}, "FNAME",
1287
1930
  "file to read grammar from",
1931
+ [](common_params & params, const std::string & value) {
1932
+ params.sampling.grammar = read_file(value);
1933
+ }
1934
+ ).set_sparam());
1935
+ add_opt(common_arg(
1936
+ {"-j", "--json-schema"}, "SCHEMA",
1937
+ "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
1938
+ [](common_params & params, const std::string & value) {
1939
+ params.sampling.grammar = json_schema_to_grammar(json::parse(value));
1940
+ }
1941
+ ).set_sparam());
1942
+ add_opt(common_arg(
1943
+ {"-jf", "--json-schema-file"}, "FILE",
1944
+ "File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
1288
1945
  [](common_params & params, const std::string & value) {
1289
1946
  std::ifstream file(value);
1290
1947
  if (!file) {
1291
1948
  throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
1292
1949
  }
1950
+ std::string schema;
1293
1951
  std::copy(
1294
1952
  std::istreambuf_iterator<char>(file),
1295
1953
  std::istreambuf_iterator<char>(),
1296
- std::back_inserter(params.sampling.grammar)
1954
+ std::back_inserter(schema)
1297
1955
  );
1298
- }
1299
- ).set_sparam());
1300
- add_opt(common_arg(
1301
- {"-j", "--json-schema"}, "SCHEMA",
1302
- "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
1303
- [](common_params & params, const std::string & value) {
1304
- params.sampling.grammar = json_schema_to_grammar(json::parse(value));
1956
+ params.sampling.grammar = json_schema_to_grammar(json::parse(schema));
1305
1957
  }
1306
1958
  ).set_sparam());
1307
1959
  add_opt(common_arg(
@@ -1445,13 +2097,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1445
2097
  params.cache_type_v = kv_cache_type_from_str(value);
1446
2098
  }
1447
2099
  ).set_env("LLAMA_ARG_CACHE_TYPE_V"));
1448
- add_opt(common_arg(
1449
- {"--perplexity", "--all-logits"},
1450
- string_format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"),
1451
- [](common_params & params) {
1452
- params.logits_all = true;
1453
- }
1454
- ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
1455
2100
  add_opt(common_arg(
1456
2101
  {"--hellaswag"},
1457
2102
  "compute HellaSwag score over random tasks from datafile supplied with -f",
@@ -1559,11 +2204,33 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1559
2204
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
1560
2205
  add_opt(common_arg(
1561
2206
  {"--mmproj"}, "FILE",
1562
- "path to a multimodal projector file for LLaVA. see examples/llava/README.md",
2207
+ "path to a multimodal projector file. see tools/mtmd/README.md\n"
2208
+ "note: if -hf is used, this argument can be omitted",
1563
2209
  [](common_params & params, const std::string & value) {
1564
- params.mmproj = value;
2210
+ params.mmproj.path = value;
1565
2211
  }
1566
- ).set_examples({LLAMA_EXAMPLE_LLAVA}));
2212
+ ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ"));
2213
+ add_opt(common_arg(
2214
+ {"--mmproj-url"}, "URL",
2215
+ "URL to a multimodal projector file. see tools/mtmd/README.md",
2216
+ [](common_params & params, const std::string & value) {
2217
+ params.mmproj.url = value;
2218
+ }
2219
+ ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_URL"));
2220
+ add_opt(common_arg(
2221
+ {"--no-mmproj"},
2222
+ "explicitly disable multimodal projector, useful when using -hf",
2223
+ [](common_params & params) {
2224
+ params.no_mmproj = true;
2225
+ }
2226
+ ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ"));
2227
+ add_opt(common_arg(
2228
+ {"--no-mmproj-offload"},
2229
+ "do not offload multimodal projector to GPU",
2230
+ [](common_params & params) {
2231
+ params.mmproj_use_gpu = false;
2232
+ }
2233
+ ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
1567
2234
  add_opt(common_arg(
1568
2235
  {"--image"}, "FILE",
1569
2236
  "path to an image file. use with multimodal models. Specify multiple times for batching",
@@ -1647,6 +2314,41 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1647
2314
  exit(0);
1648
2315
  }
1649
2316
  ));
2317
+ add_opt(common_arg(
2318
+ {"--override-tensor", "-ot"}, "<tensor name pattern>=<buffer type>,...",
2319
+ "override tensor buffer type", [](common_params & params, const std::string & value) {
2320
+ /* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
2321
+ if (buft_list.empty()) {
2322
+ // enumerate all the devices and add their buffer types to the list
2323
+ for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
2324
+ auto * dev = ggml_backend_dev_get(i);
2325
+ auto * buft = ggml_backend_dev_buffer_type(dev);
2326
+ if (buft) {
2327
+ buft_list[ggml_backend_buft_name(buft)] = buft;
2328
+ }
2329
+ }
2330
+ }
2331
+
2332
+ for (const auto & override : string_split<std::string>(value, ',')) {
2333
+ std::string::size_type pos = override.find('=');
2334
+ if (pos == std::string::npos) {
2335
+ throw std::invalid_argument("invalid value");
2336
+ }
2337
+ std::string tensor_name = override.substr(0, pos);
2338
+ std::string buffer_type = override.substr(pos + 1);
2339
+
2340
+ if (buft_list.find(buffer_type) == buft_list.end()) {
2341
+ printf("Available buffer types:\n");
2342
+ for (const auto & it : buft_list) {
2343
+ printf(" %s\n", ggml_backend_buft_name(it.second));
2344
+ }
2345
+ throw std::invalid_argument("unknown buffer type");
2346
+ }
2347
+ // FIXME: this leaks memory
2348
+ params.tensor_buft_overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)});
2349
+ }
2350
+ }
2351
+ ));
1650
2352
  add_opt(common_arg(
1651
2353
  {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
1652
2354
  "number of layers to store in VRAM",
@@ -1735,6 +2437,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1735
2437
  }
1736
2438
  }
1737
2439
  ));
2440
+ add_opt(common_arg(
2441
+ {"--no-op-offload"},
2442
+ string_format("disable offloading host tensor operations to device (default: %s)", params.no_op_offload ? "true" : "false"),
2443
+ [](common_params & params) {
2444
+ params.no_op_offload = true;
2445
+ }
2446
+ ));
1738
2447
  add_opt(common_arg(
1739
2448
  {"--lora"}, "FNAME",
1740
2449
  "path to LoRA adapter (can be repeated to use multiple adapters)",
@@ -1790,51 +2499,52 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1790
2499
  "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH
1791
2500
  ),
1792
2501
  [](common_params & params, const std::string & value) {
1793
- params.model = value;
2502
+ params.model.path = value;
1794
2503
  }
1795
2504
  ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
1796
2505
  add_opt(common_arg(
1797
2506
  {"-mu", "--model-url"}, "MODEL_URL",
1798
2507
  "model download url (default: unused)",
1799
2508
  [](common_params & params, const std::string & value) {
1800
- params.model_url = value;
2509
+ params.model.url = value;
1801
2510
  }
1802
2511
  ).set_env("LLAMA_ARG_MODEL_URL"));
1803
2512
  add_opt(common_arg(
1804
2513
  {"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
1805
2514
  "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
2515
+ "mmproj is also downloaded automatically if available. to disable, add --no-mmproj\n"
1806
2516
  "example: unsloth/phi-4-GGUF:q4_k_m\n"
1807
2517
  "(default: unused)",
1808
2518
  [](common_params & params, const std::string & value) {
1809
- params.hf_repo = value;
2519
+ params.model.hf_repo = value;
1810
2520
  }
1811
2521
  ).set_env("LLAMA_ARG_HF_REPO"));
1812
2522
  add_opt(common_arg(
1813
2523
  {"-hfd", "-hfrd", "--hf-repo-draft"}, "<user>/<model>[:quant]",
1814
2524
  "Same as --hf-repo, but for the draft model (default: unused)",
1815
2525
  [](common_params & params, const std::string & value) {
1816
- params.speculative.hf_repo = value;
2526
+ params.speculative.model.hf_repo = value;
1817
2527
  }
1818
2528
  ).set_env("LLAMA_ARG_HFD_REPO"));
1819
2529
  add_opt(common_arg(
1820
2530
  {"-hff", "--hf-file"}, "FILE",
1821
2531
  "Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)",
1822
2532
  [](common_params & params, const std::string & value) {
1823
- params.hf_file = value;
2533
+ params.model.hf_file = value;
1824
2534
  }
1825
2535
  ).set_env("LLAMA_ARG_HF_FILE"));
1826
2536
  add_opt(common_arg(
1827
2537
  {"-hfv", "-hfrv", "--hf-repo-v"}, "<user>/<model>[:quant]",
1828
2538
  "Hugging Face model repository for the vocoder model (default: unused)",
1829
2539
  [](common_params & params, const std::string & value) {
1830
- params.vocoder.hf_repo = value;
2540
+ params.vocoder.model.hf_repo = value;
1831
2541
  }
1832
2542
  ).set_env("LLAMA_ARG_HF_REPO_V"));
1833
2543
  add_opt(common_arg(
1834
2544
  {"-hffv", "--hf-file-v"}, "FILE",
1835
2545
  "Hugging Face model file for the vocoder model (default: unused)",
1836
2546
  [](common_params & params, const std::string & value) {
1837
- params.vocoder.hf_file = value;
2547
+ params.vocoder.model.hf_file = value;
1838
2548
  }
1839
2549
  ).set_env("LLAMA_ARG_HF_FILE_V"));
1840
2550
  add_opt(common_arg(
@@ -1875,7 +2585,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1875
2585
  [](common_params & params, int value) {
1876
2586
  params.n_junk = value;
1877
2587
  }
1878
- ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
2588
+ ).set_examples({LLAMA_EXAMPLE_PASSKEY, LLAMA_EXAMPLE_PARALLEL}));
1879
2589
  add_opt(common_arg(
1880
2590
  {"--pos"}, "N",
1881
2591
  string_format("position of the passkey in the junk text (default: %d)", params.i_pos),
@@ -1925,13 +2635,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1925
2635
  params.i_chunk = value;
1926
2636
  }
1927
2637
  ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2638
+ add_opt(common_arg(
2639
+ {"--parse-special"},
2640
+ string_format("prase special tokens (chat, tool, etc) (default: %s)", params.parse_special ? "true" : "false"),
2641
+ [](common_params & params) {
2642
+ params.parse_special = true;
2643
+ }
2644
+ ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
1928
2645
  add_opt(common_arg(
1929
2646
  {"-pps"},
1930
2647
  string_format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),
1931
2648
  [](common_params & params) {
1932
2649
  params.is_pp_shared = true;
1933
2650
  }
1934
- ).set_examples({LLAMA_EXAMPLE_BENCH}));
2651
+ ).set_examples({LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
1935
2652
  add_opt(common_arg(
1936
2653
  {"-npp"}, "n0,n1,...",
1937
2654
  "number of prompt tokens",
@@ -1979,7 +2696,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1979
2696
  ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
1980
2697
  add_opt(common_arg(
1981
2698
  {"--host"}, "HOST",
1982
- string_format("ip address to listen (default: %s)", params.hostname.c_str()),
2699
+ string_format("ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: %s)", params.hostname.c_str()),
1983
2700
  [](common_params & params, const std::string & value) {
1984
2701
  params.hostname = value;
1985
2702
  }
@@ -2074,7 +2791,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2074
2791
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
2075
2792
  add_opt(common_arg(
2076
2793
  {"--cache-reuse"}, "N",
2077
- string_format("min chunk size to attempt reusing from the cache via KV shifting (default: %d)", params.n_cache_reuse),
2794
+ string_format(
2795
+ "min chunk size to attempt reusing from the cache via KV shifting (default: %d)\n"
2796
+ "[(card)](https://ggml.ai/f0.png)", params.n_cache_reuse
2797
+ ),
2078
2798
  [](common_params & params, int value) {
2079
2799
  params.n_cache_reuse = value;
2080
2800
  }
@@ -2147,7 +2867,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2147
2867
  [](common_params & params, const std::string & value) {
2148
2868
  params.chat_template = value;
2149
2869
  }
2150
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
2870
+ ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_LLAVA}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
2151
2871
  add_opt(common_arg(
2152
2872
  {"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
2153
2873
  string_format(
@@ -2157,16 +2877,19 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2157
2877
  "list of built-in templates:\n%s", list_builtin_chat_templates().c_str()
2158
2878
  ),
2159
2879
  [](common_params & params, const std::string & value) {
2160
- std::ifstream file(value);
2161
- if (!file) {
2162
- throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
2163
- }
2164
- std::copy(
2165
- std::istreambuf_iterator<char>(file),
2166
- std::istreambuf_iterator<char>(),
2167
- std::back_inserter(params.chat_template));
2880
+ params.chat_template = read_file(value);
2168
2881
  }
2169
2882
  ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
2883
+ add_opt(common_arg(
2884
+ {"--no-prefill-assistant"},
2885
+ string_format(
2886
+ "whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
2887
+ "when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"
2888
+ ),
2889
+ [](common_params & params) {
2890
+ params.prefill_assistant = false;
2891
+ }
2892
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_PREFILL_ASSISTANT"));
2170
2893
  add_opt(common_arg(
2171
2894
  {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
2172
2895
  string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
@@ -2187,7 +2910,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2187
2910
  [](common_params & params) {
2188
2911
  params.simple_io = true;
2189
2912
  }
2190
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
2913
+ ).set_examples({LLAMA_EXAMPLE_MAIN}));
2191
2914
  add_opt(common_arg(
2192
2915
  {"--positive-file"}, "FNAME",
2193
2916
  string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
@@ -2454,7 +3177,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2454
3177
  {"-md", "--model-draft"}, "FNAME",
2455
3178
  "draft model for speculative decoding (default: unused)",
2456
3179
  [](common_params & params, const std::string & value) {
2457
- params.speculative.model = value;
3180
+ params.speculative.model.path = value;
2458
3181
  }
2459
3182
  ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
2460
3183
 
@@ -2462,7 +3185,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2462
3185
  {"-mv", "--model-vocoder"}, "FNAME",
2463
3186
  "vocoder model for audio generation (default: unused)",
2464
3187
  [](common_params & params, const std::string & value) {
2465
- params.vocoder.model = value;
3188
+ params.vocoder.model.path = value;
2466
3189
  }
2467
3190
  ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
2468
3191
  add_opt(common_arg(
@@ -2485,10 +3208,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2485
3208
  {"--tts-oute-default"},
2486
3209
  string_format("use default OuteTTS models (note: can download weights from the internet)"),
2487
3210
  [](common_params & params) {
2488
- params.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF";
2489
- params.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf";
2490
- params.vocoder.hf_repo = "ggml-org/WavTokenizer";
2491
- params.vocoder.hf_file = "WavTokenizer-Large-75-F16.gguf";
3211
+ params.model.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF";
3212
+ params.model.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf";
3213
+ params.vocoder.model.hf_repo = "ggml-org/WavTokenizer";
3214
+ params.vocoder.model.hf_file = "WavTokenizer-Large-75-F16.gguf";
2492
3215
  }
2493
3216
  ).set_examples({LLAMA_EXAMPLE_TTS}));
2494
3217
 
@@ -2496,8 +3219,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2496
3219
  {"--embd-bge-small-en-default"},
2497
3220
  string_format("use default bge-small-en-v1.5 model (note: can download weights from the internet)"),
2498
3221
  [](common_params & params) {
2499
- params.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
2500
- params.hf_file = "bge-small-en-v1.5-q8_0.gguf";
3222
+ params.model.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
3223
+ params.model.hf_file = "bge-small-en-v1.5-q8_0.gguf";
2501
3224
  params.pooling_type = LLAMA_POOLING_TYPE_NONE;
2502
3225
  params.embd_normalize = 2;
2503
3226
  params.n_ctx = 512;
@@ -2510,8 +3233,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2510
3233
  {"--embd-e5-small-en-default"},
2511
3234
  string_format("use default e5-small-v2 model (note: can download weights from the internet)"),
2512
3235
  [](common_params & params) {
2513
- params.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
2514
- params.hf_file = "e5-small-v2-q8_0.gguf";
3236
+ params.model.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
3237
+ params.model.hf_file = "e5-small-v2-q8_0.gguf";
2515
3238
  params.pooling_type = LLAMA_POOLING_TYPE_NONE;
2516
3239
  params.embd_normalize = 2;
2517
3240
  params.n_ctx = 512;
@@ -2524,8 +3247,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2524
3247
  {"--embd-gte-small-default"},
2525
3248
  string_format("use default gte-small model (note: can download weights from the internet)"),
2526
3249
  [](common_params & params) {
2527
- params.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
2528
- params.hf_file = "gte-small-q8_0.gguf";
3250
+ params.model.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
3251
+ params.model.hf_file = "gte-small-q8_0.gguf";
2529
3252
  params.pooling_type = LLAMA_POOLING_TYPE_NONE;
2530
3253
  params.embd_normalize = 2;
2531
3254
  params.n_ctx = 512;
@@ -2538,8 +3261,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2538
3261
  {"--fim-qwen-1.5b-default"},
2539
3262
  string_format("use default Qwen 2.5 Coder 1.5B (note: can download weights from the internet)"),
2540
3263
  [](common_params & params) {
2541
- params.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
2542
- params.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
3264
+ params.model.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
3265
+ params.model.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
2543
3266
  params.port = 8012;
2544
3267
  params.n_gpu_layers = 99;
2545
3268
  params.flash_attn = true;
@@ -2554,8 +3277,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2554
3277
  {"--fim-qwen-3b-default"},
2555
3278
  string_format("use default Qwen 2.5 Coder 3B (note: can download weights from the internet)"),
2556
3279
  [](common_params & params) {
2557
- params.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
2558
- params.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
3280
+ params.model.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
3281
+ params.model.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
2559
3282
  params.port = 8012;
2560
3283
  params.n_gpu_layers = 99;
2561
3284
  params.flash_attn = true;
@@ -2570,8 +3293,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2570
3293
  {"--fim-qwen-7b-default"},
2571
3294
  string_format("use default Qwen 2.5 Coder 7B (note: can download weights from the internet)"),
2572
3295
  [](common_params & params) {
2573
- params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
2574
- params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
3296
+ params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
3297
+ params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
2575
3298
  params.port = 8012;
2576
3299
  params.n_gpu_layers = 99;
2577
3300
  params.flash_attn = true;
@@ -2586,10 +3309,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2586
3309
  {"--fim-qwen-7b-spec"},
2587
3310
  string_format("use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
2588
3311
  [](common_params & params) {
2589
- params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
2590
- params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
2591
- params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
2592
- params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
3312
+ params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
3313
+ params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
3314
+ params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
3315
+ params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
2593
3316
  params.speculative.n_gpu_layers = 99;
2594
3317
  params.port = 8012;
2595
3318
  params.n_gpu_layers = 99;
@@ -2605,10 +3328,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2605
3328
  {"--fim-qwen-14b-spec"},
2606
3329
  string_format("use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
2607
3330
  [](common_params & params) {
2608
- params.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF";
2609
- params.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
2610
- params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
2611
- params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
3331
+ params.model.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF";
3332
+ params.model.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
3333
+ params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
3334
+ params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
2612
3335
  params.speculative.n_gpu_layers = 99;
2613
3336
  params.port = 8012;
2614
3337
  params.n_gpu_layers = 99;