@novastera-oss/llamarn 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (266) hide show
  1. package/README.md +80 -14
  2. package/RNLlamaCpp.podspec +10 -3
  3. package/android/CMakeLists.txt +8 -0
  4. package/android/src/main/cpp/include/llama.h +62 -125
  5. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  10. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  11. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  12. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  13. package/cpp/build-info.cpp +2 -2
  14. package/cpp/llama.cpp/README.md +11 -3
  15. package/cpp/llama.cpp/build-xcframework.sh +1 -0
  16. package/cpp/llama.cpp/common/CMakeLists.txt +8 -2
  17. package/cpp/llama.cpp/common/arg.cpp +153 -113
  18. package/cpp/llama.cpp/common/chat-parser.cpp +379 -0
  19. package/cpp/llama.cpp/common/chat-parser.h +117 -0
  20. package/cpp/llama.cpp/common/chat.cpp +847 -699
  21. package/cpp/llama.cpp/common/chat.h +73 -6
  22. package/cpp/llama.cpp/common/common.cpp +50 -82
  23. package/cpp/llama.cpp/common/common.h +21 -17
  24. package/cpp/llama.cpp/common/json-partial.cpp +255 -0
  25. package/cpp/llama.cpp/common/json-partial.h +37 -0
  26. package/cpp/llama.cpp/common/minja/chat-template.hpp +9 -5
  27. package/cpp/llama.cpp/common/minja/minja.hpp +69 -36
  28. package/cpp/llama.cpp/common/regex-partial.cpp +204 -0
  29. package/cpp/llama.cpp/common/regex-partial.h +56 -0
  30. package/cpp/llama.cpp/common/sampling.cpp +7 -8
  31. package/cpp/llama.cpp/convert_hf_to_gguf.py +453 -118
  32. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +120 -68
  33. package/cpp/llama.cpp/ggml/CMakeLists.txt +2 -1
  34. package/cpp/llama.cpp/ggml/cmake/common.cmake +25 -0
  35. package/cpp/llama.cpp/ggml/include/ggml-opt.h +49 -28
  36. package/cpp/llama.cpp/ggml/include/ggml.h +26 -7
  37. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +16 -10
  38. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +4 -1
  39. package/cpp/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +1 -0
  40. package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +2 -0
  41. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +604 -0
  42. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +42 -0
  43. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +54 -2
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +50 -51
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -2
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +5 -9
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +779 -19
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +22 -0
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +322 -100
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +117 -1
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +85 -16
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +220 -49
  56. package/cpp/llama.cpp/ggml/src/ggml-cuda/acc.cu +40 -26
  57. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +1 -1
  58. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +11 -1
  59. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +15 -7
  60. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +266 -64
  61. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +49 -4
  62. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +48 -4
  63. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +2 -1
  64. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +5 -1
  65. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +2 -0
  66. package/cpp/llama.cpp/ggml/src/ggml-cuda/quantize.cu +7 -6
  67. package/cpp/llama.cpp/ggml/src/ggml-cuda/sum.cu +1 -1
  68. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +10 -0
  69. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +2 -0
  70. package/cpp/llama.cpp/ggml/src/ggml-impl.h +1 -1
  71. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +4 -0
  72. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +99 -17
  73. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +200 -2
  74. package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
  75. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cu +112 -0
  76. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +12 -0
  77. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +6 -0
  78. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +972 -178
  79. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  80. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/div.cl +72 -0
  81. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
  82. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
  84. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
  85. package/cpp/llama.cpp/ggml/src/ggml-opt.cpp +373 -190
  86. package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
  87. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -10
  88. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +101 -5
  89. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +31 -33
  90. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +1 -0
  91. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +29 -2
  92. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +4 -5
  93. package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  94. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +9 -1
  95. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +84 -72
  96. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +2 -0
  97. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  98. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +1 -3
  99. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +324 -129
  100. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +1 -0
  101. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +31 -2
  102. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +95 -68
  103. package/cpp/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +1 -0
  104. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +22 -0
  105. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -2
  106. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +1 -4
  107. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +2 -3
  108. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +69 -43
  109. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +2 -14
  110. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -91
  111. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +432 -181
  112. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +17 -0
  113. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
  114. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +6 -152
  115. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +162 -0
  116. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +360 -0
  117. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +2 -118
  118. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +1 -1
  119. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +12 -1
  120. package/cpp/llama.cpp/ggml/src/ggml.c +107 -36
  121. package/cpp/llama.cpp/ggml/src/gguf.cpp +33 -33
  122. package/cpp/llama.cpp/gguf-py/gguf/constants.py +100 -15
  123. package/cpp/llama.cpp/gguf-py/gguf/gguf_reader.py +1 -1
  124. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +44 -12
  125. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_editor_gui.py +21 -10
  126. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_new_metadata.py +5 -2
  127. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +128 -31
  128. package/cpp/llama.cpp/gguf-py/gguf/utility.py +1 -1
  129. package/cpp/llama.cpp/gguf-py/pyproject.toml +1 -1
  130. package/cpp/llama.cpp/include/llama.h +62 -125
  131. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +1 -1
  132. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +1 -1
  133. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.inp +1 -1
  134. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.out +1 -1
  135. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +1 -1
  136. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +1 -1
  137. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +1 -1
  138. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +1 -1
  139. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.inp +1 -1
  140. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.out +1 -1
  141. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +1 -1
  142. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +1 -1
  143. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +1 -1
  144. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +1 -1
  145. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +1 -1
  146. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +1 -1
  147. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.inp +1 -1
  148. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.out +1 -1
  149. package/cpp/llama.cpp/models/ggml-vocab-nomic-bert-moe.gguf +0 -0
  150. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +1 -1
  151. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.out +1 -1
  152. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +1 -1
  153. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.out +1 -1
  154. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.inp +1 -1
  155. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.out +1 -1
  156. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +1 -1
  157. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.out +1 -1
  158. package/cpp/llama.cpp/models/templates/Qwen-QwQ-32B.jinja +62 -0
  159. package/cpp/llama.cpp/models/templates/Qwen-Qwen3-0.6B.jinja +85 -0
  160. package/cpp/llama.cpp/models/templates/README.md +2 -0
  161. package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
  162. package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
  163. package/cpp/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  164. package/cpp/llama.cpp/requirements/requirements-gguf_editor_gui.txt +1 -1
  165. package/cpp/llama.cpp/src/CMakeLists.txt +2 -0
  166. package/cpp/llama.cpp/src/llama-arch.cpp +6 -0
  167. package/cpp/llama.cpp/src/llama-arch.h +2 -0
  168. package/cpp/llama.cpp/src/llama-batch.cpp +3 -1
  169. package/cpp/llama.cpp/src/llama-context.cpp +340 -123
  170. package/cpp/llama.cpp/src/llama-context.h +30 -0
  171. package/cpp/llama.cpp/src/llama-cparams.cpp +4 -0
  172. package/cpp/llama.cpp/src/llama-cparams.h +2 -0
  173. package/cpp/llama.cpp/src/llama-grammar.cpp +12 -2
  174. package/cpp/llama.cpp/src/llama-graph.cpp +157 -247
  175. package/cpp/llama.cpp/src/llama-graph.h +52 -7
  176. package/cpp/llama.cpp/src/llama-hparams.cpp +17 -1
  177. package/cpp/llama.cpp/src/llama-hparams.h +37 -5
  178. package/cpp/llama.cpp/src/llama-kv-cache.cpp +742 -481
  179. package/cpp/llama.cpp/src/llama-kv-cache.h +196 -99
  180. package/cpp/llama.cpp/src/llama-kv-cells.h +379 -0
  181. package/cpp/llama.cpp/src/llama-memory.h +4 -3
  182. package/cpp/llama.cpp/src/llama-model-loader.cpp +22 -17
  183. package/cpp/llama.cpp/src/llama-model-saver.cpp +281 -0
  184. package/cpp/llama.cpp/src/llama-model-saver.h +37 -0
  185. package/cpp/llama.cpp/src/llama-model.cpp +529 -172
  186. package/cpp/llama.cpp/src/llama-model.h +6 -1
  187. package/cpp/llama.cpp/src/llama-quant.cpp +15 -13
  188. package/cpp/llama.cpp/src/llama-sampling.cpp +2 -2
  189. package/cpp/llama.cpp/src/llama-vocab.cpp +35 -8
  190. package/cpp/llama.cpp/src/llama-vocab.h +6 -0
  191. package/cpp/llama.cpp/src/llama.cpp +14 -0
  192. package/cpp/rn-completion.cpp +4 -2
  193. package/ios/include/chat.h +73 -6
  194. package/ios/include/common/minja/chat-template.hpp +9 -5
  195. package/ios/include/common/minja/minja.hpp +69 -36
  196. package/ios/include/common.h +21 -17
  197. package/ios/include/llama.h +62 -125
  198. package/ios/libs/llama.xcframework/Info.plist +19 -19
  199. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  200. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4617 -4487
  201. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-opt.h +237 -0
  202. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +26 -7
  203. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +62 -125
  204. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  205. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  206. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
  207. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3557 -3435
  208. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
  209. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
  210. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
  211. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  212. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  213. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
  214. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3559 -3437
  215. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +237 -0
  216. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +26 -7
  217. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +62 -125
  218. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +237 -0
  219. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +26 -7
  220. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +62 -125
  221. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  222. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +237 -0
  223. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +26 -7
  224. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +62 -125
  225. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  226. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  227. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  228. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4616 -4487
  229. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +237 -0
  230. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +26 -7
  231. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +62 -125
  232. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  233. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  234. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4637 -4508
  235. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3556 -3435
  236. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
  237. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
  238. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
  239. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  240. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  241. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4653 -4523
  242. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +237 -0
  243. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +26 -7
  244. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +62 -125
  245. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  246. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  247. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4674 -4544
  248. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3587 -3465
  249. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
  250. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
  251. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
  252. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  253. package/package.json +1 -1
  254. package/cpp/llama.cpp/common/stb_image.h +0 -7988
  255. package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +0 -112
  256. package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.out +0 -46
  257. package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +0 -112
  258. package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +0 -46
  259. package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +0 -112
  260. package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +0 -46
  261. package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.inp +0 -112
  262. package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.out +0 -46
  263. package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +0 -112
  264. package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.out +0 -46
  265. package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +0 -112
  266. package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +0 -46
@@ -39,7 +39,7 @@
39
39
  using json = nlohmann::ordered_json;
40
40
 
41
41
  std::initializer_list<enum llama_example> mmproj_examples = {
42
- LLAMA_EXAMPLE_LLAVA,
42
+ LLAMA_EXAMPLE_MTMD,
43
43
  LLAMA_EXAMPLE_SERVER,
44
44
  };
45
45
 
@@ -242,33 +242,7 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma
242
242
  }
243
243
 
244
244
  // download one single file from remote URL to local path
245
- static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token) {
246
- // Initialize libcurl
247
- curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
248
- curl_slist_ptr http_headers;
249
- if (!curl) {
250
- LOG_ERR("%s: error initializing libcurl\n", __func__);
251
- return false;
252
- }
253
-
254
- // Set the URL, allow to follow http redirection
255
- curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
256
- curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
257
-
258
- http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
259
- // Check if hf-token or bearer-token was specified
260
- if (!bearer_token.empty()) {
261
- std::string auth_header = "Authorization: Bearer " + bearer_token;
262
- http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
263
- }
264
- curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
265
-
266
- #if defined(_WIN32)
267
- // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
268
- // operating system. Currently implemented under MS-Windows.
269
- curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
270
- #endif
271
-
245
+ static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token, bool offline) {
272
246
  // Check if the file already exists locally
273
247
  auto file_exists = std::filesystem::exists(path);
274
248
 
@@ -279,6 +253,10 @@ static bool common_download_file_single(const std::string & url, const std::stri
279
253
  std::string last_modified;
280
254
 
281
255
  if (file_exists) {
256
+ if (offline) {
257
+ LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
258
+ return true; // skip verification/downloading
259
+ }
282
260
  // Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
283
261
  std::ifstream metadata_in(metadata_path);
284
262
  if (metadata_in.good()) {
@@ -297,6 +275,10 @@ static bool common_download_file_single(const std::string & url, const std::stri
297
275
  }
298
276
  // if we cannot open the metadata file, we assume that the downloaded file is not valid (etag and last-modified are left empty, so we will download it again)
299
277
  } else {
278
+ if (offline) {
279
+ LOG_ERR("%s: required file is not available in cache (offline mode): %s\n", __func__, path.c_str());
280
+ return false;
281
+ }
300
282
  LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
301
283
  }
302
284
 
@@ -310,50 +292,73 @@ static bool common_download_file_single(const std::string & url, const std::stri
310
292
  bool head_request_ok = false;
311
293
  bool should_download = !file_exists; // by default, we should download if the file does not exist
312
294
 
313
- // get ETag to see if the remote file has changed
314
- {
315
- typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
316
- auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
317
- common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
295
+ // Initialize libcurl
296
+ curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
297
+ curl_slist_ptr http_headers;
298
+ if (!curl) {
299
+ LOG_ERR("%s: error initializing libcurl\n", __func__);
300
+ return false;
301
+ }
318
302
 
319
- static std::regex header_regex("([^:]+): (.*)\r\n");
320
- static std::regex etag_regex("ETag", std::regex_constants::icase);
321
- static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
303
+ // Set the URL, allow to follow http redirection
304
+ curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
305
+ curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
322
306
 
323
- std::string header(buffer, n_items);
324
- std::smatch match;
325
- if (std::regex_match(header, match, header_regex)) {
326
- const std::string & key = match[1];
327
- const std::string & value = match[2];
328
- if (std::regex_match(key, match, etag_regex)) {
329
- headers->etag = value;
330
- } else if (std::regex_match(key, match, last_modified_regex)) {
331
- headers->last_modified = value;
332
- }
333
- }
334
- return n_items;
335
- };
307
+ http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
308
+ // Check if hf-token or bearer-token was specified
309
+ if (!bearer_token.empty()) {
310
+ std::string auth_header = "Authorization: Bearer " + bearer_token;
311
+ http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
312
+ }
313
+ curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
314
+
315
+ #if defined(_WIN32)
316
+ // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
317
+ // operating system. Currently implemented under MS-Windows.
318
+ curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
319
+ #endif
336
320
 
337
- curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
338
- curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
339
- curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
340
- curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
321
+ typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
322
+ auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
323
+ common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
341
324
 
342
- // we only allow retrying once for HEAD requests
343
- // this is for the use case of using running offline (no internet), retrying can be annoying
344
- bool was_perform_successful = curl_perform_with_retry(url, curl.get(), 1, 0, "HEAD");
345
- if (!was_perform_successful) {
346
- head_request_ok = false;
347
- }
325
+ static std::regex header_regex("([^:]+): (.*)\r\n");
326
+ static std::regex etag_regex("ETag", std::regex_constants::icase);
327
+ static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
348
328
 
349
- long http_code = 0;
350
- curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
351
- if (http_code == 200) {
352
- head_request_ok = true;
353
- } else {
354
- LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
355
- head_request_ok = false;
329
+ std::string header(buffer, n_items);
330
+ std::smatch match;
331
+ if (std::regex_match(header, match, header_regex)) {
332
+ const std::string & key = match[1];
333
+ const std::string & value = match[2];
334
+ if (std::regex_match(key, match, etag_regex)) {
335
+ headers->etag = value;
336
+ } else if (std::regex_match(key, match, last_modified_regex)) {
337
+ headers->last_modified = value;
338
+ }
356
339
  }
340
+ return n_items;
341
+ };
342
+
343
+ curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
344
+ curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
345
+ curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
346
+ curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
347
+
348
+ // we only allow retrying once for HEAD requests
349
+ // this is for the use case of using running offline (no internet), retrying can be annoying
350
+ bool was_perform_successful = curl_perform_with_retry(url, curl.get(), 1, 0, "HEAD");
351
+ if (!was_perform_successful) {
352
+ head_request_ok = false;
353
+ }
354
+
355
+ long http_code = 0;
356
+ curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
357
+ if (http_code == 200) {
358
+ head_request_ok = true;
359
+ } else {
360
+ LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
361
+ head_request_ok = false;
357
362
  }
358
363
 
359
364
  // if head_request_ok is false, we don't have the etag or last-modified headers
@@ -460,12 +465,12 @@ static bool common_download_file_single(const std::string & url, const std::stri
460
465
 
461
466
  // download multiple files from remote URLs to local paths
462
467
  // the input is a vector of pairs <url, path>
463
- static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token) {
468
+ static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token, bool offline) {
464
469
  // Prepare download in parallel
465
470
  std::vector<std::future<bool>> futures_download;
466
471
  for (auto const & item : urls) {
467
- futures_download.push_back(std::async(std::launch::async, [bearer_token](const std::pair<std::string, std::string> & it) -> bool {
468
- return common_download_file_single(it.first, it.second, bearer_token);
472
+ futures_download.push_back(std::async(std::launch::async, [bearer_token, offline](const std::pair<std::string, std::string> & it) -> bool {
473
+ return common_download_file_single(it.first, it.second, bearer_token, offline);
469
474
  }, item));
470
475
  }
471
476
 
@@ -481,14 +486,15 @@ static bool common_download_file_multiple(const std::vector<std::pair<std::strin
481
486
 
482
487
  static bool common_download_model(
483
488
  const common_params_model & model,
484
- const std::string & bearer_token) {
489
+ const std::string & bearer_token,
490
+ bool offline) {
485
491
  // Basic validation of the model.url
486
492
  if (model.url.empty()) {
487
493
  LOG_ERR("%s: invalid model url\n", __func__);
488
494
  return false;
489
495
  }
490
496
 
491
- if (!common_download_file_single(model.url, model.path, bearer_token)) {
497
+ if (!common_download_file_single(model.url, model.path, bearer_token, offline)) {
492
498
  return false;
493
499
  }
494
500
 
@@ -547,7 +553,7 @@ static bool common_download_model(
547
553
  }
548
554
 
549
555
  // Download in parallel
550
- common_download_file_multiple(urls, bearer_token);
556
+ common_download_file_multiple(urls, bearer_token, offline);
551
557
  }
552
558
 
553
559
  return true;
@@ -608,7 +614,7 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
608
614
  *
609
615
  * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
610
616
  */
611
- static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token) {
617
+ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token, bool offline) {
612
618
  auto parts = string_split<std::string>(hf_repo_with_tag, ':');
613
619
  std::string tag = parts.size() > 1 ? parts.back() : "latest";
614
620
  std::string hf_repo = parts[0];
@@ -638,20 +644,25 @@ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_
638
644
  long res_code = 0;
639
645
  std::string res_str;
640
646
  bool use_cache = false;
641
- try {
642
- auto res = common_remote_get_content(url, params);
643
- res_code = res.first;
644
- res_str = std::string(res.second.data(), res.second.size());
645
- } catch (const std::exception & e) {
646
- LOG_WRN("error: failed to get manifest: %s\n", e.what());
647
- LOG_WRN("try reading from cache\n");
648
- // try to read from cache
647
+ if (!offline) {
649
648
  try {
649
+ auto res = common_remote_get_content(url, params);
650
+ res_code = res.first;
651
+ res_str = std::string(res.second.data(), res.second.size());
652
+ } catch (const std::exception & e) {
653
+ LOG_WRN("error: failed to get manifest at %s: %s\n", url.c_str(), e.what());
654
+ }
655
+ }
656
+ if (res_code == 0) {
657
+ if (std::filesystem::exists(cached_response_path)) {
658
+ LOG_WRN("trying to read manifest from cache: %s\n", cached_response_path.c_str());
650
659
  res_str = read_file(cached_response_path);
651
660
  res_code = 200;
652
661
  use_cache = true;
653
- } catch (const std::exception & e) {
654
- throw std::runtime_error("error: failed to get manifest (check your internet connection)");
662
+ } else {
663
+ throw std::runtime_error(
664
+ offline ? "error: failed to get manifest (offline mode)"
665
+ : "error: failed to get manifest (check your internet connection)");
655
666
  }
656
667
  }
657
668
  std::string ggufFile;
@@ -698,24 +709,25 @@ bool common_has_curl() {
698
709
  return false;
699
710
  }
700
711
 
701
- static bool common_download_file_single(const std::string &, const std::string &, const std::string &) {
712
+ static bool common_download_file_single(const std::string &, const std::string &, const std::string &, bool) {
702
713
  LOG_ERR("error: built without CURL, cannot download model from internet\n");
703
714
  return false;
704
715
  }
705
716
 
706
- static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &) {
717
+ static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &, bool) {
707
718
  LOG_ERR("error: built without CURL, cannot download model from the internet\n");
708
719
  return false;
709
720
  }
710
721
 
711
722
  static bool common_download_model(
712
723
  const common_params_model &,
713
- const std::string &) {
724
+ const std::string &,
725
+ bool) {
714
726
  LOG_ERR("error: built without CURL, cannot download model from the internet\n");
715
727
  return false;
716
728
  }
717
729
 
718
- static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &) {
730
+ static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &, bool) {
719
731
  LOG_ERR("error: built without CURL, cannot download model from the internet\n");
720
732
  return {};
721
733
  }
@@ -742,7 +754,8 @@ struct handle_model_result {
742
754
  static handle_model_result common_params_handle_model(
743
755
  struct common_params_model & model,
744
756
  const std::string & bearer_token,
745
- const std::string & model_path_default) {
757
+ const std::string & model_path_default,
758
+ bool offline) {
746
759
  handle_model_result result;
747
760
  // handle pre-fill default model path and url based on hf_repo and hf_file
748
761
  {
@@ -750,7 +763,7 @@ static handle_model_result common_params_handle_model(
750
763
  // short-hand to avoid specifying --hf-file -> default it to --model
751
764
  if (model.hf_file.empty()) {
752
765
  if (model.path.empty()) {
753
- auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token);
766
+ auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token, offline);
754
767
  if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
755
768
  exit(1); // built without CURL, error message already printed
756
769
  }
@@ -791,7 +804,7 @@ static handle_model_result common_params_handle_model(
791
804
 
792
805
  // then, download it if needed
793
806
  if (!model.url.empty()) {
794
- bool ok = common_download_model(model, bearer_token);
807
+ bool ok = common_download_model(model, bearer_token, offline);
795
808
  if (!ok) {
796
809
  LOG_ERR("error: failed to download model from %s\n", model.url.c_str());
797
810
  exit(1);
@@ -934,7 +947,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
934
947
 
935
948
  // handle model and download
936
949
  {
937
- auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH);
950
+ auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH, params.offline);
938
951
  if (params.no_mmproj) {
939
952
  params.mmproj = {};
940
953
  } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
@@ -944,12 +957,12 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
944
957
  // only download mmproj if the current example is using it
945
958
  for (auto & ex : mmproj_examples) {
946
959
  if (ctx_arg.ex == ex) {
947
- common_params_handle_model(params.mmproj, params.hf_token, "");
960
+ common_params_handle_model(params.mmproj, params.hf_token, "", params.offline);
948
961
  break;
949
962
  }
950
963
  }
951
- common_params_handle_model(params.speculative.model, params.hf_token, "");
952
- common_params_handle_model(params.vocoder.model, params.hf_token, "");
964
+ common_params_handle_model(params.speculative.model, params.hf_token, "", params.offline);
965
+ common_params_handle_model(params.vocoder.model, params.hf_token, "", params.offline);
953
966
  }
954
967
 
955
968
  if (params.escape) {
@@ -1445,6 +1458,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1445
1458
  params.n_keep = value;
1446
1459
  }
1447
1460
  ));
1461
+ add_opt(common_arg(
1462
+ {"--swa-full"},
1463
+ string_format("use full-size SWA cache (default: %s)\n"
1464
+ "[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)", params.swa_full ? "true" : "false"),
1465
+ [](common_params & params) {
1466
+ params.swa_full = true;
1467
+ }
1468
+ ).set_env("LLAMA_ARG_SWA_FULL"));
1448
1469
  add_opt(common_arg(
1449
1470
  {"--no-context-shift"},
1450
1471
  string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
@@ -1670,7 +1691,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1670
1691
  [](common_params & params) {
1671
1692
  params.warmup = false;
1672
1693
  }
1673
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING}));
1694
+ ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL}));
1674
1695
  add_opt(common_arg(
1675
1696
  {"--spm-infill"},
1676
1697
  string_format(
@@ -2057,13 +2078,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2057
2078
  params.grp_attn_w = value;
2058
2079
  }
2059
2080
  ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN}));
2060
- add_opt(common_arg(
2061
- {"-dkvc", "--dump-kv-cache"},
2062
- "verbose print of the KV cache",
2063
- [](common_params & params) {
2064
- params.dump_kv_cache = true;
2065
- }
2066
- ));
2067
2081
  add_opt(common_arg(
2068
2082
  {"-nkvo", "--no-kv-offload"},
2069
2083
  "disable KV offload",
@@ -2232,12 +2246,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2232
2246
  }
2233
2247
  ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
2234
2248
  add_opt(common_arg(
2235
- {"--image"}, "FILE",
2236
- "path to an image file. use with multimodal models. Specify multiple times for batching",
2249
+ {"--image", "--audio"}, "FILE",
2250
+ "path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
2237
2251
  [](common_params & params, const std::string & value) {
2238
2252
  params.image.emplace_back(value);
2239
2253
  }
2240
- ).set_examples({LLAMA_EXAMPLE_LLAVA}));
2254
+ ).set_examples({LLAMA_EXAMPLE_MTMD}));
2241
2255
  if (llama_supports_rpc()) {
2242
2256
  add_opt(common_arg(
2243
2257
  {"--rpc"}, "SERVERS",
@@ -2585,7 +2599,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2585
2599
  [](common_params & params, int value) {
2586
2600
  params.n_junk = value;
2587
2601
  }
2588
- ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
2602
+ ).set_examples({LLAMA_EXAMPLE_PASSKEY, LLAMA_EXAMPLE_PARALLEL}));
2589
2603
  add_opt(common_arg(
2590
2604
  {"--pos"}, "N",
2591
2605
  string_format("position of the passkey in the junk text (default: %d)", params.i_pos),
@@ -2648,7 +2662,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2648
2662
  [](common_params & params) {
2649
2663
  params.is_pp_shared = true;
2650
2664
  }
2651
- ).set_examples({LLAMA_EXAMPLE_BENCH}));
2665
+ ).set_examples({LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
2652
2666
  add_opt(common_arg(
2653
2667
  {"-npp"}, "n0,n1,...",
2654
2668
  "number of prompt tokens",
@@ -2847,15 +2861,24 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2847
2861
  ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
2848
2862
  add_opt(common_arg(
2849
2863
  {"--reasoning-format"}, "FORMAT",
2850
- "reasoning format (default: deepseek; allowed values: deepseek, none)\n"
2851
- "controls whether thought tags are extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only).\n"
2852
- "only supported for non-streamed responses",
2864
+ "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
2865
+ "- none: leaves thoughts unparsed in `message.content`\n"
2866
+ "- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
2867
+ "(default: deepseek)",
2853
2868
  [](common_params & params, const std::string & value) {
2854
2869
  /**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
2855
2870
  else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
2856
- else { std::invalid_argument("invalid value"); }
2871
+ else { throw std::invalid_argument("invalid value"); }
2857
2872
  }
2858
2873
  ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
2874
+ add_opt(common_arg(
2875
+ {"--reasoning-budget"}, "N",
2876
+ "controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
2877
+ [](common_params & params, int value) {
2878
+ if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
2879
+ params.reasoning_budget = value;
2880
+ }
2881
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK_BUDGET"));
2859
2882
  add_opt(common_arg(
2860
2883
  {"--chat-template"}, "JINJA_TEMPLATE",
2861
2884
  string_format(
@@ -2867,7 +2890,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2867
2890
  [](common_params & params, const std::string & value) {
2868
2891
  params.chat_template = value;
2869
2892
  }
2870
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_LLAVA}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
2893
+ ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
2871
2894
  add_opt(common_arg(
2872
2895
  {"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
2873
2896
  string_format(
@@ -2880,6 +2903,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2880
2903
  params.chat_template = read_file(value);
2881
2904
  }
2882
2905
  ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
2906
+ add_opt(common_arg(
2907
+ {"--no-prefill-assistant"},
2908
+ string_format(
2909
+ "whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
2910
+ "when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"
2911
+ ),
2912
+ [](common_params & params) {
2913
+ params.prefill_assistant = false;
2914
+ }
2915
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_PREFILL_ASSISTANT"));
2883
2916
  add_opt(common_arg(
2884
2917
  {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
2885
2918
  string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
@@ -2944,7 +2977,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2944
2977
  [](common_params & params, const std::string & value) {
2945
2978
  /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
2946
2979
  else if (value == "md") { params.batched_bench_output_jsonl = false; }
2947
- else { std::invalid_argument("invalid value"); }
2980
+ else { throw std::invalid_argument("invalid value"); }
2948
2981
  }
2949
2982
  ).set_examples({LLAMA_EXAMPLE_BENCH}));
2950
2983
  add_opt(common_arg(
@@ -2976,6 +3009,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2976
3009
  common_log_set_verbosity_thold(INT_MAX);
2977
3010
  }
2978
3011
  ));
3012
+ add_opt(common_arg(
3013
+ {"--offline"},
3014
+ "Offline mode: forces use of cache, prevents network access",
3015
+ [](common_params & params) {
3016
+ params.offline = true;
3017
+ }
3018
+ ).set_env("LLAMA_OFFLINE"));
2979
3019
  add_opt(common_arg(
2980
3020
  {"-lv", "--verbosity", "--log-verbosity"}, "N",
2981
3021
  "Set the verbosity threshold. Messages with a higher verbosity will be ignored.",