@novastera-oss/llamarn 0.1.5-beta.3 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (274) hide show
  1. package/README.md +80 -14
  2. package/RNLlamaCpp.podspec +10 -3
  3. package/android/CMakeLists.txt +8 -0
  4. package/android/src/main/cpp/include/llama.h +62 -125
  5. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  10. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  11. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  12. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  13. package/cpp/LlamaCppModel.cpp +148 -5
  14. package/cpp/LlamaCppModel.h +11 -2
  15. package/cpp/PureCppImpl.cpp +3 -3
  16. package/cpp/PureCppImpl.h +3 -0
  17. package/cpp/build-info.cpp +2 -2
  18. package/cpp/llama.cpp/README.md +11 -3
  19. package/cpp/llama.cpp/build-xcframework.sh +1 -0
  20. package/cpp/llama.cpp/common/CMakeLists.txt +8 -2
  21. package/cpp/llama.cpp/common/arg.cpp +153 -113
  22. package/cpp/llama.cpp/common/chat-parser.cpp +379 -0
  23. package/cpp/llama.cpp/common/chat-parser.h +117 -0
  24. package/cpp/llama.cpp/common/chat.cpp +847 -699
  25. package/cpp/llama.cpp/common/chat.h +73 -6
  26. package/cpp/llama.cpp/common/common.cpp +50 -82
  27. package/cpp/llama.cpp/common/common.h +21 -17
  28. package/cpp/llama.cpp/common/json-partial.cpp +255 -0
  29. package/cpp/llama.cpp/common/json-partial.h +37 -0
  30. package/cpp/llama.cpp/common/minja/chat-template.hpp +9 -5
  31. package/cpp/llama.cpp/common/minja/minja.hpp +69 -36
  32. package/cpp/llama.cpp/common/regex-partial.cpp +204 -0
  33. package/cpp/llama.cpp/common/regex-partial.h +56 -0
  34. package/cpp/llama.cpp/common/sampling.cpp +7 -8
  35. package/cpp/llama.cpp/convert_hf_to_gguf.py +453 -118
  36. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +120 -68
  37. package/cpp/llama.cpp/ggml/CMakeLists.txt +2 -1
  38. package/cpp/llama.cpp/ggml/cmake/common.cmake +25 -0
  39. package/cpp/llama.cpp/ggml/include/ggml-opt.h +49 -28
  40. package/cpp/llama.cpp/ggml/include/ggml.h +26 -7
  41. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +16 -10
  42. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +4 -1
  43. package/cpp/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +1 -0
  44. package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +2 -0
  45. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +604 -0
  46. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +42 -0
  47. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +54 -2
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +50 -51
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -2
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +5 -9
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +779 -19
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +22 -0
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
  56. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +322 -100
  57. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +117 -1
  58. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +85 -16
  59. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +220 -49
  60. package/cpp/llama.cpp/ggml/src/ggml-cuda/acc.cu +40 -26
  61. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +1 -1
  62. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +11 -1
  63. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +15 -7
  64. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +266 -64
  65. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +49 -4
  66. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +48 -4
  67. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +2 -1
  68. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +5 -1
  69. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +2 -0
  70. package/cpp/llama.cpp/ggml/src/ggml-cuda/quantize.cu +7 -6
  71. package/cpp/llama.cpp/ggml/src/ggml-cuda/sum.cu +1 -1
  72. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +10 -0
  73. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +2 -0
  74. package/cpp/llama.cpp/ggml/src/ggml-impl.h +1 -1
  75. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +4 -0
  76. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +99 -17
  77. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +200 -2
  78. package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
  79. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cu +112 -0
  80. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +12 -0
  81. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +6 -0
  82. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +972 -178
  83. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  84. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/div.cl +72 -0
  85. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
  86. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  87. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
  88. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
  89. package/cpp/llama.cpp/ggml/src/ggml-opt.cpp +373 -190
  90. package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
  91. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -10
  92. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +101 -5
  93. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +31 -33
  94. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +1 -0
  95. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +29 -2
  96. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +4 -5
  97. package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  98. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +9 -1
  99. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +84 -72
  100. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +2 -0
  101. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  102. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +1 -3
  103. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +324 -129
  104. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +1 -0
  105. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +31 -2
  106. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +95 -68
  107. package/cpp/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +1 -0
  108. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +22 -0
  109. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -2
  110. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +1 -4
  111. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +2 -3
  112. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +69 -43
  113. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +2 -14
  114. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -91
  115. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +432 -181
  116. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +17 -0
  117. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
  118. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +6 -152
  119. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +162 -0
  120. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +360 -0
  121. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +2 -118
  122. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +1 -1
  123. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +12 -1
  124. package/cpp/llama.cpp/ggml/src/ggml.c +107 -36
  125. package/cpp/llama.cpp/ggml/src/gguf.cpp +33 -33
  126. package/cpp/llama.cpp/gguf-py/gguf/constants.py +100 -15
  127. package/cpp/llama.cpp/gguf-py/gguf/gguf_reader.py +1 -1
  128. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +44 -12
  129. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_editor_gui.py +21 -10
  130. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_new_metadata.py +5 -2
  131. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +128 -31
  132. package/cpp/llama.cpp/gguf-py/gguf/utility.py +1 -1
  133. package/cpp/llama.cpp/gguf-py/pyproject.toml +1 -1
  134. package/cpp/llama.cpp/include/llama.h +62 -125
  135. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +1 -1
  136. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +1 -1
  137. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.inp +1 -1
  138. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.out +1 -1
  139. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +1 -1
  140. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +1 -1
  141. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +1 -1
  142. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +1 -1
  143. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.inp +1 -1
  144. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.out +1 -1
  145. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +1 -1
  146. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +1 -1
  147. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +1 -1
  148. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +1 -1
  149. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +1 -1
  150. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +1 -1
  151. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.inp +1 -1
  152. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.out +1 -1
  153. package/cpp/llama.cpp/models/ggml-vocab-nomic-bert-moe.gguf +0 -0
  154. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +1 -1
  155. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.out +1 -1
  156. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +1 -1
  157. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.out +1 -1
  158. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.inp +1 -1
  159. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.out +1 -1
  160. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +1 -1
  161. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.out +1 -1
  162. package/cpp/llama.cpp/models/templates/Qwen-QwQ-32B.jinja +62 -0
  163. package/cpp/llama.cpp/models/templates/Qwen-Qwen3-0.6B.jinja +85 -0
  164. package/cpp/llama.cpp/models/templates/README.md +2 -0
  165. package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
  166. package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
  167. package/cpp/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  168. package/cpp/llama.cpp/requirements/requirements-gguf_editor_gui.txt +1 -1
  169. package/cpp/llama.cpp/src/CMakeLists.txt +2 -0
  170. package/cpp/llama.cpp/src/llama-arch.cpp +6 -0
  171. package/cpp/llama.cpp/src/llama-arch.h +2 -0
  172. package/cpp/llama.cpp/src/llama-batch.cpp +3 -1
  173. package/cpp/llama.cpp/src/llama-context.cpp +340 -123
  174. package/cpp/llama.cpp/src/llama-context.h +30 -0
  175. package/cpp/llama.cpp/src/llama-cparams.cpp +4 -0
  176. package/cpp/llama.cpp/src/llama-cparams.h +2 -0
  177. package/cpp/llama.cpp/src/llama-grammar.cpp +12 -2
  178. package/cpp/llama.cpp/src/llama-graph.cpp +157 -247
  179. package/cpp/llama.cpp/src/llama-graph.h +52 -7
  180. package/cpp/llama.cpp/src/llama-hparams.cpp +17 -1
  181. package/cpp/llama.cpp/src/llama-hparams.h +37 -5
  182. package/cpp/llama.cpp/src/llama-kv-cache.cpp +742 -481
  183. package/cpp/llama.cpp/src/llama-kv-cache.h +196 -99
  184. package/cpp/llama.cpp/src/llama-kv-cells.h +379 -0
  185. package/cpp/llama.cpp/src/llama-memory.h +4 -3
  186. package/cpp/llama.cpp/src/llama-model-loader.cpp +22 -17
  187. package/cpp/llama.cpp/src/llama-model-saver.cpp +281 -0
  188. package/cpp/llama.cpp/src/llama-model-saver.h +37 -0
  189. package/cpp/llama.cpp/src/llama-model.cpp +529 -172
  190. package/cpp/llama.cpp/src/llama-model.h +6 -1
  191. package/cpp/llama.cpp/src/llama-quant.cpp +15 -13
  192. package/cpp/llama.cpp/src/llama-sampling.cpp +2 -2
  193. package/cpp/llama.cpp/src/llama-vocab.cpp +35 -8
  194. package/cpp/llama.cpp/src/llama-vocab.h +6 -0
  195. package/cpp/llama.cpp/src/llama.cpp +14 -0
  196. package/cpp/rn-completion.cpp +4 -2
  197. package/ios/include/chat.h +73 -6
  198. package/ios/include/common/minja/chat-template.hpp +9 -5
  199. package/ios/include/common/minja/minja.hpp +69 -36
  200. package/ios/include/common.h +21 -17
  201. package/ios/include/llama.h +62 -125
  202. package/ios/libs/llama.xcframework/Info.plist +19 -19
  203. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  204. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4617 -4487
  205. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-opt.h +237 -0
  206. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +26 -7
  207. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +62 -125
  208. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  209. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  210. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
  211. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3557 -3435
  212. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
  213. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
  214. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
  215. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  216. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  217. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
  218. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3559 -3437
  219. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +237 -0
  220. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +26 -7
  221. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +62 -125
  222. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +237 -0
  223. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +26 -7
  224. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +62 -125
  225. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  226. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +237 -0
  227. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +26 -7
  228. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +62 -125
  229. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  230. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  231. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  232. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4616 -4487
  233. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +237 -0
  234. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +26 -7
  235. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +62 -125
  236. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  237. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  238. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4637 -4508
  239. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3556 -3435
  240. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
  241. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
  242. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
  243. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  244. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  245. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4653 -4523
  246. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +237 -0
  247. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +26 -7
  248. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +62 -125
  249. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  250. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  251. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4674 -4544
  252. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3587 -3465
  253. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
  254. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
  255. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
  256. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  257. package/lib/module/NativeRNLlamaCpp.js.map +1 -1
  258. package/lib/typescript/src/NativeRNLlamaCpp.d.ts +3 -0
  259. package/lib/typescript/src/NativeRNLlamaCpp.d.ts.map +1 -1
  260. package/package.json +2 -1
  261. package/src/NativeRNLlamaCpp.ts +1 -0
  262. package/cpp/llama.cpp/common/stb_image.h +0 -7988
  263. package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +0 -112
  264. package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.out +0 -46
  265. package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +0 -112
  266. package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +0 -46
  267. package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +0 -112
  268. package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +0 -46
  269. package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.inp +0 -112
  270. package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.out +0 -46
  271. package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +0 -112
  272. package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.out +0 -46
  273. package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +0 -112
  274. package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +0 -46
@@ -8,6 +8,9 @@
8
8
  #include <unordered_map>
9
9
  #include <functional>
10
10
 
11
+ // Add ReactCommon includes for proper async handling
12
+ #include <ReactCommon/CallInvoker.h>
13
+
11
14
  // Include all necessary common headers from llama.cpp
12
15
  #include "common.h"
13
16
  #include "sampling.h"
@@ -70,13 +73,14 @@ struct ToolCall {
70
73
  * - Uses common_token_to_piece for token->text conversion
71
74
  * - Leverages the llama.cpp chat template system
72
75
  */
73
- class LlamaCppModel : public jsi::HostObject {
76
+ class LlamaCppModel : public jsi::HostObject, public std::enable_shared_from_this<LlamaCppModel> {
74
77
  public:
75
78
  /**
76
79
  * Constructor
77
80
  * @param rn_ctx A pointer to an initialized rn_llama_context
81
+ * @param jsInvoker CallInvoker for async operations (optional, for async completion)
78
82
  */
79
- LlamaCppModel(rn_llama_context* rn_ctx);
83
+ LlamaCppModel(rn_llama_context* rn_ctx, std::shared_ptr<CallInvoker> jsInvoker = nullptr);
80
84
  virtual ~LlamaCppModel();
81
85
 
82
86
  /**
@@ -124,6 +128,8 @@ private:
124
128
  * JSI method implementations
125
129
  */
126
130
  jsi::Value completionJsi(jsi::Runtime& rt, const jsi::Value* args, size_t count);
131
+ jsi::Value completionAsyncJsi(jsi::Runtime& rt, const jsi::Value* args, size_t count);
132
+ jsi::Value stopCompletionJsi(jsi::Runtime& rt, const jsi::Value* args, size_t count);
127
133
  jsi::Value tokenizeJsi(jsi::Runtime& rt, const jsi::Value* args, size_t count);
128
134
  jsi::Value detokenizeJsi(jsi::Runtime& rt, const jsi::Value* args, size_t count);
129
135
  jsi::Value embeddingJsi(jsi::Runtime& rt, const jsi::Value* args, size_t count);
@@ -157,6 +163,9 @@ private:
157
163
  // Completion state
158
164
  bool should_stop_completion_;
159
165
  bool is_predicting_;
166
+
167
+ // Add CallInvoker for async operations
168
+ std::shared_ptr<CallInvoker> jsInvoker_;
160
169
  };
161
170
 
162
171
  } // namespace facebook::react
@@ -35,7 +35,7 @@ std::shared_ptr<TurboModule> PureCppImpl::create(std::shared_ptr<CallInvoker> js
35
35
  }
36
36
 
37
37
  PureCppImpl::PureCppImpl(std::shared_ptr<CallInvoker> jsInvoker)
38
- : NativeRNLlamaCppCxxSpec(std::move(jsInvoker)) {
38
+ : NativeRNLlamaCppCxxSpec(jsInvoker), jsInvoker_(jsInvoker) {
39
39
  }
40
40
 
41
41
  double PureCppImpl::multiply(jsi::Runtime& rt, double a, double b) {
@@ -325,8 +325,8 @@ jsi::Value PureCppImpl::initLlama(jsi::Runtime &runtime, jsi::Object options) {
325
325
  }
326
326
 
327
327
  jsi::Object PureCppImpl::createModelObject(jsi::Runtime& runtime, rn_llama_context* rn_ctx) {
328
- // Create a shared_ptr to a new LlamaCppModel instance
329
- auto llamaModel = std::make_shared<LlamaCppModel>(rn_ctx);
328
+ // Create a shared_ptr to a new LlamaCppModel instance with CallInvoker
329
+ auto llamaModel = std::make_shared<LlamaCppModel>(rn_ctx, jsInvoker_);
330
330
 
331
331
  // Create a host object from the LlamaCppModel instance
332
332
  return jsi::Object::createFromHostObject(runtime, std::move(llamaModel));
package/cpp/PureCppImpl.h CHANGED
@@ -54,6 +54,9 @@ private:
54
54
 
55
55
  // Mutex for thread safety when accessing rn_ctx_ or other shared resources
56
56
  std::mutex mutex_;
57
+
58
+ // CallInvoker for async operations
59
+ std::shared_ptr<CallInvoker> jsInvoker_;
57
60
  };
58
61
 
59
62
  } // namespace facebook::react
@@ -1,4 +1,4 @@
1
- int LLAMA_BUILD_NUMBER = 5349;
2
- char const *LLAMA_COMMIT = "9a390c48";
1
+ int LLAMA_BUILD_NUMBER = 5541;
2
+ char const *LLAMA_COMMIT = "07e4351c";
3
3
  char const *LLAMA_COMPILER = "unknown";
4
4
  char const *LLAMA_BUILD_TARGET = "unknown";
@@ -37,7 +37,7 @@ range of hardware - locally and in the cloud.
37
37
  - Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
38
38
  - AVX, AVX2, AVX512 and AMX support for x86 architectures
39
39
  - 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
40
- - Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads MTT GPUs via MUSA)
40
+ - Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads GPUs via MUSA)
41
41
  - Vulkan and SYCL backend support
42
42
  - CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
43
43
 
@@ -237,7 +237,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
237
237
  | [BLAS](docs/build.md#blas-build) | All |
238
238
  | [BLIS](docs/backend/BLIS.md) | All |
239
239
  | [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
240
- | [MUSA](docs/build.md#musa) | Moore Threads MTT GPU |
240
+ | [MUSA](docs/build.md#musa) | Moore Threads GPU |
241
241
  | [CUDA](docs/build.md#cuda) | Nvidia GPU |
242
242
  | [HIP](docs/build.md#hip) | AMD GPU |
243
243
  | [Vulkan](docs/build.md#vulkan) | GPU |
@@ -572,4 +572,12 @@ automatically. For example:
572
572
  $ echo "source ~/.llama-completion.bash" >> ~/.bashrc
573
573
  ```
574
574
 
575
- ## References
575
+ ## Dependencies
576
+
577
+ - [yhirose/cpp-httplib](https://github.com/yhirose/cpp-httplib) - Single-header HTTP server, used by `llama-server` - MIT license
578
+ - [stb-image](https://github.com/nothings/stb) - Single-header image format decoder, used by multimodal subsystem - Public domain
579
+ - [nlohmann/json](https://github.com/nlohmann/json) - Single-header JSON library, used by various tools/examples - MIT License
580
+ - [minja](https://github.com/google/minja) - Minimal Jinja parser in C++, used by various tools/examples - MIT License
581
+ - [linenoise.cpp](./tools/run/linenoise.cpp/linenoise.cpp) - C++ library that provides readline-like line editing capabilities, used by `llama-run` - BSD 2-Clause License
582
+ - [curl](https://curl.se/) - Client-side URL transfer library, used by various tools/examples - [CURL License](https://curl.se/docs/copyright.html)
583
+ - [miniaudio.h](https://github.com/mackron/miniaudio) - Single-header audio format decoder, used by multimodal subsystem - Public domain
@@ -117,6 +117,7 @@ setup_framework_structure() {
117
117
  # Copy all required headers (common for all platforms)
118
118
  cp include/llama.h ${header_path}
119
119
  cp ggml/include/ggml.h ${header_path}
120
+ cp ggml/include/ggml-opt.h ${header_path}
120
121
  cp ggml/include/ggml-alloc.h ${header_path}
121
122
  cp ggml/include/ggml-backend.h ${header_path}
122
123
  cp ggml/include/ggml-metal.h ${header_path}
@@ -60,12 +60,16 @@ add_library(${TARGET} STATIC
60
60
  base64.hpp
61
61
  chat.cpp
62
62
  chat.h
63
+ chat-parser.cpp
64
+ chat-parser.h
63
65
  common.cpp
64
66
  common.h
65
67
  console.cpp
66
68
  console.h
67
69
  json-schema-to-grammar.cpp
68
70
  json.hpp
71
+ json-partial.h
72
+ json-partial.cpp
69
73
  llguidance.cpp
70
74
  log.cpp
71
75
  log.h
@@ -73,6 +77,8 @@ add_library(${TARGET} STATIC
73
77
  minja/minja.hpp
74
78
  ngram-cache.cpp
75
79
  ngram-cache.h
80
+ regex-partial.cpp
81
+ regex-partial.h
76
82
  sampling.cpp
77
83
  sampling.h
78
84
  speculative.cpp
@@ -119,8 +125,8 @@ if (LLAMA_LLGUIDANCE)
119
125
 
120
126
  ExternalProject_Add(llguidance_ext
121
127
  GIT_REPOSITORY https://github.com/guidance-ai/llguidance
122
- # v0.7.19 (+ fancy-regex build fix):
123
- GIT_TAG b59f98f85269892a7de3d3641ad155366f13daa6
128
+ # v0.7.20 (+ fix to build on GCC 15):
129
+ GIT_TAG b5b8b64dba11c4e4ee6b1d1450d3a3ae279891e8
124
130
  PREFIX ${CMAKE_BINARY_DIR}/llguidance
125
131
  SOURCE_DIR ${LLGUIDANCE_SRC}
126
132
  BUILD_IN_SOURCE TRUE
@@ -39,7 +39,7 @@
39
39
  using json = nlohmann::ordered_json;
40
40
 
41
41
  std::initializer_list<enum llama_example> mmproj_examples = {
42
- LLAMA_EXAMPLE_LLAVA,
42
+ LLAMA_EXAMPLE_MTMD,
43
43
  LLAMA_EXAMPLE_SERVER,
44
44
  };
45
45
 
@@ -242,33 +242,7 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma
242
242
  }
243
243
 
244
244
  // download one single file from remote URL to local path
245
- static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token) {
246
- // Initialize libcurl
247
- curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
248
- curl_slist_ptr http_headers;
249
- if (!curl) {
250
- LOG_ERR("%s: error initializing libcurl\n", __func__);
251
- return false;
252
- }
253
-
254
- // Set the URL, allow to follow http redirection
255
- curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
256
- curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
257
-
258
- http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
259
- // Check if hf-token or bearer-token was specified
260
- if (!bearer_token.empty()) {
261
- std::string auth_header = "Authorization: Bearer " + bearer_token;
262
- http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
263
- }
264
- curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
265
-
266
- #if defined(_WIN32)
267
- // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
268
- // operating system. Currently implemented under MS-Windows.
269
- curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
270
- #endif
271
-
245
+ static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token, bool offline) {
272
246
  // Check if the file already exists locally
273
247
  auto file_exists = std::filesystem::exists(path);
274
248
 
@@ -279,6 +253,10 @@ static bool common_download_file_single(const std::string & url, const std::stri
279
253
  std::string last_modified;
280
254
 
281
255
  if (file_exists) {
256
+ if (offline) {
257
+ LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
258
+ return true; // skip verification/downloading
259
+ }
282
260
  // Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
283
261
  std::ifstream metadata_in(metadata_path);
284
262
  if (metadata_in.good()) {
@@ -297,6 +275,10 @@ static bool common_download_file_single(const std::string & url, const std::stri
297
275
  }
298
276
  // if we cannot open the metadata file, we assume that the downloaded file is not valid (etag and last-modified are left empty, so we will download it again)
299
277
  } else {
278
+ if (offline) {
279
+ LOG_ERR("%s: required file is not available in cache (offline mode): %s\n", __func__, path.c_str());
280
+ return false;
281
+ }
300
282
  LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
301
283
  }
302
284
 
@@ -310,50 +292,73 @@ static bool common_download_file_single(const std::string & url, const std::stri
310
292
  bool head_request_ok = false;
311
293
  bool should_download = !file_exists; // by default, we should download if the file does not exist
312
294
 
313
- // get ETag to see if the remote file has changed
314
- {
315
- typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
316
- auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
317
- common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
295
+ // Initialize libcurl
296
+ curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
297
+ curl_slist_ptr http_headers;
298
+ if (!curl) {
299
+ LOG_ERR("%s: error initializing libcurl\n", __func__);
300
+ return false;
301
+ }
318
302
 
319
- static std::regex header_regex("([^:]+): (.*)\r\n");
320
- static std::regex etag_regex("ETag", std::regex_constants::icase);
321
- static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
303
+ // Set the URL, allow to follow http redirection
304
+ curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
305
+ curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
322
306
 
323
- std::string header(buffer, n_items);
324
- std::smatch match;
325
- if (std::regex_match(header, match, header_regex)) {
326
- const std::string & key = match[1];
327
- const std::string & value = match[2];
328
- if (std::regex_match(key, match, etag_regex)) {
329
- headers->etag = value;
330
- } else if (std::regex_match(key, match, last_modified_regex)) {
331
- headers->last_modified = value;
332
- }
333
- }
334
- return n_items;
335
- };
307
+ http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
308
+ // Check if hf-token or bearer-token was specified
309
+ if (!bearer_token.empty()) {
310
+ std::string auth_header = "Authorization: Bearer " + bearer_token;
311
+ http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
312
+ }
313
+ curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
314
+
315
+ #if defined(_WIN32)
316
+ // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
317
+ // operating system. Currently implemented under MS-Windows.
318
+ curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
319
+ #endif
336
320
 
337
- curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
338
- curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
339
- curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
340
- curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
321
+ typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
322
+ auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
323
+ common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
341
324
 
342
- // we only allow retrying once for HEAD requests
343
- // this is for the use case of using running offline (no internet), retrying can be annoying
344
- bool was_perform_successful = curl_perform_with_retry(url, curl.get(), 1, 0, "HEAD");
345
- if (!was_perform_successful) {
346
- head_request_ok = false;
347
- }
325
+ static std::regex header_regex("([^:]+): (.*)\r\n");
326
+ static std::regex etag_regex("ETag", std::regex_constants::icase);
327
+ static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
348
328
 
349
- long http_code = 0;
350
- curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
351
- if (http_code == 200) {
352
- head_request_ok = true;
353
- } else {
354
- LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
355
- head_request_ok = false;
329
+ std::string header(buffer, n_items);
330
+ std::smatch match;
331
+ if (std::regex_match(header, match, header_regex)) {
332
+ const std::string & key = match[1];
333
+ const std::string & value = match[2];
334
+ if (std::regex_match(key, match, etag_regex)) {
335
+ headers->etag = value;
336
+ } else if (std::regex_match(key, match, last_modified_regex)) {
337
+ headers->last_modified = value;
338
+ }
356
339
  }
340
+ return n_items;
341
+ };
342
+
343
+ curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
344
+ curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
345
+ curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
346
+ curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
347
+
348
+ // we only allow retrying once for HEAD requests
349
+ // this is for the use case of using running offline (no internet), retrying can be annoying
350
+ bool was_perform_successful = curl_perform_with_retry(url, curl.get(), 1, 0, "HEAD");
351
+ if (!was_perform_successful) {
352
+ head_request_ok = false;
353
+ }
354
+
355
+ long http_code = 0;
356
+ curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
357
+ if (http_code == 200) {
358
+ head_request_ok = true;
359
+ } else {
360
+ LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
361
+ head_request_ok = false;
357
362
  }
358
363
 
359
364
  // if head_request_ok is false, we don't have the etag or last-modified headers
@@ -460,12 +465,12 @@ static bool common_download_file_single(const std::string & url, const std::stri
460
465
 
461
466
  // download multiple files from remote URLs to local paths
462
467
  // the input is a vector of pairs <url, path>
463
- static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token) {
468
+ static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token, bool offline) {
464
469
  // Prepare download in parallel
465
470
  std::vector<std::future<bool>> futures_download;
466
471
  for (auto const & item : urls) {
467
- futures_download.push_back(std::async(std::launch::async, [bearer_token](const std::pair<std::string, std::string> & it) -> bool {
468
- return common_download_file_single(it.first, it.second, bearer_token);
472
+ futures_download.push_back(std::async(std::launch::async, [bearer_token, offline](const std::pair<std::string, std::string> & it) -> bool {
473
+ return common_download_file_single(it.first, it.second, bearer_token, offline);
469
474
  }, item));
470
475
  }
471
476
 
@@ -481,14 +486,15 @@ static bool common_download_file_multiple(const std::vector<std::pair<std::strin
481
486
 
482
487
  static bool common_download_model(
483
488
  const common_params_model & model,
484
- const std::string & bearer_token) {
489
+ const std::string & bearer_token,
490
+ bool offline) {
485
491
  // Basic validation of the model.url
486
492
  if (model.url.empty()) {
487
493
  LOG_ERR("%s: invalid model url\n", __func__);
488
494
  return false;
489
495
  }
490
496
 
491
- if (!common_download_file_single(model.url, model.path, bearer_token)) {
497
+ if (!common_download_file_single(model.url, model.path, bearer_token, offline)) {
492
498
  return false;
493
499
  }
494
500
 
@@ -547,7 +553,7 @@ static bool common_download_model(
547
553
  }
548
554
 
549
555
  // Download in parallel
550
- common_download_file_multiple(urls, bearer_token);
556
+ common_download_file_multiple(urls, bearer_token, offline);
551
557
  }
552
558
 
553
559
  return true;
@@ -608,7 +614,7 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
608
614
  *
609
615
  * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
610
616
  */
611
- static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token) {
617
+ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token, bool offline) {
612
618
  auto parts = string_split<std::string>(hf_repo_with_tag, ':');
613
619
  std::string tag = parts.size() > 1 ? parts.back() : "latest";
614
620
  std::string hf_repo = parts[0];
@@ -638,20 +644,25 @@ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_
638
644
  long res_code = 0;
639
645
  std::string res_str;
640
646
  bool use_cache = false;
641
- try {
642
- auto res = common_remote_get_content(url, params);
643
- res_code = res.first;
644
- res_str = std::string(res.second.data(), res.second.size());
645
- } catch (const std::exception & e) {
646
- LOG_WRN("error: failed to get manifest: %s\n", e.what());
647
- LOG_WRN("try reading from cache\n");
648
- // try to read from cache
647
+ if (!offline) {
649
648
  try {
649
+ auto res = common_remote_get_content(url, params);
650
+ res_code = res.first;
651
+ res_str = std::string(res.second.data(), res.second.size());
652
+ } catch (const std::exception & e) {
653
+ LOG_WRN("error: failed to get manifest at %s: %s\n", url.c_str(), e.what());
654
+ }
655
+ }
656
+ if (res_code == 0) {
657
+ if (std::filesystem::exists(cached_response_path)) {
658
+ LOG_WRN("trying to read manifest from cache: %s\n", cached_response_path.c_str());
650
659
  res_str = read_file(cached_response_path);
651
660
  res_code = 200;
652
661
  use_cache = true;
653
- } catch (const std::exception & e) {
654
- throw std::runtime_error("error: failed to get manifest (check your internet connection)");
662
+ } else {
663
+ throw std::runtime_error(
664
+ offline ? "error: failed to get manifest (offline mode)"
665
+ : "error: failed to get manifest (check your internet connection)");
655
666
  }
656
667
  }
657
668
  std::string ggufFile;
@@ -698,24 +709,25 @@ bool common_has_curl() {
698
709
  return false;
699
710
  }
700
711
 
701
- static bool common_download_file_single(const std::string &, const std::string &, const std::string &) {
712
+ static bool common_download_file_single(const std::string &, const std::string &, const std::string &, bool) {
702
713
  LOG_ERR("error: built without CURL, cannot download model from internet\n");
703
714
  return false;
704
715
  }
705
716
 
706
- static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &) {
717
+ static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &, bool) {
707
718
  LOG_ERR("error: built without CURL, cannot download model from the internet\n");
708
719
  return false;
709
720
  }
710
721
 
711
722
  static bool common_download_model(
712
723
  const common_params_model &,
713
- const std::string &) {
724
+ const std::string &,
725
+ bool) {
714
726
  LOG_ERR("error: built without CURL, cannot download model from the internet\n");
715
727
  return false;
716
728
  }
717
729
 
718
- static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &) {
730
+ static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &, bool) {
719
731
  LOG_ERR("error: built without CURL, cannot download model from the internet\n");
720
732
  return {};
721
733
  }
@@ -742,7 +754,8 @@ struct handle_model_result {
742
754
  static handle_model_result common_params_handle_model(
743
755
  struct common_params_model & model,
744
756
  const std::string & bearer_token,
745
- const std::string & model_path_default) {
757
+ const std::string & model_path_default,
758
+ bool offline) {
746
759
  handle_model_result result;
747
760
  // handle pre-fill default model path and url based on hf_repo and hf_file
748
761
  {
@@ -750,7 +763,7 @@ static handle_model_result common_params_handle_model(
750
763
  // short-hand to avoid specifying --hf-file -> default it to --model
751
764
  if (model.hf_file.empty()) {
752
765
  if (model.path.empty()) {
753
- auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token);
766
+ auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token, offline);
754
767
  if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
755
768
  exit(1); // built without CURL, error message already printed
756
769
  }
@@ -791,7 +804,7 @@ static handle_model_result common_params_handle_model(
791
804
 
792
805
  // then, download it if needed
793
806
  if (!model.url.empty()) {
794
- bool ok = common_download_model(model, bearer_token);
807
+ bool ok = common_download_model(model, bearer_token, offline);
795
808
  if (!ok) {
796
809
  LOG_ERR("error: failed to download model from %s\n", model.url.c_str());
797
810
  exit(1);
@@ -934,7 +947,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
934
947
 
935
948
  // handle model and download
936
949
  {
937
- auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH);
950
+ auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH, params.offline);
938
951
  if (params.no_mmproj) {
939
952
  params.mmproj = {};
940
953
  } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
@@ -944,12 +957,12 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
944
957
  // only download mmproj if the current example is using it
945
958
  for (auto & ex : mmproj_examples) {
946
959
  if (ctx_arg.ex == ex) {
947
- common_params_handle_model(params.mmproj, params.hf_token, "");
960
+ common_params_handle_model(params.mmproj, params.hf_token, "", params.offline);
948
961
  break;
949
962
  }
950
963
  }
951
- common_params_handle_model(params.speculative.model, params.hf_token, "");
952
- common_params_handle_model(params.vocoder.model, params.hf_token, "");
964
+ common_params_handle_model(params.speculative.model, params.hf_token, "", params.offline);
965
+ common_params_handle_model(params.vocoder.model, params.hf_token, "", params.offline);
953
966
  }
954
967
 
955
968
  if (params.escape) {
@@ -1445,6 +1458,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1445
1458
  params.n_keep = value;
1446
1459
  }
1447
1460
  ));
1461
+ add_opt(common_arg(
1462
+ {"--swa-full"},
1463
+ string_format("use full-size SWA cache (default: %s)\n"
1464
+ "[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)", params.swa_full ? "true" : "false"),
1465
+ [](common_params & params) {
1466
+ params.swa_full = true;
1467
+ }
1468
+ ).set_env("LLAMA_ARG_SWA_FULL"));
1448
1469
  add_opt(common_arg(
1449
1470
  {"--no-context-shift"},
1450
1471
  string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
@@ -1670,7 +1691,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1670
1691
  [](common_params & params) {
1671
1692
  params.warmup = false;
1672
1693
  }
1673
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING}));
1694
+ ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL}));
1674
1695
  add_opt(common_arg(
1675
1696
  {"--spm-infill"},
1676
1697
  string_format(
@@ -2057,13 +2078,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2057
2078
  params.grp_attn_w = value;
2058
2079
  }
2059
2080
  ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN}));
2060
- add_opt(common_arg(
2061
- {"-dkvc", "--dump-kv-cache"},
2062
- "verbose print of the KV cache",
2063
- [](common_params & params) {
2064
- params.dump_kv_cache = true;
2065
- }
2066
- ));
2067
2081
  add_opt(common_arg(
2068
2082
  {"-nkvo", "--no-kv-offload"},
2069
2083
  "disable KV offload",
@@ -2232,12 +2246,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2232
2246
  }
2233
2247
  ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
2234
2248
  add_opt(common_arg(
2235
- {"--image"}, "FILE",
2236
- "path to an image file. use with multimodal models. Specify multiple times for batching",
2249
+ {"--image", "--audio"}, "FILE",
2250
+ "path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
2237
2251
  [](common_params & params, const std::string & value) {
2238
2252
  params.image.emplace_back(value);
2239
2253
  }
2240
- ).set_examples({LLAMA_EXAMPLE_LLAVA}));
2254
+ ).set_examples({LLAMA_EXAMPLE_MTMD}));
2241
2255
  if (llama_supports_rpc()) {
2242
2256
  add_opt(common_arg(
2243
2257
  {"--rpc"}, "SERVERS",
@@ -2585,7 +2599,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2585
2599
  [](common_params & params, int value) {
2586
2600
  params.n_junk = value;
2587
2601
  }
2588
- ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
2602
+ ).set_examples({LLAMA_EXAMPLE_PASSKEY, LLAMA_EXAMPLE_PARALLEL}));
2589
2603
  add_opt(common_arg(
2590
2604
  {"--pos"}, "N",
2591
2605
  string_format("position of the passkey in the junk text (default: %d)", params.i_pos),
@@ -2648,7 +2662,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2648
2662
  [](common_params & params) {
2649
2663
  params.is_pp_shared = true;
2650
2664
  }
2651
- ).set_examples({LLAMA_EXAMPLE_BENCH}));
2665
+ ).set_examples({LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
2652
2666
  add_opt(common_arg(
2653
2667
  {"-npp"}, "n0,n1,...",
2654
2668
  "number of prompt tokens",
@@ -2847,15 +2861,24 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2847
2861
  ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
2848
2862
  add_opt(common_arg(
2849
2863
  {"--reasoning-format"}, "FORMAT",
2850
- "reasoning format (default: deepseek; allowed values: deepseek, none)\n"
2851
- "controls whether thought tags are extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only).\n"
2852
- "only supported for non-streamed responses",
2864
+ "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
2865
+ "- none: leaves thoughts unparsed in `message.content`\n"
2866
+ "- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
2867
+ "(default: deepseek)",
2853
2868
  [](common_params & params, const std::string & value) {
2854
2869
  /**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
2855
2870
  else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
2856
- else { std::invalid_argument("invalid value"); }
2871
+ else { throw std::invalid_argument("invalid value"); }
2857
2872
  }
2858
2873
  ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
2874
+ add_opt(common_arg(
2875
+ {"--reasoning-budget"}, "N",
2876
+ "controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
2877
+ [](common_params & params, int value) {
2878
+ if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
2879
+ params.reasoning_budget = value;
2880
+ }
2881
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK_BUDGET"));
2859
2882
  add_opt(common_arg(
2860
2883
  {"--chat-template"}, "JINJA_TEMPLATE",
2861
2884
  string_format(
@@ -2867,7 +2890,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2867
2890
  [](common_params & params, const std::string & value) {
2868
2891
  params.chat_template = value;
2869
2892
  }
2870
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_LLAVA}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
2893
+ ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
2871
2894
  add_opt(common_arg(
2872
2895
  {"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
2873
2896
  string_format(
@@ -2880,6 +2903,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2880
2903
  params.chat_template = read_file(value);
2881
2904
  }
2882
2905
  ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
2906
+ add_opt(common_arg(
2907
+ {"--no-prefill-assistant"},
2908
+ string_format(
2909
+ "whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
2910
+ "when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"
2911
+ ),
2912
+ [](common_params & params) {
2913
+ params.prefill_assistant = false;
2914
+ }
2915
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_PREFILL_ASSISTANT"));
2883
2916
  add_opt(common_arg(
2884
2917
  {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
2885
2918
  string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
@@ -2944,7 +2977,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2944
2977
  [](common_params & params, const std::string & value) {
2945
2978
  /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
2946
2979
  else if (value == "md") { params.batched_bench_output_jsonl = false; }
2947
- else { std::invalid_argument("invalid value"); }
2980
+ else { throw std::invalid_argument("invalid value"); }
2948
2981
  }
2949
2982
  ).set_examples({LLAMA_EXAMPLE_BENCH}));
2950
2983
  add_opt(common_arg(
@@ -2976,6 +3009,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2976
3009
  common_log_set_verbosity_thold(INT_MAX);
2977
3010
  }
2978
3011
  ));
3012
+ add_opt(common_arg(
3013
+ {"--offline"},
3014
+ "Offline mode: forces use of cache, prevents network access",
3015
+ [](common_params & params) {
3016
+ params.offline = true;
3017
+ }
3018
+ ).set_env("LLAMA_OFFLINE"));
2979
3019
  add_opt(common_arg(
2980
3020
  {"-lv", "--verbosity", "--log-verbosity"}, "N",
2981
3021
  "Set the verbosity threshold. Messages with a higher verbosity will be ignored.",