@novastera-oss/llamarn 0.4.1 → 0.4.3-beta4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (976) hide show
  1. package/RNLlamaCpp.podspec +3 -0
  2. package/android/CMakeLists.txt +2 -0
  3. package/android/src/main/cpp/include/llama.h +44 -21
  4. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  8. package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
  9. package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
  10. package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
  11. package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
  12. package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
  13. package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
  14. package/android/src/main/jniLibs/x86/libggml.so +0 -0
  15. package/android/src/main/jniLibs/x86/libllama.so +0 -0
  16. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  17. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  18. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  19. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  20. package/cpp/build-info.cpp +2 -2
  21. package/cpp/llama.cpp/CMakeLists.txt +12 -0
  22. package/cpp/llama.cpp/CODEOWNERS +116 -10
  23. package/cpp/llama.cpp/CONTRIBUTING.md +30 -3
  24. package/cpp/llama.cpp/README.md +13 -5
  25. package/cpp/llama.cpp/build-xcframework.sh +5 -0
  26. package/cpp/llama.cpp/cmake/riscv64-spacemit-linux-gnu-gcc.cmake +29 -0
  27. package/cpp/llama.cpp/common/CMakeLists.txt +12 -2
  28. package/cpp/llama.cpp/common/arg.cpp +303 -795
  29. package/cpp/llama.cpp/common/arg.h +2 -3
  30. package/cpp/llama.cpp/common/chat-parser-xml-toolcall.cpp +861 -0
  31. package/cpp/llama.cpp/common/chat-parser-xml-toolcall.h +45 -0
  32. package/cpp/llama.cpp/common/chat-parser.cpp +156 -15
  33. package/cpp/llama.cpp/common/chat-parser.h +13 -0
  34. package/cpp/llama.cpp/common/chat.cpp +1147 -88
  35. package/cpp/llama.cpp/common/chat.h +16 -3
  36. package/cpp/llama.cpp/common/common.cpp +70 -15
  37. package/cpp/llama.cpp/common/common.h +57 -19
  38. package/cpp/llama.cpp/common/download.cpp +1072 -0
  39. package/cpp/llama.cpp/common/download.h +55 -0
  40. package/cpp/llama.cpp/common/http.h +73 -0
  41. package/cpp/llama.cpp/common/json-partial.cpp +70 -2
  42. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +61 -22
  43. package/cpp/llama.cpp/common/json-schema-to-grammar.h +2 -0
  44. package/cpp/llama.cpp/common/log.cpp +59 -2
  45. package/cpp/llama.cpp/common/log.h +12 -4
  46. package/cpp/llama.cpp/common/sampling.cpp +84 -8
  47. package/cpp/llama.cpp/common/sampling.h +3 -1
  48. package/cpp/llama.cpp/common/speculative.cpp +1 -1
  49. package/cpp/llama.cpp/convert_hf_to_gguf.py +1608 -233
  50. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +6 -1
  51. package/cpp/llama.cpp/convert_lora_to_gguf.py +37 -5
  52. package/cpp/llama.cpp/ggml/CMakeLists.txt +47 -28
  53. package/cpp/llama.cpp/ggml/include/ggml-backend.h +19 -1
  54. package/cpp/llama.cpp/ggml/include/ggml-cpu.h +1 -1
  55. package/cpp/llama.cpp/ggml/include/ggml-hexagon.h +19 -0
  56. package/cpp/llama.cpp/ggml/include/ggml-metal.h +1 -6
  57. package/cpp/llama.cpp/ggml/include/ggml-rpc.h +7 -9
  58. package/cpp/llama.cpp/ggml/include/ggml-zdnn.h +2 -1
  59. package/cpp/llama.cpp/ggml/include/ggml.h +199 -6
  60. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +38 -0
  61. package/cpp/llama.cpp/ggml/src/ggml-alloc.c +299 -130
  62. package/cpp/llama.cpp/ggml/src/ggml-backend-impl.h +4 -4
  63. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +21 -5
  64. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +99 -2
  65. package/cpp/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +1 -1
  66. package/cpp/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +1 -0
  67. package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +57 -45
  68. package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +138 -47
  69. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +1584 -1773
  70. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +201 -317
  71. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +146 -187
  72. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +771 -713
  73. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +135 -77
  74. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +5 -2
  75. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +428 -26
  76. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +16 -17
  77. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +318 -145
  78. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  79. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +155 -60
  80. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +8 -8
  81. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -1
  82. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +14 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +10 -9
  84. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +108 -64
  85. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +14 -4
  86. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +530 -87
  87. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +37 -45
  88. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +349 -127
  89. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +947 -1218
  90. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +5 -4
  91. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +143 -29
  92. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +82 -76
  93. package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1025 -0
  94. package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  95. package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  96. package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  97. package/cpp/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +151 -0
  98. package/cpp/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +7 -0
  99. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +233 -28
  100. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +326 -66
  101. package/cpp/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +12 -3
  102. package/cpp/llama.cpp/ggml/src/ggml-cuda/argsort.cu +102 -6
  103. package/cpp/llama.cpp/ggml/src/ggml-cuda/binbcast.cu +110 -76
  104. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +167 -38
  105. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d.cu +6 -11
  106. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +12 -0
  107. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh +1 -1
  108. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +245 -151
  109. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cuh +1 -5
  110. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +341 -289
  111. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile.cu +49 -0
  112. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile.cuh +1233 -0
  113. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec.cuh +586 -0
  114. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +6 -6
  115. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +48 -0
  116. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +123 -220
  117. package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +41 -39
  118. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +715 -45
  119. package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cu +150 -0
  120. package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cuh +1 -0
  121. package/cpp/llama.cpp/ggml/src/ggml-cuda/mma.cuh +321 -24
  122. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cu +93 -351
  123. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cuh +828 -1
  124. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmid.cu +164 -0
  125. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmid.cuh +5 -0
  126. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +3 -166
  127. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +1 -1
  128. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvf.cu +371 -78
  129. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvf.cuh +3 -2
  130. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cu +279 -147
  131. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cuh +1 -1
  132. package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cu +97 -85
  133. package/cpp/llama.cpp/ggml/src/ggml-cuda/pad.cu +46 -23
  134. package/cpp/llama.cpp/ggml/src/ggml-cuda/pad_reflect_1d.cu +63 -54
  135. package/cpp/llama.cpp/ggml/src/ggml-cuda/quantize.cu +12 -10
  136. package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +192 -77
  137. package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cuh +2 -0
  138. package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +10 -9
  139. package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cu +137 -75
  140. package/cpp/llama.cpp/ggml/src/ggml-cuda/set.cu +39 -0
  141. package/cpp/llama.cpp/ggml/src/ggml-cuda/set.cuh +7 -0
  142. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
  143. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
  144. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
  145. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
  146. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
  147. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
  148. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
  149. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
  150. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
  151. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
  152. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
  153. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
  154. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
  155. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
  156. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
  157. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
  158. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
  159. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
  160. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
  161. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
  162. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
  163. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
  164. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
  165. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
  166. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
  167. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
  168. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
  169. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
  170. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
  171. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
  172. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
  173. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
  174. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
  175. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
  176. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
  177. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
  178. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
  179. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
  180. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
  181. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
  182. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
  183. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
  184. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
  185. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
  186. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
  187. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +40 -19
  188. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
  189. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
  190. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
  191. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
  192. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
  193. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
  194. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
  195. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
  196. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
  197. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
  198. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
  199. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
  200. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
  201. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
  202. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
  203. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
  204. package/cpp/llama.cpp/ggml/src/ggml-cuda/topk-moe.cu +336 -0
  205. package/cpp/llama.cpp/ggml/src/ggml-cuda/topk-moe.cuh +16 -0
  206. package/cpp/llama.cpp/ggml/src/ggml-cuda/tsembd.cu +3 -3
  207. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +105 -11
  208. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +36 -0
  209. package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +87 -6
  210. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +28 -12
  211. package/cpp/llama.cpp/ggml/src/ggml-hexagon/CMakeLists.txt +68 -0
  212. package/cpp/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3807 -0
  213. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/CMakeLists.txt +40 -0
  214. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/act-ops.c +442 -0
  215. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/binary-ops.c +360 -0
  216. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
  217. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-ctx.h +40 -0
  218. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.c +69 -0
  219. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.h +119 -0
  220. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-msg.h +156 -0
  221. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-ops.h +64 -0
  222. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
  223. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-exp.c +93 -0
  224. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.c +60 -0
  225. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +49 -0
  226. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.c +960 -0
  227. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +1032 -0
  228. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/main.c +829 -0
  229. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +2223 -0
  230. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/ops-utils.h +149 -0
  231. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/rope-ops.c +418 -0
  232. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/softmax-ops.c +402 -0
  233. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/unary-ops.c +255 -0
  234. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/worker-pool.c +297 -0
  235. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
  236. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp-utils.c +448 -0
  237. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp-utils.h +220 -0
  238. package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +8 -13
  239. package/cpp/llama.cpp/ggml/src/ggml-impl.h +110 -12
  240. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +6 -5
  241. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
  242. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  243. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
  244. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-context.m +599 -0
  245. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +1662 -0
  246. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.h +251 -0
  247. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.m +1527 -0
  248. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +244 -39
  249. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +3844 -0
  250. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.h +90 -0
  251. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.cpp +723 -0
  252. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +3453 -1907
  253. package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +3 -1
  254. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +10 -0
  255. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1331 -109
  256. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/cvt.cl +126 -0
  257. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +31 -4
  258. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +35 -7
  259. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +31 -4
  260. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
  261. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
  262. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/get_rows.cl +36 -12
  263. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
  264. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +24 -10
  265. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +24 -10
  266. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
  267. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  268. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  269. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  270. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  271. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  272. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  273. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +29 -20
  274. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +25 -10
  275. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rope.cl +50 -24
  276. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +123 -10
  277. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +2 -2
  278. package/cpp/llama.cpp/ggml/src/ggml-quants.c +1 -0
  279. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +341 -161
  280. package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
  281. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +6 -5
  282. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +74 -15
  283. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +50 -30
  284. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +10 -4
  285. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +166 -99
  286. package/cpp/llama.cpp/ggml/src/ggml-sycl/count-equal.cpp +79 -0
  287. package/cpp/llama.cpp/ggml/src/ggml-sycl/count-equal.hpp +9 -0
  288. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +72 -94
  289. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +67 -49
  290. package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +21 -31
  291. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +252 -316
  292. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +6 -2
  293. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +9 -6
  294. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +359 -142
  295. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
  296. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  297. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +80 -60
  298. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +201 -132
  299. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +230 -55
  300. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.hpp +2 -0
  301. package/cpp/llama.cpp/ggml/src/ggml-sycl/pad.cpp +97 -0
  302. package/cpp/llama.cpp/ggml/src/ggml-sycl/pad.hpp +24 -0
  303. package/cpp/llama.cpp/ggml/src/ggml-sycl/pad_reflect_1d.cpp +72 -0
  304. package/cpp/llama.cpp/ggml/src/ggml-sycl/pad_reflect_1d.hpp +8 -0
  305. package/cpp/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
  306. package/cpp/llama.cpp/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
  307. package/cpp/llama.cpp/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
  308. package/cpp/llama.cpp/ggml/src/ggml-sycl/roll.cpp +122 -0
  309. package/cpp/llama.cpp/ggml/src/ggml-sycl/roll.hpp +20 -0
  310. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +50 -41
  311. package/cpp/llama.cpp/ggml/src/ggml-sycl/set.cpp +73 -0
  312. package/cpp/llama.cpp/ggml/src/ggml-sycl/set.hpp +5 -0
  313. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +45 -36
  314. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +330 -165
  315. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +4 -0
  316. package/cpp/llama.cpp/ggml/src/ggml-sycl/ssm_conv.cpp +127 -0
  317. package/cpp/llama.cpp/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
  318. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +12 -6
  319. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +16 -12
  320. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +38 -18
  321. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +4184 -2159
  322. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
  323. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +2 -2
  324. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +2 -2
  325. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
  326. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +1 -1
  327. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
  328. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +2 -2
  329. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +33 -26
  330. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
  331. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
  332. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +2 -2
  333. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +2 -2
  334. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +2 -2
  335. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +1 -1
  336. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +53 -30
  337. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +1 -1
  338. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +2 -2
  339. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +3 -3
  340. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +13 -6
  341. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
  342. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +2 -2
  343. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +2 -2
  344. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +1 -1
  345. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs.comp → dequant_funcs.glsl} +138 -2
  346. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs_cm2.comp → dequant_funcs_cm2.glsl} +18 -4
  347. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_head.comp → dequant_head.glsl} +1 -1
  348. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
  349. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +1 -1
  350. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +2 -2
  351. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +1 -1
  352. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +3 -2
  353. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +7 -6
  354. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +5 -3
  355. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +1 -1
  356. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +1 -1
  357. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +3 -3
  358. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +3 -3
  359. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
  360. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +1 -1
  361. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +1 -1
  362. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +3 -3
  363. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +1 -1
  364. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +1 -1
  365. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +3 -3
  366. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
  367. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +1 -1
  368. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +1 -1
  369. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +2 -2
  370. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +3 -2
  371. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
  372. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +52 -14
  373. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{flash_attn_base.comp → flash_attn_base.glsl} +50 -12
  374. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +61 -12
  375. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +54 -12
  376. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +5 -1
  377. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
  378. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +2 -2
  379. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +2 -2
  380. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +2 -2
  381. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +2 -2
  382. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +2 -2
  383. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +2 -2
  384. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_binary_head.comp → generic_binary_head.glsl} +10 -2
  385. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +21 -12
  386. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +28 -18
  387. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{glu_head.comp → glu_head.glsl} +1 -1
  388. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +2 -2
  389. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
  390. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
  391. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +15 -7
  392. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +125 -0
  393. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +2 -2
  394. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +2 -2
  395. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +18 -0
  396. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +2 -2
  397. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +1 -1
  398. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +229 -0
  399. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +33 -0
  400. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +1 -1
  401. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +1 -1
  402. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +1 -1
  403. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +1 -1
  404. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +1 -1
  405. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +1 -1
  406. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +1 -1
  407. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +9 -7
  408. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +9 -7
  409. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +3 -5
  410. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +1 -1
  411. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +3 -5
  412. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +3 -5
  413. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +1 -1
  414. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
  415. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +106 -634
  416. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +118 -9
  417. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +556 -0
  418. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +70 -0
  419. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +77 -214
  420. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +589 -0
  421. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +78 -0
  422. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +97 -13
  423. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
  424. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +2 -2
  425. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +2 -2
  426. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +1 -1
  427. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +25 -4
  428. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +1 -1
  429. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +55 -5
  430. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +2 -2
  431. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +2 -2
  432. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +2 -2
  433. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +2 -2
  434. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +45 -3
  435. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +2 -2
  436. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +2 -2
  437. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +2 -2
  438. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +227 -0
  439. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +20 -0
  440. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +5 -52
  441. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +5 -35
  442. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +5 -35
  443. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +27 -0
  444. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +5 -41
  445. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
  446. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +2 -2
  447. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +2 -2
  448. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +2 -2
  449. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +2 -2
  450. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +2 -2
  451. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +1 -1
  452. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +6 -2
  453. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
  454. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +2 -2
  455. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +2 -2
  456. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +44 -0
  457. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +140 -0
  458. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
  459. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +2 -2
  460. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +1 -1
  461. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +2 -2
  462. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +2 -2
  463. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +2 -2
  464. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +5 -4
  465. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +171 -0
  466. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
  467. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{types.comp → types.glsl} +79 -29
  468. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +36 -12
  469. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +471 -196
  470. package/cpp/llama.cpp/ggml/src/ggml-webgpu/CMakeLists.txt +8 -0
  471. package/cpp/llama.cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1690 -383
  472. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +188 -0
  473. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
  474. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
  475. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +101 -0
  476. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +57 -10
  477. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
  478. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
  479. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +25 -912
  480. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +97 -0
  481. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl +247 -0
  482. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.tmpl.wgsl +302 -0
  483. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +267 -0
  484. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +123 -0
  485. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
  486. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/scale.tmpl.wgsl +90 -0
  487. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/{set_rows.wgsl → set_rows.tmpl.wgsl} +38 -8
  488. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
  489. package/cpp/llama.cpp/ggml/src/ggml-zdnn/common.hpp +59 -0
  490. package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +96 -314
  491. package/cpp/llama.cpp/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  492. package/cpp/llama.cpp/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  493. package/cpp/llama.cpp/ggml/src/ggml-zdnn/utils.cpp +79 -0
  494. package/cpp/llama.cpp/ggml/src/ggml-zdnn/utils.hpp +19 -0
  495. package/cpp/llama.cpp/ggml/src/ggml.c +440 -17
  496. package/cpp/llama.cpp/ggml/src/gguf.cpp +104 -29
  497. package/cpp/llama.cpp/gguf-py/gguf/constants.py +363 -13
  498. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +64 -0
  499. package/cpp/llama.cpp/gguf-py/gguf/lazy.py +8 -3
  500. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_convert_endian.py +6 -0
  501. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +156 -18
  502. package/cpp/llama.cpp/gguf-py/gguf/utility.py +80 -0
  503. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +4 -4
  504. package/cpp/llama.cpp/include/llama.h +44 -21
  505. package/cpp/llama.cpp/media/llama1-icon-transparent.png +0 -0
  506. package/cpp/llama.cpp/media/llama1-icon-transparent.svg +77 -0
  507. package/cpp/llama.cpp/media/llama1-icon.png +0 -0
  508. package/cpp/llama.cpp/media/llama1-icon.svg +87 -0
  509. package/cpp/llama.cpp/requirements/requirements-all.txt +2 -0
  510. package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -3
  511. package/cpp/llama.cpp/requirements/requirements-convert_legacy_llama.txt +3 -1
  512. package/cpp/llama.cpp/requirements/requirements-tool_bench.txt +1 -1
  513. package/cpp/llama.cpp/src/CMakeLists.txt +101 -0
  514. package/cpp/llama.cpp/src/llama-adapter.cpp +33 -0
  515. package/cpp/llama.cpp/src/llama-adapter.h +3 -0
  516. package/cpp/llama.cpp/src/llama-arch.cpp +344 -14
  517. package/cpp/llama.cpp/src/llama-arch.h +50 -0
  518. package/cpp/llama.cpp/src/llama-batch.cpp +63 -31
  519. package/cpp/llama.cpp/src/llama-batch.h +13 -2
  520. package/cpp/llama.cpp/src/llama-chat.cpp +85 -3
  521. package/cpp/llama.cpp/src/llama-chat.h +4 -0
  522. package/cpp/llama.cpp/src/llama-context.cpp +300 -45
  523. package/cpp/llama.cpp/src/llama-context.h +16 -6
  524. package/cpp/llama.cpp/src/llama-cparams.h +2 -1
  525. package/cpp/llama.cpp/src/llama-grammar.cpp +17 -9
  526. package/cpp/llama.cpp/src/llama-graph.cpp +226 -64
  527. package/cpp/llama.cpp/src/llama-graph.h +27 -5
  528. package/cpp/llama.cpp/src/llama-hparams.cpp +53 -2
  529. package/cpp/llama.cpp/src/llama-hparams.h +48 -8
  530. package/cpp/llama.cpp/src/llama-impl.cpp +3 -3
  531. package/cpp/llama.cpp/src/llama-impl.h +2 -0
  532. package/cpp/llama.cpp/src/llama-kv-cache-iswa.cpp +13 -3
  533. package/cpp/llama.cpp/src/llama-kv-cache-iswa.h +2 -0
  534. package/cpp/llama.cpp/src/llama-kv-cache.cpp +120 -62
  535. package/cpp/llama.cpp/src/llama-kv-cache.h +13 -4
  536. package/cpp/llama.cpp/src/llama-kv-cells.h +44 -2
  537. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +19 -9
  538. package/cpp/llama.cpp/src/llama-memory-hybrid.h +2 -0
  539. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +38 -17
  540. package/cpp/llama.cpp/src/llama-memory-recurrent.h +5 -2
  541. package/cpp/llama.cpp/src/llama-memory.h +3 -0
  542. package/cpp/llama.cpp/src/llama-model-loader.cpp +2 -0
  543. package/cpp/llama.cpp/src/llama-model.cpp +1070 -12614
  544. package/cpp/llama.cpp/src/llama-model.h +40 -4
  545. package/cpp/llama.cpp/src/llama-quant.cpp +14 -6
  546. package/cpp/llama.cpp/src/llama-sampling.cpp +243 -136
  547. package/cpp/llama.cpp/src/llama-vocab.cpp +43 -3
  548. package/cpp/llama.cpp/src/llama-vocab.h +43 -39
  549. package/cpp/llama.cpp/src/llama.cpp +69 -10
  550. package/cpp/llama.cpp/src/models/afmoe.cpp +187 -0
  551. package/cpp/llama.cpp/src/models/apertus.cpp +125 -0
  552. package/cpp/llama.cpp/src/models/arcee.cpp +135 -0
  553. package/cpp/llama.cpp/src/models/arctic.cpp +138 -0
  554. package/cpp/llama.cpp/src/models/arwkv7.cpp +86 -0
  555. package/cpp/llama.cpp/src/models/baichuan.cpp +122 -0
  556. package/cpp/llama.cpp/src/models/bailingmoe.cpp +144 -0
  557. package/cpp/llama.cpp/src/models/bailingmoe2.cpp +135 -0
  558. package/cpp/llama.cpp/src/models/bert.cpp +176 -0
  559. package/cpp/llama.cpp/src/models/bitnet.cpp +160 -0
  560. package/cpp/llama.cpp/src/models/bloom.cpp +101 -0
  561. package/cpp/llama.cpp/src/models/chameleon.cpp +178 -0
  562. package/cpp/llama.cpp/src/models/chatglm.cpp +132 -0
  563. package/cpp/llama.cpp/src/models/codeshell.cpp +111 -0
  564. package/cpp/llama.cpp/src/models/cogvlm.cpp +100 -0
  565. package/cpp/llama.cpp/src/models/cohere2-iswa.cpp +131 -0
  566. package/cpp/llama.cpp/src/models/command-r.cpp +122 -0
  567. package/cpp/llama.cpp/src/models/dbrx.cpp +123 -0
  568. package/cpp/llama.cpp/src/models/deci.cpp +135 -0
  569. package/cpp/llama.cpp/src/models/deepseek.cpp +144 -0
  570. package/cpp/llama.cpp/src/models/deepseek2.cpp +237 -0
  571. package/cpp/llama.cpp/src/models/dots1.cpp +134 -0
  572. package/cpp/llama.cpp/src/models/dream.cpp +105 -0
  573. package/cpp/llama.cpp/src/models/ernie4-5-moe.cpp +150 -0
  574. package/cpp/llama.cpp/src/models/ernie4-5.cpp +110 -0
  575. package/cpp/llama.cpp/src/models/exaone.cpp +114 -0
  576. package/cpp/llama.cpp/src/models/exaone4.cpp +123 -0
  577. package/cpp/llama.cpp/src/models/falcon-h1.cpp +113 -0
  578. package/cpp/llama.cpp/src/models/falcon.cpp +120 -0
  579. package/cpp/llama.cpp/src/models/gemma-embedding.cpp +120 -0
  580. package/cpp/llama.cpp/src/models/gemma.cpp +112 -0
  581. package/cpp/llama.cpp/src/models/gemma2-iswa.cpp +125 -0
  582. package/cpp/llama.cpp/src/models/gemma3-iswa.cpp +131 -0
  583. package/cpp/llama.cpp/src/models/gemma3n-iswa.cpp +377 -0
  584. package/cpp/llama.cpp/src/models/glm4-moe.cpp +153 -0
  585. package/cpp/llama.cpp/src/models/glm4.cpp +127 -0
  586. package/cpp/llama.cpp/src/models/gpt2.cpp +105 -0
  587. package/cpp/llama.cpp/src/models/gptneox.cpp +144 -0
  588. package/cpp/llama.cpp/src/models/granite-hybrid.cpp +196 -0
  589. package/cpp/llama.cpp/src/models/granite.cpp +211 -0
  590. package/cpp/llama.cpp/src/models/graph-context-mamba.cpp +283 -0
  591. package/cpp/llama.cpp/src/models/grok.cpp +159 -0
  592. package/cpp/llama.cpp/src/models/grovemoe.cpp +141 -0
  593. package/cpp/llama.cpp/src/models/hunyuan-dense.cpp +132 -0
  594. package/cpp/llama.cpp/src/models/hunyuan-moe.cpp +154 -0
  595. package/cpp/llama.cpp/src/models/internlm2.cpp +120 -0
  596. package/cpp/llama.cpp/src/models/jais.cpp +86 -0
  597. package/cpp/llama.cpp/src/models/jamba.cpp +106 -0
  598. package/cpp/llama.cpp/src/models/lfm2.cpp +173 -0
  599. package/cpp/llama.cpp/src/models/llada-moe.cpp +122 -0
  600. package/cpp/llama.cpp/src/models/llada.cpp +99 -0
  601. package/cpp/llama.cpp/src/models/llama-iswa.cpp +174 -0
  602. package/cpp/llama.cpp/src/models/llama.cpp +155 -0
  603. package/cpp/llama.cpp/src/models/mamba.cpp +55 -0
  604. package/cpp/llama.cpp/src/models/minicpm3.cpp +199 -0
  605. package/cpp/llama.cpp/src/models/minimax-m2.cpp +124 -0
  606. package/cpp/llama.cpp/src/models/models.h +485 -0
  607. package/cpp/llama.cpp/src/models/mpt.cpp +126 -0
  608. package/cpp/llama.cpp/src/models/nemotron-h.cpp +121 -0
  609. package/cpp/llama.cpp/src/models/nemotron.cpp +122 -0
  610. package/cpp/llama.cpp/src/models/neo-bert.cpp +104 -0
  611. package/cpp/llama.cpp/src/models/olmo.cpp +121 -0
  612. package/cpp/llama.cpp/src/models/olmo2.cpp +150 -0
  613. package/cpp/llama.cpp/src/models/olmoe.cpp +124 -0
  614. package/cpp/llama.cpp/src/models/openai-moe-iswa.cpp +124 -0
  615. package/cpp/llama.cpp/src/models/openelm.cpp +124 -0
  616. package/cpp/llama.cpp/src/models/orion.cpp +123 -0
  617. package/cpp/llama.cpp/src/models/pangu-embedded.cpp +121 -0
  618. package/cpp/llama.cpp/src/models/phi2.cpp +121 -0
  619. package/cpp/llama.cpp/src/models/phi3.cpp +152 -0
  620. package/cpp/llama.cpp/src/models/plamo.cpp +110 -0
  621. package/cpp/llama.cpp/src/models/plamo2.cpp +316 -0
  622. package/cpp/llama.cpp/src/models/plm.cpp +168 -0
  623. package/cpp/llama.cpp/src/models/qwen.cpp +108 -0
  624. package/cpp/llama.cpp/src/models/qwen2.cpp +117 -0
  625. package/cpp/llama.cpp/src/models/qwen2moe.cpp +151 -0
  626. package/cpp/llama.cpp/src/models/qwen2vl.cpp +117 -0
  627. package/cpp/llama.cpp/src/models/qwen3.cpp +117 -0
  628. package/cpp/llama.cpp/src/models/qwen3moe.cpp +124 -0
  629. package/cpp/llama.cpp/src/models/qwen3vl-moe.cpp +149 -0
  630. package/cpp/llama.cpp/src/models/qwen3vl.cpp +141 -0
  631. package/cpp/llama.cpp/src/models/refact.cpp +94 -0
  632. package/cpp/llama.cpp/src/models/rwkv6-base.cpp +162 -0
  633. package/cpp/llama.cpp/src/models/rwkv6.cpp +94 -0
  634. package/cpp/llama.cpp/src/models/rwkv6qwen2.cpp +86 -0
  635. package/cpp/llama.cpp/src/models/rwkv7-base.cpp +135 -0
  636. package/cpp/llama.cpp/src/models/rwkv7.cpp +90 -0
  637. package/cpp/llama.cpp/src/models/seed-oss.cpp +124 -0
  638. package/cpp/llama.cpp/src/models/smallthinker.cpp +120 -0
  639. package/cpp/llama.cpp/src/models/smollm3.cpp +128 -0
  640. package/cpp/llama.cpp/src/models/stablelm.cpp +146 -0
  641. package/cpp/llama.cpp/src/models/starcoder.cpp +100 -0
  642. package/cpp/llama.cpp/src/models/starcoder2.cpp +121 -0
  643. package/cpp/llama.cpp/src/models/t5-dec.cpp +166 -0
  644. package/cpp/llama.cpp/src/models/t5-enc.cpp +96 -0
  645. package/cpp/llama.cpp/src/models/wavtokenizer-dec.cpp +149 -0
  646. package/cpp/llama.cpp/src/models/xverse.cpp +108 -0
  647. package/cpp/llama.cpp/src/unicode.cpp +77 -0
  648. package/cpp/llama.cpp/src/unicode.h +43 -0
  649. package/cpp/llama.cpp/vendor/cpp-httplib/CMakeLists.txt +94 -0
  650. package/cpp/llama.cpp/vendor/cpp-httplib/httplib.cpp +9339 -0
  651. package/cpp/llama.cpp/vendor/cpp-httplib/httplib.h +433 -8222
  652. package/cpp/llama.cpp/vendor/cpp-httplib/patch-boringssl.cmake +6 -0
  653. package/cpp/llama.cpp/vendor/miniaudio/miniaudio.h +4179 -1900
  654. package/cpp/llama.cpp/vendor/minja/chat-template.hpp +9 -2
  655. package/cpp/llama.cpp/vendor/minja/minja.hpp +101 -22
  656. package/ios/include/chat.h +16 -3
  657. package/ios/include/common/minja/chat-template.hpp +9 -2
  658. package/ios/include/common/minja/minja.hpp +101 -22
  659. package/ios/include/common.h +57 -19
  660. package/ios/include/json-schema-to-grammar.h +2 -0
  661. package/ios/include/llama.h +44 -21
  662. package/ios/include/log.h +12 -4
  663. package/ios/include/sampling.h +3 -1
  664. package/ios/libs/llama.xcframework/Info.plist +20 -20
  665. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  666. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +6399 -5557
  667. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +19 -1
  668. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +1 -1
  669. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-metal.h +1 -6
  670. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +199 -6
  671. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +44 -21
  672. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  673. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  674. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +6362 -5520
  675. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4813 -4241
  676. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +19 -1
  677. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +1 -1
  678. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +1 -6
  679. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +199 -6
  680. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +44 -21
  681. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  682. package/package.json +10 -4
  683. package/cpp/llama.cpp/ggml/src/ggml-cann/Doxyfile +0 -2579
  684. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -371
  685. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
  686. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -379
  687. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
  688. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -495
  689. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -486
  690. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  691. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  692. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  693. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  694. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  695. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  696. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  697. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  698. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  699. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  700. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  701. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  702. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  703. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  704. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  705. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  706. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  707. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  708. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  709. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  710. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  711. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  712. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  713. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  714. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  715. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  716. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  717. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  718. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  719. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  720. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  721. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  722. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  723. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  724. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  725. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  726. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  727. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  728. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  729. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  730. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  731. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  732. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  733. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  734. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  735. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  736. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  737. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  738. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  739. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  740. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  741. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  742. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  743. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  744. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  745. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  746. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  747. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  748. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  749. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  750. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  751. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  752. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  753. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  754. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  755. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  756. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  757. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  758. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  759. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  760. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  761. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  762. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  763. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  764. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  765. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  766. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  767. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  768. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  769. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  770. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  771. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  772. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  773. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  774. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  775. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  776. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +0 -6886
  777. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +0 -154
  778. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +0 -105
  779. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +0 -55
  780. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +0 -60
  781. package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn-impl.h +0 -97
  782. package/cpp/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
  783. package/cpp/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
  784. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
  785. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +0 -112
  786. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +0 -46
  787. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
  788. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.inp +0 -112
  789. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.out +0 -46
  790. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
  791. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +0 -112
  792. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +0 -46
  793. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
  794. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +0 -112
  795. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +0 -46
  796. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
  797. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.inp +0 -112
  798. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.out +0 -46
  799. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
  800. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +0 -112
  801. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +0 -46
  802. package/cpp/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
  803. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
  804. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +0 -112
  805. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +0 -46
  806. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
  807. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +0 -112
  808. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +0 -46
  809. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
  810. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.inp +0 -112
  811. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.out +0 -46
  812. package/cpp/llama.cpp/models/ggml-vocab-nomic-bert-moe.gguf +0 -0
  813. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
  814. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +0 -112
  815. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.out +0 -46
  816. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
  817. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +0 -112
  818. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.out +0 -46
  819. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
  820. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.inp +0 -112
  821. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.out +0 -46
  822. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
  823. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +0 -112
  824. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.out +0 -46
  825. package/cpp/llama.cpp/models/templates/ByteDance-Seed-OSS.jinja +0 -171
  826. package/cpp/llama.cpp/models/templates/CohereForAI-c4ai-command-r-plus-tool_use.jinja +0 -202
  827. package/cpp/llama.cpp/models/templates/CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja +0 -156
  828. package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +0 -124
  829. package/cpp/llama.cpp/models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja +0 -152
  830. package/cpp/llama.cpp/models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja +0 -152
  831. package/cpp/llama.cpp/models/templates/Qwen-QwQ-32B.jinja +0 -62
  832. package/cpp/llama.cpp/models/templates/Qwen-Qwen2.5-7B-Instruct.jinja +0 -54
  833. package/cpp/llama.cpp/models/templates/Qwen-Qwen3-0.6B.jinja +0 -85
  834. package/cpp/llama.cpp/models/templates/README.md +0 -25
  835. package/cpp/llama.cpp/models/templates/deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja +0 -1
  836. package/cpp/llama.cpp/models/templates/deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja +0 -1
  837. package/cpp/llama.cpp/models/templates/fireworks-ai-llama-3-firefunction-v2.jinja +0 -57
  838. package/cpp/llama.cpp/models/templates/google-gemma-2-2b-it.jinja +0 -4
  839. package/cpp/llama.cpp/models/templates/ibm-granite-granite-3.3-2B-Instruct.jinja +0 -59
  840. package/cpp/llama.cpp/models/templates/llama-cpp-deepseek-r1.jinja +0 -76
  841. package/cpp/llama.cpp/models/templates/llama-cpp-rwkv-world.jinja +0 -34
  842. package/cpp/llama.cpp/models/templates/meetkai-functionary-medium-v3.1.jinja +0 -58
  843. package/cpp/llama.cpp/models/templates/meetkai-functionary-medium-v3.2.jinja +0 -287
  844. package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.1-8B-Instruct.jinja +0 -109
  845. package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.2-3B-Instruct.jinja +0 -93
  846. package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.3-70B-Instruct.jinja +0 -109
  847. package/cpp/llama.cpp/models/templates/microsoft-Phi-3.5-mini-instruct.jinja +0 -8
  848. package/cpp/llama.cpp/models/templates/mistralai-Mistral-Nemo-Instruct-2407.jinja +0 -87
  849. package/cpp/llama.cpp/models/templates/moonshotai-Kimi-K2.jinja +0 -43
  850. package/cpp/llama.cpp/models/templates/openai-gpt-oss-120b.jinja +0 -331
  851. package/cpp/llama.cpp/models/templates/unsloth-mistral-Devstral-Small-2507.jinja +0 -105
  852. package/cpp/llama.cpp/prompts/LLM-questions.txt +0 -49
  853. package/cpp/llama.cpp/prompts/alpaca.txt +0 -1
  854. package/cpp/llama.cpp/prompts/assistant.txt +0 -31
  855. package/cpp/llama.cpp/prompts/chat-with-baichuan.txt +0 -4
  856. package/cpp/llama.cpp/prompts/chat-with-bob.txt +0 -7
  857. package/cpp/llama.cpp/prompts/chat-with-qwen.txt +0 -1
  858. package/cpp/llama.cpp/prompts/chat-with-vicuna-v0.txt +0 -7
  859. package/cpp/llama.cpp/prompts/chat-with-vicuna-v1.txt +0 -7
  860. package/cpp/llama.cpp/prompts/chat.txt +0 -28
  861. package/cpp/llama.cpp/prompts/dan-modified.txt +0 -1
  862. package/cpp/llama.cpp/prompts/dan.txt +0 -1
  863. package/cpp/llama.cpp/prompts/mnemonics.txt +0 -93
  864. package/cpp/llama.cpp/prompts/parallel-questions.txt +0 -43
  865. package/cpp/llama.cpp/prompts/reason-act.txt +0 -18
  866. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  867. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  868. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5524
  869. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4247
  870. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-alloc.h +0 -76
  871. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +0 -354
  872. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-blas.h +0 -25
  873. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +0 -145
  874. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-metal.h +0 -66
  875. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +0 -256
  876. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +0 -2492
  877. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/gguf.h +0 -202
  878. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +0 -1391
  879. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Modules/module.modulemap +0 -17
  880. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Resources/Info.plist +0 -32
  881. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-alloc.h +0 -76
  882. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +0 -354
  883. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-blas.h +0 -25
  884. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +0 -145
  885. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-metal.h +0 -66
  886. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +0 -256
  887. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +0 -2492
  888. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/gguf.h +0 -202
  889. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +0 -1391
  890. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Modules/module.modulemap +0 -17
  891. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Resources/Info.plist +0 -32
  892. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  893. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-alloc.h +0 -76
  894. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +0 -354
  895. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-blas.h +0 -25
  896. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +0 -145
  897. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-metal.h +0 -66
  898. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +0 -256
  899. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +0 -2492
  900. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/gguf.h +0 -202
  901. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +0 -1391
  902. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Modules/module.modulemap +0 -17
  903. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Resources/Info.plist +0 -32
  904. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  905. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  906. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  907. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  908. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5561
  909. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-alloc.h +0 -76
  910. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +0 -354
  911. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-blas.h +0 -25
  912. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +0 -145
  913. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-metal.h +0 -66
  914. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +0 -256
  915. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +0 -2492
  916. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/gguf.h +0 -202
  917. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +0 -1391
  918. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Info.plist +0 -35
  919. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Modules/module.modulemap +0 -17
  920. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  921. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  922. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  923. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5524
  924. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4246
  925. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-alloc.h +0 -76
  926. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +0 -354
  927. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-blas.h +0 -25
  928. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +0 -145
  929. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +0 -66
  930. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +0 -256
  931. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +0 -2492
  932. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/gguf.h +0 -202
  933. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -1391
  934. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Info.plist +0 -35
  935. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Modules/module.modulemap +0 -17
  936. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  937. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  938. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  939. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5558
  940. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-alloc.h +0 -76
  941. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +0 -354
  942. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-blas.h +0 -25
  943. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +0 -145
  944. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-metal.h +0 -66
  945. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +0 -256
  946. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +0 -2492
  947. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/gguf.h +0 -202
  948. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +0 -1391
  949. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Info.plist +0 -32
  950. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Modules/module.modulemap +0 -17
  951. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  952. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  953. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  954. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5520
  955. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4243
  956. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-alloc.h +0 -76
  957. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +0 -354
  958. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-blas.h +0 -25
  959. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +0 -145
  960. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +0 -66
  961. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +0 -256
  962. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +0 -2492
  963. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/gguf.h +0 -202
  964. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -1391
  965. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Info.plist +0 -32
  966. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Modules/module.modulemap +0 -17
  967. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  968. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_bfloat16_support.comp → feature-tests/bfloat16.comp} +0 -0
  969. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat_support.comp → feature-tests/coopmat.comp} +0 -0
  970. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat2_support.comp → feature-tests/coopmat2.comp} +0 -0
  971. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_integer_dot_support.comp → feature-tests/integer_dot.comp} +0 -0
  972. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_head.comp → generic_head.glsl} +0 -0
  973. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_unary_head.comp → generic_unary_head.glsl} +0 -0
  974. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{glu_main.comp → glu_main.glsl} +0 -0
  975. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{rte.comp → rte.glsl} +0 -0
  976. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{utils.comp → utils.glsl} +0 -0
@@ -2,10 +2,10 @@
2
2
 
3
3
  #include "chat.h"
4
4
  #include "common.h"
5
- #include "gguf.h" // for reading GGUF splits
6
5
  #include "json-schema-to-grammar.h"
7
6
  #include "log.h"
8
7
  #include "sampling.h"
8
+ #include "download.h"
9
9
 
10
10
  // fix problem with std::min and std::max
11
11
  #if defined(_WIN32)
@@ -22,26 +22,30 @@
22
22
  #include <algorithm>
23
23
  #include <climits>
24
24
  #include <cstdarg>
25
- #include <filesystem>
26
25
  #include <fstream>
27
26
  #include <list>
28
27
  #include <regex>
29
28
  #include <set>
30
29
  #include <string>
31
- #include <thread>
30
+ #include <thread> // for hardware_concurrency
32
31
  #include <vector>
33
32
 
34
- //#define LLAMA_USE_CURL
35
-
36
- #if defined(LLAMA_USE_CURL)
37
- #include <curl/curl.h>
38
- #include <curl/easy.h>
39
- #include <future>
33
+ #ifdef __linux__
34
+ #include <linux/limits.h>
35
+ #elif defined(_WIN32)
36
+ # if !defined(PATH_MAX)
37
+ # define PATH_MAX MAX_PATH
38
+ # endif
39
+ #elif defined(_AIX)
40
+ #include <sys/limits.h>
41
+ #else
42
+ #include <sys/syslimits.h>
40
43
  #endif
44
+ #define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
41
45
 
42
46
  using json = nlohmann::ordered_json;
43
47
 
44
- std::initializer_list<enum llama_example> mmproj_examples = {
48
+ static std::initializer_list<enum llama_example> mmproj_examples = {
45
49
  LLAMA_EXAMPLE_MTMD,
46
50
  LLAMA_EXAMPLE_SERVER,
47
51
  };
@@ -56,22 +60,13 @@ static std::string read_file(const std::string & fname) {
56
60
  return content;
57
61
  }
58
62
 
59
- static void write_file(const std::string & fname, const std::string & content) {
60
- std::ofstream file(fname);
61
- if (!file) {
62
- throw std::runtime_error(string_format("error: failed to open file '%s'\n", fname.c_str()));
63
- }
64
- file << content;
65
- file.close();
66
- }
67
-
68
63
  common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
69
- this->examples = std::move(examples);
64
+ this->examples = examples;
70
65
  return *this;
71
66
  }
72
67
 
73
68
  common_arg & common_arg::set_excludes(std::initializer_list<enum llama_example> excludes) {
74
- this->excludes = std::move(excludes);
69
+ this->excludes = excludes;
75
70
  return *this;
76
71
  }
77
72
 
@@ -94,7 +89,7 @@ bool common_arg::is_exclude(enum llama_example ex) {
94
89
  return excludes.find(ex) != excludes.end();
95
90
  }
96
91
 
97
- bool common_arg::get_value_from_env(std::string & output) {
92
+ bool common_arg::get_value_from_env(std::string & output) const {
98
93
  if (env == nullptr) return false;
99
94
  char * value = std::getenv(env);
100
95
  if (value) {
@@ -104,7 +99,7 @@ bool common_arg::get_value_from_env(std::string & output) {
104
99
  return false;
105
100
  }
106
101
 
107
- bool common_arg::has_value_from_env() {
102
+ bool common_arg::has_value_from_env() const {
108
103
  return env != nullptr && std::getenv(env);
109
104
  }
110
105
 
@@ -172,579 +167,6 @@ std::string common_arg::to_string() {
172
167
  return ss.str();
173
168
  }
174
169
 
175
- //
176
- // downloader
177
- //
178
-
179
- struct common_hf_file_res {
180
- std::string repo; // repo name with ":tag" removed
181
- std::string ggufFile;
182
- std::string mmprojFile;
183
- };
184
-
185
- #ifdef LLAMA_USE_CURL
186
-
187
- bool common_has_curl() {
188
- return true;
189
- }
190
-
191
- #ifdef __linux__
192
- #include <linux/limits.h>
193
- #elif defined(_WIN32)
194
- # if !defined(PATH_MAX)
195
- # define PATH_MAX MAX_PATH
196
- # endif
197
- #elif defined(_AIX)
198
- #include <sys/limits.h>
199
- #else
200
- #include <sys/syslimits.h>
201
- #endif
202
- #define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
203
-
204
- //
205
- // CURL utils
206
- //
207
-
208
- using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
209
-
210
- // cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
211
- struct curl_slist_ptr {
212
- struct curl_slist * ptr = nullptr;
213
- ~curl_slist_ptr() {
214
- if (ptr) {
215
- curl_slist_free_all(ptr);
216
- }
217
- }
218
- };
219
-
220
- #define CURL_MAX_RETRY 3
221
- #define CURL_RETRY_DELAY_SECONDS 2
222
-
223
- static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds, const char * method_name) {
224
- int remaining_attempts = max_attempts;
225
-
226
- while (remaining_attempts > 0) {
227
- LOG_INF("%s: %s %s (attempt %d of %d)...\n", __func__ , method_name, url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
228
-
229
- CURLcode res = curl_easy_perform(curl);
230
- if (res == CURLE_OK) {
231
- return true;
232
- }
233
-
234
- int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
235
- LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
236
-
237
- remaining_attempts--;
238
- if (remaining_attempts == 0) break;
239
- std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
240
- }
241
-
242
- LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
243
-
244
- return false;
245
- }
246
-
247
- // download one single file from remote URL to local path
248
- static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token, bool offline) {
249
- // Check if the file already exists locally
250
- auto file_exists = std::filesystem::exists(path);
251
-
252
- // If the file exists, check its JSON metadata companion file.
253
- std::string metadata_path = path + ".json";
254
- nlohmann::json metadata; // TODO @ngxson : get rid of this json, use regex instead
255
- std::string etag;
256
- std::string last_modified;
257
-
258
- if (file_exists) {
259
- if (offline) {
260
- LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
261
- return true; // skip verification/downloading
262
- }
263
- // Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
264
- std::ifstream metadata_in(metadata_path);
265
- if (metadata_in.good()) {
266
- try {
267
- metadata_in >> metadata;
268
- LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
269
- if (metadata.contains("etag") && metadata.at("etag").is_string()) {
270
- etag = metadata.at("etag");
271
- }
272
- if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
273
- last_modified = metadata.at("lastModified");
274
- }
275
- } catch (const nlohmann::json::exception & e) {
276
- LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
277
- }
278
- }
279
- // if we cannot open the metadata file, we assume that the downloaded file is not valid (etag and last-modified are left empty, so we will download it again)
280
- } else {
281
- if (offline) {
282
- LOG_ERR("%s: required file is not available in cache (offline mode): %s\n", __func__, path.c_str());
283
- return false;
284
- }
285
- LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
286
- }
287
-
288
- // Send a HEAD request to retrieve the etag and last-modified headers
289
- struct common_load_model_from_url_headers {
290
- std::string etag;
291
- std::string last_modified;
292
- };
293
-
294
- common_load_model_from_url_headers headers;
295
- bool head_request_ok = false;
296
- bool should_download = !file_exists; // by default, we should download if the file does not exist
297
-
298
- // Initialize libcurl
299
- curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
300
- curl_slist_ptr http_headers;
301
- if (!curl) {
302
- LOG_ERR("%s: error initializing libcurl\n", __func__);
303
- return false;
304
- }
305
-
306
- // Set the URL, allow to follow http redirection
307
- curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
308
- curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
309
-
310
- http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
311
- // Check if hf-token or bearer-token was specified
312
- if (!bearer_token.empty()) {
313
- std::string auth_header = "Authorization: Bearer " + bearer_token;
314
- http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
315
- }
316
- curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
317
-
318
- #if defined(_WIN32)
319
- // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
320
- // operating system. Currently implemented under MS-Windows.
321
- curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
322
- #endif
323
-
324
- typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
325
- auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
326
- common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
327
-
328
- static std::regex header_regex("([^:]+): (.*)\r\n");
329
- static std::regex etag_regex("ETag", std::regex_constants::icase);
330
- static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
331
-
332
- std::string header(buffer, n_items);
333
- std::smatch match;
334
- if (std::regex_match(header, match, header_regex)) {
335
- const std::string & key = match[1];
336
- const std::string & value = match[2];
337
- if (std::regex_match(key, match, etag_regex)) {
338
- headers->etag = value;
339
- } else if (std::regex_match(key, match, last_modified_regex)) {
340
- headers->last_modified = value;
341
- }
342
- }
343
- return n_items;
344
- };
345
-
346
- curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
347
- curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
348
- curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
349
- curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
350
-
351
- // we only allow retrying once for HEAD requests
352
- // this is for the use case of using running offline (no internet), retrying can be annoying
353
- bool was_perform_successful = curl_perform_with_retry(url, curl.get(), 1, 0, "HEAD");
354
- if (!was_perform_successful) {
355
- head_request_ok = false;
356
- }
357
-
358
- long http_code = 0;
359
- curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
360
- if (http_code == 200) {
361
- head_request_ok = true;
362
- } else {
363
- LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
364
- head_request_ok = false;
365
- }
366
-
367
- // if head_request_ok is false, we don't have the etag or last-modified headers
368
- // we leave should_download as-is, which is true if the file does not exist
369
- if (head_request_ok) {
370
- // check if ETag or Last-Modified headers are different
371
- // if it is, we need to download the file again
372
- if (!etag.empty() && etag != headers.etag) {
373
- LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
374
- should_download = true;
375
- } else if (!last_modified.empty() && last_modified != headers.last_modified) {
376
- LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
377
- should_download = true;
378
- }
379
- }
380
-
381
- if (should_download) {
382
- std::string path_temporary = path + ".downloadInProgress";
383
- if (file_exists) {
384
- LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
385
- if (remove(path.c_str()) != 0) {
386
- LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
387
- return false;
388
- }
389
- }
390
-
391
- // Set the output file
392
-
393
- struct FILE_deleter {
394
- void operator()(FILE * f) const {
395
- fclose(f);
396
- }
397
- };
398
-
399
- std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
400
- if (!outfile) {
401
- LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str());
402
- return false;
403
- }
404
-
405
- typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
406
- auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t {
407
- return fwrite(data, size, nmemb, (FILE *)fd);
408
- };
409
- curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L);
410
- curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
411
- curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get());
412
-
413
- // display download progress
414
- curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
415
-
416
- // helper function to hide password in URL
417
- auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
418
- std::size_t protocol_pos = url.find("://");
419
- if (protocol_pos == std::string::npos) {
420
- return url; // Malformed URL
421
- }
422
-
423
- std::size_t at_pos = url.find('@', protocol_pos + 3);
424
- if (at_pos == std::string::npos) {
425
- return url; // No password in URL
426
- }
427
-
428
- return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
429
- };
430
-
431
- // start the download
432
- LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
433
- llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
434
- bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS, "GET");
435
- if (!was_perform_successful) {
436
- return false;
437
- }
438
-
439
- long http_code = 0;
440
- curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
441
- if (http_code < 200 || http_code >= 400) {
442
- LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
443
- return false;
444
- }
445
-
446
- // Causes file to be closed explicitly here before we rename it.
447
- outfile.reset();
448
-
449
- // Write the updated JSON metadata file.
450
- metadata.update({
451
- {"url", url},
452
- {"etag", headers.etag},
453
- {"lastModified", headers.last_modified}
454
- });
455
- write_file(metadata_path, metadata.dump(4));
456
- LOG_DBG("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
457
-
458
- if (rename(path_temporary.c_str(), path.c_str()) != 0) {
459
- LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
460
- return false;
461
- }
462
- } else {
463
- LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
464
- }
465
-
466
- return true;
467
- }
468
-
469
- // download multiple files from remote URLs to local paths
470
- // the input is a vector of pairs <url, path>
471
- static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token, bool offline) {
472
- // Prepare download in parallel
473
- std::vector<std::future<bool>> futures_download;
474
- for (auto const & item : urls) {
475
- futures_download.push_back(std::async(std::launch::async, [bearer_token, offline](const std::pair<std::string, std::string> & it) -> bool {
476
- return common_download_file_single(it.first, it.second, bearer_token, offline);
477
- }, item));
478
- }
479
-
480
- // Wait for all downloads to complete
481
- for (auto & f : futures_download) {
482
- if (!f.get()) {
483
- return false;
484
- }
485
- }
486
-
487
- return true;
488
- }
489
-
490
- static bool common_download_model(
491
- const common_params_model & model,
492
- const std::string & bearer_token,
493
- bool offline) {
494
- // Basic validation of the model.url
495
- if (model.url.empty()) {
496
- LOG_ERR("%s: invalid model url\n", __func__);
497
- return false;
498
- }
499
-
500
- if (!common_download_file_single(model.url, model.path, bearer_token, offline)) {
501
- return false;
502
- }
503
-
504
- // check for additional GGUFs split to download
505
- int n_split = 0;
506
- {
507
- struct gguf_init_params gguf_params = {
508
- /*.no_alloc = */ true,
509
- /*.ctx = */ NULL,
510
- };
511
- auto * ctx_gguf = gguf_init_from_file(model.path.c_str(), gguf_params);
512
- if (!ctx_gguf) {
513
- LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, model.path.c_str());
514
- return false;
515
- }
516
-
517
- auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
518
- if (key_n_split >= 0) {
519
- n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
520
- }
521
-
522
- gguf_free(ctx_gguf);
523
- }
524
-
525
- if (n_split > 1) {
526
- char split_prefix[PATH_MAX] = {0};
527
- char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
528
-
529
- // Verify the first split file format
530
- // and extract split URL and PATH prefixes
531
- {
532
- if (!llama_split_prefix(split_prefix, sizeof(split_prefix), model.path.c_str(), 0, n_split)) {
533
- LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, model.path.c_str(), n_split);
534
- return false;
535
- }
536
-
537
- if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model.url.c_str(), 0, n_split)) {
538
- LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model.url.c_str(), n_split);
539
- return false;
540
- }
541
- }
542
-
543
- std::vector<std::pair<std::string, std::string>> urls;
544
- for (int idx = 1; idx < n_split; idx++) {
545
- char split_path[PATH_MAX] = {0};
546
- llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
547
-
548
- char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
549
- llama_split_path(split_url, sizeof(split_url), split_url_prefix, idx, n_split);
550
-
551
- if (std::string(split_path) == model.path) {
552
- continue; // skip the already downloaded file
553
- }
554
-
555
- urls.push_back({split_url, split_path});
556
- }
557
-
558
- // Download in parallel
559
- common_download_file_multiple(urls, bearer_token, offline);
560
- }
561
-
562
- return true;
563
- }
564
-
565
- std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params) {
566
- curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
567
- curl_slist_ptr http_headers;
568
- std::vector<char> res_buffer;
569
-
570
- curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
571
- curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
572
- curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
573
- typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
574
- auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
575
- auto data_vec = static_cast<std::vector<char> *>(data);
576
- data_vec->insert(data_vec->end(), (char *)ptr, (char *)ptr + size * nmemb);
577
- return size * nmemb;
578
- };
579
- curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
580
- curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_buffer);
581
- #if defined(_WIN32)
582
- curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
583
- #endif
584
- if (params.timeout > 0) {
585
- curl_easy_setopt(curl.get(), CURLOPT_TIMEOUT, params.timeout);
586
- }
587
- if (params.max_size > 0) {
588
- curl_easy_setopt(curl.get(), CURLOPT_MAXFILESIZE, params.max_size);
589
- }
590
- http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
591
- for (const auto & header : params.headers) {
592
- http_headers.ptr = curl_slist_append(http_headers.ptr, header.c_str());
593
- }
594
- curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
595
-
596
- CURLcode res = curl_easy_perform(curl.get());
597
-
598
- if (res != CURLE_OK) {
599
- std::string error_msg = curl_easy_strerror(res);
600
- throw std::runtime_error("error: cannot make GET request: " + error_msg);
601
- }
602
-
603
- long res_code;
604
- curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
605
-
606
- return { res_code, std::move(res_buffer) };
607
- }
608
-
609
- /**
610
- * Allow getting the HF file from the HF repo with tag (like ollama), for example:
611
- * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
612
- * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
613
- * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
614
- * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
615
- *
616
- * Return pair of <repo, file> (with "repo" already having tag removed)
617
- *
618
- * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
619
- */
620
- static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token, bool offline) {
621
- auto parts = string_split<std::string>(hf_repo_with_tag, ':');
622
- std::string tag = parts.size() > 1 ? parts.back() : "latest";
623
- std::string hf_repo = parts[0];
624
- if (string_split<std::string>(hf_repo, '/').size() != 2) {
625
- throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
626
- }
627
-
628
- std::string url = get_model_endpoint() + "v2/" + hf_repo + "/manifests/" + tag;
629
-
630
- // headers
631
- std::vector<std::string> headers;
632
- headers.push_back("Accept: application/json");
633
- if (!bearer_token.empty()) {
634
- headers.push_back("Authorization: Bearer " + bearer_token);
635
- }
636
- // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
637
- // User-Agent header is already set in common_remote_get_content, no need to set it here
638
-
639
- // we use "=" to avoid clashing with other component, while still being allowed on windows
640
- std::string cached_response_fname = "manifest=" + hf_repo + "=" + tag + ".json";
641
- string_replace_all(cached_response_fname, "/", "_");
642
- std::string cached_response_path = fs_get_cache_file(cached_response_fname);
643
-
644
- // make the request
645
- common_remote_params params;
646
- params.headers = headers;
647
- long res_code = 0;
648
- std::string res_str;
649
- bool use_cache = false;
650
- if (!offline) {
651
- try {
652
- auto res = common_remote_get_content(url, params);
653
- res_code = res.first;
654
- res_str = std::string(res.second.data(), res.second.size());
655
- } catch (const std::exception & e) {
656
- LOG_WRN("error: failed to get manifest at %s: %s\n", url.c_str(), e.what());
657
- }
658
- }
659
- if (res_code == 0) {
660
- if (std::filesystem::exists(cached_response_path)) {
661
- LOG_WRN("trying to read manifest from cache: %s\n", cached_response_path.c_str());
662
- res_str = read_file(cached_response_path);
663
- res_code = 200;
664
- use_cache = true;
665
- } else {
666
- throw std::runtime_error(
667
- offline ? "error: failed to get manifest (offline mode)"
668
- : "error: failed to get manifest (check your internet connection)");
669
- }
670
- }
671
- std::string ggufFile;
672
- std::string mmprojFile;
673
-
674
- if (res_code == 200 || res_code == 304) {
675
- // extract ggufFile.rfilename in json, using regex
676
- {
677
- std::regex pattern("\"ggufFile\"[\\s\\S]*?\"rfilename\"\\s*:\\s*\"([^\"]+)\"");
678
- std::smatch match;
679
- if (std::regex_search(res_str, match, pattern)) {
680
- ggufFile = match[1].str();
681
- }
682
- }
683
- // extract mmprojFile.rfilename in json, using regex
684
- {
685
- std::regex pattern("\"mmprojFile\"[\\s\\S]*?\"rfilename\"\\s*:\\s*\"([^\"]+)\"");
686
- std::smatch match;
687
- if (std::regex_search(res_str, match, pattern)) {
688
- mmprojFile = match[1].str();
689
- }
690
- }
691
- if (!use_cache) {
692
- // if not using cached response, update the cache file
693
- write_file(cached_response_path, res_str);
694
- }
695
- } else if (res_code == 401) {
696
- throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
697
- } else {
698
- throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
699
- }
700
-
701
- // check response
702
- if (ggufFile.empty()) {
703
- throw std::runtime_error("error: model does not have ggufFile");
704
- }
705
-
706
- return { hf_repo, ggufFile, mmprojFile };
707
- }
708
-
709
- #else
710
-
711
- bool common_has_curl() {
712
- return false;
713
- }
714
-
715
- static bool common_download_file_single(const std::string &, const std::string &, const std::string &, bool) {
716
- LOG_ERR("error: built without CURL, cannot download model from internet\n");
717
- return false;
718
- }
719
-
720
- static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &, bool) {
721
- LOG_ERR("error: built without CURL, cannot download model from the internet\n");
722
- return false;
723
- }
724
-
725
- static bool common_download_model(
726
- const common_params_model &,
727
- const std::string &,
728
- bool) {
729
- LOG_ERR("error: built without CURL, cannot download model from the internet\n");
730
- return false;
731
- }
732
-
733
- static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &, bool) {
734
- LOG_ERR("error: built without CURL, cannot download model from the internet\n");
735
- return {};
736
- }
737
-
738
- std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params &) {
739
- if (!url.empty()) {
740
- throw std::runtime_error("error: built without CURL, cannot download model from the internet");
741
- }
742
-
743
- return {};
744
- }
745
-
746
- #endif // LLAMA_USE_CURL
747
-
748
170
  //
749
171
  // utils
750
172
  //
@@ -795,7 +217,9 @@ static handle_model_result common_params_handle_model(
795
217
  handle_model_result result;
796
218
  // handle pre-fill default model path and url based on hf_repo and hf_file
797
219
  {
798
- if (!model.hf_repo.empty()) {
220
+ if (!model.docker_repo.empty()) { // Handle Docker URLs by resolving them to local paths
221
+ model.path = common_docker_resolve_model(model.docker_repo);
222
+ } else if (!model.hf_repo.empty()) {
799
223
  // short-hand to avoid specifying --hf-file -> default it to --model
800
224
  if (model.hf_file.empty()) {
801
225
  if (model.path.empty()) {
@@ -884,8 +308,6 @@ static std::string get_all_kv_cache_types() {
884
308
  //
885
309
 
886
310
  static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
887
- std::string arg;
888
- const std::string arg_prefix = "--";
889
311
  common_params & params = ctx_arg.params;
890
312
 
891
313
  std::unordered_map<std::string, common_arg *> arg_to_options;
@@ -1184,7 +606,7 @@ static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & val
1184
606
  } else {
1185
607
  for (const auto & device : dev_names) {
1186
608
  auto * dev = ggml_backend_dev_by_name(device.c_str());
1187
- if (!dev || ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_GPU) {
609
+ if (!dev || ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
1188
610
  throw std::invalid_argument(string_format("invalid device: %s", device.c_str()));
1189
611
  }
1190
612
  devices.push_back(dev);
@@ -1194,7 +616,7 @@ static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & val
1194
616
  return devices;
1195
617
  }
1196
618
 
1197
- static void add_rpc_devices(std::string servers) {
619
+ static void add_rpc_devices(const std::string & servers) {
1198
620
  auto rpc_servers = string_split<std::string>(servers, ',');
1199
621
  if (rpc_servers.empty()) {
1200
622
  throw std::invalid_argument("no RPC servers specified");
@@ -1203,18 +625,14 @@ static void add_rpc_devices(std::string servers) {
1203
625
  if (!rpc_reg) {
1204
626
  throw std::invalid_argument("failed to find RPC backend");
1205
627
  }
1206
- typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint);
1207
- ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
1208
- if (!ggml_backend_rpc_add_device_fn) {
1209
- throw std::invalid_argument("failed to find RPC device add function");
628
+ typedef ggml_backend_reg_t (*ggml_backend_rpc_add_server_t)(const char * endpoint);
629
+ ggml_backend_rpc_add_server_t ggml_backend_rpc_add_server_fn = (ggml_backend_rpc_add_server_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_server");
630
+ if (!ggml_backend_rpc_add_server_fn) {
631
+ throw std::invalid_argument("failed to find RPC add server function");
1210
632
  }
1211
633
  for (const auto & server : rpc_servers) {
1212
- ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
1213
- if (dev) {
1214
- ggml_backend_device_register(dev);
1215
- } else {
1216
- throw std::invalid_argument("failed to register RPC device");
1217
- }
634
+ auto reg = ggml_backend_rpc_add_server_fn(server.c_str());
635
+ ggml_backend_register(reg);
1218
636
  }
1219
637
  }
1220
638
 
@@ -1263,6 +681,18 @@ static std::string list_builtin_chat_templates() {
1263
681
  return msg.str();
1264
682
  }
1265
683
 
684
+ static bool is_truthy(const std::string & value) {
685
+ return value == "on" || value == "enabled" || value == "1";
686
+ }
687
+
688
+ static bool is_falsey(const std::string & value) {
689
+ return value == "off" || value == "disabled" || value == "0";
690
+ }
691
+
692
+ static bool is_autoy(const std::string & value) {
693
+ return value == "auto" || value == "-1";
694
+ }
695
+
1266
696
  common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
1267
697
  // load dynamic backends
1268
698
  ggml_backend_load_all();
@@ -1310,6 +740,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1310
740
  exit(0);
1311
741
  }
1312
742
  ));
743
+ add_opt(common_arg(
744
+ {"-cl", "--cache-list"},
745
+ "show list of models in cache",
746
+ [](common_params &) {
747
+ printf("model cache directory: %s\n", fs_get_cache_directory().c_str());
748
+ auto models = common_list_cached_models();
749
+ printf("number of models in cache: %zu\n", models.size());
750
+ for (size_t i = 0; i < models.size(); i++) {
751
+ auto & model = models[i];
752
+ printf("%4d. %s\n", (int) i + 1, model.to_string().c_str());
753
+ }
754
+ exit(0);
755
+ }
756
+ ));
1313
757
  add_opt(common_arg(
1314
758
  {"--completion-bash"},
1315
759
  "print source-able bash completion script for llama.cpp",
@@ -1340,7 +784,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1340
784
  ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
1341
785
  add_opt(common_arg(
1342
786
  {"-t", "--threads"}, "N",
1343
- string_format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
787
+ string_format("number of CPU threads to use during generation (default: %d)", params.cpuparams.n_threads),
1344
788
  [](common_params & params, int value) {
1345
789
  params.cpuparams.n_threads = value;
1346
790
  if (params.cpuparams.n_threads <= 0) {
@@ -1508,13 +952,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1508
952
  }
1509
953
  ).set_env("LLAMA_ARG_SWA_FULL"));
1510
954
  add_opt(common_arg(
1511
- {"--swa-checkpoints"}, "N",
1512
- string_format("max number of SWA checkpoints per slot to create (default: %d)\n"
1513
- "[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_swa_checkpoints),
955
+ {"--ctx-checkpoints", "--swa-checkpoints"}, "N",
956
+ string_format("max number of context checkpoints to create per slot (default: %d)\n"
957
+ "[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_ctx_checkpoints),
958
+ [](common_params & params, int value) {
959
+ params.n_ctx_checkpoints = value;
960
+ }
961
+ ).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
962
+ add_opt(common_arg(
963
+ {"--cache-ram", "-cram"}, "N",
964
+ string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)\n"
965
+ "[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)", params.cache_ram_mib),
1514
966
  [](common_params & params, int value) {
1515
- params.n_swa_checkpoints = value;
967
+ params.cache_ram_mib = value;
1516
968
  }
1517
- ).set_env("LLAMA_ARG_SWA_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
969
+ ).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER}));
1518
970
  add_opt(common_arg(
1519
971
  {"--kv-unified", "-kvu"},
1520
972
  string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
@@ -1544,13 +996,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1544
996
  params.n_chunks = value;
1545
997
  }
1546
998
  ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
1547
- add_opt(common_arg(
1548
- {"-fa", "--flash-attn"},
1549
- string_format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"),
1550
- [](common_params & params) {
1551
- params.flash_attn = true;
1552
- }
1553
- ).set_env("LLAMA_ARG_FLASH_ATTN"));
999
+ add_opt(common_arg({ "-fa", "--flash-attn" }, "[on|off|auto]",
1000
+ string_format("set Flash Attention use ('on', 'off', or 'auto', default: '%s')",
1001
+ llama_flash_attn_type_name(params.flash_attn_type)),
1002
+ [](common_params & params, const std::string & value) {
1003
+ if (is_truthy(value)) {
1004
+ params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
1005
+ } else if (is_falsey(value)) {
1006
+ params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
1007
+ } else if (is_autoy(value)) {
1008
+ params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
1009
+ } else {
1010
+ throw std::runtime_error(
1011
+ string_format("error: unkown value for --flash-attn: '%s'\n", value.c_str()));
1012
+ }
1013
+ }).set_env("LLAMA_ARG_FLASH_ATTN"));
1554
1014
  add_opt(common_arg(
1555
1015
  {"-p", "--prompt"}, "PROMPT",
1556
1016
  "prompt to start generation with; for system message, use -sys",
@@ -1564,7 +1024,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1564
1024
  [](common_params & params, const std::string & value) {
1565
1025
  params.system_prompt = value;
1566
1026
  }
1567
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
1027
+ ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION}));
1568
1028
  add_opt(common_arg(
1569
1029
  {"--no-perf"},
1570
1030
  string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
@@ -1594,7 +1054,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1594
1054
  params.system_prompt.pop_back();
1595
1055
  }
1596
1056
  }
1597
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
1057
+ ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION}));
1598
1058
  add_opt(common_arg(
1599
1059
  {"--in-file"}, "FNAME",
1600
1060
  "an input file (repeat to specify multiple files)",
@@ -2156,6 +1616,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2156
1616
  params.no_extra_bufts = true;
2157
1617
  }
2158
1618
  ).set_env("LLAMA_ARG_NO_REPACK"));
1619
+ add_opt(common_arg(
1620
+ {"--no-host"},
1621
+ "bypass host buffer allowing extra buffers to be used",
1622
+ [](common_params & params) {
1623
+ params.no_host = true;
1624
+ }
1625
+ ).set_env("LLAMA_ARG_NO_HOST"));
2159
1626
  add_opt(common_arg(
2160
1627
  {"-ctk", "--cache-type-k"}, "TYPE",
2161
1628
  string_format(
@@ -2325,6 +1792,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2325
1792
  params.image.emplace_back(value);
2326
1793
  }
2327
1794
  ).set_examples({LLAMA_EXAMPLE_MTMD}));
1795
+ add_opt(common_arg(
1796
+ {"--image-min-tokens"}, "N",
1797
+ "minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)",
1798
+ [](common_params & params, int value) {
1799
+ params.image_min_tokens = value;
1800
+ }
1801
+ ).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MIN_TOKENS"));
1802
+ add_opt(common_arg(
1803
+ {"--image-max-tokens"}, "N",
1804
+ "maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)",
1805
+ [](common_params & params, int value) {
1806
+ params.image_max_tokens = value;
1807
+ }
1808
+ ).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MAX_TOKENS"));
2328
1809
  if (llama_supports_rpc()) {
2329
1810
  add_opt(common_arg(
2330
1811
  {"--rpc"}, "SERVERS",
@@ -2376,24 +1857,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2376
1857
  {"--list-devices"},
2377
1858
  "print list of available devices and exit",
2378
1859
  [](common_params &) {
2379
- std::vector<ggml_backend_dev_t> rpc_devices;
2380
- std::vector<ggml_backend_dev_t> all_devices;
1860
+ std::vector<ggml_backend_dev_t> devices;
2381
1861
  for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
2382
1862
  auto * dev = ggml_backend_dev_get(i);
2383
- if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
2384
- ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
2385
- if (ggml_backend_reg_name(reg) == std::string("RPC")) {
2386
- rpc_devices.push_back(dev);
2387
- } else {
2388
- all_devices.push_back(dev);
2389
- }
1863
+ if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) {
1864
+ devices.push_back(dev);
2390
1865
  }
2391
1866
  }
2392
- // insert RPC devices in front
2393
- all_devices.insert(all_devices.begin(), rpc_devices.begin(), rpc_devices.end());
2394
1867
  printf("Available devices:\n");
2395
- for (size_t i = 0; i < all_devices.size(); ++i) {
2396
- auto * dev = all_devices[i];
1868
+ for (auto * dev : devices) {
2397
1869
  size_t free, total;
2398
1870
  ggml_backend_dev_memory(dev, &free, &total);
2399
1871
  printf(" %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
@@ -2417,7 +1889,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2417
1889
  {"--cpu-moe", "-cmoe"},
2418
1890
  "keep all Mixture of Experts (MoE) weights in the CPU",
2419
1891
  [](common_params & params) {
2420
- params.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
1892
+ params.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
2421
1893
  }
2422
1894
  ).set_env("LLAMA_ARG_CPU_MOE"));
2423
1895
  add_opt(common_arg(
@@ -2430,7 +1902,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2430
1902
  for (int i = 0; i < value; ++i) {
2431
1903
  // keep strings alive and avoid leaking memory by storing them in a static vector
2432
1904
  static std::list<std::string> buft_overrides;
2433
- buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
1905
+ buft_overrides.push_back(llm_ffn_exps_block_regex(i));
2434
1906
  params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});
2435
1907
  }
2436
1908
  }
@@ -2439,7 +1911,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2439
1911
  {"--cpu-moe-draft", "-cmoed"},
2440
1912
  "keep all Mixture of Experts (MoE) weights in the CPU for the draft model",
2441
1913
  [](common_params & params) {
2442
- params.speculative.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
1914
+ params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
2443
1915
  }
2444
1916
  ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
2445
1917
  add_opt(common_arg(
@@ -2451,14 +1923,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2451
1923
  }
2452
1924
  for (int i = 0; i < value; ++i) {
2453
1925
  static std::list<std::string> buft_overrides_draft;
2454
- buft_overrides_draft.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
1926
+ buft_overrides_draft.push_back(llm_ffn_exps_block_regex(i));
2455
1927
  params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()});
2456
1928
  }
2457
1929
  }
2458
1930
  ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
2459
1931
  add_opt(common_arg(
2460
1932
  {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
2461
- "number of layers to store in VRAM",
1933
+ string_format("max. number of layers to store in VRAM (default: %d)", params.n_gpu_layers),
2462
1934
  [](common_params & params, int value) {
2463
1935
  params.n_gpu_layers = value;
2464
1936
  if (!llama_supports_gpu_offload()) {
@@ -2616,6 +2088,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2616
2088
  params.model.url = value;
2617
2089
  }
2618
2090
  ).set_env("LLAMA_ARG_MODEL_URL"));
2091
+ add_opt(common_arg(
2092
+ { "-dr", "--docker-repo" }, "[<repo>/]<model>[:quant]",
2093
+ "Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.\n"
2094
+ "example: gemma3\n"
2095
+ "(default: unused)",
2096
+ [](common_params & params, const std::string & value) {
2097
+ params.model.docker_repo = value;
2098
+ }
2099
+ ).set_env("LLAMA_ARG_DOCKER_REPO"));
2619
2100
  add_opt(common_arg(
2620
2101
  {"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
2621
2102
  "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
@@ -2760,7 +2241,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2760
2241
  ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2761
2242
  add_opt(common_arg(
2762
2243
  {"--parse-special"},
2763
- string_format("prase special tokens (chat, tool, etc) (default: %s)", params.parse_special ? "true" : "false"),
2244
+ string_format("parse special tokens (chat, tool, etc) (default: %s)", params.parse_special ? "true" : "false"),
2764
2245
  [](common_params & params) {
2765
2246
  params.parse_special = true;
2766
2247
  }
@@ -2772,6 +2253,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2772
2253
  params.is_pp_shared = true;
2773
2254
  }
2774
2255
  ).set_examples({LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
2256
+ add_opt(common_arg(
2257
+ {"-tgs"},
2258
+ string_format("is the text generation separated across the different sequences (default: %s)", params.is_tg_separate ? "true" : "false"),
2259
+ [](common_params & params) {
2260
+ params.is_tg_separate = true;
2261
+ }
2262
+ ).set_examples({LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
2775
2263
  add_opt(common_arg(
2776
2264
  {"-npp"}, "n0,n1,...",
2777
2265
  "number of prompt tokens",
@@ -2805,7 +2293,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2805
2293
  ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
2806
2294
  add_opt(common_arg(
2807
2295
  {"--embd-output-format"}, "FORMAT",
2808
- "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix",
2296
+ "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix, \"raw\" = plain whitespace-delimited output (one embedding per line)",
2809
2297
  [](common_params & params, const std::string & value) {
2810
2298
  params.embd_out = value;
2811
2299
  }
@@ -2915,7 +2403,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2915
2403
  add_opt(common_arg(
2916
2404
  {"--chat-template-kwargs"}, "STRING",
2917
2405
  string_format("sets additional params for the json template parser"),
2918
- [](common_params & params, const std::string & value) {
2406
+ [](common_params & params, const std::string & value) {
2919
2407
  auto parsed = json::parse(value);
2920
2408
  for (const auto & item : parsed.items()) {
2921
2409
  params.default_template_kwargs[item.key()] = item.value().dump();
@@ -2954,13 +2442,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2954
2442
  params.endpoint_metrics = true;
2955
2443
  }
2956
2444
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS"));
2957
- add_opt(common_arg(
2958
- {"--slots"},
2959
- string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
2960
- [](common_params & params) {
2961
- params.endpoint_slots = true;
2962
- }
2963
- ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
2964
2445
  add_opt(common_arg(
2965
2446
  {"--props"},
2966
2447
  string_format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"),
@@ -2968,6 +2449,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2968
2449
  params.endpoint_props = true;
2969
2450
  }
2970
2451
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
2452
+ add_opt(common_arg(
2453
+ {"--slots"},
2454
+ string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
2455
+ [](common_params & params) {
2456
+ params.endpoint_slots = true;
2457
+ }
2458
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
2971
2459
  add_opt(common_arg(
2972
2460
  {"--no-slots"},
2973
2461
  "disables slots monitoring endpoint",
@@ -2992,12 +2480,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2992
2480
  [](common_params & params) {
2993
2481
  params.use_jinja = true;
2994
2482
  }
2995
- ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
2483
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
2996
2484
  add_opt(common_arg(
2997
2485
  {"--reasoning-format"}, "FORMAT",
2998
2486
  "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
2999
2487
  "- none: leaves thoughts unparsed in `message.content`\n"
3000
- "- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
2488
+ "- deepseek: puts thoughts in `message.reasoning_content`\n"
2489
+ "- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`\n"
3001
2490
  "(default: auto)",
3002
2491
  [](common_params & params, const std::string & value) {
3003
2492
  params.reasoning_format = common_reasoning_format_from_name(value);
@@ -3127,10 +2616,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3127
2616
  }
3128
2617
  ));
3129
2618
  add_opt(common_arg(
3130
- {"--log-colors"},
3131
- "Enable colored logging",
3132
- [](common_params &) {
3133
- common_log_set_colors(common_log_main(), true);
2619
+ {"--log-colors"}, "[on|off|auto]",
2620
+ "Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
2621
+ "'auto' enables colors when output is to a terminal",
2622
+ [](common_params &, const std::string & value) {
2623
+ if (is_truthy(value)) {
2624
+ common_log_set_colors(common_log_main(), LOG_COLORS_ENABLED);
2625
+ } else if (is_falsey(value)) {
2626
+ common_log_set_colors(common_log_main(), LOG_COLORS_DISABLED);
2627
+ } else if (is_autoy(value)) {
2628
+ common_log_set_colors(common_log_main(), LOG_COLORS_AUTO);
2629
+ } else {
2630
+ throw std::invalid_argument(
2631
+ string_format("error: unkown value for --log-colors: '%s'\n", value.c_str()));
2632
+ }
3134
2633
  }
3135
2634
  ).set_env("LLAMA_LOG_COLORS"));
3136
2635
  add_opt(common_arg(
@@ -3398,7 +2897,87 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3398
2897
  }
3399
2898
  ).set_examples({LLAMA_EXAMPLE_TTS}));
3400
2899
 
3401
- // model-specific
2900
+ add_opt(common_arg(
2901
+ {"--diffusion-steps"}, "N",
2902
+ string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
2903
+ [](common_params & params, int value) { params.diffusion.steps = value; }
2904
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
2905
+ add_opt(common_arg(
2906
+ {"--diffusion-visual"},
2907
+ string_format("enable visual diffusion mode (show progressive generation) (default: %s)", params.diffusion.visual_mode ? "true" : "false"),
2908
+ [](common_params & params) { params.diffusion.visual_mode = true; }
2909
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
2910
+ add_opt(common_arg(
2911
+ {"--diffusion-eps"}, "F",
2912
+ string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
2913
+ [](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
2914
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
2915
+ add_opt(common_arg(
2916
+ {"--diffusion-algorithm"}, "N",
2917
+ string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)", params.diffusion.algorithm),
2918
+ [](common_params & params, int value) { params.diffusion.algorithm = value; }
2919
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
2920
+ add_opt(common_arg(
2921
+ {"--diffusion-alg-temp"}, "F",
2922
+ string_format("dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
2923
+ [](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
2924
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
2925
+ add_opt(common_arg(
2926
+ {"--diffusion-block-length"}, "N",
2927
+ string_format("llada block length for generation (default: %d)", params.diffusion.block_length),
2928
+ [](common_params & params, int value) { params.diffusion.block_length = value; }
2929
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
2930
+ add_opt(common_arg(
2931
+ {"--diffusion-cfg-scale"}, "F",
2932
+ string_format("llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale),
2933
+ [](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(value); }
2934
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
2935
+ add_opt(common_arg(
2936
+ {"--diffusion-add-gumbel-noise"}, "F",
2937
+ string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
2938
+ [](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); }
2939
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
2940
+ add_opt(common_arg(
2941
+ { "-lr", "--learning-rate" }, "ALPHA",
2942
+ string_format("adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)", (double) params.lr.lr0),
2943
+ [](common_params & params, const std::string & value) { params.lr.lr0 = std::stof(value); }
2944
+ ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
2945
+ add_opt(common_arg({ "-lr-min", "--learning-rate-min" }, "ALPHA",
2946
+ string_format("(if >0) final learning rate after decay (if -decay-epochs is set, default=%.2g)",
2947
+ (double) params.lr.lr_min),
2948
+ [](common_params & params, const std::string & value) { params.lr.lr_min = std::stof(value); }
2949
+ ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
2950
+ add_opt(common_arg(
2951
+ {"-decay-epochs", "--learning-rate-decay-epochs"}, "ALPHA",
2952
+ string_format("(if >0) decay learning rate to -lr-min after this many epochs (exponential decay, default=%.2g)", (double) params.lr.decay_epochs),
2953
+ [](common_params & params, const std::string & value) { params.lr.decay_epochs = std::stof(value); }
2954
+ ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
2955
+ add_opt(common_arg(
2956
+ {"-wd", "--weight-decay"}, "WD",
2957
+ string_format("adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g).", (double) params.lr.wd),
2958
+ [](common_params & params, const std::string & value) { params.lr.wd = std::stof(value); }
2959
+ ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
2960
+ add_opt(common_arg(
2961
+ {"-val-split", "--val-split"}, "FRACTION",
2962
+ string_format("fraction of data to use as validation set for training (default: %.2g).", (double) params.val_split),
2963
+ [](common_params & params, const std::string & value) { params.val_split = std::stof(value); }
2964
+ ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
2965
+ add_opt(common_arg(
2966
+ {"-epochs", "--epochs"}, "N",
2967
+ string_format("optimizer max # of epochs (default: %d)", params.lr.epochs),
2968
+ [](common_params & params, int epochs) { params.lr.epochs = epochs; }
2969
+ ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
2970
+ add_opt(common_arg(
2971
+ {"-opt", "--optimizer"}, "sgd|adamw", "adamw or sgd",
2972
+ [](common_params & params, const std::string & name) {
2973
+ params.optimizer = common_opt_get_optimizer(name.c_str());
2974
+ if (params.optimizer == GGML_OPT_OPTIMIZER_TYPE_COUNT) {
2975
+ throw std::invalid_argument("invalid --optimizer, valid options: adamw, sgd");
2976
+ }
2977
+ }
2978
+ ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
2979
+
2980
+ // presets
3402
2981
  add_opt(common_arg(
3403
2982
  {"--tts-oute-default"},
3404
2983
  string_format("use default OuteTTS models (note: can download weights from the internet)"),
@@ -3411,42 +2990,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3411
2990
  ).set_examples({LLAMA_EXAMPLE_TTS}));
3412
2991
 
3413
2992
  add_opt(common_arg(
3414
- {"--embd-bge-small-en-default"},
3415
- string_format("use default bge-small-en-v1.5 model (note: can download weights from the internet)"),
3416
- [](common_params & params) {
3417
- params.model.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
3418
- params.model.hf_file = "bge-small-en-v1.5-q8_0.gguf";
3419
- params.pooling_type = LLAMA_POOLING_TYPE_NONE;
3420
- params.embd_normalize = 2;
3421
- params.n_ctx = 512;
3422
- params.verbose_prompt = true;
3423
- params.embedding = true;
3424
- }
3425
- ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
3426
-
3427
- add_opt(common_arg(
3428
- {"--embd-e5-small-en-default"},
3429
- string_format("use default e5-small-v2 model (note: can download weights from the internet)"),
3430
- [](common_params & params) {
3431
- params.model.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
3432
- params.model.hf_file = "e5-small-v2-q8_0.gguf";
3433
- params.pooling_type = LLAMA_POOLING_TYPE_NONE;
3434
- params.embd_normalize = 2;
3435
- params.n_ctx = 512;
3436
- params.verbose_prompt = true;
3437
- params.embedding = true;
3438
- }
3439
- ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
3440
-
3441
- add_opt(common_arg(
3442
- {"--embd-gte-small-default"},
3443
- string_format("use default gte-small model (note: can download weights from the internet)"),
2993
+ {"--embd-gemma-default"},
2994
+ string_format("use default EmbeddingGemma model (note: can download weights from the internet)"),
3444
2995
  [](common_params & params) {
3445
- params.model.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
3446
- params.model.hf_file = "gte-small-q8_0.gguf";
3447
- params.pooling_type = LLAMA_POOLING_TYPE_NONE;
3448
- params.embd_normalize = 2;
3449
- params.n_ctx = 512;
2996
+ params.model.hf_repo = "ggml-org/embeddinggemma-300M-qat-q4_0-GGUF";
2997
+ params.model.hf_file = "embeddinggemma-300M-qat-Q4_0.gguf";
2998
+ params.port = 8011;
2999
+ params.n_ubatch = 2048;
3000
+ params.n_batch = 2048;
3001
+ params.n_parallel = 32;
3002
+ params.n_ctx = 2048*params.n_parallel;
3450
3003
  params.verbose_prompt = true;
3451
3004
  params.embedding = true;
3452
3005
  }
@@ -3459,8 +3012,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3459
3012
  params.model.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
3460
3013
  params.model.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
3461
3014
  params.port = 8012;
3462
- params.n_gpu_layers = 99;
3463
- params.flash_attn = true;
3464
3015
  params.n_ubatch = 1024;
3465
3016
  params.n_batch = 1024;
3466
3017
  params.n_ctx = 0;
@@ -3475,8 +3026,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3475
3026
  params.model.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
3476
3027
  params.model.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
3477
3028
  params.port = 8012;
3478
- params.n_gpu_layers = 99;
3479
- params.flash_attn = true;
3480
3029
  params.n_ubatch = 1024;
3481
3030
  params.n_batch = 1024;
3482
3031
  params.n_ctx = 0;
@@ -3491,8 +3040,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3491
3040
  params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
3492
3041
  params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
3493
3042
  params.port = 8012;
3494
- params.n_gpu_layers = 99;
3495
- params.flash_attn = true;
3496
3043
  params.n_ubatch = 1024;
3497
3044
  params.n_batch = 1024;
3498
3045
  params.n_ctx = 0;
@@ -3508,10 +3055,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3508
3055
  params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
3509
3056
  params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
3510
3057
  params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
3511
- params.speculative.n_gpu_layers = 99;
3512
3058
  params.port = 8012;
3513
- params.n_gpu_layers = 99;
3514
- params.flash_attn = true;
3515
3059
  params.n_ubatch = 1024;
3516
3060
  params.n_batch = 1024;
3517
3061
  params.n_ctx = 0;
@@ -3527,10 +3071,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3527
3071
  params.model.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
3528
3072
  params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
3529
3073
  params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
3530
- params.speculative.n_gpu_layers = 99;
3531
3074
  params.port = 8012;
3532
- params.n_gpu_layers = 99;
3533
- params.flash_attn = true;
3534
3075
  params.n_ubatch = 1024;
3535
3076
  params.n_batch = 1024;
3536
3077
  params.n_ctx = 0;
@@ -3545,8 +3086,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3545
3086
  params.model.hf_repo = "ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF";
3546
3087
  params.model.hf_file = "qwen3-coder-30b-a3b-instruct-q8_0.gguf";
3547
3088
  params.port = 8012;
3548
- params.n_gpu_layers = 99;
3549
- params.flash_attn = true;
3550
3089
  params.n_ubatch = 1024;
3551
3090
  params.n_batch = 1024;
3552
3091
  params.n_ctx = 0;
@@ -3555,96 +3094,65 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3555
3094
  ).set_examples({LLAMA_EXAMPLE_SERVER}));
3556
3095
 
3557
3096
  add_opt(common_arg(
3558
- { "--diffusion-steps" }, "N",
3559
- string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
3560
- [](common_params & params, int value) { params.diffusion.steps = value; }
3561
- ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3562
- add_opt(common_arg(
3563
- { "--diffusion-visual" },
3564
- string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
3565
- params.diffusion.visual_mode ? "true" : "false"),
3566
- [](common_params & params) { params.diffusion.visual_mode = true; }
3567
- ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3097
+ {"--gpt-oss-20b-default"},
3098
+ string_format("use gpt-oss-20b (note: can download weights from the internet)"),
3099
+ [](common_params & params) {
3100
+ params.model.hf_repo = "ggml-org/gpt-oss-20b-GGUF";
3101
+ params.model.hf_file = "gpt-oss-20b-mxfp4.gguf";
3102
+ params.port = 8013;
3103
+ params.n_ubatch = 2048;
3104
+ params.n_batch = 32768;
3105
+ params.n_parallel = 2;
3106
+ params.n_ctx = 131072*params.n_parallel;
3107
+ params.sampling.temp = 1.0f;
3108
+ params.sampling.top_p = 1.0f;
3109
+ params.sampling.top_k = 0;
3110
+ params.sampling.min_p = 0.01f;
3111
+ params.use_jinja = true;
3112
+ //params.default_template_kwargs["reasoning_effort"] = "\"high\"";
3113
+ }
3114
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
3568
3115
 
3569
3116
  add_opt(common_arg(
3570
- { "--diffusion-eps" }, "F",
3571
- string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
3572
- [](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
3573
- ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3574
- add_opt(common_arg(
3575
- { "--diffusion-algorithm" }, "N",
3576
- string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)",
3577
- params.diffusion.algorithm),
3578
- [](common_params & params, int value) { params.diffusion.algorithm = value; }
3579
- ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3580
- add_opt(common_arg(
3581
- { "--diffusion-alg-temp" }, "F",
3582
- string_format("dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
3583
- [](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
3584
- ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3117
+ {"--gpt-oss-120b-default"},
3118
+ string_format("use gpt-oss-120b (note: can download weights from the internet)"),
3119
+ [](common_params & params) {
3120
+ params.model.hf_repo = "ggml-org/gpt-oss-120b-GGUF";
3121
+ params.port = 8013;
3122
+ params.n_ubatch = 2048;
3123
+ params.n_batch = 32768;
3124
+ params.n_parallel = 2;
3125
+ params.n_ctx = 131072*params.n_parallel;
3126
+ params.sampling.temp = 1.0f;
3127
+ params.sampling.top_p = 1.0f;
3128
+ params.sampling.top_k = 0;
3129
+ params.sampling.min_p = 0.01f;
3130
+ params.use_jinja = true;
3131
+ //params.default_template_kwargs["reasoning_effort"] = "\"high\"";
3132
+ }
3133
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
3585
3134
 
3586
3135
  add_opt(common_arg(
3587
- { "--diffusion-block-length" }, "N",
3588
- string_format("llada block length for generation (default: %d)", params.diffusion.block_length),
3589
- [](common_params & params, int value) { params.diffusion.block_length = value; }
3590
- ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3591
- add_opt(common_arg(
3592
- { "--diffusion-cfg-scale" }, "F",
3593
- string_format("llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale),
3594
- [](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(value); }
3595
- ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3596
- add_opt(common_arg(
3597
- { "--diffusion-add-gumbel-noise" }, "F",
3598
- string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
3599
- [](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); }
3600
- ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3601
-
3136
+ {"--vision-gemma-4b-default"},
3137
+ string_format("use Gemma 3 4B QAT (note: can download weights from the internet)"),
3138
+ [](common_params & params) {
3139
+ params.model.hf_repo = "ggml-org/gemma-3-4b-it-qat-GGUF";
3140
+ params.port = 8014;
3141
+ params.n_ctx = 0;
3142
+ params.use_jinja = true;
3143
+ }
3144
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
3602
3145
 
3603
- add_opt(
3604
- common_arg({ "-lr", "--learning-rate" }, "ALPHA",
3605
- string_format(
3606
- "adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)",
3607
- (double) params.lr.lr0),
3608
- [](common_params & params, const std::string & value) { params.lr.lr0 = std::stof(value); })
3609
- .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3610
- add_opt(
3611
- common_arg({ "-lr-min", "--learning-rate-min" }, "ALPHA",
3612
- string_format(
3613
- "(if >0) final learning rate after decay (if -decay-epochs is set, default=%.2g)",
3614
- (double) params.lr.lr_min),
3615
- [](common_params & params, const std::string & value) { params.lr.lr_min = std::stof(value); })
3616
- .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3617
- add_opt(
3618
- common_arg({ "-decay-epochs", "--learning-rate-decay-epochs" }, "ALPHA",
3619
- string_format(
3620
- "(if >0) decay learning rate to -lr-min after this many epochs (exponential decay, default=%.2g)",
3621
- (double) params.lr.decay_epochs),
3622
- [](common_params & params, const std::string & value) { params.lr.decay_epochs = std::stof(value); })
3623
- .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3624
- add_opt(common_arg(
3625
- { "-wd", "--weight-decay" }, "WD",
3626
- string_format(
3627
- "adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g).",
3628
- (double) params.lr.wd),
3629
- [](common_params & params, const std::string & value) { params.lr.wd = std::stof(value); })
3630
- .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3631
- add_opt(common_arg({ "-val-split", "--val-split" }, "FRACTION",
3632
- string_format("fraction of data to use as validation set for training (default: %.2g).",
3633
- (double) params.val_split),
3634
- [](common_params & params, const std::string & value) { params.val_split = std::stof(value); })
3635
- .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3636
- add_opt(common_arg({ "-epochs", "--epochs" }, "N",
3637
- string_format("optimizer max # of epochs (default: %d)", params.lr.epochs),
3638
- [](common_params & params, int epochs) { params.lr.epochs = epochs; })
3639
- .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3640
- add_opt(common_arg({ "-opt", "--optimizer" }, "sgd|adamw", "adamw or sgd",
3641
- [](common_params & params, const std::string & name) {
3642
- params.optimizer = common_opt_get_optimizer(name.c_str());
3643
- if (params.optimizer == GGML_OPT_OPTIMIZER_TYPE_COUNT) {
3644
- throw std::invalid_argument("invalid --optimizer, valid options: adamw, sgd");
3645
- }
3646
- })
3647
- .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3146
+ add_opt(common_arg(
3147
+ {"--vision-gemma-12b-default"},
3148
+ string_format("use Gemma 3 12B QAT (note: can download weights from the internet)"),
3149
+ [](common_params & params) {
3150
+ params.model.hf_repo = "ggml-org/gemma-3-12b-it-qat-GGUF";
3151
+ params.port = 8014;
3152
+ params.n_ctx = 0;
3153
+ params.use_jinja = true;
3154
+ }
3155
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
3648
3156
 
3649
3157
  return ctx_arg;
3650
3158
  }