@novastera-oss/llamarn 0.4.0 → 0.4.3-beta4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (979) hide show
  1. package/RNLlamaCpp.podspec +4 -1
  2. package/android/CMakeLists.txt +13 -3
  3. package/android/src/main/cpp/include/llama.h +44 -21
  4. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  8. package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
  9. package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
  10. package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
  11. package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
  12. package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
  13. package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
  14. package/android/src/main/jniLibs/x86/libggml.so +0 -0
  15. package/android/src/main/jniLibs/x86/libllama.so +0 -0
  16. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  17. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  18. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  19. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  20. package/cpp/LlamaCppModel.cpp +2 -10
  21. package/cpp/SystemUtils.cpp +3 -7
  22. package/cpp/build-info.cpp +2 -2
  23. package/cpp/llama.cpp/CMakeLists.txt +12 -0
  24. package/cpp/llama.cpp/CODEOWNERS +116 -10
  25. package/cpp/llama.cpp/CONTRIBUTING.md +30 -3
  26. package/cpp/llama.cpp/README.md +13 -5
  27. package/cpp/llama.cpp/build-xcframework.sh +5 -0
  28. package/cpp/llama.cpp/cmake/riscv64-spacemit-linux-gnu-gcc.cmake +29 -0
  29. package/cpp/llama.cpp/common/CMakeLists.txt +12 -2
  30. package/cpp/llama.cpp/common/arg.cpp +303 -795
  31. package/cpp/llama.cpp/common/arg.h +2 -3
  32. package/cpp/llama.cpp/common/chat-parser-xml-toolcall.cpp +861 -0
  33. package/cpp/llama.cpp/common/chat-parser-xml-toolcall.h +45 -0
  34. package/cpp/llama.cpp/common/chat-parser.cpp +156 -15
  35. package/cpp/llama.cpp/common/chat-parser.h +13 -0
  36. package/cpp/llama.cpp/common/chat.cpp +1147 -88
  37. package/cpp/llama.cpp/common/chat.h +16 -3
  38. package/cpp/llama.cpp/common/common.cpp +70 -15
  39. package/cpp/llama.cpp/common/common.h +57 -19
  40. package/cpp/llama.cpp/common/download.cpp +1072 -0
  41. package/cpp/llama.cpp/common/download.h +55 -0
  42. package/cpp/llama.cpp/common/http.h +73 -0
  43. package/cpp/llama.cpp/common/json-partial.cpp +70 -2
  44. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +61 -22
  45. package/cpp/llama.cpp/common/json-schema-to-grammar.h +2 -0
  46. package/cpp/llama.cpp/common/log.cpp +59 -2
  47. package/cpp/llama.cpp/common/log.h +12 -4
  48. package/cpp/llama.cpp/common/sampling.cpp +84 -8
  49. package/cpp/llama.cpp/common/sampling.h +3 -1
  50. package/cpp/llama.cpp/common/speculative.cpp +1 -1
  51. package/cpp/llama.cpp/convert_hf_to_gguf.py +1608 -233
  52. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +6 -1
  53. package/cpp/llama.cpp/convert_lora_to_gguf.py +37 -5
  54. package/cpp/llama.cpp/ggml/CMakeLists.txt +47 -28
  55. package/cpp/llama.cpp/ggml/include/ggml-backend.h +19 -1
  56. package/cpp/llama.cpp/ggml/include/ggml-cpu.h +1 -1
  57. package/cpp/llama.cpp/ggml/include/ggml-hexagon.h +19 -0
  58. package/cpp/llama.cpp/ggml/include/ggml-metal.h +1 -6
  59. package/cpp/llama.cpp/ggml/include/ggml-rpc.h +7 -9
  60. package/cpp/llama.cpp/ggml/include/ggml-zdnn.h +2 -1
  61. package/cpp/llama.cpp/ggml/include/ggml.h +199 -6
  62. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +38 -0
  63. package/cpp/llama.cpp/ggml/src/ggml-alloc.c +299 -130
  64. package/cpp/llama.cpp/ggml/src/ggml-backend-impl.h +4 -4
  65. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +21 -5
  66. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +99 -2
  67. package/cpp/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +1 -1
  68. package/cpp/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +1 -0
  69. package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +57 -45
  70. package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +138 -47
  71. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +1584 -1773
  72. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +201 -317
  73. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +146 -187
  74. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +771 -713
  75. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +135 -77
  76. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +5 -2
  77. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +428 -26
  78. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +16 -17
  79. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +318 -145
  80. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  81. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +155 -60
  82. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +8 -8
  83. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -1
  84. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +14 -0
  85. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +10 -9
  86. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +108 -64
  87. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +14 -4
  88. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +530 -87
  89. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +37 -45
  90. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +349 -127
  91. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +947 -1218
  92. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +5 -4
  93. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +143 -29
  94. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +82 -76
  95. package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1025 -0
  96. package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  97. package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  98. package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  99. package/cpp/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +151 -0
  100. package/cpp/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +7 -0
  101. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +233 -28
  102. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +326 -66
  103. package/cpp/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +12 -3
  104. package/cpp/llama.cpp/ggml/src/ggml-cuda/argsort.cu +102 -6
  105. package/cpp/llama.cpp/ggml/src/ggml-cuda/binbcast.cu +110 -76
  106. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +167 -38
  107. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d.cu +6 -11
  108. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +12 -0
  109. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh +1 -1
  110. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +245 -151
  111. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cuh +1 -5
  112. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +341 -289
  113. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile.cu +49 -0
  114. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile.cuh +1233 -0
  115. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec.cuh +586 -0
  116. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +6 -6
  117. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +48 -0
  118. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +123 -220
  119. package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +41 -39
  120. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +715 -45
  121. package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cu +150 -0
  122. package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cuh +1 -0
  123. package/cpp/llama.cpp/ggml/src/ggml-cuda/mma.cuh +321 -24
  124. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cu +93 -351
  125. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cuh +828 -1
  126. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmid.cu +164 -0
  127. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmid.cuh +5 -0
  128. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +3 -166
  129. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +1 -1
  130. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvf.cu +371 -78
  131. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvf.cuh +3 -2
  132. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cu +279 -147
  133. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cuh +1 -1
  134. package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cu +97 -85
  135. package/cpp/llama.cpp/ggml/src/ggml-cuda/pad.cu +46 -23
  136. package/cpp/llama.cpp/ggml/src/ggml-cuda/pad_reflect_1d.cu +63 -54
  137. package/cpp/llama.cpp/ggml/src/ggml-cuda/quantize.cu +12 -10
  138. package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +192 -77
  139. package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cuh +2 -0
  140. package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +10 -9
  141. package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cu +137 -75
  142. package/cpp/llama.cpp/ggml/src/ggml-cuda/set.cu +39 -0
  143. package/cpp/llama.cpp/ggml/src/ggml-cuda/set.cuh +7 -0
  144. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
  145. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
  146. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
  147. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
  148. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
  149. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
  150. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
  151. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
  152. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
  153. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
  154. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
  155. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
  156. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
  157. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
  158. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
  159. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
  160. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
  161. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
  162. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
  163. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
  164. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
  165. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
  166. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
  167. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
  168. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
  169. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
  170. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
  171. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
  172. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
  173. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
  174. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
  175. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
  176. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
  177. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
  178. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
  179. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
  180. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
  181. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
  182. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
  183. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
  184. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
  185. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
  186. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
  187. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
  188. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
  189. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +40 -19
  190. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
  191. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
  192. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
  193. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
  194. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
  195. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
  196. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
  197. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
  198. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
  199. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
  200. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
  201. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
  202. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
  203. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
  204. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
  205. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
  206. package/cpp/llama.cpp/ggml/src/ggml-cuda/topk-moe.cu +336 -0
  207. package/cpp/llama.cpp/ggml/src/ggml-cuda/topk-moe.cuh +16 -0
  208. package/cpp/llama.cpp/ggml/src/ggml-cuda/tsembd.cu +3 -3
  209. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +105 -11
  210. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +36 -0
  211. package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +87 -6
  212. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +28 -12
  213. package/cpp/llama.cpp/ggml/src/ggml-hexagon/CMakeLists.txt +68 -0
  214. package/cpp/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3807 -0
  215. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/CMakeLists.txt +40 -0
  216. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/act-ops.c +442 -0
  217. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/binary-ops.c +360 -0
  218. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
  219. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-ctx.h +40 -0
  220. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.c +69 -0
  221. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.h +119 -0
  222. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-msg.h +156 -0
  223. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-ops.h +64 -0
  224. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
  225. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-exp.c +93 -0
  226. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.c +60 -0
  227. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +49 -0
  228. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.c +960 -0
  229. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +1032 -0
  230. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/main.c +829 -0
  231. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +2223 -0
  232. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/ops-utils.h +149 -0
  233. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/rope-ops.c +418 -0
  234. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/softmax-ops.c +402 -0
  235. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/unary-ops.c +255 -0
  236. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/worker-pool.c +297 -0
  237. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
  238. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp-utils.c +448 -0
  239. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp-utils.h +220 -0
  240. package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +8 -13
  241. package/cpp/llama.cpp/ggml/src/ggml-impl.h +110 -12
  242. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +6 -5
  243. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
  244. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  245. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
  246. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-context.m +599 -0
  247. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +1662 -0
  248. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.h +251 -0
  249. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.m +1527 -0
  250. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +244 -39
  251. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +3844 -0
  252. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.h +90 -0
  253. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.cpp +723 -0
  254. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +3453 -1907
  255. package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +3 -1
  256. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +10 -0
  257. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1331 -109
  258. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/cvt.cl +126 -0
  259. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +31 -4
  260. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +35 -7
  261. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +31 -4
  262. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
  263. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
  264. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/get_rows.cl +36 -12
  265. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
  266. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +24 -10
  267. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +24 -10
  268. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
  269. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  270. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  271. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  272. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  273. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  274. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  275. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +29 -20
  276. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +25 -10
  277. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rope.cl +50 -24
  278. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +123 -10
  279. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +2 -2
  280. package/cpp/llama.cpp/ggml/src/ggml-quants.c +1 -0
  281. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +341 -161
  282. package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
  283. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +6 -5
  284. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +74 -15
  285. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +50 -30
  286. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +10 -4
  287. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +166 -99
  288. package/cpp/llama.cpp/ggml/src/ggml-sycl/count-equal.cpp +79 -0
  289. package/cpp/llama.cpp/ggml/src/ggml-sycl/count-equal.hpp +9 -0
  290. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +72 -94
  291. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +67 -49
  292. package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +21 -31
  293. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +252 -316
  294. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +6 -2
  295. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +9 -6
  296. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +359 -142
  297. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
  298. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  299. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +80 -60
  300. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +201 -132
  301. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +230 -55
  302. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.hpp +2 -0
  303. package/cpp/llama.cpp/ggml/src/ggml-sycl/pad.cpp +97 -0
  304. package/cpp/llama.cpp/ggml/src/ggml-sycl/pad.hpp +24 -0
  305. package/cpp/llama.cpp/ggml/src/ggml-sycl/pad_reflect_1d.cpp +72 -0
  306. package/cpp/llama.cpp/ggml/src/ggml-sycl/pad_reflect_1d.hpp +8 -0
  307. package/cpp/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
  308. package/cpp/llama.cpp/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
  309. package/cpp/llama.cpp/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
  310. package/cpp/llama.cpp/ggml/src/ggml-sycl/roll.cpp +122 -0
  311. package/cpp/llama.cpp/ggml/src/ggml-sycl/roll.hpp +20 -0
  312. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +50 -41
  313. package/cpp/llama.cpp/ggml/src/ggml-sycl/set.cpp +73 -0
  314. package/cpp/llama.cpp/ggml/src/ggml-sycl/set.hpp +5 -0
  315. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +45 -36
  316. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +330 -165
  317. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +4 -0
  318. package/cpp/llama.cpp/ggml/src/ggml-sycl/ssm_conv.cpp +127 -0
  319. package/cpp/llama.cpp/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
  320. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +12 -6
  321. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +16 -12
  322. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +38 -18
  323. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +4184 -2159
  324. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
  325. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +2 -2
  326. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +2 -2
  327. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
  328. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +1 -1
  329. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
  330. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +2 -2
  331. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +33 -26
  332. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
  333. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
  334. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +2 -2
  335. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +2 -2
  336. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +2 -2
  337. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +1 -1
  338. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +53 -30
  339. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +1 -1
  340. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +2 -2
  341. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +3 -3
  342. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +13 -6
  343. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
  344. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +2 -2
  345. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +2 -2
  346. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +1 -1
  347. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs.comp → dequant_funcs.glsl} +138 -2
  348. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs_cm2.comp → dequant_funcs_cm2.glsl} +18 -4
  349. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_head.comp → dequant_head.glsl} +1 -1
  350. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
  351. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +1 -1
  352. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +2 -2
  353. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +1 -1
  354. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +3 -2
  355. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +7 -6
  356. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +5 -3
  357. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +1 -1
  358. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +1 -1
  359. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +3 -3
  360. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +3 -3
  361. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
  362. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +1 -1
  363. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +1 -1
  364. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +3 -3
  365. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +1 -1
  366. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +1 -1
  367. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +3 -3
  368. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
  369. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +1 -1
  370. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +1 -1
  371. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +2 -2
  372. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +3 -2
  373. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
  374. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +52 -14
  375. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{flash_attn_base.comp → flash_attn_base.glsl} +50 -12
  376. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +61 -12
  377. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +54 -12
  378. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +5 -1
  379. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
  380. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +2 -2
  381. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +2 -2
  382. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +2 -2
  383. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +2 -2
  384. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +2 -2
  385. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +2 -2
  386. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_binary_head.comp → generic_binary_head.glsl} +10 -2
  387. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +21 -12
  388. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +28 -18
  389. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{glu_head.comp → glu_head.glsl} +1 -1
  390. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +2 -2
  391. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
  392. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
  393. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +15 -7
  394. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +125 -0
  395. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +2 -2
  396. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +2 -2
  397. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +18 -0
  398. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +2 -2
  399. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +1 -1
  400. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +229 -0
  401. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +33 -0
  402. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +1 -1
  403. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +1 -1
  404. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +1 -1
  405. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +1 -1
  406. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +1 -1
  407. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +1 -1
  408. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +1 -1
  409. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +9 -7
  410. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +9 -7
  411. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +3 -5
  412. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +1 -1
  413. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +3 -5
  414. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +3 -5
  415. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +1 -1
  416. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
  417. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +106 -634
  418. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +118 -9
  419. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +556 -0
  420. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +70 -0
  421. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +77 -214
  422. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +589 -0
  423. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +78 -0
  424. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +97 -13
  425. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
  426. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +2 -2
  427. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +2 -2
  428. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +1 -1
  429. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +25 -4
  430. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +1 -1
  431. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +55 -5
  432. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +2 -2
  433. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +2 -2
  434. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +2 -2
  435. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +2 -2
  436. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +45 -3
  437. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +2 -2
  438. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +2 -2
  439. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +2 -2
  440. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +227 -0
  441. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +20 -0
  442. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +5 -52
  443. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +5 -35
  444. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +5 -35
  445. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +27 -0
  446. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +5 -41
  447. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
  448. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +2 -2
  449. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +2 -2
  450. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +2 -2
  451. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +2 -2
  452. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +2 -2
  453. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +1 -1
  454. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +6 -2
  455. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
  456. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +2 -2
  457. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +2 -2
  458. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +44 -0
  459. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +140 -0
  460. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
  461. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +2 -2
  462. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +1 -1
  463. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +2 -2
  464. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +2 -2
  465. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +2 -2
  466. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +5 -4
  467. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +171 -0
  468. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
  469. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{types.comp → types.glsl} +79 -29
  470. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +36 -12
  471. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +471 -196
  472. package/cpp/llama.cpp/ggml/src/ggml-webgpu/CMakeLists.txt +8 -0
  473. package/cpp/llama.cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1690 -383
  474. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +188 -0
  475. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
  476. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
  477. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +101 -0
  478. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +57 -10
  479. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
  480. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
  481. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +25 -912
  482. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +97 -0
  483. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl +247 -0
  484. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.tmpl.wgsl +302 -0
  485. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +267 -0
  486. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +123 -0
  487. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
  488. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/scale.tmpl.wgsl +90 -0
  489. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/{set_rows.wgsl → set_rows.tmpl.wgsl} +38 -8
  490. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
  491. package/cpp/llama.cpp/ggml/src/ggml-zdnn/common.hpp +59 -0
  492. package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +96 -314
  493. package/cpp/llama.cpp/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  494. package/cpp/llama.cpp/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  495. package/cpp/llama.cpp/ggml/src/ggml-zdnn/utils.cpp +79 -0
  496. package/cpp/llama.cpp/ggml/src/ggml-zdnn/utils.hpp +19 -0
  497. package/cpp/llama.cpp/ggml/src/ggml.c +440 -17
  498. package/cpp/llama.cpp/ggml/src/gguf.cpp +104 -29
  499. package/cpp/llama.cpp/gguf-py/gguf/constants.py +363 -13
  500. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +64 -0
  501. package/cpp/llama.cpp/gguf-py/gguf/lazy.py +8 -3
  502. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_convert_endian.py +6 -0
  503. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +156 -18
  504. package/cpp/llama.cpp/gguf-py/gguf/utility.py +80 -0
  505. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +4 -4
  506. package/cpp/llama.cpp/include/llama.h +44 -21
  507. package/cpp/llama.cpp/media/llama1-icon-transparent.png +0 -0
  508. package/cpp/llama.cpp/media/llama1-icon-transparent.svg +77 -0
  509. package/cpp/llama.cpp/media/llama1-icon.png +0 -0
  510. package/cpp/llama.cpp/media/llama1-icon.svg +87 -0
  511. package/cpp/llama.cpp/requirements/requirements-all.txt +2 -0
  512. package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -3
  513. package/cpp/llama.cpp/requirements/requirements-convert_legacy_llama.txt +3 -1
  514. package/cpp/llama.cpp/requirements/requirements-tool_bench.txt +1 -1
  515. package/cpp/llama.cpp/src/CMakeLists.txt +101 -0
  516. package/cpp/llama.cpp/src/llama-adapter.cpp +33 -0
  517. package/cpp/llama.cpp/src/llama-adapter.h +3 -0
  518. package/cpp/llama.cpp/src/llama-arch.cpp +344 -14
  519. package/cpp/llama.cpp/src/llama-arch.h +50 -0
  520. package/cpp/llama.cpp/src/llama-batch.cpp +63 -31
  521. package/cpp/llama.cpp/src/llama-batch.h +13 -2
  522. package/cpp/llama.cpp/src/llama-chat.cpp +85 -3
  523. package/cpp/llama.cpp/src/llama-chat.h +4 -0
  524. package/cpp/llama.cpp/src/llama-context.cpp +300 -45
  525. package/cpp/llama.cpp/src/llama-context.h +16 -6
  526. package/cpp/llama.cpp/src/llama-cparams.h +2 -1
  527. package/cpp/llama.cpp/src/llama-grammar.cpp +17 -9
  528. package/cpp/llama.cpp/src/llama-graph.cpp +226 -64
  529. package/cpp/llama.cpp/src/llama-graph.h +27 -5
  530. package/cpp/llama.cpp/src/llama-hparams.cpp +53 -2
  531. package/cpp/llama.cpp/src/llama-hparams.h +48 -8
  532. package/cpp/llama.cpp/src/llama-impl.cpp +3 -3
  533. package/cpp/llama.cpp/src/llama-impl.h +2 -0
  534. package/cpp/llama.cpp/src/llama-kv-cache-iswa.cpp +13 -3
  535. package/cpp/llama.cpp/src/llama-kv-cache-iswa.h +2 -0
  536. package/cpp/llama.cpp/src/llama-kv-cache.cpp +120 -62
  537. package/cpp/llama.cpp/src/llama-kv-cache.h +13 -4
  538. package/cpp/llama.cpp/src/llama-kv-cells.h +44 -2
  539. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +19 -9
  540. package/cpp/llama.cpp/src/llama-memory-hybrid.h +2 -0
  541. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +38 -17
  542. package/cpp/llama.cpp/src/llama-memory-recurrent.h +5 -2
  543. package/cpp/llama.cpp/src/llama-memory.h +3 -0
  544. package/cpp/llama.cpp/src/llama-model-loader.cpp +2 -0
  545. package/cpp/llama.cpp/src/llama-model.cpp +1070 -12614
  546. package/cpp/llama.cpp/src/llama-model.h +40 -4
  547. package/cpp/llama.cpp/src/llama-quant.cpp +14 -6
  548. package/cpp/llama.cpp/src/llama-sampling.cpp +243 -136
  549. package/cpp/llama.cpp/src/llama-vocab.cpp +43 -3
  550. package/cpp/llama.cpp/src/llama-vocab.h +43 -39
  551. package/cpp/llama.cpp/src/llama.cpp +69 -10
  552. package/cpp/llama.cpp/src/models/afmoe.cpp +187 -0
  553. package/cpp/llama.cpp/src/models/apertus.cpp +125 -0
  554. package/cpp/llama.cpp/src/models/arcee.cpp +135 -0
  555. package/cpp/llama.cpp/src/models/arctic.cpp +138 -0
  556. package/cpp/llama.cpp/src/models/arwkv7.cpp +86 -0
  557. package/cpp/llama.cpp/src/models/baichuan.cpp +122 -0
  558. package/cpp/llama.cpp/src/models/bailingmoe.cpp +144 -0
  559. package/cpp/llama.cpp/src/models/bailingmoe2.cpp +135 -0
  560. package/cpp/llama.cpp/src/models/bert.cpp +176 -0
  561. package/cpp/llama.cpp/src/models/bitnet.cpp +160 -0
  562. package/cpp/llama.cpp/src/models/bloom.cpp +101 -0
  563. package/cpp/llama.cpp/src/models/chameleon.cpp +178 -0
  564. package/cpp/llama.cpp/src/models/chatglm.cpp +132 -0
  565. package/cpp/llama.cpp/src/models/codeshell.cpp +111 -0
  566. package/cpp/llama.cpp/src/models/cogvlm.cpp +100 -0
  567. package/cpp/llama.cpp/src/models/cohere2-iswa.cpp +131 -0
  568. package/cpp/llama.cpp/src/models/command-r.cpp +122 -0
  569. package/cpp/llama.cpp/src/models/dbrx.cpp +123 -0
  570. package/cpp/llama.cpp/src/models/deci.cpp +135 -0
  571. package/cpp/llama.cpp/src/models/deepseek.cpp +144 -0
  572. package/cpp/llama.cpp/src/models/deepseek2.cpp +237 -0
  573. package/cpp/llama.cpp/src/models/dots1.cpp +134 -0
  574. package/cpp/llama.cpp/src/models/dream.cpp +105 -0
  575. package/cpp/llama.cpp/src/models/ernie4-5-moe.cpp +150 -0
  576. package/cpp/llama.cpp/src/models/ernie4-5.cpp +110 -0
  577. package/cpp/llama.cpp/src/models/exaone.cpp +114 -0
  578. package/cpp/llama.cpp/src/models/exaone4.cpp +123 -0
  579. package/cpp/llama.cpp/src/models/falcon-h1.cpp +113 -0
  580. package/cpp/llama.cpp/src/models/falcon.cpp +120 -0
  581. package/cpp/llama.cpp/src/models/gemma-embedding.cpp +120 -0
  582. package/cpp/llama.cpp/src/models/gemma.cpp +112 -0
  583. package/cpp/llama.cpp/src/models/gemma2-iswa.cpp +125 -0
  584. package/cpp/llama.cpp/src/models/gemma3-iswa.cpp +131 -0
  585. package/cpp/llama.cpp/src/models/gemma3n-iswa.cpp +377 -0
  586. package/cpp/llama.cpp/src/models/glm4-moe.cpp +153 -0
  587. package/cpp/llama.cpp/src/models/glm4.cpp +127 -0
  588. package/cpp/llama.cpp/src/models/gpt2.cpp +105 -0
  589. package/cpp/llama.cpp/src/models/gptneox.cpp +144 -0
  590. package/cpp/llama.cpp/src/models/granite-hybrid.cpp +196 -0
  591. package/cpp/llama.cpp/src/models/granite.cpp +211 -0
  592. package/cpp/llama.cpp/src/models/graph-context-mamba.cpp +283 -0
  593. package/cpp/llama.cpp/src/models/grok.cpp +159 -0
  594. package/cpp/llama.cpp/src/models/grovemoe.cpp +141 -0
  595. package/cpp/llama.cpp/src/models/hunyuan-dense.cpp +132 -0
  596. package/cpp/llama.cpp/src/models/hunyuan-moe.cpp +154 -0
  597. package/cpp/llama.cpp/src/models/internlm2.cpp +120 -0
  598. package/cpp/llama.cpp/src/models/jais.cpp +86 -0
  599. package/cpp/llama.cpp/src/models/jamba.cpp +106 -0
  600. package/cpp/llama.cpp/src/models/lfm2.cpp +173 -0
  601. package/cpp/llama.cpp/src/models/llada-moe.cpp +122 -0
  602. package/cpp/llama.cpp/src/models/llada.cpp +99 -0
  603. package/cpp/llama.cpp/src/models/llama-iswa.cpp +174 -0
  604. package/cpp/llama.cpp/src/models/llama.cpp +155 -0
  605. package/cpp/llama.cpp/src/models/mamba.cpp +55 -0
  606. package/cpp/llama.cpp/src/models/minicpm3.cpp +199 -0
  607. package/cpp/llama.cpp/src/models/minimax-m2.cpp +124 -0
  608. package/cpp/llama.cpp/src/models/models.h +485 -0
  609. package/cpp/llama.cpp/src/models/mpt.cpp +126 -0
  610. package/cpp/llama.cpp/src/models/nemotron-h.cpp +121 -0
  611. package/cpp/llama.cpp/src/models/nemotron.cpp +122 -0
  612. package/cpp/llama.cpp/src/models/neo-bert.cpp +104 -0
  613. package/cpp/llama.cpp/src/models/olmo.cpp +121 -0
  614. package/cpp/llama.cpp/src/models/olmo2.cpp +150 -0
  615. package/cpp/llama.cpp/src/models/olmoe.cpp +124 -0
  616. package/cpp/llama.cpp/src/models/openai-moe-iswa.cpp +124 -0
  617. package/cpp/llama.cpp/src/models/openelm.cpp +124 -0
  618. package/cpp/llama.cpp/src/models/orion.cpp +123 -0
  619. package/cpp/llama.cpp/src/models/pangu-embedded.cpp +121 -0
  620. package/cpp/llama.cpp/src/models/phi2.cpp +121 -0
  621. package/cpp/llama.cpp/src/models/phi3.cpp +152 -0
  622. package/cpp/llama.cpp/src/models/plamo.cpp +110 -0
  623. package/cpp/llama.cpp/src/models/plamo2.cpp +316 -0
  624. package/cpp/llama.cpp/src/models/plm.cpp +168 -0
  625. package/cpp/llama.cpp/src/models/qwen.cpp +108 -0
  626. package/cpp/llama.cpp/src/models/qwen2.cpp +117 -0
  627. package/cpp/llama.cpp/src/models/qwen2moe.cpp +151 -0
  628. package/cpp/llama.cpp/src/models/qwen2vl.cpp +117 -0
  629. package/cpp/llama.cpp/src/models/qwen3.cpp +117 -0
  630. package/cpp/llama.cpp/src/models/qwen3moe.cpp +124 -0
  631. package/cpp/llama.cpp/src/models/qwen3vl-moe.cpp +149 -0
  632. package/cpp/llama.cpp/src/models/qwen3vl.cpp +141 -0
  633. package/cpp/llama.cpp/src/models/refact.cpp +94 -0
  634. package/cpp/llama.cpp/src/models/rwkv6-base.cpp +162 -0
  635. package/cpp/llama.cpp/src/models/rwkv6.cpp +94 -0
  636. package/cpp/llama.cpp/src/models/rwkv6qwen2.cpp +86 -0
  637. package/cpp/llama.cpp/src/models/rwkv7-base.cpp +135 -0
  638. package/cpp/llama.cpp/src/models/rwkv7.cpp +90 -0
  639. package/cpp/llama.cpp/src/models/seed-oss.cpp +124 -0
  640. package/cpp/llama.cpp/src/models/smallthinker.cpp +120 -0
  641. package/cpp/llama.cpp/src/models/smollm3.cpp +128 -0
  642. package/cpp/llama.cpp/src/models/stablelm.cpp +146 -0
  643. package/cpp/llama.cpp/src/models/starcoder.cpp +100 -0
  644. package/cpp/llama.cpp/src/models/starcoder2.cpp +121 -0
  645. package/cpp/llama.cpp/src/models/t5-dec.cpp +166 -0
  646. package/cpp/llama.cpp/src/models/t5-enc.cpp +96 -0
  647. package/cpp/llama.cpp/src/models/wavtokenizer-dec.cpp +149 -0
  648. package/cpp/llama.cpp/src/models/xverse.cpp +108 -0
  649. package/cpp/llama.cpp/src/unicode.cpp +77 -0
  650. package/cpp/llama.cpp/src/unicode.h +43 -0
  651. package/cpp/llama.cpp/vendor/cpp-httplib/CMakeLists.txt +94 -0
  652. package/cpp/llama.cpp/vendor/cpp-httplib/httplib.cpp +9339 -0
  653. package/cpp/llama.cpp/vendor/cpp-httplib/httplib.h +433 -8222
  654. package/cpp/llama.cpp/vendor/cpp-httplib/patch-boringssl.cmake +6 -0
  655. package/cpp/llama.cpp/vendor/miniaudio/miniaudio.h +4179 -1900
  656. package/cpp/llama.cpp/vendor/minja/chat-template.hpp +9 -2
  657. package/cpp/llama.cpp/vendor/minja/minja.hpp +101 -22
  658. package/cpp/rn-completion.cpp +3 -27
  659. package/ios/include/chat.h +16 -3
  660. package/ios/include/common/minja/chat-template.hpp +9 -2
  661. package/ios/include/common/minja/minja.hpp +101 -22
  662. package/ios/include/common.h +57 -19
  663. package/ios/include/json-schema-to-grammar.h +2 -0
  664. package/ios/include/llama.h +44 -21
  665. package/ios/include/log.h +12 -4
  666. package/ios/include/sampling.h +3 -1
  667. package/ios/libs/llama.xcframework/Info.plist +20 -20
  668. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  669. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +6399 -5557
  670. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +19 -1
  671. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +1 -1
  672. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-metal.h +1 -6
  673. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +199 -6
  674. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +44 -21
  675. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  676. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  677. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +6362 -5520
  678. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4813 -4241
  679. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +19 -1
  680. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +1 -1
  681. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +1 -6
  682. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +199 -6
  683. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +44 -21
  684. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  685. package/package.json +10 -4
  686. package/cpp/llama.cpp/ggml/src/ggml-cann/Doxyfile +0 -2579
  687. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -371
  688. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
  689. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -379
  690. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
  691. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -495
  692. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -486
  693. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  694. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  695. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  696. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  697. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  698. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  699. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  700. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  701. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  702. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  703. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  704. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  705. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  706. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  707. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  708. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  709. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  710. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  711. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  712. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  713. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  714. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  715. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  716. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  717. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  718. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  719. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  720. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  721. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  722. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  723. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  724. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  725. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  726. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  727. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  728. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  729. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  730. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  731. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  732. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  733. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  734. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  735. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  736. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  737. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  738. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  739. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  740. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  741. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  742. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  743. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  744. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  745. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  746. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  747. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  748. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  749. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  750. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  751. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  752. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  753. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  754. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  755. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  756. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  757. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  758. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  759. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  760. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  761. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  762. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  763. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  764. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  765. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  766. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  767. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  768. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  769. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  770. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  771. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  772. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  773. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  774. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  775. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  776. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  777. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  778. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  779. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +0 -6886
  780. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +0 -154
  781. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +0 -105
  782. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +0 -55
  783. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +0 -60
  784. package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn-impl.h +0 -97
  785. package/cpp/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
  786. package/cpp/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
  787. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
  788. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +0 -112
  789. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +0 -46
  790. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
  791. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.inp +0 -112
  792. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.out +0 -46
  793. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
  794. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +0 -112
  795. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +0 -46
  796. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
  797. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +0 -112
  798. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +0 -46
  799. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
  800. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.inp +0 -112
  801. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.out +0 -46
  802. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
  803. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +0 -112
  804. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +0 -46
  805. package/cpp/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
  806. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
  807. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +0 -112
  808. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +0 -46
  809. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
  810. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +0 -112
  811. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +0 -46
  812. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
  813. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.inp +0 -112
  814. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.out +0 -46
  815. package/cpp/llama.cpp/models/ggml-vocab-nomic-bert-moe.gguf +0 -0
  816. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
  817. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +0 -112
  818. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.out +0 -46
  819. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
  820. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +0 -112
  821. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.out +0 -46
  822. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
  823. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.inp +0 -112
  824. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.out +0 -46
  825. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
  826. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +0 -112
  827. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.out +0 -46
  828. package/cpp/llama.cpp/models/templates/ByteDance-Seed-OSS.jinja +0 -171
  829. package/cpp/llama.cpp/models/templates/CohereForAI-c4ai-command-r-plus-tool_use.jinja +0 -202
  830. package/cpp/llama.cpp/models/templates/CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja +0 -156
  831. package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +0 -124
  832. package/cpp/llama.cpp/models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja +0 -152
  833. package/cpp/llama.cpp/models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja +0 -152
  834. package/cpp/llama.cpp/models/templates/Qwen-QwQ-32B.jinja +0 -62
  835. package/cpp/llama.cpp/models/templates/Qwen-Qwen2.5-7B-Instruct.jinja +0 -54
  836. package/cpp/llama.cpp/models/templates/Qwen-Qwen3-0.6B.jinja +0 -85
  837. package/cpp/llama.cpp/models/templates/README.md +0 -25
  838. package/cpp/llama.cpp/models/templates/deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja +0 -1
  839. package/cpp/llama.cpp/models/templates/deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja +0 -1
  840. package/cpp/llama.cpp/models/templates/fireworks-ai-llama-3-firefunction-v2.jinja +0 -57
  841. package/cpp/llama.cpp/models/templates/google-gemma-2-2b-it.jinja +0 -4
  842. package/cpp/llama.cpp/models/templates/ibm-granite-granite-3.3-2B-Instruct.jinja +0 -59
  843. package/cpp/llama.cpp/models/templates/llama-cpp-deepseek-r1.jinja +0 -76
  844. package/cpp/llama.cpp/models/templates/llama-cpp-rwkv-world.jinja +0 -34
  845. package/cpp/llama.cpp/models/templates/meetkai-functionary-medium-v3.1.jinja +0 -58
  846. package/cpp/llama.cpp/models/templates/meetkai-functionary-medium-v3.2.jinja +0 -287
  847. package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.1-8B-Instruct.jinja +0 -109
  848. package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.2-3B-Instruct.jinja +0 -93
  849. package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.3-70B-Instruct.jinja +0 -109
  850. package/cpp/llama.cpp/models/templates/microsoft-Phi-3.5-mini-instruct.jinja +0 -8
  851. package/cpp/llama.cpp/models/templates/mistralai-Mistral-Nemo-Instruct-2407.jinja +0 -87
  852. package/cpp/llama.cpp/models/templates/moonshotai-Kimi-K2.jinja +0 -43
  853. package/cpp/llama.cpp/models/templates/openai-gpt-oss-120b.jinja +0 -331
  854. package/cpp/llama.cpp/models/templates/unsloth-mistral-Devstral-Small-2507.jinja +0 -105
  855. package/cpp/llama.cpp/prompts/LLM-questions.txt +0 -49
  856. package/cpp/llama.cpp/prompts/alpaca.txt +0 -1
  857. package/cpp/llama.cpp/prompts/assistant.txt +0 -31
  858. package/cpp/llama.cpp/prompts/chat-with-baichuan.txt +0 -4
  859. package/cpp/llama.cpp/prompts/chat-with-bob.txt +0 -7
  860. package/cpp/llama.cpp/prompts/chat-with-qwen.txt +0 -1
  861. package/cpp/llama.cpp/prompts/chat-with-vicuna-v0.txt +0 -7
  862. package/cpp/llama.cpp/prompts/chat-with-vicuna-v1.txt +0 -7
  863. package/cpp/llama.cpp/prompts/chat.txt +0 -28
  864. package/cpp/llama.cpp/prompts/dan-modified.txt +0 -1
  865. package/cpp/llama.cpp/prompts/dan.txt +0 -1
  866. package/cpp/llama.cpp/prompts/mnemonics.txt +0 -93
  867. package/cpp/llama.cpp/prompts/parallel-questions.txt +0 -43
  868. package/cpp/llama.cpp/prompts/reason-act.txt +0 -18
  869. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  870. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  871. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5524
  872. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4247
  873. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-alloc.h +0 -76
  874. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +0 -354
  875. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-blas.h +0 -25
  876. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +0 -145
  877. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-metal.h +0 -66
  878. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +0 -256
  879. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +0 -2492
  880. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/gguf.h +0 -202
  881. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +0 -1391
  882. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Modules/module.modulemap +0 -17
  883. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Resources/Info.plist +0 -32
  884. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-alloc.h +0 -76
  885. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +0 -354
  886. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-blas.h +0 -25
  887. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +0 -145
  888. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-metal.h +0 -66
  889. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +0 -256
  890. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +0 -2492
  891. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/gguf.h +0 -202
  892. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +0 -1391
  893. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Modules/module.modulemap +0 -17
  894. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Resources/Info.plist +0 -32
  895. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  896. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-alloc.h +0 -76
  897. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +0 -354
  898. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-blas.h +0 -25
  899. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +0 -145
  900. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-metal.h +0 -66
  901. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +0 -256
  902. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +0 -2492
  903. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/gguf.h +0 -202
  904. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +0 -1391
  905. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Modules/module.modulemap +0 -17
  906. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Resources/Info.plist +0 -32
  907. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  908. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  909. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  910. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  911. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5561
  912. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-alloc.h +0 -76
  913. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +0 -354
  914. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-blas.h +0 -25
  915. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +0 -145
  916. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-metal.h +0 -66
  917. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +0 -256
  918. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +0 -2492
  919. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/gguf.h +0 -202
  920. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +0 -1391
  921. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Info.plist +0 -35
  922. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Modules/module.modulemap +0 -17
  923. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  924. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  925. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  926. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5524
  927. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4246
  928. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-alloc.h +0 -76
  929. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +0 -354
  930. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-blas.h +0 -25
  931. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +0 -145
  932. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +0 -66
  933. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +0 -256
  934. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +0 -2492
  935. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/gguf.h +0 -202
  936. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -1391
  937. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Info.plist +0 -35
  938. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Modules/module.modulemap +0 -17
  939. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  940. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  941. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  942. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5558
  943. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-alloc.h +0 -76
  944. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +0 -354
  945. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-blas.h +0 -25
  946. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +0 -145
  947. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-metal.h +0 -66
  948. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +0 -256
  949. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +0 -2492
  950. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/gguf.h +0 -202
  951. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +0 -1391
  952. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Info.plist +0 -32
  953. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Modules/module.modulemap +0 -17
  954. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  955. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  956. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  957. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5520
  958. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4243
  959. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-alloc.h +0 -76
  960. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +0 -354
  961. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-blas.h +0 -25
  962. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +0 -145
  963. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +0 -66
  964. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +0 -256
  965. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +0 -2492
  966. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/gguf.h +0 -202
  967. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -1391
  968. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Info.plist +0 -32
  969. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Modules/module.modulemap +0 -17
  970. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  971. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_bfloat16_support.comp → feature-tests/bfloat16.comp} +0 -0
  972. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat_support.comp → feature-tests/coopmat.comp} +0 -0
  973. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat2_support.comp → feature-tests/coopmat2.comp} +0 -0
  974. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_integer_dot_support.comp → feature-tests/integer_dot.comp} +0 -0
  975. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_head.comp → generic_head.glsl} +0 -0
  976. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_unary_head.comp → generic_unary_head.glsl} +0 -0
  977. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{glu_main.comp → glu_main.glsl} +0 -0
  978. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{rte.comp → rte.glsl} +0 -0
  979. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{utils.comp → utils.glsl} +0 -0
@@ -0,0 +1,1527 @@
1
+ #import "ggml-metal-device.h"
2
+
3
+ #import "ggml-impl.h"
4
+ #import "ggml-threading.h"
5
+
6
+ #include <Foundation/Foundation.h>
7
+
8
+ #include <Metal/Metal.h>
9
+
10
+ #include <stdatomic.h>
11
+
12
+ #ifndef TARGET_OS_VISION
13
+ #define TARGET_OS_VISION 0
14
+ #endif
15
+
16
+ // create residency sets only on macOS >= 15.0
17
+ #if !TARGET_CPU_X86_64 && TARGET_OS_OSX && __MAC_OS_X_VERSION_MAX_ALLOWED >= 150000 || \
18
+ TARGET_OS_IOS && __IPHONE_OS_VERSION_MAX_ALLOWED >= 180000 || \
19
+ TARGET_OS_TV && __TV_OS_VERSION_MAX_ALLOWED >= 180000 || \
20
+ TARGET_OS_VISION && __VISION_OS_VERSION_MAX_ALLOWED >= 200000
21
+ #define GGML_METAL_HAS_RESIDENCY_SETS 1
22
+ #endif
23
+
24
+ // overload of MTLGPUFamilyMetalX (not available in some environments)
25
+ static const NSInteger MTLGPUFamilyMetal3_GGML = 5001;
26
+ static const NSInteger MTLGPUFamilyMetal4_GGML = 5002;
27
+
28
+ // virtual address for GPU memory allocations
29
+ static atomic_uintptr_t g_addr_device = 0x000000400ULL;
30
+
31
+ #if !GGML_METAL_EMBED_LIBRARY
32
+ // Here to assist with NSBundle Path Hack
33
+ @interface GGMLMetalClass : NSObject
34
+ @end
35
+ @implementation GGMLMetalClass
36
+ @end
37
+ #endif
38
+
39
+ //
40
+ // MTLFunctionConstantValues wrapper
41
+ //
42
+
43
+ struct ggml_metal_cv {
44
+ MTLFunctionConstantValues * obj;
45
+ };
46
+
47
+ ggml_metal_cv_t ggml_metal_cv_init(void) {
48
+ ggml_metal_cv_t res = calloc(1, sizeof(struct ggml_metal_cv));
49
+
50
+ res->obj = [[MTLFunctionConstantValues alloc] init];
51
+
52
+ return res;
53
+ }
54
+
55
+ void ggml_metal_cv_free(ggml_metal_cv_t cv) {
56
+ [cv->obj release];
57
+ free(cv);
58
+ }
59
+
60
+ void ggml_metal_cv_set_int16(ggml_metal_cv_t cv, int16_t value, int32_t idx) {
61
+ [cv->obj setConstantValue:&value type:MTLDataTypeShort atIndex:idx];
62
+ }
63
+
64
+ void ggml_metal_cv_set_int32(ggml_metal_cv_t cv, int32_t value, int32_t idx) {
65
+ [cv->obj setConstantValue:&value type:MTLDataTypeInt atIndex:idx];
66
+ }
67
+
68
+ void ggml_metal_cv_set_bool(ggml_metal_cv_t cv, bool value, int32_t idx) {
69
+ [cv->obj setConstantValue:&value type:MTLDataTypeBool atIndex:idx];
70
+ }
71
+
72
+ //
73
+ // MTLComputePipelineState wrapper
74
+ //
75
+
76
+ struct ggml_metal_pipeline {
77
+ id<MTLComputePipelineState> obj;
78
+
79
+ // suggested dispatch sizes
80
+ int nsg;
81
+
82
+ int nr0;
83
+ int nr1;
84
+
85
+ size_t smem;
86
+ };
87
+
88
+ ggml_metal_pipeline_t ggml_metal_pipeline_init(void) {
89
+ ggml_metal_pipeline_t res = calloc(1, sizeof(struct ggml_metal_pipeline));
90
+
91
+ *res = (struct ggml_metal_pipeline) {
92
+ /*.obj =*/ nil,
93
+ /*.nsg =*/ 0,
94
+ /*.nr0 =*/ 0,
95
+ /*.nr1 =*/ 0,
96
+ /*.smem =*/ 0,
97
+ };
98
+
99
+ return res;
100
+ }
101
+
102
+ void ggml_metal_pipeline_free(ggml_metal_pipeline_t pipeline) {
103
+ [pipeline->obj release];
104
+
105
+ free(pipeline);
106
+ }
107
+
108
+ void ggml_metal_pipeline_set_nsg(ggml_metal_pipeline_t pipeline, int nsg) {
109
+ pipeline->nsg = nsg;
110
+ }
111
+
112
+ int ggml_metal_pipeline_get_nsg(ggml_metal_pipeline_t pipeline) {
113
+ return pipeline->nsg;
114
+ }
115
+
116
+ void ggml_metal_pipeline_set_nr0(ggml_metal_pipeline_t pipeline, int nr0) {
117
+ pipeline->nr0 = nr0;
118
+ }
119
+
120
+ int ggml_metal_pipeline_get_nr0(ggml_metal_pipeline_t pipeline) {
121
+ return pipeline->nr0;
122
+ }
123
+
124
+ void ggml_metal_pipeline_set_nr1(ggml_metal_pipeline_t pipeline, int nr1) {
125
+ pipeline->nr1 = nr1;
126
+ }
127
+
128
+ int ggml_metal_pipeline_get_nr1(ggml_metal_pipeline_t pipeline) {
129
+ return pipeline->nr1;
130
+ }
131
+
132
+ void ggml_metal_pipeline_set_smem(ggml_metal_pipeline_t pipeline, size_t smem) {
133
+ pipeline->smem = smem;
134
+ }
135
+
136
+ size_t ggml_metal_pipeline_get_smem(ggml_metal_pipeline_t pipeline) {
137
+ return pipeline->smem;
138
+ }
139
+
140
+ int ggml_metal_pipeline_max_theads_per_threadgroup(ggml_metal_pipeline_t pipeline) {
141
+ return pipeline->obj.maxTotalThreadsPerThreadgroup;
142
+ }
143
+
144
+ struct ggml_metal_library {
145
+ id<MTLLibrary> obj;
146
+ id<MTLDevice> device;
147
+
148
+ ggml_metal_pipelines_t pipelines; // cache of compiled pipelines
149
+ };
150
+
151
+ ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev) {
152
+ id<MTLLibrary> library = nil;
153
+ id<MTLDevice> device = ggml_metal_device_get_obj(dev);
154
+
155
+ // load library
156
+ //
157
+ // - first check if the library is embedded
158
+ // - then check if the library is in the bundle
159
+ // - if not found, load the source and compile it
160
+ // - if that fails, return NULL
161
+ //
162
+ // TODO: move to a function
163
+ {
164
+ const int64_t t_start = ggml_time_us();
165
+
166
+ NSError * error = nil;
167
+ NSString * src = nil;
168
+
169
+ #if GGML_METAL_EMBED_LIBRARY
170
+ GGML_LOG_INFO("%s: using embedded metal library\n", __func__);
171
+
172
+ extern const char ggml_metallib_start[];
173
+ extern const char ggml_metallib_end[];
174
+
175
+ src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
176
+ #else
177
+
178
+ #ifdef SWIFT_PACKAGE
179
+ NSBundle * bundle = SWIFTPM_MODULE_BUNDLE;
180
+ #else
181
+ NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
182
+ #endif
183
+
184
+ NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"];
185
+ if (path_lib == nil) {
186
+ // Try to find the resource in the directory where the current binary located.
187
+ NSString * bin_cur = [[NSProcessInfo processInfo] arguments][0];
188
+ NSString * bin_dir = [bin_cur stringByDeletingLastPathComponent];
189
+
190
+ NSString * path_lib_default = [NSString pathWithComponents:@[bin_dir, @"default.metallib"]];
191
+ if ([[NSFileManager defaultManager] isReadableFileAtPath:path_lib_default]) {
192
+ GGML_LOG_INFO("%s: found '%s'\n", __func__, [path_lib_default UTF8String]);
193
+
194
+ NSDictionary * atts = [[NSFileManager defaultManager] attributesOfItemAtPath:path_lib_default error:&error];
195
+ if (atts && atts[NSFileType] == NSFileTypeSymbolicLink) {
196
+ // Optionally, if this is a symlink, try to resolve it.
197
+ path_lib_default = [[NSFileManager defaultManager] destinationOfSymbolicLinkAtPath:path_lib_default error:&error];
198
+ if (path_lib_default && [path_lib_default length] > 0 && ![[path_lib_default substringToIndex:1] isEqualToString:@"/"]) {
199
+ // It is a relative path, adding the binary directory as directory prefix.
200
+ path_lib_default = [NSString pathWithComponents:@[bin_dir, path_lib_default]];
201
+ }
202
+ if (!path_lib_default || ![[NSFileManager defaultManager] isReadableFileAtPath:path_lib_default]) {
203
+ // Link to the resource could not be resolved.
204
+ path_lib_default = nil;
205
+ } else {
206
+ GGML_LOG_INFO("%s: symlink resolved '%s'\n", __func__, [path_lib_default UTF8String]);
207
+ }
208
+ }
209
+ } else {
210
+ // The resource couldn't be found in the binary's directory.
211
+ path_lib_default = nil;
212
+ }
213
+
214
+ path_lib = path_lib_default;
215
+ }
216
+
217
+ if (path_lib != nil) {
218
+ // pre-compiled library found
219
+ NSURL * libURL = [NSURL fileURLWithPath:path_lib];
220
+ GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]);
221
+
222
+ library = [device newLibraryWithURL:libURL error:&error];
223
+ if (error) {
224
+ GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
225
+ return nil;
226
+ }
227
+ } else {
228
+ GGML_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
229
+
230
+ NSString * path_source;
231
+ NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
232
+
233
+ GGML_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil");
234
+
235
+ if (path_resource) {
236
+ path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal.metal"];
237
+ } else {
238
+ path_source = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
239
+ }
240
+
241
+ if (path_source == nil) {
242
+ GGML_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
243
+ path_source = @"ggml-metal.metal";
244
+ }
245
+
246
+ GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]);
247
+
248
+ src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];
249
+ if (error) {
250
+ GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
251
+ return nil;
252
+ }
253
+ }
254
+ #endif
255
+
256
+ if (!library) {
257
+ @autoreleasepool {
258
+ // dictionary of preprocessor macros
259
+ NSMutableDictionary * prep = [NSMutableDictionary dictionary];
260
+
261
+ if (ggml_metal_device_get_props(dev)->has_bfloat) {
262
+ [prep setObject:@"1" forKey:@"GGML_METAL_HAS_BF16"];
263
+ }
264
+
265
+ if (ggml_metal_device_get_props(dev)->has_tensor) {
266
+ [prep setObject:@"1" forKey:@"GGML_METAL_HAS_TENSOR"];
267
+ }
268
+
269
+ #if GGML_METAL_EMBED_LIBRARY
270
+ [prep setObject:@"1" forKey:@"GGML_METAL_EMBED_LIBRARY"];
271
+ #endif
272
+
273
+ MTLCompileOptions * options = [MTLCompileOptions new];
274
+ options.preprocessorMacros = prep;
275
+
276
+ //[options setFastMathEnabled:false];
277
+
278
+ library = [device newLibraryWithSource:src options:options error:&error];
279
+ if (error) {
280
+ GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
281
+ return nil;
282
+ }
283
+
284
+ #if !__has_feature(objc_arc)
285
+ [options release];
286
+ #endif
287
+ }
288
+ }
289
+
290
+ #if GGML_METAL_EMBED_LIBRARY
291
+ [src release];
292
+ #endif // GGML_METAL_EMBED_LIBRARY
293
+
294
+ GGML_LOG_INFO("%s: loaded in %.3f sec\n", __func__, (ggml_time_us() - t_start) / 1e6);
295
+ }
296
+
297
+ ggml_metal_library_t res = calloc(1, sizeof(struct ggml_metal_library));
298
+
299
+ res->obj = library;
300
+ res->device = device;
301
+ res->pipelines = ggml_metal_pipelines_init();
302
+
303
+ return res;
304
+ }
305
+
306
+ ggml_metal_library_t ggml_metal_library_init_from_source(ggml_metal_device_t dev, const char * source, bool verbose) {
307
+ if (source == NULL) {
308
+ GGML_LOG_ERROR("%s: source is NULL\n", __func__);
309
+ return NULL;
310
+ }
311
+
312
+ id<MTLDevice> device = ggml_metal_device_get_obj(dev);
313
+ id<MTLLibrary> library = nil;
314
+ NSError * error = nil;
315
+
316
+ const int64_t t_start = ggml_time_us();
317
+
318
+ NSString * src = [[NSString alloc] initWithBytes:source
319
+ length:strlen(source)
320
+ encoding:NSUTF8StringEncoding];
321
+ if (!src) {
322
+ GGML_LOG_ERROR("%s: failed to create NSString from source\n", __func__);
323
+ return NULL;
324
+ }
325
+
326
+ @autoreleasepool {
327
+ NSMutableDictionary * prep = [NSMutableDictionary dictionary];
328
+
329
+ MTLCompileOptions * options = [MTLCompileOptions new];
330
+ options.preprocessorMacros = prep;
331
+
332
+ library = [device newLibraryWithSource:src options:options error:&error];
333
+ if (error) {
334
+ if (verbose) {
335
+ GGML_LOG_ERROR("%s: error compiling source: %s\n", __func__, [[error description] UTF8String]);
336
+ } else {
337
+ GGML_LOG_ERROR("%s: error compiling source\n", __func__);
338
+ }
339
+ library = nil;
340
+ }
341
+
342
+ [options release];
343
+ }
344
+
345
+ [src release];
346
+
347
+ if (!library) {
348
+ if (verbose) {
349
+ GGML_LOG_ERROR("%s: failed to create Metal library from source\n", __func__);
350
+ }
351
+
352
+ return NULL;
353
+ }
354
+
355
+ if (verbose) {
356
+ GGML_LOG_INFO("%s: compiled in %.3f sec\n", __func__, (ggml_time_us() - t_start) / 1e6);
357
+ }
358
+
359
+ ggml_metal_library_t res = calloc(1, sizeof(struct ggml_metal_library));
360
+ if (!res) {
361
+ GGML_LOG_ERROR("%s: calloc failed\n", __func__);
362
+ return NULL;
363
+ }
364
+
365
+ res->obj = library;
366
+ res->device = device;
367
+ res->pipelines = ggml_metal_pipelines_init();
368
+
369
+ return res;
370
+ }
371
+
372
+ void ggml_metal_library_free(ggml_metal_library_t lib) {
373
+ if (!lib) {
374
+ return;
375
+ }
376
+
377
+ if (lib->obj) {
378
+ [lib->obj release];
379
+ }
380
+
381
+ ggml_metal_pipelines_free(lib->pipelines);
382
+
383
+ free(lib);
384
+ }
385
+
386
+ ggml_metal_pipeline_t ggml_metal_library_get_pipeline(ggml_metal_library_t lib, const char * name) {
387
+ return ggml_metal_pipelines_get(lib->pipelines, name);
388
+ }
389
+
390
+ ggml_metal_pipeline_t ggml_metal_library_compile_pipeline(ggml_metal_library_t lib, const char * base, const char * name, ggml_metal_cv_t cv) {
391
+ // note: the pipelines are cached in the library per device, so they are shared across all metal contexts
392
+ ggml_critical_section_start();
393
+
394
+ ggml_metal_pipeline_t res = ggml_metal_library_get_pipeline(lib, name);
395
+ if (res) {
396
+ ggml_critical_section_end();
397
+
398
+ return res;
399
+ }
400
+
401
+ res = ggml_metal_pipeline_init();
402
+
403
+ @autoreleasepool {
404
+ NSError * error = nil;
405
+
406
+ NSString * base_func = [NSString stringWithUTF8String:base];
407
+
408
+ GGML_LOG_DEBUG("%s: compiling pipeline: base = '%s', name = '%s'\n", __func__, base, name);
409
+
410
+ id<MTLFunction> mtl_function;
411
+ if (!cv) {
412
+ mtl_function = [lib->obj newFunctionWithName:base_func];
413
+ } else {
414
+ mtl_function = [lib->obj newFunctionWithName:base_func constantValues:cv->obj error:&error];
415
+ }
416
+ if (!mtl_function) {
417
+ ggml_critical_section_end();
418
+
419
+ GGML_LOG_ERROR("%s: failed to compile pipeline: base = '%s', name = '%s'\n", __func__, base, name);
420
+ if (error) {
421
+ GGML_LOG_ERROR("%s: %s\n", __func__, [[error description] UTF8String]);
422
+ }
423
+
424
+ return nil;
425
+ }
426
+
427
+ res->obj = [lib->device newComputePipelineStateWithFunction:mtl_function error:&error];
428
+
429
+ [mtl_function release];
430
+
431
+ GGML_LOG_DEBUG("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, name, (void *) res->obj,
432
+ (int) res->obj.maxTotalThreadsPerThreadgroup,
433
+ (int) res->obj.threadExecutionWidth);
434
+
435
+ if (res->obj.maxTotalThreadsPerThreadgroup == 0 || res->obj.threadExecutionWidth == 0) {
436
+ ggml_critical_section_end();
437
+
438
+ GGML_LOG_ERROR("%s: incompatible pipeline %s\n", __func__, name);
439
+
440
+ return nil;
441
+ }
442
+
443
+ ggml_metal_pipelines_add(lib->pipelines, name, res);
444
+ }
445
+
446
+ ggml_critical_section_end();
447
+
448
+ return res;
449
+ }
450
+
451
+ //
452
+ // MTLComputeCommandEncoder wrapper
453
+ //
454
+
455
+ struct ggml_metal_encoder {
456
+ id<MTLComputeCommandEncoder> obj;
457
+ };
458
+
459
+ ggml_metal_encoder_t ggml_metal_encoder_init(ggml_metal_cmd_buf_t cmd_buf_raw, bool concurrent) {
460
+ ggml_metal_encoder_t res = calloc(1, sizeof(struct ggml_metal_encoder));
461
+
462
+ id<MTLCommandBuffer> cmd_buf = (id<MTLCommandBuffer>) cmd_buf_raw;
463
+
464
+ if (concurrent) {
465
+ res->obj = [cmd_buf computeCommandEncoderWithDispatchType: MTLDispatchTypeConcurrent];
466
+ } else {
467
+ res->obj = [cmd_buf computeCommandEncoder];
468
+ }
469
+
470
+ [res->obj retain];
471
+
472
+ return res;
473
+ }
474
+
475
+ void ggml_metal_encoder_free(ggml_metal_encoder_t encoder) {
476
+ [encoder->obj release];
477
+ free(encoder);
478
+ }
479
+
480
+ void ggml_metal_encoder_debug_group_push(ggml_metal_encoder_t encoder, const char * name) {
481
+ [encoder->obj pushDebugGroup:[NSString stringWithCString:name encoding:NSUTF8StringEncoding]];
482
+ }
483
+
484
+ void ggml_metal_encoder_debug_group_pop (ggml_metal_encoder_t encoder) {
485
+ [encoder->obj popDebugGroup];
486
+ }
487
+
488
+ void ggml_metal_encoder_set_pipeline(ggml_metal_encoder_t encoder, ggml_metal_pipeline_t pipeline) {
489
+ [encoder->obj setComputePipelineState:pipeline->obj];
490
+ }
491
+
492
+ void ggml_metal_encoder_set_bytes(ggml_metal_encoder_t encoder, void * data, size_t size, int idx) {
493
+ [encoder->obj setBytes:data length:size atIndex:idx];
494
+ }
495
+
496
+ void ggml_metal_encoder_set_buffer(ggml_metal_encoder_t encoder, struct ggml_metal_buffer_id buffer, int idx) {
497
+ [encoder->obj setBuffer:buffer.metal offset:buffer.offs atIndex:idx];
498
+ }
499
+
500
+ void ggml_metal_encoder_set_threadgroup_memory_size(ggml_metal_encoder_t encoder, size_t size, int idx) {
501
+ [encoder->obj setThreadgroupMemoryLength:size atIndex:idx];
502
+ }
503
+
504
+ void ggml_metal_encoder_dispatch_threadgroups(ggml_metal_encoder_t encoder, int tg0, int tg1, int tg2, int tptg0, int tptg1, int tptg2) {
505
+ [encoder->obj dispatchThreadgroups:MTLSizeMake(tg0, tg1, tg2) threadsPerThreadgroup:MTLSizeMake(tptg0, tptg1, tptg2)];
506
+ }
507
+
508
+ void ggml_metal_encoder_memory_barrier(ggml_metal_encoder_t encoder) {
509
+ [encoder->obj memoryBarrierWithScope:MTLBarrierScopeBuffers];
510
+ }
511
+
512
+ void ggml_metal_encoder_end_encoding(ggml_metal_encoder_t encoder) {
513
+ [encoder->obj endEncoding];
514
+ }
515
+
516
+ struct ggml_metal_device {
517
+ id<MTLDevice> mtl_device;
518
+
519
+ // a single global queue shared by all Metal backends
520
+ // technically not needed for devices with unified memory, but enables discrete GPUs support
521
+ // ref: https://github.com/ggml-org/llama.cpp/pull/15906
522
+ id<MTLCommandQueue> mtl_queue;
523
+
524
+ ggml_metal_library_t library;
525
+
526
+ struct ggml_metal_device_props props;
527
+ };
528
+
529
+ ggml_metal_device_t ggml_metal_device_init(void) {
530
+ ggml_metal_device_t dev = calloc(1, sizeof(struct ggml_metal_device));
531
+
532
+ assert(dev != NULL);
533
+
534
+ if (dev->mtl_device == nil) {
535
+ dev->mtl_device = MTLCreateSystemDefaultDevice();
536
+
537
+ if (dev->mtl_device) {
538
+ dev->mtl_queue = [dev->mtl_device newCommandQueue];
539
+ if (dev->mtl_queue == nil) {
540
+ GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
541
+ }
542
+
543
+ dev->props.has_simdgroup_reduction = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];
544
+ dev->props.has_simdgroup_reduction |= [dev->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
545
+
546
+ dev->props.has_simdgroup_mm = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];
547
+ dev->props.has_unified_memory = dev->mtl_device.hasUnifiedMemory;
548
+
549
+ dev->props.has_bfloat = [dev->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
550
+ dev->props.has_bfloat |= [dev->mtl_device supportsFamily:MTLGPUFamilyApple6];
551
+ if (getenv("GGML_METAL_BF16_DISABLE") != NULL) {
552
+ dev->props.has_bfloat = false;
553
+ }
554
+
555
+ dev->props.has_tensor = [dev->mtl_device supportsFamily:MTLGPUFamilyMetal4_GGML];
556
+ if (getenv("GGML_METAL_TENSOR_DISABLE") != NULL) {
557
+ dev->props.has_tensor = false;
558
+ }
559
+
560
+ // note: disable the tensor API by default for old chips because with the current implementation it is not useful
561
+ // - M2 Ultra: ~5% slower
562
+ // - M4, M4 Max: no significant difference
563
+ //
564
+ // TODO: try to update the tensor API kernels to at least match the simdgroup performance
565
+ if (getenv("GGML_METAL_TENSOR_ENABLE") == NULL &&
566
+ ![[dev->mtl_device name] containsString:@"M5"] &&
567
+ ![[dev->mtl_device name] containsString:@"M6"] &&
568
+ ![[dev->mtl_device name] containsString:@"A19"] &&
569
+ ![[dev->mtl_device name] containsString:@"A20"]) {
570
+ GGML_LOG_WARN("%s: tensor API disabled for pre-M5 and pre-A19 devices\n", __func__);
571
+ dev->props.has_tensor = false;
572
+ }
573
+
574
+ // double-check that the tensor API compiles
575
+ if (dev->props.has_tensor) {
576
+ const char * src_tensor_f16 = "\n"
577
+ "#include <metal_stdlib> \n"
578
+ "#include <metal_tensor> \n"
579
+ "#include <MetalPerformancePrimitives/MetalPerformancePrimitives.h> \n"
580
+ " \n"
581
+ "using namespace metal; \n"
582
+ "using namespace mpp::tensor_ops; \n"
583
+ " \n"
584
+ "kernel void dummy_kernel( \n"
585
+ " tensor<device half, dextents<int32_t, 2>> A [[buffer(0)]], \n"
586
+ " tensor<device half, dextents<int32_t, 2>> B [[buffer(1)]], \n"
587
+ " device float * C [[buffer(2)]], \n"
588
+ " uint2 tgid [[threadgroup_position_in_grid]]) \n"
589
+ "{ \n"
590
+ " auto tA = A.slice(0, (int)tgid.y); \n"
591
+ " auto tB = B.slice((int)tgid.x, 0); \n"
592
+ " \n"
593
+ " matmul2d< \n"
594
+ " matmul2d_descriptor(8, 8, dynamic_extent), \n"
595
+ " execution_simdgroups<4>> mm; \n"
596
+ " \n"
597
+ " auto cT = mm.get_destination_cooperative_tensor<decltype(tA), decltype(tB), float>(); \n"
598
+ " \n"
599
+ " auto sA = tA.slice(0, 0); \n"
600
+ " auto sB = tB.slice(0, 0); \n"
601
+ " mm.run(sB, sA, cT); \n"
602
+ " \n"
603
+ " auto tC = tensor<device float, dextents<int32_t, 2>, tensor_inline>(C, dextents<int32_t, 2>(4, 4)); \n"
604
+ " \n"
605
+ " cT.store(tC); \n"
606
+ "}";
607
+
608
+ GGML_LOG_INFO("%s: testing tensor API for f16 support\n", __func__);
609
+ ggml_metal_library_t lib = ggml_metal_library_init_from_source(dev, src_tensor_f16, false);
610
+ if (lib == NULL) {
611
+ GGML_LOG_WARN("%s: - the tensor API is not supported in this environment - disabling\n", __func__);
612
+ dev->props.has_tensor = false;
613
+ } else {
614
+ ggml_metal_pipeline_t ppl = ggml_metal_library_compile_pipeline(lib, "dummy_kernel", "dummy_kernel", nil);
615
+ if (!ppl) {
616
+ GGML_LOG_WARN("%s: - the tensor API is not supported in this environment - disabling\n", __func__);
617
+ dev->props.has_tensor = false;
618
+ }
619
+
620
+ ggml_metal_library_free(lib);
621
+ }
622
+ }
623
+
624
+ // try to compile a dummy kernel to determine if the tensor API is supported for bfloat
625
+ if (dev->props.has_tensor && dev->props.has_bfloat) {
626
+ const char * src_tensor_bf16 = "\n"
627
+ "#include <metal_stdlib> \n"
628
+ "#include <metal_tensor> \n"
629
+ "#include <MetalPerformancePrimitives/MetalPerformancePrimitives.h> \n"
630
+ " \n"
631
+ "using namespace metal; \n"
632
+ "using namespace mpp::tensor_ops; \n"
633
+ " \n"
634
+ "kernel void dummy_kernel( \n"
635
+ " tensor<device bfloat, dextents<int32_t, 2>> A [[buffer(0)]], \n"
636
+ " tensor<device bfloat, dextents<int32_t, 2>> B [[buffer(1)]], \n"
637
+ " device float * C [[buffer(2)]], \n"
638
+ " uint2 tgid [[threadgroup_position_in_grid]]) \n"
639
+ "{ \n"
640
+ " auto tA = A.slice(0, (int)tgid.y); \n"
641
+ " auto tB = B.slice((int)tgid.x, 0); \n"
642
+ " \n"
643
+ " matmul2d< \n"
644
+ " matmul2d_descriptor(8, 8, dynamic_extent), \n"
645
+ " execution_simdgroups<4>> mm; \n"
646
+ " \n"
647
+ " auto cT = mm.get_destination_cooperative_tensor<decltype(tA), decltype(tB), float>(); \n"
648
+ " \n"
649
+ " auto sA = tA.slice(0, 0); \n"
650
+ " auto sB = tB.slice(0, 0); \n"
651
+ " mm.run(sB, sA, cT); \n"
652
+ " \n"
653
+ " auto tC = tensor<device float, dextents<int32_t, 2>, tensor_inline>(C, dextents<int32_t, 2>(4, 4)); \n"
654
+ " \n"
655
+ " cT.store(tC); \n"
656
+ "}";
657
+
658
+ GGML_LOG_INFO("%s: testing tensor API for bfloat support\n", __func__);
659
+ ggml_metal_library_t lib = ggml_metal_library_init_from_source(dev, src_tensor_bf16, false);
660
+ if (lib == NULL) {
661
+ GGML_LOG_WARN("%s: - the tensor API does not support bfloat - disabling bfloat support\n", __func__);
662
+ dev->props.has_bfloat = false;
663
+ } else {
664
+ ggml_metal_pipeline_t ppl = ggml_metal_library_compile_pipeline(lib, "dummy_kernel", "dummy_kernel", nil);
665
+ if (!ppl) {
666
+ GGML_LOG_WARN("%s: - the tensor API does not support bfloat - disabling bfloat support\n", __func__);
667
+ dev->props.has_bfloat = false;
668
+ }
669
+
670
+ ggml_metal_library_free(lib);
671
+ }
672
+ }
673
+
674
+ dev->props.use_residency_sets = true;
675
+ #if defined(GGML_METAL_HAS_RESIDENCY_SETS)
676
+ dev->props.use_residency_sets = getenv("GGML_METAL_NO_RESIDENCY") == nil;
677
+ #endif
678
+
679
+ dev->props.use_shared_buffers = dev->props.has_unified_memory;
680
+ if (getenv("GGML_METAL_SHARED_BUFFERS_DISABLE") != NULL) {
681
+ dev->props.use_shared_buffers = false;
682
+ }
683
+
684
+ dev->props.supports_gpu_family_apple7 = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];
685
+
686
+ dev->props.max_buffer_size = dev->mtl_device.maxBufferLength;
687
+ dev->props.max_working_set_size = dev->mtl_device.recommendedMaxWorkingSetSize;
688
+ dev->props.max_theadgroup_memory_size = dev->mtl_device.maxThreadgroupMemoryLength;
689
+
690
+ strncpy(dev->props.name, [[dev->mtl_device name] UTF8String], sizeof(dev->props.name) - 1);
691
+
692
+ dev->library = ggml_metal_library_init(dev);
693
+ if (!dev->library) {
694
+ GGML_LOG_ERROR("%s: error: failed to create library\n", __func__);
695
+ }
696
+
697
+ // --------------------------------------------------
698
+
699
+ // print MTL GPU family:
700
+ GGML_LOG_INFO("%s: GPU name: %s\n", __func__, dev->props.name);
701
+
702
+ // determine max supported GPU family
703
+ // https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
704
+ // https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
705
+ {
706
+ for (int i = MTLGPUFamilyApple1 + 20; i >= MTLGPUFamilyApple1; --i) {
707
+ if ([dev->mtl_device supportsFamily:i]) {
708
+ GGML_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d (%d)\n", __func__, i - (int) MTLGPUFamilyApple1 + 1, i);
709
+ break;
710
+ }
711
+ }
712
+
713
+ for (int i = MTLGPUFamilyCommon1 + 5; i >= MTLGPUFamilyCommon1; --i) {
714
+ if ([dev->mtl_device supportsFamily:i]) {
715
+ GGML_LOG_INFO("%s: GPU family: MTLGPUFamilyCommon%d (%d)\n", __func__, i - (int) MTLGPUFamilyCommon1 + 1, i);
716
+ break;
717
+ }
718
+ }
719
+
720
+ for (int i = MTLGPUFamilyMetal3_GGML + 5; i >= MTLGPUFamilyMetal3_GGML; --i) {
721
+ if ([dev->mtl_device supportsFamily:i]) {
722
+ GGML_LOG_INFO("%s: GPU family: MTLGPUFamilyMetal%d (%d)\n", __func__, i - (int) MTLGPUFamilyMetal3_GGML + 3, i);
723
+ break;
724
+ }
725
+ }
726
+ }
727
+
728
+ GGML_LOG_INFO("%s: simdgroup reduction = %s\n", __func__, dev->props.has_simdgroup_reduction ? "true" : "false");
729
+ GGML_LOG_INFO("%s: simdgroup matrix mul. = %s\n", __func__, dev->props.has_simdgroup_mm ? "true" : "false");
730
+ GGML_LOG_INFO("%s: has unified memory = %s\n", __func__, dev->props.has_unified_memory ? "true" : "false");
731
+ GGML_LOG_INFO("%s: has bfloat = %s\n", __func__, dev->props.has_bfloat ? "true" : "false");
732
+ GGML_LOG_INFO("%s: has tensor = %s\n", __func__, dev->props.has_tensor ? "true" : "false");
733
+ GGML_LOG_INFO("%s: use residency sets = %s\n", __func__, dev->props.use_residency_sets ? "true" : "false");
734
+ GGML_LOG_INFO("%s: use shared buffers = %s\n", __func__, dev->props.use_shared_buffers ? "true" : "false");
735
+
736
+ #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
737
+ if (@available(macOS 10.12, iOS 16.0, *)) {
738
+ GGML_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, dev->props.max_working_set_size / 1e6);
739
+ }
740
+ #endif
741
+ }
742
+ }
743
+
744
+ return dev;
745
+ }
746
+
747
+ void ggml_metal_device_free(ggml_metal_device_t dev) {
748
+ assert(dev != NULL);
749
+
750
+ ggml_metal_library_free(dev->library);
751
+ dev->library = NULL;
752
+
753
+ if (dev->mtl_queue) {
754
+ [dev->mtl_queue release];
755
+ dev->mtl_queue = nil;
756
+ }
757
+
758
+ if (dev->mtl_device) {
759
+ [dev->mtl_device release];
760
+ dev->mtl_device = nil;
761
+ }
762
+
763
+ free(dev);
764
+ }
765
+
766
+ void * ggml_metal_device_get_obj(ggml_metal_device_t dev) {
767
+ return dev->mtl_device;
768
+ }
769
+
770
+ void * ggml_metal_device_get_queue(ggml_metal_device_t dev) {
771
+ return dev->mtl_queue;
772
+ }
773
+
774
+ ggml_metal_library_t ggml_metal_device_get_library(ggml_metal_device_t dev) {
775
+ return dev->library;
776
+ }
777
+
778
+ void ggml_metal_device_get_memory(ggml_metal_device_t dev, size_t * free, size_t * total) {
779
+ if (@available(macOS 10.12, iOS 16.0, *)) {
780
+ *total = dev->mtl_device.recommendedMaxWorkingSetSize;
781
+ *free = *total - dev->mtl_device.currentAllocatedSize;
782
+ } else {
783
+ *free = 0;
784
+ *total = 0;
785
+ }
786
+ }
787
+
788
+ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_tensor * op) {
789
+ const bool has_simdgroup_mm = dev->props.has_simdgroup_mm;
790
+ const bool has_simdgroup_reduction = dev->props.has_simdgroup_reduction;
791
+ const bool has_bfloat = dev->props.has_bfloat;
792
+
793
+ if (!has_bfloat) {
794
+ if (op->type == GGML_TYPE_BF16) {
795
+ return false;
796
+ }
797
+
798
+ for (size_t i = 0, n = 3; i < n; ++i) {
799
+ if (op->src[i] != NULL && op->src[i]->type == GGML_TYPE_BF16) {
800
+ return false;
801
+ }
802
+ }
803
+ }
804
+
805
+ switch (op->op) {
806
+ case GGML_OP_UNARY:
807
+ switch (ggml_get_unary_op(op)) {
808
+ case GGML_UNARY_OP_TANH:
809
+ case GGML_UNARY_OP_RELU:
810
+ case GGML_UNARY_OP_SIGMOID:
811
+ case GGML_UNARY_OP_GELU:
812
+ case GGML_UNARY_OP_GELU_ERF:
813
+ case GGML_UNARY_OP_GELU_QUICK:
814
+ case GGML_UNARY_OP_SILU:
815
+ case GGML_UNARY_OP_ELU:
816
+ case GGML_UNARY_OP_NEG:
817
+ case GGML_UNARY_OP_ABS:
818
+ case GGML_UNARY_OP_SGN:
819
+ case GGML_UNARY_OP_STEP:
820
+ case GGML_UNARY_OP_HARDSWISH:
821
+ case GGML_UNARY_OP_HARDSIGMOID:
822
+ case GGML_UNARY_OP_EXP:
823
+ return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
824
+ default:
825
+ return false;
826
+ }
827
+ case GGML_OP_GLU:
828
+ switch (ggml_get_glu_op(op)) {
829
+ case GGML_GLU_OP_REGLU:
830
+ case GGML_GLU_OP_GEGLU:
831
+ case GGML_GLU_OP_SWIGLU:
832
+ case GGML_GLU_OP_SWIGLU_OAI:
833
+ case GGML_GLU_OP_GEGLU_ERF:
834
+ case GGML_GLU_OP_GEGLU_QUICK:
835
+ return ggml_is_contiguous_1(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
836
+ default:
837
+ return false;
838
+ }
839
+ case GGML_OP_NONE:
840
+ case GGML_OP_RESHAPE:
841
+ case GGML_OP_VIEW:
842
+ case GGML_OP_TRANSPOSE:
843
+ case GGML_OP_PERMUTE:
844
+ case GGML_OP_CONCAT:
845
+ return true;
846
+ case GGML_OP_ADD:
847
+ case GGML_OP_SUB:
848
+ case GGML_OP_MUL:
849
+ case GGML_OP_DIV:
850
+ case GGML_OP_ADD_ID:
851
+ return op->src[0]->type == GGML_TYPE_F32;
852
+ case GGML_OP_ACC:
853
+ case GGML_OP_REPEAT:
854
+ case GGML_OP_SCALE:
855
+ case GGML_OP_CONV_TRANSPOSE_1D:
856
+ return true;
857
+ case GGML_OP_CONV_TRANSPOSE_2D:
858
+ return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]) &&
859
+ (op->src[0]->type == GGML_TYPE_F16 || op->src[0]->type == GGML_TYPE_F32) &&
860
+ op->src[1]->type == GGML_TYPE_F32 &&
861
+ op->type == GGML_TYPE_F32;
862
+ case GGML_OP_CLAMP:
863
+ return op->src[0]->type == GGML_TYPE_F32;
864
+ case GGML_OP_SQR:
865
+ case GGML_OP_SQRT:
866
+ case GGML_OP_SIN:
867
+ case GGML_OP_COS:
868
+ case GGML_OP_LOG:
869
+ return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
870
+ case GGML_OP_SUM:
871
+ return has_simdgroup_reduction && ggml_is_contiguous(op->src[0]);
872
+ case GGML_OP_SUM_ROWS:
873
+ case GGML_OP_CUMSUM:
874
+ case GGML_OP_MEAN:
875
+ case GGML_OP_SOFT_MAX:
876
+ case GGML_OP_GROUP_NORM:
877
+ return has_simdgroup_reduction && ggml_is_contiguous_rows(op->src[0]);
878
+ case GGML_OP_L2_NORM:
879
+ return has_simdgroup_reduction && (op->ne[0] % 4 == 0 && ggml_is_contiguous_1(op->src[0]));
880
+ case GGML_OP_ARGMAX:
881
+ return has_simdgroup_reduction;
882
+ case GGML_OP_NORM:
883
+ case GGML_OP_RMS_NORM:
884
+ return has_simdgroup_reduction && (ggml_is_contiguous_rows(op->src[0]));
885
+ case GGML_OP_ROPE:
886
+ return true;
887
+ case GGML_OP_IM2COL:
888
+ return ggml_is_contiguous(op->src[1]) && op->src[1]->type == GGML_TYPE_F32 && (op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_F32);
889
+ case GGML_OP_CONV_2D:
890
+ return ggml_is_contiguous(op->src[0]) &&
891
+ op->src[1]->type == GGML_TYPE_F32 &&
892
+ op->type == GGML_TYPE_F32 &&
893
+ (op->src[0]->type == GGML_TYPE_F16 || op->src[0]->type == GGML_TYPE_F32);
894
+ case GGML_OP_POOL_1D:
895
+ return false;
896
+ case GGML_OP_UPSCALE:
897
+ return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
898
+ case GGML_OP_POOL_2D:
899
+ return op->src[0]->type == GGML_TYPE_F32;
900
+ case GGML_OP_PAD:
901
+ return (ggml_get_op_params_i32(op, 0) == 0) && (ggml_get_op_params_i32(op, 2) == 0) &&
902
+ (ggml_get_op_params_i32(op, 4) == 0) && (ggml_get_op_params_i32(op, 6) == 0);
903
+ case GGML_OP_PAD_REFLECT_1D:
904
+ case GGML_OP_TIMESTEP_EMBEDDING:
905
+ case GGML_OP_LEAKY_RELU:
906
+ return op->src[0]->type == GGML_TYPE_F32;
907
+ case GGML_OP_ARGSORT:
908
+ case GGML_OP_ARANGE:
909
+ return true;
910
+ case GGML_OP_FLASH_ATTN_EXT:
911
+ // for new head sizes, add checks here
912
+ if (op->src[0]->ne[0] != 32 &&
913
+ op->src[0]->ne[0] != 40 &&
914
+ op->src[0]->ne[0] != 64 &&
915
+ op->src[0]->ne[0] != 72 &&
916
+ op->src[0]->ne[0] != 80 &&
917
+ op->src[0]->ne[0] != 96 &&
918
+ op->src[0]->ne[0] != 112 &&
919
+ op->src[0]->ne[0] != 128 &&
920
+ op->src[0]->ne[0] != 192 &&
921
+ op->src[0]->ne[0] != 256) {
922
+ return false;
923
+ }
924
+ if (op->src[0]->ne[0] == 576) {
925
+ // DeepSeek sizes
926
+ // TODO: disabled for now, until optmized
927
+ return false;
928
+ }
929
+ if (op->src[1]->type != op->src[2]->type) {
930
+ return false;
931
+ }
932
+ return has_simdgroup_mm; // TODO: over-restricted for vec-kernels
933
+ case GGML_OP_SSM_CONV:
934
+ case GGML_OP_SSM_SCAN:
935
+ return has_simdgroup_reduction;
936
+ case GGML_OP_RWKV_WKV6:
937
+ case GGML_OP_RWKV_WKV7:
938
+ return true;
939
+ case GGML_OP_MUL_MAT:
940
+ case GGML_OP_MUL_MAT_ID:
941
+ return has_simdgroup_reduction;
942
+ case GGML_OP_CPY:
943
+ case GGML_OP_DUP:
944
+ case GGML_OP_CONT:
945
+ {
946
+ switch (op->src[0]->type) {
947
+ case GGML_TYPE_F32:
948
+ switch (op->type) {
949
+ case GGML_TYPE_F32:
950
+ case GGML_TYPE_F16:
951
+ case GGML_TYPE_BF16:
952
+ case GGML_TYPE_Q8_0:
953
+ case GGML_TYPE_Q4_0:
954
+ case GGML_TYPE_Q4_1:
955
+ case GGML_TYPE_Q5_0:
956
+ case GGML_TYPE_Q5_1:
957
+ case GGML_TYPE_IQ4_NL:
958
+ case GGML_TYPE_I32:
959
+ return true;
960
+ default:
961
+ return false;
962
+ }
963
+ case GGML_TYPE_F16:
964
+ switch (op->type) {
965
+ case GGML_TYPE_F32:
966
+ case GGML_TYPE_F16:
967
+ return true;
968
+ default:
969
+ return false;
970
+ }
971
+ case GGML_TYPE_BF16:
972
+ switch (op->type) {
973
+ case GGML_TYPE_F32:
974
+ case GGML_TYPE_BF16:
975
+ return true;
976
+ default:
977
+ return false;
978
+ }
979
+ case GGML_TYPE_Q4_0:
980
+ case GGML_TYPE_Q4_1:
981
+ case GGML_TYPE_Q5_0:
982
+ case GGML_TYPE_Q5_1:
983
+ case GGML_TYPE_Q8_0:
984
+ switch (op->type) {
985
+ case GGML_TYPE_F32:
986
+ case GGML_TYPE_F16:
987
+ return true;
988
+ default:
989
+ return false;
990
+ }
991
+ case GGML_TYPE_I32:
992
+ return op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_I32;
993
+ default:
994
+ return false;
995
+ };
996
+ }
997
+ case GGML_OP_GET_ROWS:
998
+ return true;
999
+ case GGML_OP_SET_ROWS:
1000
+ {
1001
+ if (op->src[0]->type != GGML_TYPE_F32) {
1002
+ return false;
1003
+ }
1004
+
1005
+ switch (op->type) {
1006
+ case GGML_TYPE_F32:
1007
+ case GGML_TYPE_F16:
1008
+ case GGML_TYPE_BF16:
1009
+ case GGML_TYPE_Q8_0:
1010
+ case GGML_TYPE_Q4_0:
1011
+ case GGML_TYPE_Q4_1:
1012
+ case GGML_TYPE_Q5_0:
1013
+ case GGML_TYPE_Q5_1:
1014
+ case GGML_TYPE_IQ4_NL:
1015
+ return true;
1016
+ default:
1017
+ return false;
1018
+ };
1019
+ }
1020
+ case GGML_OP_OPT_STEP_ADAMW:
1021
+ case GGML_OP_OPT_STEP_SGD:
1022
+ return has_simdgroup_reduction;
1023
+ default:
1024
+ return false;
1025
+ }
1026
+ }
1027
+
1028
+ const struct ggml_metal_device_props * ggml_metal_device_get_props(ggml_metal_device_t dev) {
1029
+ return &dev->props;
1030
+ }
1031
+
1032
+ //
1033
+ // device buffers
1034
+ //
1035
+
1036
+ // max memory buffers that can be mapped to the device
1037
+ #define GGML_METAL_MAX_BUFFERS 64
1038
+
1039
+ struct ggml_metal_buffer_wrapper {
1040
+ void * data;
1041
+ size_t size;
1042
+
1043
+ id<MTLBuffer> metal;
1044
+ };
1045
+
1046
+ struct ggml_metal_buffer {
1047
+ void * all_data;
1048
+ size_t all_size;
1049
+
1050
+ // if false, the Metal buffer data is allocated in private GPU memory and is not shared with the host
1051
+ bool is_shared;
1052
+ bool owned;
1053
+
1054
+ // multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
1055
+ int n_buffers;
1056
+ struct ggml_metal_buffer_wrapper buffers[GGML_METAL_MAX_BUFFERS];
1057
+
1058
+ bool use_residency_sets;
1059
+
1060
+ // optional MTLResidencySet
1061
+ // note: cannot use explicity "id<MTLResidencySet>" here because it is not available on certain OSes
1062
+ id rset;
1063
+
1064
+ // pointers to global device objects
1065
+ id<MTLDevice> device;
1066
+ id<MTLCommandQueue> queue;
1067
+ };
1068
+
1069
+ static void ggml_metal_log_allocated_size(id<MTLDevice> device, size_t size_aligned) {
1070
+ #ifndef GGML_METAL_NDEBUG
1071
+ #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
1072
+ if (@available(macOS 10.12, iOS 16.0, *)) {
1073
+ GGML_LOG_DEBUG("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)\n",
1074
+ __func__,
1075
+ size_aligned / 1024.0 / 1024.0,
1076
+ device.currentAllocatedSize / 1024.0 / 1024.0,
1077
+ device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
1078
+
1079
+ if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
1080
+ GGML_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
1081
+ }
1082
+ } else {
1083
+ GGML_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f)\n",
1084
+ __func__,
1085
+ size_aligned / 1024.0 / 1024.0,
1086
+ device.currentAllocatedSize / 1024.0 / 1024.0);
1087
+ }
1088
+ #endif
1089
+ #endif
1090
+ GGML_UNUSED(device);
1091
+ GGML_UNUSED(size_aligned);
1092
+ }
1093
+
1094
+ // rset init
1095
+ static bool ggml_metal_buffer_rset_init(ggml_metal_buffer_t buf) {
1096
+ buf->rset = nil;
1097
+
1098
+ if (!buf->use_residency_sets) {
1099
+ return true;
1100
+ }
1101
+
1102
+ #if defined(GGML_METAL_HAS_RESIDENCY_SETS)
1103
+ if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, visionOS 2.0, *)) {
1104
+ MTLResidencySetDescriptor * desc = [[MTLResidencySetDescriptor alloc] init];
1105
+ desc.label = @"ggml_metal";
1106
+ desc.initialCapacity = buf->n_buffers;
1107
+
1108
+ NSError * error;
1109
+ buf->rset = [buf->device newResidencySetWithDescriptor:desc error:&error];
1110
+ if (error) {
1111
+ GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
1112
+ [desc release];
1113
+ return false;
1114
+ }
1115
+
1116
+ [desc release];
1117
+
1118
+ for (int i = 0; i < buf->n_buffers; i++) {
1119
+ [buf->rset addAllocation:buf->buffers[i].metal];
1120
+ }
1121
+
1122
+ [buf->rset commit];
1123
+ [buf->rset requestResidency];
1124
+
1125
+ return true;
1126
+ }
1127
+ #endif
1128
+
1129
+ return true;
1130
+ }
1131
+
1132
+ // rset free
1133
+ static void ggml_metal_buffer_rset_free(ggml_metal_buffer_t buf) {
1134
+ #if defined(GGML_METAL_HAS_RESIDENCY_SETS)
1135
+ if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, visionOS 2.0, *)) {
1136
+ if (buf->rset) {
1137
+ [buf->rset endResidency];
1138
+ [buf->rset removeAllAllocations];
1139
+ [buf->rset release];
1140
+ }
1141
+ }
1142
+ #else
1143
+ GGML_UNUSED(buf);
1144
+ #endif
1145
+ }
1146
+
1147
+ static void * ggml_metal_host_malloc(size_t n) {
1148
+ void * data = NULL;
1149
+
1150
+ #if TARGET_OS_OSX
1151
+ kern_return_t err = vm_allocate((vm_map_t) mach_task_self(), (void *) &data, n, VM_FLAGS_ANYWHERE);
1152
+ if (err != KERN_SUCCESS) {
1153
+ GGML_LOG_ERROR("%s: error: vm_allocate failed\n", __func__);
1154
+ return NULL;
1155
+ }
1156
+ #else
1157
+ const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
1158
+ if (result != 0) {
1159
+ GGML_LOG_ERROR("%s: error: posix_memalign failed\n", __func__);
1160
+ return NULL;
1161
+ }
1162
+ #endif
1163
+
1164
+ return data;
1165
+ }
1166
+
1167
+ ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size, bool shared) {
1168
+ ggml_metal_buffer_t res = calloc(1, sizeof(struct ggml_metal_buffer));
1169
+
1170
+ const size_t size_page = sysconf(_SC_PAGESIZE);
1171
+
1172
+ size_t size_aligned = size;
1173
+ if ((size_aligned % size_page) != 0) {
1174
+ size_aligned += (size_page - (size_aligned % size_page));
1175
+ }
1176
+
1177
+ const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev);
1178
+
1179
+ shared = shared && props_dev->use_shared_buffers;
1180
+
1181
+ // allocate shared buffer if the device supports it and it is required by the buffer type
1182
+ if (shared) {
1183
+ res->all_data = ggml_metal_host_malloc(size_aligned);
1184
+ res->is_shared = true;
1185
+ } else {
1186
+ // use virtual address from g_addr_device counter
1187
+ res->all_data = (void *) atomic_fetch_add_explicit(&g_addr_device, size_aligned, memory_order_relaxed);
1188
+ res->is_shared = false;
1189
+ }
1190
+ res->all_size = size_aligned;
1191
+
1192
+ res->owned = true;
1193
+
1194
+ res->device = ggml_metal_device_get_obj(dev);
1195
+ res->queue = ggml_metal_device_get_queue(dev);
1196
+
1197
+ res->n_buffers = 1;
1198
+
1199
+ if (res->all_data != NULL) {
1200
+ res->buffers[0].size = size;
1201
+ res->buffers[0].metal = nil;
1202
+
1203
+ if (size_aligned > 0) {
1204
+ if (props_dev->use_shared_buffers && shared) {
1205
+ res->buffers[0].metal = [res->device newBufferWithBytesNoCopy:res->all_data
1206
+ length:size_aligned
1207
+ options:MTLResourceStorageModeShared
1208
+ deallocator:nil];
1209
+ } else {
1210
+ res->buffers[0].metal = [res->device newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate];
1211
+ }
1212
+ }
1213
+
1214
+ res->buffers[0].data = res->all_data;
1215
+ }
1216
+
1217
+ if (size_aligned > 0 && (res->all_data == NULL || res->buffers[0].metal == nil)) {
1218
+ GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
1219
+ free(res);
1220
+ return NULL;
1221
+ }
1222
+
1223
+ res->use_residency_sets = props_dev->use_residency_sets;
1224
+
1225
+ if (!ggml_metal_buffer_rset_init(res)) {
1226
+ GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
1227
+ free(res);
1228
+ return NULL;
1229
+ }
1230
+
1231
+ //ggml_metal_log_allocated_size(device, size_aligned);
1232
+
1233
+ return res;
1234
+ }
1235
+
1236
+ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, size_t size, size_t max_tensor_size) {
1237
+ ggml_metal_buffer_t res = calloc(1, sizeof(struct ggml_metal_buffer));
1238
+
1239
+ res->all_data = ptr;
1240
+ res->all_size = size;
1241
+
1242
+ res->is_shared = true;
1243
+ res->owned = false;
1244
+
1245
+ res->n_buffers = 0;
1246
+
1247
+ const size_t size_page = sysconf(_SC_PAGESIZE);
1248
+
1249
+ // page-align the data ptr
1250
+ {
1251
+ const uintptr_t offs = (uintptr_t) ptr % size_page;
1252
+ ptr = (void *) ((char *) ptr - offs);
1253
+ size += offs;
1254
+ }
1255
+
1256
+ size_t size_aligned = size;
1257
+ if ((size_aligned % size_page) != 0) {
1258
+ size_aligned += (size_page - (size_aligned % size_page));
1259
+ }
1260
+
1261
+ res->device = ggml_metal_device_get_obj(dev);
1262
+ res->queue = ggml_metal_device_get_queue(dev);
1263
+
1264
+ const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev);
1265
+
1266
+ // the buffer fits into the max buffer size allowed by the device
1267
+ if (size_aligned <= props_dev->max_buffer_size) {
1268
+ res->buffers[res->n_buffers].data = ptr;
1269
+ res->buffers[res->n_buffers].size = size;
1270
+ res->buffers[res->n_buffers].metal = nil;
1271
+
1272
+ if (size_aligned > 0) {
1273
+ res->buffers[res->n_buffers].metal = [res->device newBufferWithBytesNoCopy:ptr length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
1274
+
1275
+ if (res->buffers[res->n_buffers].metal == nil) {
1276
+ GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
1277
+ free(res);
1278
+ return NULL;
1279
+ }
1280
+ }
1281
+
1282
+ ggml_metal_log_allocated_size(res->device, size_aligned);
1283
+
1284
+ ++res->n_buffers;
1285
+ } else {
1286
+ // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
1287
+ // one of the views
1288
+ const size_t size_ovlp = ((max_tensor_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
1289
+ const size_t size_step = props_dev->max_buffer_size - size_ovlp;
1290
+ const size_t size_view = props_dev->max_buffer_size;
1291
+
1292
+ for (size_t i = 0; i < size; i += size_step) {
1293
+ const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
1294
+
1295
+ res->buffers[res->n_buffers].data = (void *) ((uint8_t *) ptr + i);
1296
+ res->buffers[res->n_buffers].size = size_step_aligned;
1297
+ res->buffers[res->n_buffers].metal = nil;
1298
+
1299
+ if (size_step_aligned > 0) {
1300
+ res->buffers[res->n_buffers].metal = [res->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) ptr + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
1301
+
1302
+ if (res->buffers[res->n_buffers].metal == nil) {
1303
+ GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
1304
+ free(res);
1305
+ return NULL;
1306
+ }
1307
+ }
1308
+
1309
+ ggml_metal_log_allocated_size(res->device, size_step_aligned);
1310
+
1311
+ if (i + size_step < size) {
1312
+ GGML_LOG_INFO("\n");
1313
+ }
1314
+
1315
+ ++res->n_buffers;
1316
+ }
1317
+ }
1318
+
1319
+ res->use_residency_sets = props_dev->use_residency_sets;
1320
+
1321
+ if (!ggml_metal_buffer_rset_init(res)) {
1322
+ GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
1323
+ free(res);
1324
+ return NULL;
1325
+ }
1326
+
1327
+ return res;
1328
+ }
1329
+
1330
+ void ggml_metal_buffer_free(ggml_metal_buffer_t buf) {
1331
+ for (int i = 0; i < buf->n_buffers; i++) {
1332
+ [buf->buffers[i].metal release];
1333
+ }
1334
+
1335
+ ggml_metal_buffer_rset_free(buf);
1336
+
1337
+ if (buf->is_shared && buf->owned) {
1338
+ #if TARGET_OS_OSX
1339
+ vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)buf->all_data, buf->all_size);
1340
+ #else
1341
+ free(buf->all_data);
1342
+ #endif
1343
+ }
1344
+
1345
+ free(buf);
1346
+ }
1347
+
1348
+ void * ggml_metal_buffer_get_base(ggml_metal_buffer_t buf) {
1349
+ return buf->all_data;
1350
+ }
1351
+
1352
+ bool ggml_metal_buffer_is_shared(ggml_metal_buffer_t buf) {
1353
+ return buf->is_shared;
1354
+ }
1355
+
1356
+ void ggml_metal_buffer_memset_tensor(ggml_metal_buffer_t buf, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
1357
+ if (buf->is_shared) {
1358
+ memset((char *) tensor->data + offset, value, size);
1359
+ return;
1360
+ }
1361
+
1362
+ @autoreleasepool {
1363
+ // dst
1364
+ struct ggml_metal_buffer_id bid_dst = ggml_metal_buffer_get_id(buf, tensor);
1365
+ bid_dst.offs += offset;
1366
+
1367
+ id<MTLCommandQueue> queue = buf->queue;
1368
+ id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
1369
+
1370
+ {
1371
+ id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
1372
+
1373
+ [encoder fillBuffer:bid_dst.metal
1374
+ range:NSMakeRange(bid_dst.offs, bid_dst.offs + size)
1375
+ value:value];
1376
+
1377
+ [encoder endEncoding];
1378
+ }
1379
+
1380
+ [cmd_buf commit];
1381
+ [cmd_buf waitUntilCompleted];
1382
+ }
1383
+ }
1384
+
1385
+ void ggml_metal_buffer_set_tensor(ggml_metal_buffer_t buf, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
1386
+ if (buf->is_shared) {
1387
+ memcpy((char *) tensor->data + offset, data, size);
1388
+ return;
1389
+ }
1390
+
1391
+ @autoreleasepool {
1392
+ // src
1393
+ void * data_ptr = (void *)(uintptr_t) data; // "const cast" the src data
1394
+ id<MTLBuffer> buf_src = [buf->device newBufferWithBytesNoCopy:data_ptr
1395
+ length:size
1396
+ options:MTLResourceStorageModeShared
1397
+ deallocator:nil];
1398
+
1399
+ GGML_ASSERT(buf_src);
1400
+
1401
+ // dst
1402
+ struct ggml_metal_buffer_id bid_dst = ggml_metal_buffer_get_id(buf, tensor);
1403
+ bid_dst.offs += offset;
1404
+
1405
+ // note: for experimentation purposes, here we use a semaphore to wait for the copy to complete
1406
+ // this is alternative to waitUntilCompleted, which should be faster, but don't seem to make much difference
1407
+ dispatch_semaphore_t completion_semaphore = dispatch_semaphore_create(0);
1408
+
1409
+ id<MTLCommandQueue> queue = buf->queue;
1410
+ id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
1411
+
1412
+ {
1413
+ id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
1414
+
1415
+ [encoder copyFromBuffer:buf_src
1416
+ sourceOffset:0
1417
+ toBuffer:bid_dst.metal
1418
+ destinationOffset:bid_dst.offs
1419
+ size:size];
1420
+
1421
+ [encoder endEncoding];
1422
+ }
1423
+
1424
+ [cmd_buf addCompletedHandler:^(id<MTLCommandBuffer> cb) {
1425
+ // TODO: can check for errors here
1426
+ GGML_UNUSED(cb);
1427
+
1428
+ dispatch_semaphore_signal(completion_semaphore);
1429
+ }];
1430
+
1431
+ [cmd_buf commit];
1432
+
1433
+ dispatch_semaphore_wait(completion_semaphore, DISPATCH_TIME_FOREVER);
1434
+ dispatch_release(completion_semaphore);
1435
+
1436
+ //[cmd_buf waitUntilCompleted];
1437
+ }
1438
+ }
1439
+
1440
+ void ggml_metal_buffer_get_tensor(ggml_metal_buffer_t buf, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
1441
+ if (buf->is_shared) {
1442
+ memcpy(data, (const char *) tensor->data + offset, size);
1443
+ return;
1444
+ }
1445
+
1446
+ @autoreleasepool {
1447
+ // src
1448
+ struct ggml_metal_buffer_id bid_src = ggml_metal_buffer_get_id(buf, tensor);
1449
+ bid_src.offs += offset;
1450
+
1451
+ // dst
1452
+ id<MTLBuffer> buf_dst = [buf->device newBufferWithBytesNoCopy:data
1453
+ length:size
1454
+ options:MTLResourceStorageModeShared
1455
+ deallocator:nil];
1456
+
1457
+ GGML_ASSERT(buf_dst);
1458
+
1459
+ id<MTLCommandQueue> queue = buf->queue;
1460
+ id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
1461
+
1462
+ {
1463
+ id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
1464
+
1465
+ [encoder copyFromBuffer:bid_src.metal
1466
+ sourceOffset:bid_src.offs
1467
+ toBuffer:buf_dst
1468
+ destinationOffset:0
1469
+ size:size];
1470
+
1471
+ [encoder endEncoding];
1472
+ }
1473
+
1474
+ [cmd_buf commit];
1475
+ [cmd_buf waitUntilCompleted];
1476
+ }
1477
+ }
1478
+
1479
+ void ggml_metal_buffer_clear(ggml_metal_buffer_t buf, uint8_t value) {
1480
+ if (buf->is_shared) {
1481
+ memset(buf->all_data, value, buf->all_size);
1482
+ return;
1483
+ }
1484
+
1485
+ @autoreleasepool {
1486
+ id<MTLCommandQueue> queue = buf->queue;
1487
+ id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
1488
+
1489
+ {
1490
+ id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
1491
+
1492
+ [encoder fillBuffer:buf->buffers[0].metal
1493
+ range:NSMakeRange(0, buf->buffers[0].size)
1494
+ value:value];
1495
+
1496
+ [encoder endEncoding];
1497
+ }
1498
+
1499
+ [cmd_buf commit];
1500
+ [cmd_buf waitUntilCompleted];
1501
+ }
1502
+ }
1503
+
1504
+ struct ggml_metal_buffer_id ggml_metal_buffer_get_id(ggml_metal_buffer_t buf, const struct ggml_tensor * t) {
1505
+ struct ggml_metal_buffer_id res = { nil, 0 };
1506
+
1507
+ const int64_t tsize = ggml_nbytes(t);
1508
+
1509
+ // find the view that contains the tensor fully
1510
+ for (int i = 0; i < buf->n_buffers; ++i) {
1511
+ const int64_t ioffs = (int64_t) t->data - (int64_t) buf->buffers[i].data;
1512
+
1513
+ //GGML_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf->buffers[i].size);
1514
+ if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf->buffers[i].size) {
1515
+ res.metal = buf->buffers[i].metal;
1516
+ res.offs = (size_t) ioffs;
1517
+
1518
+ //GGML_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs);
1519
+
1520
+ return res;
1521
+ }
1522
+ }
1523
+
1524
+ GGML_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name);
1525
+
1526
+ return res;
1527
+ }