@novastera-oss/llamarn 0.4.1 → 0.4.3-beta4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (976) hide show
  1. package/RNLlamaCpp.podspec +3 -0
  2. package/android/CMakeLists.txt +2 -0
  3. package/android/src/main/cpp/include/llama.h +44 -21
  4. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  8. package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
  9. package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
  10. package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
  11. package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
  12. package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
  13. package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
  14. package/android/src/main/jniLibs/x86/libggml.so +0 -0
  15. package/android/src/main/jniLibs/x86/libllama.so +0 -0
  16. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  17. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  18. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  19. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  20. package/cpp/build-info.cpp +2 -2
  21. package/cpp/llama.cpp/CMakeLists.txt +12 -0
  22. package/cpp/llama.cpp/CODEOWNERS +116 -10
  23. package/cpp/llama.cpp/CONTRIBUTING.md +30 -3
  24. package/cpp/llama.cpp/README.md +13 -5
  25. package/cpp/llama.cpp/build-xcframework.sh +5 -0
  26. package/cpp/llama.cpp/cmake/riscv64-spacemit-linux-gnu-gcc.cmake +29 -0
  27. package/cpp/llama.cpp/common/CMakeLists.txt +12 -2
  28. package/cpp/llama.cpp/common/arg.cpp +303 -795
  29. package/cpp/llama.cpp/common/arg.h +2 -3
  30. package/cpp/llama.cpp/common/chat-parser-xml-toolcall.cpp +861 -0
  31. package/cpp/llama.cpp/common/chat-parser-xml-toolcall.h +45 -0
  32. package/cpp/llama.cpp/common/chat-parser.cpp +156 -15
  33. package/cpp/llama.cpp/common/chat-parser.h +13 -0
  34. package/cpp/llama.cpp/common/chat.cpp +1147 -88
  35. package/cpp/llama.cpp/common/chat.h +16 -3
  36. package/cpp/llama.cpp/common/common.cpp +70 -15
  37. package/cpp/llama.cpp/common/common.h +57 -19
  38. package/cpp/llama.cpp/common/download.cpp +1072 -0
  39. package/cpp/llama.cpp/common/download.h +55 -0
  40. package/cpp/llama.cpp/common/http.h +73 -0
  41. package/cpp/llama.cpp/common/json-partial.cpp +70 -2
  42. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +61 -22
  43. package/cpp/llama.cpp/common/json-schema-to-grammar.h +2 -0
  44. package/cpp/llama.cpp/common/log.cpp +59 -2
  45. package/cpp/llama.cpp/common/log.h +12 -4
  46. package/cpp/llama.cpp/common/sampling.cpp +84 -8
  47. package/cpp/llama.cpp/common/sampling.h +3 -1
  48. package/cpp/llama.cpp/common/speculative.cpp +1 -1
  49. package/cpp/llama.cpp/convert_hf_to_gguf.py +1608 -233
  50. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +6 -1
  51. package/cpp/llama.cpp/convert_lora_to_gguf.py +37 -5
  52. package/cpp/llama.cpp/ggml/CMakeLists.txt +47 -28
  53. package/cpp/llama.cpp/ggml/include/ggml-backend.h +19 -1
  54. package/cpp/llama.cpp/ggml/include/ggml-cpu.h +1 -1
  55. package/cpp/llama.cpp/ggml/include/ggml-hexagon.h +19 -0
  56. package/cpp/llama.cpp/ggml/include/ggml-metal.h +1 -6
  57. package/cpp/llama.cpp/ggml/include/ggml-rpc.h +7 -9
  58. package/cpp/llama.cpp/ggml/include/ggml-zdnn.h +2 -1
  59. package/cpp/llama.cpp/ggml/include/ggml.h +199 -6
  60. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +38 -0
  61. package/cpp/llama.cpp/ggml/src/ggml-alloc.c +299 -130
  62. package/cpp/llama.cpp/ggml/src/ggml-backend-impl.h +4 -4
  63. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +21 -5
  64. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +99 -2
  65. package/cpp/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +1 -1
  66. package/cpp/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +1 -0
  67. package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +57 -45
  68. package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +138 -47
  69. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +1584 -1773
  70. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +201 -317
  71. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +146 -187
  72. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +771 -713
  73. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +135 -77
  74. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +5 -2
  75. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +428 -26
  76. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +16 -17
  77. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +318 -145
  78. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  79. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +155 -60
  80. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +8 -8
  81. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -1
  82. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +14 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +10 -9
  84. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +108 -64
  85. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +14 -4
  86. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +530 -87
  87. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +37 -45
  88. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +349 -127
  89. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +947 -1218
  90. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +5 -4
  91. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +143 -29
  92. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +82 -76
  93. package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1025 -0
  94. package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  95. package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  96. package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  97. package/cpp/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +151 -0
  98. package/cpp/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +7 -0
  99. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +233 -28
  100. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +326 -66
  101. package/cpp/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +12 -3
  102. package/cpp/llama.cpp/ggml/src/ggml-cuda/argsort.cu +102 -6
  103. package/cpp/llama.cpp/ggml/src/ggml-cuda/binbcast.cu +110 -76
  104. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +167 -38
  105. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d.cu +6 -11
  106. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +12 -0
  107. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh +1 -1
  108. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +245 -151
  109. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cuh +1 -5
  110. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +341 -289
  111. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile.cu +49 -0
  112. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile.cuh +1233 -0
  113. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec.cuh +586 -0
  114. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +6 -6
  115. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +48 -0
  116. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +123 -220
  117. package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +41 -39
  118. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +715 -45
  119. package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cu +150 -0
  120. package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cuh +1 -0
  121. package/cpp/llama.cpp/ggml/src/ggml-cuda/mma.cuh +321 -24
  122. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cu +93 -351
  123. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cuh +828 -1
  124. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmid.cu +164 -0
  125. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmid.cuh +5 -0
  126. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +3 -166
  127. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +1 -1
  128. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvf.cu +371 -78
  129. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvf.cuh +3 -2
  130. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cu +279 -147
  131. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cuh +1 -1
  132. package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cu +97 -85
  133. package/cpp/llama.cpp/ggml/src/ggml-cuda/pad.cu +46 -23
  134. package/cpp/llama.cpp/ggml/src/ggml-cuda/pad_reflect_1d.cu +63 -54
  135. package/cpp/llama.cpp/ggml/src/ggml-cuda/quantize.cu +12 -10
  136. package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +192 -77
  137. package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cuh +2 -0
  138. package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +10 -9
  139. package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cu +137 -75
  140. package/cpp/llama.cpp/ggml/src/ggml-cuda/set.cu +39 -0
  141. package/cpp/llama.cpp/ggml/src/ggml-cuda/set.cuh +7 -0
  142. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
  143. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
  144. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
  145. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
  146. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
  147. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
  148. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
  149. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
  150. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
  151. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
  152. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
  153. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
  154. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
  155. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
  156. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
  157. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
  158. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
  159. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
  160. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
  161. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
  162. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
  163. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
  164. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
  165. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
  166. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
  167. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
  168. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
  169. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
  170. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
  171. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
  172. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
  173. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
  174. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
  175. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
  176. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
  177. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
  178. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
  179. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
  180. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
  181. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
  182. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
  183. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
  184. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
  185. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
  186. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
  187. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +40 -19
  188. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
  189. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
  190. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
  191. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
  192. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
  193. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
  194. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
  195. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
  196. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
  197. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
  198. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
  199. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
  200. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
  201. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
  202. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
  203. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
  204. package/cpp/llama.cpp/ggml/src/ggml-cuda/topk-moe.cu +336 -0
  205. package/cpp/llama.cpp/ggml/src/ggml-cuda/topk-moe.cuh +16 -0
  206. package/cpp/llama.cpp/ggml/src/ggml-cuda/tsembd.cu +3 -3
  207. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +105 -11
  208. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +36 -0
  209. package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +87 -6
  210. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +28 -12
  211. package/cpp/llama.cpp/ggml/src/ggml-hexagon/CMakeLists.txt +68 -0
  212. package/cpp/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3807 -0
  213. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/CMakeLists.txt +40 -0
  214. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/act-ops.c +442 -0
  215. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/binary-ops.c +360 -0
  216. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
  217. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-ctx.h +40 -0
  218. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.c +69 -0
  219. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.h +119 -0
  220. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-msg.h +156 -0
  221. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-ops.h +64 -0
  222. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
  223. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-exp.c +93 -0
  224. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.c +60 -0
  225. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +49 -0
  226. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.c +960 -0
  227. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +1032 -0
  228. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/main.c +829 -0
  229. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +2223 -0
  230. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/ops-utils.h +149 -0
  231. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/rope-ops.c +418 -0
  232. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/softmax-ops.c +402 -0
  233. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/unary-ops.c +255 -0
  234. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/worker-pool.c +297 -0
  235. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
  236. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp-utils.c +448 -0
  237. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp-utils.h +220 -0
  238. package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +8 -13
  239. package/cpp/llama.cpp/ggml/src/ggml-impl.h +110 -12
  240. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +6 -5
  241. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
  242. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  243. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
  244. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-context.m +599 -0
  245. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +1662 -0
  246. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.h +251 -0
  247. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.m +1527 -0
  248. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +244 -39
  249. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +3844 -0
  250. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.h +90 -0
  251. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.cpp +723 -0
  252. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +3453 -1907
  253. package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +3 -1
  254. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +10 -0
  255. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1331 -109
  256. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/cvt.cl +126 -0
  257. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +31 -4
  258. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +35 -7
  259. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +31 -4
  260. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
  261. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
  262. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/get_rows.cl +36 -12
  263. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
  264. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +24 -10
  265. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +24 -10
  266. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
  267. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  268. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  269. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  270. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  271. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  272. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  273. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +29 -20
  274. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +25 -10
  275. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rope.cl +50 -24
  276. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +123 -10
  277. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +2 -2
  278. package/cpp/llama.cpp/ggml/src/ggml-quants.c +1 -0
  279. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +341 -161
  280. package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
  281. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +6 -5
  282. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +74 -15
  283. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +50 -30
  284. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +10 -4
  285. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +166 -99
  286. package/cpp/llama.cpp/ggml/src/ggml-sycl/count-equal.cpp +79 -0
  287. package/cpp/llama.cpp/ggml/src/ggml-sycl/count-equal.hpp +9 -0
  288. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +72 -94
  289. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +67 -49
  290. package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +21 -31
  291. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +252 -316
  292. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +6 -2
  293. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +9 -6
  294. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +359 -142
  295. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
  296. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  297. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +80 -60
  298. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +201 -132
  299. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +230 -55
  300. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.hpp +2 -0
  301. package/cpp/llama.cpp/ggml/src/ggml-sycl/pad.cpp +97 -0
  302. package/cpp/llama.cpp/ggml/src/ggml-sycl/pad.hpp +24 -0
  303. package/cpp/llama.cpp/ggml/src/ggml-sycl/pad_reflect_1d.cpp +72 -0
  304. package/cpp/llama.cpp/ggml/src/ggml-sycl/pad_reflect_1d.hpp +8 -0
  305. package/cpp/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
  306. package/cpp/llama.cpp/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
  307. package/cpp/llama.cpp/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
  308. package/cpp/llama.cpp/ggml/src/ggml-sycl/roll.cpp +122 -0
  309. package/cpp/llama.cpp/ggml/src/ggml-sycl/roll.hpp +20 -0
  310. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +50 -41
  311. package/cpp/llama.cpp/ggml/src/ggml-sycl/set.cpp +73 -0
  312. package/cpp/llama.cpp/ggml/src/ggml-sycl/set.hpp +5 -0
  313. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +45 -36
  314. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +330 -165
  315. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +4 -0
  316. package/cpp/llama.cpp/ggml/src/ggml-sycl/ssm_conv.cpp +127 -0
  317. package/cpp/llama.cpp/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
  318. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +12 -6
  319. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +16 -12
  320. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +38 -18
  321. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +4184 -2159
  322. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
  323. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +2 -2
  324. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +2 -2
  325. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
  326. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +1 -1
  327. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
  328. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +2 -2
  329. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +33 -26
  330. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
  331. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
  332. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +2 -2
  333. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +2 -2
  334. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +2 -2
  335. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +1 -1
  336. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +53 -30
  337. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +1 -1
  338. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +2 -2
  339. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +3 -3
  340. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +13 -6
  341. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
  342. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +2 -2
  343. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +2 -2
  344. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +1 -1
  345. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs.comp → dequant_funcs.glsl} +138 -2
  346. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs_cm2.comp → dequant_funcs_cm2.glsl} +18 -4
  347. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_head.comp → dequant_head.glsl} +1 -1
  348. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
  349. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +1 -1
  350. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +2 -2
  351. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +1 -1
  352. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +3 -2
  353. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +7 -6
  354. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +5 -3
  355. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +1 -1
  356. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +1 -1
  357. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +3 -3
  358. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +3 -3
  359. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
  360. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +1 -1
  361. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +1 -1
  362. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +3 -3
  363. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +1 -1
  364. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +1 -1
  365. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +3 -3
  366. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
  367. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +1 -1
  368. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +1 -1
  369. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +2 -2
  370. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +3 -2
  371. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
  372. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +52 -14
  373. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{flash_attn_base.comp → flash_attn_base.glsl} +50 -12
  374. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +61 -12
  375. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +54 -12
  376. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +5 -1
  377. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
  378. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +2 -2
  379. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +2 -2
  380. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +2 -2
  381. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +2 -2
  382. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +2 -2
  383. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +2 -2
  384. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_binary_head.comp → generic_binary_head.glsl} +10 -2
  385. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +21 -12
  386. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +28 -18
  387. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{glu_head.comp → glu_head.glsl} +1 -1
  388. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +2 -2
  389. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
  390. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
  391. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +15 -7
  392. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +125 -0
  393. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +2 -2
  394. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +2 -2
  395. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +18 -0
  396. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +2 -2
  397. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +1 -1
  398. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +229 -0
  399. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +33 -0
  400. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +1 -1
  401. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +1 -1
  402. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +1 -1
  403. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +1 -1
  404. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +1 -1
  405. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +1 -1
  406. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +1 -1
  407. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +9 -7
  408. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +9 -7
  409. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +3 -5
  410. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +1 -1
  411. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +3 -5
  412. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +3 -5
  413. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +1 -1
  414. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
  415. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +106 -634
  416. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +118 -9
  417. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +556 -0
  418. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +70 -0
  419. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +77 -214
  420. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +589 -0
  421. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +78 -0
  422. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +97 -13
  423. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
  424. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +2 -2
  425. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +2 -2
  426. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +1 -1
  427. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +25 -4
  428. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +1 -1
  429. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +55 -5
  430. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +2 -2
  431. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +2 -2
  432. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +2 -2
  433. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +2 -2
  434. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +45 -3
  435. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +2 -2
  436. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +2 -2
  437. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +2 -2
  438. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +227 -0
  439. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +20 -0
  440. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +5 -52
  441. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +5 -35
  442. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +5 -35
  443. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +27 -0
  444. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +5 -41
  445. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
  446. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +2 -2
  447. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +2 -2
  448. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +2 -2
  449. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +2 -2
  450. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +2 -2
  451. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +1 -1
  452. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +6 -2
  453. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
  454. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +2 -2
  455. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +2 -2
  456. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +44 -0
  457. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +140 -0
  458. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
  459. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +2 -2
  460. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +1 -1
  461. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +2 -2
  462. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +2 -2
  463. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +2 -2
  464. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +5 -4
  465. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +171 -0
  466. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
  467. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{types.comp → types.glsl} +79 -29
  468. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +36 -12
  469. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +471 -196
  470. package/cpp/llama.cpp/ggml/src/ggml-webgpu/CMakeLists.txt +8 -0
  471. package/cpp/llama.cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1690 -383
  472. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +188 -0
  473. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
  474. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
  475. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +101 -0
  476. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +57 -10
  477. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
  478. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
  479. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +25 -912
  480. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +97 -0
  481. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl +247 -0
  482. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.tmpl.wgsl +302 -0
  483. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +267 -0
  484. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +123 -0
  485. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
  486. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/scale.tmpl.wgsl +90 -0
  487. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/{set_rows.wgsl → set_rows.tmpl.wgsl} +38 -8
  488. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
  489. package/cpp/llama.cpp/ggml/src/ggml-zdnn/common.hpp +59 -0
  490. package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +96 -314
  491. package/cpp/llama.cpp/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  492. package/cpp/llama.cpp/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  493. package/cpp/llama.cpp/ggml/src/ggml-zdnn/utils.cpp +79 -0
  494. package/cpp/llama.cpp/ggml/src/ggml-zdnn/utils.hpp +19 -0
  495. package/cpp/llama.cpp/ggml/src/ggml.c +440 -17
  496. package/cpp/llama.cpp/ggml/src/gguf.cpp +104 -29
  497. package/cpp/llama.cpp/gguf-py/gguf/constants.py +363 -13
  498. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +64 -0
  499. package/cpp/llama.cpp/gguf-py/gguf/lazy.py +8 -3
  500. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_convert_endian.py +6 -0
  501. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +156 -18
  502. package/cpp/llama.cpp/gguf-py/gguf/utility.py +80 -0
  503. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +4 -4
  504. package/cpp/llama.cpp/include/llama.h +44 -21
  505. package/cpp/llama.cpp/media/llama1-icon-transparent.png +0 -0
  506. package/cpp/llama.cpp/media/llama1-icon-transparent.svg +77 -0
  507. package/cpp/llama.cpp/media/llama1-icon.png +0 -0
  508. package/cpp/llama.cpp/media/llama1-icon.svg +87 -0
  509. package/cpp/llama.cpp/requirements/requirements-all.txt +2 -0
  510. package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -3
  511. package/cpp/llama.cpp/requirements/requirements-convert_legacy_llama.txt +3 -1
  512. package/cpp/llama.cpp/requirements/requirements-tool_bench.txt +1 -1
  513. package/cpp/llama.cpp/src/CMakeLists.txt +101 -0
  514. package/cpp/llama.cpp/src/llama-adapter.cpp +33 -0
  515. package/cpp/llama.cpp/src/llama-adapter.h +3 -0
  516. package/cpp/llama.cpp/src/llama-arch.cpp +344 -14
  517. package/cpp/llama.cpp/src/llama-arch.h +50 -0
  518. package/cpp/llama.cpp/src/llama-batch.cpp +63 -31
  519. package/cpp/llama.cpp/src/llama-batch.h +13 -2
  520. package/cpp/llama.cpp/src/llama-chat.cpp +85 -3
  521. package/cpp/llama.cpp/src/llama-chat.h +4 -0
  522. package/cpp/llama.cpp/src/llama-context.cpp +300 -45
  523. package/cpp/llama.cpp/src/llama-context.h +16 -6
  524. package/cpp/llama.cpp/src/llama-cparams.h +2 -1
  525. package/cpp/llama.cpp/src/llama-grammar.cpp +17 -9
  526. package/cpp/llama.cpp/src/llama-graph.cpp +226 -64
  527. package/cpp/llama.cpp/src/llama-graph.h +27 -5
  528. package/cpp/llama.cpp/src/llama-hparams.cpp +53 -2
  529. package/cpp/llama.cpp/src/llama-hparams.h +48 -8
  530. package/cpp/llama.cpp/src/llama-impl.cpp +3 -3
  531. package/cpp/llama.cpp/src/llama-impl.h +2 -0
  532. package/cpp/llama.cpp/src/llama-kv-cache-iswa.cpp +13 -3
  533. package/cpp/llama.cpp/src/llama-kv-cache-iswa.h +2 -0
  534. package/cpp/llama.cpp/src/llama-kv-cache.cpp +120 -62
  535. package/cpp/llama.cpp/src/llama-kv-cache.h +13 -4
  536. package/cpp/llama.cpp/src/llama-kv-cells.h +44 -2
  537. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +19 -9
  538. package/cpp/llama.cpp/src/llama-memory-hybrid.h +2 -0
  539. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +38 -17
  540. package/cpp/llama.cpp/src/llama-memory-recurrent.h +5 -2
  541. package/cpp/llama.cpp/src/llama-memory.h +3 -0
  542. package/cpp/llama.cpp/src/llama-model-loader.cpp +2 -0
  543. package/cpp/llama.cpp/src/llama-model.cpp +1070 -12614
  544. package/cpp/llama.cpp/src/llama-model.h +40 -4
  545. package/cpp/llama.cpp/src/llama-quant.cpp +14 -6
  546. package/cpp/llama.cpp/src/llama-sampling.cpp +243 -136
  547. package/cpp/llama.cpp/src/llama-vocab.cpp +43 -3
  548. package/cpp/llama.cpp/src/llama-vocab.h +43 -39
  549. package/cpp/llama.cpp/src/llama.cpp +69 -10
  550. package/cpp/llama.cpp/src/models/afmoe.cpp +187 -0
  551. package/cpp/llama.cpp/src/models/apertus.cpp +125 -0
  552. package/cpp/llama.cpp/src/models/arcee.cpp +135 -0
  553. package/cpp/llama.cpp/src/models/arctic.cpp +138 -0
  554. package/cpp/llama.cpp/src/models/arwkv7.cpp +86 -0
  555. package/cpp/llama.cpp/src/models/baichuan.cpp +122 -0
  556. package/cpp/llama.cpp/src/models/bailingmoe.cpp +144 -0
  557. package/cpp/llama.cpp/src/models/bailingmoe2.cpp +135 -0
  558. package/cpp/llama.cpp/src/models/bert.cpp +176 -0
  559. package/cpp/llama.cpp/src/models/bitnet.cpp +160 -0
  560. package/cpp/llama.cpp/src/models/bloom.cpp +101 -0
  561. package/cpp/llama.cpp/src/models/chameleon.cpp +178 -0
  562. package/cpp/llama.cpp/src/models/chatglm.cpp +132 -0
  563. package/cpp/llama.cpp/src/models/codeshell.cpp +111 -0
  564. package/cpp/llama.cpp/src/models/cogvlm.cpp +100 -0
  565. package/cpp/llama.cpp/src/models/cohere2-iswa.cpp +131 -0
  566. package/cpp/llama.cpp/src/models/command-r.cpp +122 -0
  567. package/cpp/llama.cpp/src/models/dbrx.cpp +123 -0
  568. package/cpp/llama.cpp/src/models/deci.cpp +135 -0
  569. package/cpp/llama.cpp/src/models/deepseek.cpp +144 -0
  570. package/cpp/llama.cpp/src/models/deepseek2.cpp +237 -0
  571. package/cpp/llama.cpp/src/models/dots1.cpp +134 -0
  572. package/cpp/llama.cpp/src/models/dream.cpp +105 -0
  573. package/cpp/llama.cpp/src/models/ernie4-5-moe.cpp +150 -0
  574. package/cpp/llama.cpp/src/models/ernie4-5.cpp +110 -0
  575. package/cpp/llama.cpp/src/models/exaone.cpp +114 -0
  576. package/cpp/llama.cpp/src/models/exaone4.cpp +123 -0
  577. package/cpp/llama.cpp/src/models/falcon-h1.cpp +113 -0
  578. package/cpp/llama.cpp/src/models/falcon.cpp +120 -0
  579. package/cpp/llama.cpp/src/models/gemma-embedding.cpp +120 -0
  580. package/cpp/llama.cpp/src/models/gemma.cpp +112 -0
  581. package/cpp/llama.cpp/src/models/gemma2-iswa.cpp +125 -0
  582. package/cpp/llama.cpp/src/models/gemma3-iswa.cpp +131 -0
  583. package/cpp/llama.cpp/src/models/gemma3n-iswa.cpp +377 -0
  584. package/cpp/llama.cpp/src/models/glm4-moe.cpp +153 -0
  585. package/cpp/llama.cpp/src/models/glm4.cpp +127 -0
  586. package/cpp/llama.cpp/src/models/gpt2.cpp +105 -0
  587. package/cpp/llama.cpp/src/models/gptneox.cpp +144 -0
  588. package/cpp/llama.cpp/src/models/granite-hybrid.cpp +196 -0
  589. package/cpp/llama.cpp/src/models/granite.cpp +211 -0
  590. package/cpp/llama.cpp/src/models/graph-context-mamba.cpp +283 -0
  591. package/cpp/llama.cpp/src/models/grok.cpp +159 -0
  592. package/cpp/llama.cpp/src/models/grovemoe.cpp +141 -0
  593. package/cpp/llama.cpp/src/models/hunyuan-dense.cpp +132 -0
  594. package/cpp/llama.cpp/src/models/hunyuan-moe.cpp +154 -0
  595. package/cpp/llama.cpp/src/models/internlm2.cpp +120 -0
  596. package/cpp/llama.cpp/src/models/jais.cpp +86 -0
  597. package/cpp/llama.cpp/src/models/jamba.cpp +106 -0
  598. package/cpp/llama.cpp/src/models/lfm2.cpp +173 -0
  599. package/cpp/llama.cpp/src/models/llada-moe.cpp +122 -0
  600. package/cpp/llama.cpp/src/models/llada.cpp +99 -0
  601. package/cpp/llama.cpp/src/models/llama-iswa.cpp +174 -0
  602. package/cpp/llama.cpp/src/models/llama.cpp +155 -0
  603. package/cpp/llama.cpp/src/models/mamba.cpp +55 -0
  604. package/cpp/llama.cpp/src/models/minicpm3.cpp +199 -0
  605. package/cpp/llama.cpp/src/models/minimax-m2.cpp +124 -0
  606. package/cpp/llama.cpp/src/models/models.h +485 -0
  607. package/cpp/llama.cpp/src/models/mpt.cpp +126 -0
  608. package/cpp/llama.cpp/src/models/nemotron-h.cpp +121 -0
  609. package/cpp/llama.cpp/src/models/nemotron.cpp +122 -0
  610. package/cpp/llama.cpp/src/models/neo-bert.cpp +104 -0
  611. package/cpp/llama.cpp/src/models/olmo.cpp +121 -0
  612. package/cpp/llama.cpp/src/models/olmo2.cpp +150 -0
  613. package/cpp/llama.cpp/src/models/olmoe.cpp +124 -0
  614. package/cpp/llama.cpp/src/models/openai-moe-iswa.cpp +124 -0
  615. package/cpp/llama.cpp/src/models/openelm.cpp +124 -0
  616. package/cpp/llama.cpp/src/models/orion.cpp +123 -0
  617. package/cpp/llama.cpp/src/models/pangu-embedded.cpp +121 -0
  618. package/cpp/llama.cpp/src/models/phi2.cpp +121 -0
  619. package/cpp/llama.cpp/src/models/phi3.cpp +152 -0
  620. package/cpp/llama.cpp/src/models/plamo.cpp +110 -0
  621. package/cpp/llama.cpp/src/models/plamo2.cpp +316 -0
  622. package/cpp/llama.cpp/src/models/plm.cpp +168 -0
  623. package/cpp/llama.cpp/src/models/qwen.cpp +108 -0
  624. package/cpp/llama.cpp/src/models/qwen2.cpp +117 -0
  625. package/cpp/llama.cpp/src/models/qwen2moe.cpp +151 -0
  626. package/cpp/llama.cpp/src/models/qwen2vl.cpp +117 -0
  627. package/cpp/llama.cpp/src/models/qwen3.cpp +117 -0
  628. package/cpp/llama.cpp/src/models/qwen3moe.cpp +124 -0
  629. package/cpp/llama.cpp/src/models/qwen3vl-moe.cpp +149 -0
  630. package/cpp/llama.cpp/src/models/qwen3vl.cpp +141 -0
  631. package/cpp/llama.cpp/src/models/refact.cpp +94 -0
  632. package/cpp/llama.cpp/src/models/rwkv6-base.cpp +162 -0
  633. package/cpp/llama.cpp/src/models/rwkv6.cpp +94 -0
  634. package/cpp/llama.cpp/src/models/rwkv6qwen2.cpp +86 -0
  635. package/cpp/llama.cpp/src/models/rwkv7-base.cpp +135 -0
  636. package/cpp/llama.cpp/src/models/rwkv7.cpp +90 -0
  637. package/cpp/llama.cpp/src/models/seed-oss.cpp +124 -0
  638. package/cpp/llama.cpp/src/models/smallthinker.cpp +120 -0
  639. package/cpp/llama.cpp/src/models/smollm3.cpp +128 -0
  640. package/cpp/llama.cpp/src/models/stablelm.cpp +146 -0
  641. package/cpp/llama.cpp/src/models/starcoder.cpp +100 -0
  642. package/cpp/llama.cpp/src/models/starcoder2.cpp +121 -0
  643. package/cpp/llama.cpp/src/models/t5-dec.cpp +166 -0
  644. package/cpp/llama.cpp/src/models/t5-enc.cpp +96 -0
  645. package/cpp/llama.cpp/src/models/wavtokenizer-dec.cpp +149 -0
  646. package/cpp/llama.cpp/src/models/xverse.cpp +108 -0
  647. package/cpp/llama.cpp/src/unicode.cpp +77 -0
  648. package/cpp/llama.cpp/src/unicode.h +43 -0
  649. package/cpp/llama.cpp/vendor/cpp-httplib/CMakeLists.txt +94 -0
  650. package/cpp/llama.cpp/vendor/cpp-httplib/httplib.cpp +9339 -0
  651. package/cpp/llama.cpp/vendor/cpp-httplib/httplib.h +433 -8222
  652. package/cpp/llama.cpp/vendor/cpp-httplib/patch-boringssl.cmake +6 -0
  653. package/cpp/llama.cpp/vendor/miniaudio/miniaudio.h +4179 -1900
  654. package/cpp/llama.cpp/vendor/minja/chat-template.hpp +9 -2
  655. package/cpp/llama.cpp/vendor/minja/minja.hpp +101 -22
  656. package/ios/include/chat.h +16 -3
  657. package/ios/include/common/minja/chat-template.hpp +9 -2
  658. package/ios/include/common/minja/minja.hpp +101 -22
  659. package/ios/include/common.h +57 -19
  660. package/ios/include/json-schema-to-grammar.h +2 -0
  661. package/ios/include/llama.h +44 -21
  662. package/ios/include/log.h +12 -4
  663. package/ios/include/sampling.h +3 -1
  664. package/ios/libs/llama.xcframework/Info.plist +20 -20
  665. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  666. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +6399 -5557
  667. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +19 -1
  668. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +1 -1
  669. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-metal.h +1 -6
  670. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +199 -6
  671. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +44 -21
  672. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  673. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  674. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +6362 -5520
  675. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4813 -4241
  676. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +19 -1
  677. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +1 -1
  678. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +1 -6
  679. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +199 -6
  680. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +44 -21
  681. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  682. package/package.json +10 -4
  683. package/cpp/llama.cpp/ggml/src/ggml-cann/Doxyfile +0 -2579
  684. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -371
  685. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
  686. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -379
  687. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
  688. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -495
  689. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -486
  690. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  691. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  692. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  693. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  694. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  695. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  696. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  697. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  698. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  699. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  700. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  701. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  702. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  703. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  704. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  705. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  706. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  707. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  708. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  709. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  710. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  711. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  712. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  713. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  714. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  715. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  716. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  717. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  718. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  719. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  720. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  721. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  722. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  723. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  724. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  725. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  726. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  727. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  728. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  729. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  730. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  731. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  732. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  733. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  734. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  735. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  736. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  737. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  738. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  739. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  740. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  741. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  742. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  743. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  744. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  745. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  746. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  747. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  748. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  749. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  750. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  751. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  752. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  753. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  754. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  755. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  756. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  757. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  758. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  759. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  760. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  761. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  762. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  763. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  764. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  765. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  766. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  767. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  768. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  769. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  770. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  771. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  772. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  773. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  774. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  775. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  776. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +0 -6886
  777. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +0 -154
  778. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +0 -105
  779. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +0 -55
  780. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +0 -60
  781. package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn-impl.h +0 -97
  782. package/cpp/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
  783. package/cpp/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
  784. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
  785. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +0 -112
  786. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +0 -46
  787. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
  788. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.inp +0 -112
  789. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.out +0 -46
  790. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
  791. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +0 -112
  792. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +0 -46
  793. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
  794. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +0 -112
  795. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +0 -46
  796. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
  797. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.inp +0 -112
  798. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.out +0 -46
  799. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
  800. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +0 -112
  801. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +0 -46
  802. package/cpp/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
  803. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
  804. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +0 -112
  805. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +0 -46
  806. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
  807. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +0 -112
  808. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +0 -46
  809. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
  810. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.inp +0 -112
  811. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.out +0 -46
  812. package/cpp/llama.cpp/models/ggml-vocab-nomic-bert-moe.gguf +0 -0
  813. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
  814. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +0 -112
  815. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.out +0 -46
  816. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
  817. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +0 -112
  818. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.out +0 -46
  819. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
  820. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.inp +0 -112
  821. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.out +0 -46
  822. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
  823. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +0 -112
  824. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.out +0 -46
  825. package/cpp/llama.cpp/models/templates/ByteDance-Seed-OSS.jinja +0 -171
  826. package/cpp/llama.cpp/models/templates/CohereForAI-c4ai-command-r-plus-tool_use.jinja +0 -202
  827. package/cpp/llama.cpp/models/templates/CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja +0 -156
  828. package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +0 -124
  829. package/cpp/llama.cpp/models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja +0 -152
  830. package/cpp/llama.cpp/models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja +0 -152
  831. package/cpp/llama.cpp/models/templates/Qwen-QwQ-32B.jinja +0 -62
  832. package/cpp/llama.cpp/models/templates/Qwen-Qwen2.5-7B-Instruct.jinja +0 -54
  833. package/cpp/llama.cpp/models/templates/Qwen-Qwen3-0.6B.jinja +0 -85
  834. package/cpp/llama.cpp/models/templates/README.md +0 -25
  835. package/cpp/llama.cpp/models/templates/deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja +0 -1
  836. package/cpp/llama.cpp/models/templates/deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja +0 -1
  837. package/cpp/llama.cpp/models/templates/fireworks-ai-llama-3-firefunction-v2.jinja +0 -57
  838. package/cpp/llama.cpp/models/templates/google-gemma-2-2b-it.jinja +0 -4
  839. package/cpp/llama.cpp/models/templates/ibm-granite-granite-3.3-2B-Instruct.jinja +0 -59
  840. package/cpp/llama.cpp/models/templates/llama-cpp-deepseek-r1.jinja +0 -76
  841. package/cpp/llama.cpp/models/templates/llama-cpp-rwkv-world.jinja +0 -34
  842. package/cpp/llama.cpp/models/templates/meetkai-functionary-medium-v3.1.jinja +0 -58
  843. package/cpp/llama.cpp/models/templates/meetkai-functionary-medium-v3.2.jinja +0 -287
  844. package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.1-8B-Instruct.jinja +0 -109
  845. package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.2-3B-Instruct.jinja +0 -93
  846. package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.3-70B-Instruct.jinja +0 -109
  847. package/cpp/llama.cpp/models/templates/microsoft-Phi-3.5-mini-instruct.jinja +0 -8
  848. package/cpp/llama.cpp/models/templates/mistralai-Mistral-Nemo-Instruct-2407.jinja +0 -87
  849. package/cpp/llama.cpp/models/templates/moonshotai-Kimi-K2.jinja +0 -43
  850. package/cpp/llama.cpp/models/templates/openai-gpt-oss-120b.jinja +0 -331
  851. package/cpp/llama.cpp/models/templates/unsloth-mistral-Devstral-Small-2507.jinja +0 -105
  852. package/cpp/llama.cpp/prompts/LLM-questions.txt +0 -49
  853. package/cpp/llama.cpp/prompts/alpaca.txt +0 -1
  854. package/cpp/llama.cpp/prompts/assistant.txt +0 -31
  855. package/cpp/llama.cpp/prompts/chat-with-baichuan.txt +0 -4
  856. package/cpp/llama.cpp/prompts/chat-with-bob.txt +0 -7
  857. package/cpp/llama.cpp/prompts/chat-with-qwen.txt +0 -1
  858. package/cpp/llama.cpp/prompts/chat-with-vicuna-v0.txt +0 -7
  859. package/cpp/llama.cpp/prompts/chat-with-vicuna-v1.txt +0 -7
  860. package/cpp/llama.cpp/prompts/chat.txt +0 -28
  861. package/cpp/llama.cpp/prompts/dan-modified.txt +0 -1
  862. package/cpp/llama.cpp/prompts/dan.txt +0 -1
  863. package/cpp/llama.cpp/prompts/mnemonics.txt +0 -93
  864. package/cpp/llama.cpp/prompts/parallel-questions.txt +0 -43
  865. package/cpp/llama.cpp/prompts/reason-act.txt +0 -18
  866. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  867. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  868. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5524
  869. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4247
  870. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-alloc.h +0 -76
  871. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +0 -354
  872. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-blas.h +0 -25
  873. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +0 -145
  874. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-metal.h +0 -66
  875. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +0 -256
  876. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +0 -2492
  877. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/gguf.h +0 -202
  878. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +0 -1391
  879. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Modules/module.modulemap +0 -17
  880. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Resources/Info.plist +0 -32
  881. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-alloc.h +0 -76
  882. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +0 -354
  883. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-blas.h +0 -25
  884. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +0 -145
  885. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-metal.h +0 -66
  886. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +0 -256
  887. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +0 -2492
  888. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/gguf.h +0 -202
  889. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +0 -1391
  890. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Modules/module.modulemap +0 -17
  891. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Resources/Info.plist +0 -32
  892. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  893. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-alloc.h +0 -76
  894. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +0 -354
  895. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-blas.h +0 -25
  896. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +0 -145
  897. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-metal.h +0 -66
  898. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +0 -256
  899. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +0 -2492
  900. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/gguf.h +0 -202
  901. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +0 -1391
  902. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Modules/module.modulemap +0 -17
  903. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Resources/Info.plist +0 -32
  904. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  905. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  906. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  907. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  908. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5561
  909. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-alloc.h +0 -76
  910. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +0 -354
  911. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-blas.h +0 -25
  912. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +0 -145
  913. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-metal.h +0 -66
  914. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +0 -256
  915. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +0 -2492
  916. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/gguf.h +0 -202
  917. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +0 -1391
  918. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Info.plist +0 -35
  919. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Modules/module.modulemap +0 -17
  920. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  921. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  922. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  923. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5524
  924. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4246
  925. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-alloc.h +0 -76
  926. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +0 -354
  927. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-blas.h +0 -25
  928. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +0 -145
  929. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +0 -66
  930. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +0 -256
  931. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +0 -2492
  932. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/gguf.h +0 -202
  933. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -1391
  934. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Info.plist +0 -35
  935. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Modules/module.modulemap +0 -17
  936. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  937. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  938. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  939. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5558
  940. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-alloc.h +0 -76
  941. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +0 -354
  942. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-blas.h +0 -25
  943. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +0 -145
  944. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-metal.h +0 -66
  945. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +0 -256
  946. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +0 -2492
  947. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/gguf.h +0 -202
  948. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +0 -1391
  949. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Info.plist +0 -32
  950. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Modules/module.modulemap +0 -17
  951. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  952. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  953. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  954. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5520
  955. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4243
  956. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-alloc.h +0 -76
  957. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +0 -354
  958. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-blas.h +0 -25
  959. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +0 -145
  960. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +0 -66
  961. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +0 -256
  962. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +0 -2492
  963. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/gguf.h +0 -202
  964. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -1391
  965. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Info.plist +0 -32
  966. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Modules/module.modulemap +0 -17
  967. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  968. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_bfloat16_support.comp → feature-tests/bfloat16.comp} +0 -0
  969. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat_support.comp → feature-tests/coopmat.comp} +0 -0
  970. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat2_support.comp → feature-tests/coopmat2.comp} +0 -0
  971. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_integer_dot_support.comp → feature-tests/integer_dot.comp} +0 -0
  972. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_head.comp → generic_head.glsl} +0 -0
  973. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_unary_head.comp → generic_unary_head.glsl} +0 -0
  974. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{glu_main.comp → glu_main.glsl} +0 -0
  975. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{rte.comp → rte.glsl} +0 -0
  976. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{utils.comp → utils.glsl} +0 -0
@@ -0,0 +1,3844 @@
1
+ #include "ggml-metal-ops.h"
2
+
3
+ #include "ggml.h"
4
+ #include "ggml-impl.h"
5
+ #include "ggml-backend-impl.h"
6
+
7
+ #include "ggml-metal-impl.h"
8
+ #include "ggml-metal-common.h"
9
+ #include "ggml-metal-device.h"
10
+
11
+ #include <cassert>
12
+ #include <algorithm>
13
+ #include <limits>
14
+ #include <cmath>
15
+
16
+ static ggml_metal_buffer_id ggml_metal_get_buffer_id(const ggml_tensor * t) {
17
+ if (!t) {
18
+ return { nullptr, 0 };
19
+ }
20
+
21
+ ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;
22
+
23
+ ggml_metal_buffer_t ctx = (ggml_metal_buffer_t) buffer->context;
24
+
25
+ return ggml_metal_buffer_get_id(ctx, t);
26
+ }
27
+
28
+ struct ggml_metal_op {
29
+ ggml_metal_op(
30
+ ggml_metal_device_t dev,
31
+ ggml_metal_cmd_buf_t cmd_buf,
32
+ ggml_cgraph * gf,
33
+ int idx_start,
34
+ int idx_end,
35
+ bool use_fusion,
36
+ bool use_concurrency,
37
+ bool use_capture,
38
+ int debug_graph,
39
+ int debug_fusion) {
40
+ this->dev = dev;
41
+ this->lib = ggml_metal_device_get_library(dev);
42
+ this->enc = ggml_metal_encoder_init(cmd_buf, use_concurrency);
43
+ this->mem_ranges = ggml_mem_ranges_init(debug_graph);
44
+ this->idx_start = idx_start;
45
+ this->idx_end = idx_end;
46
+ this->use_fusion = use_fusion;
47
+ this->use_concurrency = use_concurrency;
48
+ this->use_capture = use_capture;
49
+ this->debug_graph = debug_graph;
50
+ this->debug_fusion = debug_fusion;
51
+ this->gf = gf;
52
+
53
+ idxs.reserve(gf->n_nodes);
54
+
55
+ // filter empty nodes
56
+ // TODO: this can be removed when the allocator starts filtering them earlier
57
+ // https://github.com/ggml-org/llama.cpp/pull/16130#issuecomment-3327905830
58
+ for (int i = idx_start; i < idx_end; i++) {
59
+ if (!ggml_op_is_empty(gf->nodes[i]->op) && !ggml_is_empty(gf->nodes[i])) {
60
+ idxs.push_back(i);
61
+ }
62
+ }
63
+ }
64
+
65
+ ~ggml_metal_op() {
66
+ ggml_metal_encoder_end_encoding(this->enc);
67
+ ggml_metal_encoder_free(this->enc);
68
+ ggml_mem_ranges_free(this->mem_ranges);
69
+ }
70
+
71
+ int n_nodes() const {
72
+ return idxs.size();
73
+ }
74
+
75
+ ggml_tensor * node(int i) const {
76
+ assert(i >= 0 && i < (int) idxs.size());
77
+ return ggml_graph_node(gf, idxs[i]);
78
+ }
79
+
80
+ bool can_fuse(int i0, const ggml_op * ops, int n_ops) const {
81
+ assert(use_fusion);
82
+ assert(i0 >= 0 && i0 < n_nodes());
83
+
84
+ if (i0 + n_ops > n_nodes()) {
85
+ return false;
86
+ }
87
+
88
+ return ggml_can_fuse_ext(gf, idxs.data() + i0, ops, n_ops);
89
+ }
90
+
91
+ ggml_metal_device_t dev;
92
+ ggml_metal_library_t lib;
93
+ ggml_metal_encoder_t enc;
94
+ ggml_mem_ranges_t mem_ranges;
95
+
96
+ bool use_fusion;
97
+ bool use_concurrency;
98
+ bool use_capture;
99
+
100
+ int debug_graph;
101
+ int debug_fusion;
102
+
103
+ private:
104
+ ggml_cgraph * gf;
105
+
106
+ int idx_start;
107
+ int idx_end;
108
+
109
+ // non-empty node indices
110
+ std::vector<int> idxs;
111
+ };
112
+
113
+ ggml_metal_op_t ggml_metal_op_init(
114
+ ggml_metal_device_t dev,
115
+ ggml_metal_cmd_buf_t cmd_buf,
116
+ ggml_cgraph * gf,
117
+ int idx_start,
118
+ int idx_end,
119
+ bool use_fusion,
120
+ bool use_concurrency,
121
+ bool use_capture,
122
+ int debug_graph,
123
+ int debug_fusion) {
124
+ ggml_metal_op_t res = new ggml_metal_op(
125
+ dev,
126
+ cmd_buf,
127
+ gf,
128
+ idx_start,
129
+ idx_end,
130
+ use_fusion,
131
+ use_concurrency,
132
+ use_capture,
133
+ debug_graph,
134
+ debug_fusion);
135
+
136
+ return res;
137
+ }
138
+
139
+ void ggml_metal_op_free(ggml_metal_op_t ctx) {
140
+ delete ctx;
141
+ }
142
+
143
+ int ggml_metal_op_n_nodes(ggml_metal_op_t ctx) {
144
+ return ctx->n_nodes();
145
+ }
146
+
147
+ static bool ggml_metal_op_concurrency_reset(ggml_metal_op_t ctx) {
148
+ if (!ctx->mem_ranges) {
149
+ return true;
150
+ }
151
+
152
+ ggml_metal_encoder_memory_barrier(ctx->enc);
153
+
154
+ ggml_mem_ranges_reset(ctx->mem_ranges);
155
+
156
+ return true;
157
+ }
158
+
159
+ static bool ggml_metal_op_concurrency_check(ggml_metal_op_t ctx, const ggml_tensor * node) {
160
+ if (!ctx->mem_ranges) {
161
+ return false;
162
+ }
163
+
164
+ return ggml_mem_ranges_check(ctx->mem_ranges, node);
165
+ }
166
+
167
+ static bool ggml_metal_op_concurrency_add(ggml_metal_op_t ctx, const ggml_tensor * node) {
168
+ if (!ctx->mem_ranges) {
169
+ return true;
170
+ }
171
+
172
+ return ggml_mem_ranges_add(ctx->mem_ranges, node);
173
+ }
174
+
175
+ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
176
+ struct ggml_tensor * node = ctx->node(idx);
177
+
178
+ //GGML_LOG_INFO("%s: encoding node %3d, op = %8s\n", __func__, idx, ggml_op_name(node->op));
179
+
180
+ if (ggml_is_empty(node)) {
181
+ return 1;
182
+ }
183
+
184
+ switch (node->op) {
185
+ case GGML_OP_NONE:
186
+ case GGML_OP_RESHAPE:
187
+ case GGML_OP_VIEW:
188
+ case GGML_OP_TRANSPOSE:
189
+ case GGML_OP_PERMUTE:
190
+ {
191
+ // noop -> next node
192
+ if (ctx->debug_graph > 0) {
193
+ GGML_LOG_DEBUG("%s: node[%5d] - %-12s %s\n", __func__, idx, ggml_op_name(node->op), "(noop)");
194
+ }
195
+ } return 1;
196
+ default:
197
+ {
198
+ } break;
199
+ }
200
+
201
+ if (!ggml_metal_device_supports_op(ctx->dev, node)) {
202
+ GGML_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(node));
203
+ GGML_ABORT("unsupported op");
204
+ }
205
+
206
+ int n_fuse = 1;
207
+
208
+ // check if the current node can run concurrently with other nodes before it
209
+ // the condition is that:
210
+ // - the current node cannot write to any previous src or dst ranges
211
+ // - the current node cannot read from any previous dst ranges
212
+ //
213
+ // if the condition is not satisfied, we put a memory barrier and clear all ranges
214
+ // otherwise, we add the new ranges to the encoding context and process the node concurrently
215
+ //
216
+ {
217
+ const bool is_concurrent = ggml_metal_op_concurrency_check(ctx, node);
218
+
219
+ if (!is_concurrent) {
220
+ ggml_metal_op_concurrency_reset(ctx);
221
+ }
222
+
223
+ if (ctx->debug_graph > 0) {
224
+ GGML_LOG_DEBUG("%s: node[%5d] - %-12s %s\n", __func__, idx, ggml_op_name(node->op), is_concurrent ? "(concurrent)" : "");
225
+ }
226
+ if (ctx->debug_graph > 1) {
227
+ GGML_TENSOR_LOCALS( int64_t, ne0, node->src[0], ne);
228
+ GGML_TENSOR_LOCALS(uint64_t, nb0, node->src[0], nb);
229
+ GGML_TENSOR_LOCALS( int64_t, ne1, node->src[1], ne);
230
+ GGML_TENSOR_LOCALS(uint64_t, nb1, node->src[1], nb);
231
+ GGML_TENSOR_LOCALS( int64_t, ne2, node->src[2], ne);
232
+ GGML_TENSOR_LOCALS(uint64_t, nb2, node->src[2], nb);
233
+ GGML_TENSOR_LOCALS( int64_t, ne3, node->src[3], ne);
234
+ GGML_TENSOR_LOCALS(uint64_t, nb3, node->src[3], nb);
235
+ GGML_TENSOR_LOCALS( int64_t, ne, node, ne);
236
+ GGML_TENSOR_LOCALS(uint64_t, nb, node, nb);
237
+
238
+ if (node->src[0]) {
239
+ GGML_LOG_DEBUG("%s: src0 - %4s [%5lld, %5lld, %5lld, %5lld] [%5lld, %5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(node->src[0]->type), ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03,
240
+ ggml_is_contiguous(node->src[0]), node->src[0]->name);
241
+ }
242
+ if (node->src[1]) {
243
+ GGML_LOG_DEBUG("%s: src1 - %4s [%5lld, %5lld, %5lld, %5lld] [%5lld, %5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(node->src[1]->type), ne10, ne11, ne12, ne13, nb10, nb11, nb12, nb13,
244
+ ggml_is_contiguous(node->src[1]), node->src[1]->name);
245
+ }
246
+ if (node->src[2]) {
247
+ GGML_LOG_DEBUG("%s: src2 - %4s [%5lld, %5lld, %5lld, %5lld] [%5lld, %5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(node->src[2]->type), ne20, ne21, ne22, ne23, nb20, nb21, nb22, nb23,
248
+ ggml_is_contiguous(node->src[2]), node->src[2]->name);
249
+ }
250
+ if (node->src[3]) {
251
+ GGML_LOG_DEBUG("%s: src3 - %4s [%5lld, %5lld, %5lld, %5lld] [%5lld, %5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(node->src[3]->type), ne30, ne31, ne32, ne33, nb30, nb31, nb32, nb33,
252
+ ggml_is_contiguous(node->src[3]), node->src[3]->name);
253
+ }
254
+ if (node) {
255
+ GGML_LOG_DEBUG("%s: node - %4s [%5lld, %5lld, %5lld, %5lld] [%5lld, %5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(node->type), ne0, ne1, ne2, ne3, nb0, nb1, nb2, nb3,
256
+ node->name);
257
+ }
258
+ }
259
+ }
260
+
261
+ switch (node->op) {
262
+ case GGML_OP_CONCAT:
263
+ {
264
+ n_fuse = ggml_metal_op_concat(ctx, idx);
265
+ } break;
266
+ case GGML_OP_ADD:
267
+ case GGML_OP_SUB:
268
+ case GGML_OP_MUL:
269
+ case GGML_OP_DIV:
270
+ {
271
+ n_fuse = ggml_metal_op_bin(ctx, idx);
272
+ } break;
273
+ case GGML_OP_ADD_ID:
274
+ {
275
+ n_fuse = ggml_metal_op_add_id(ctx, idx);
276
+ } break;
277
+ case GGML_OP_REPEAT:
278
+ {
279
+ n_fuse = ggml_metal_op_repeat(ctx, idx);
280
+ } break;
281
+ case GGML_OP_ACC:
282
+ {
283
+ n_fuse = ggml_metal_op_acc(ctx, idx);
284
+ } break;
285
+ case GGML_OP_SCALE:
286
+ {
287
+ n_fuse = ggml_metal_op_scale(ctx, idx);
288
+ } break;
289
+ case GGML_OP_CLAMP:
290
+ {
291
+ n_fuse = ggml_metal_op_clamp(ctx, idx);
292
+ } break;
293
+ case GGML_OP_SQR:
294
+ case GGML_OP_SQRT:
295
+ case GGML_OP_SIN:
296
+ case GGML_OP_COS:
297
+ case GGML_OP_LOG:
298
+ case GGML_OP_UNARY:
299
+ {
300
+ n_fuse = ggml_metal_op_unary(ctx, idx);
301
+ } break;
302
+ case GGML_OP_GLU:
303
+ {
304
+ n_fuse = ggml_metal_op_glu(ctx, idx);
305
+ } break;
306
+ case GGML_OP_SUM:
307
+ {
308
+ n_fuse = ggml_metal_op_sum(ctx, idx);
309
+ } break;
310
+ case GGML_OP_SUM_ROWS:
311
+ case GGML_OP_MEAN:
312
+ {
313
+ n_fuse = ggml_metal_op_sum_rows(ctx, idx);
314
+ } break;
315
+ case GGML_OP_CUMSUM:
316
+ {
317
+ n_fuse = ggml_metal_op_cumsum(ctx, idx);
318
+ } break;
319
+ case GGML_OP_SOFT_MAX:
320
+ {
321
+ n_fuse = ggml_metal_op_soft_max(ctx, idx);
322
+ } break;
323
+ case GGML_OP_SSM_CONV:
324
+ {
325
+ n_fuse = ggml_metal_op_ssm_conv(ctx, idx);
326
+ } break;
327
+ case GGML_OP_SSM_SCAN:
328
+ {
329
+ n_fuse = ggml_metal_op_ssm_scan(ctx, idx);
330
+ } break;
331
+ case GGML_OP_RWKV_WKV6:
332
+ case GGML_OP_RWKV_WKV7:
333
+ {
334
+ n_fuse = ggml_metal_op_rwkv(ctx, idx);
335
+ } break;
336
+ case GGML_OP_MUL_MAT:
337
+ {
338
+ n_fuse = ggml_metal_op_mul_mat(ctx, idx);
339
+ } break;
340
+ case GGML_OP_MUL_MAT_ID:
341
+ {
342
+ n_fuse = ggml_metal_op_mul_mat_id(ctx, idx);
343
+ } break;
344
+ case GGML_OP_GET_ROWS:
345
+ {
346
+ n_fuse = ggml_metal_op_get_rows(ctx, idx);
347
+ } break;
348
+ case GGML_OP_SET_ROWS:
349
+ {
350
+ n_fuse = ggml_metal_op_set_rows(ctx, idx);
351
+ } break;
352
+ case GGML_OP_L2_NORM:
353
+ {
354
+ n_fuse = ggml_metal_op_l2_norm(ctx, idx);
355
+ } break;
356
+ case GGML_OP_GROUP_NORM:
357
+ {
358
+ n_fuse = ggml_metal_op_group_norm(ctx, idx);
359
+ } break;
360
+ case GGML_OP_NORM:
361
+ case GGML_OP_RMS_NORM:
362
+ {
363
+ n_fuse = ggml_metal_op_norm(ctx, idx);
364
+ } break;
365
+ case GGML_OP_ROPE:
366
+ {
367
+ n_fuse = ggml_metal_op_rope(ctx, idx);
368
+ } break;
369
+ case GGML_OP_IM2COL:
370
+ {
371
+ n_fuse = ggml_metal_op_im2col(ctx, idx);
372
+ } break;
373
+ case GGML_OP_CONV_2D:
374
+ {
375
+ n_fuse = ggml_metal_op_conv_2d(ctx, idx);
376
+ } break;
377
+ case GGML_OP_CONV_TRANSPOSE_1D:
378
+ {
379
+ n_fuse = ggml_metal_op_conv_transpose_1d(ctx, idx);
380
+ } break;
381
+ case GGML_OP_CONV_TRANSPOSE_2D:
382
+ {
383
+ n_fuse = ggml_metal_op_conv_transpose_2d(ctx, idx);
384
+ } break;
385
+ case GGML_OP_UPSCALE:
386
+ {
387
+ n_fuse = ggml_metal_op_upscale(ctx, idx);
388
+ } break;
389
+ case GGML_OP_PAD:
390
+ {
391
+ n_fuse = ggml_metal_op_pad(ctx, idx);
392
+ } break;
393
+ case GGML_OP_PAD_REFLECT_1D:
394
+ {
395
+ n_fuse = ggml_metal_op_pad_reflect_1d(ctx, idx);
396
+ } break;
397
+ case GGML_OP_ARANGE:
398
+ {
399
+ n_fuse = ggml_metal_op_arange(ctx, idx);
400
+ } break;
401
+ case GGML_OP_TIMESTEP_EMBEDDING:
402
+ {
403
+ n_fuse = ggml_metal_op_timestep_embedding(ctx, idx);
404
+ } break;
405
+ case GGML_OP_ARGSORT:
406
+ {
407
+ n_fuse = ggml_metal_op_argsort(ctx, idx);
408
+ } break;
409
+ case GGML_OP_LEAKY_RELU:
410
+ {
411
+ n_fuse = ggml_metal_op_leaky_relu(ctx, idx);
412
+ } break;
413
+ case GGML_OP_FLASH_ATTN_EXT:
414
+ {
415
+ n_fuse = ggml_metal_op_flash_attn_ext(ctx, idx);
416
+ } break;
417
+ case GGML_OP_DUP:
418
+ case GGML_OP_CPY:
419
+ case GGML_OP_CONT:
420
+ {
421
+ n_fuse = ggml_metal_op_cpy(ctx, idx);
422
+ } break;
423
+ case GGML_OP_POOL_2D:
424
+ {
425
+ n_fuse = ggml_metal_op_pool_2d(ctx, idx);
426
+ } break;
427
+ case GGML_OP_ARGMAX:
428
+ {
429
+ n_fuse = ggml_metal_op_argmax(ctx, idx);
430
+ } break;
431
+ case GGML_OP_OPT_STEP_ADAMW:
432
+ {
433
+ n_fuse = ggml_metal_op_opt_step_adamw(ctx, idx);
434
+ } break;
435
+ case GGML_OP_OPT_STEP_SGD:
436
+ {
437
+ n_fuse = ggml_metal_op_opt_step_sgd(ctx, idx);
438
+ } break;
439
+ default:
440
+ {
441
+ GGML_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(node->op));
442
+ GGML_ABORT("fatal error");
443
+ }
444
+ }
445
+
446
+ if (ctx->debug_graph > 0) {
447
+ if (n_fuse > 1) {
448
+ GGML_LOG_DEBUG("%s: fuse %d ops\n", __func__, n_fuse);
449
+ }
450
+ }
451
+
452
+ // update the mem ranges in the encoding context
453
+ for (int i = 0; i < n_fuse; ++i) {
454
+ if (!ggml_metal_op_concurrency_add(ctx, ctx->node(idx + i))) {
455
+ ggml_metal_op_concurrency_reset(ctx);
456
+ }
457
+ }
458
+
459
+ return n_fuse;
460
+ }
461
+
462
+ int ggml_metal_op_encode(ggml_metal_op_t ctx, int idx) {
463
+ if (ctx->use_capture) {
464
+ ggml_metal_encoder_debug_group_push(ctx->enc, ggml_op_desc(ctx->node(idx)));
465
+ }
466
+
467
+ int res = ggml_metal_op_encode_impl(ctx, idx);
468
+ if (idx + res > ctx->n_nodes()) {
469
+ GGML_ABORT("fusion error: nodes spanning multiple encoders have been fused. this indicates a bug in the fusion logic %s",
470
+ "https://github.com/ggml-org/llama.cpp/pull/14849");
471
+ }
472
+
473
+ if (ctx->use_capture) {
474
+ ggml_metal_encoder_debug_group_pop(ctx->enc);
475
+ }
476
+
477
+ return res;
478
+ }
479
+
480
+ int ggml_metal_op_concat(ggml_metal_op_t ctx, int idx) {
481
+ ggml_tensor * op = ctx->node(idx);
482
+
483
+ ggml_metal_library_t lib = ctx->lib;
484
+ ggml_metal_encoder_t enc = ctx->enc;
485
+
486
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
487
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
488
+ GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
489
+ GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
490
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
491
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
492
+
493
+ const int32_t dim = ((const int32_t *) op->op_params)[0];
494
+
495
+ ggml_metal_kargs_concat args = {
496
+ /*.ne00 =*/ ne00,
497
+ /*.ne01 =*/ ne01,
498
+ /*.ne02 =*/ ne02,
499
+ /*.ne03 =*/ ne03,
500
+ /*.nb00 =*/ nb00,
501
+ /*.nb01 =*/ nb01,
502
+ /*.nb02 =*/ nb02,
503
+ /*.nb03 =*/ nb03,
504
+ /*.ne10 =*/ ne10,
505
+ /*.ne11 =*/ ne11,
506
+ /*.ne12 =*/ ne12,
507
+ /*.ne13 =*/ ne13,
508
+ /*.nb10 =*/ nb10,
509
+ /*.nb11 =*/ nb11,
510
+ /*.nb12 =*/ nb12,
511
+ /*.nb13 =*/ nb13,
512
+ /*.ne0 =*/ ne0,
513
+ /*.ne1 =*/ ne1,
514
+ /*.ne2 =*/ ne2,
515
+ /*.ne3 =*/ ne3,
516
+ /*.nb0 =*/ nb0,
517
+ /*.nb1 =*/ nb1,
518
+ /*.nb2 =*/ nb2,
519
+ /*.nb3 =*/ nb3,
520
+ /*.dim =*/ dim,
521
+ };
522
+
523
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_base(lib, GGML_OP_CONCAT);
524
+
525
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
526
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
527
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
528
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
529
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3);
530
+
531
+ const int nth = std::min(1024, ne0);
532
+
533
+ ggml_metal_encoder_dispatch_threadgroups(enc, ne1, ne2, ne3, nth, 1, 1);
534
+
535
+ return 1;
536
+ }
537
+
538
+ int ggml_metal_op_repeat(ggml_metal_op_t ctx, int idx) {
539
+ ggml_tensor * op = ctx->node(idx);
540
+
541
+ ggml_metal_library_t lib = ctx->lib;
542
+ ggml_metal_encoder_t enc = ctx->enc;
543
+
544
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
545
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
546
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
547
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
548
+
549
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_repeat(lib, op->type);
550
+
551
+ ggml_metal_kargs_repeat args = {
552
+ /*.ne00 =*/ ne00,
553
+ /*.ne01 =*/ ne01,
554
+ /*.ne02 =*/ ne02,
555
+ /*.ne03 =*/ ne03,
556
+ /*.nb00 =*/ nb00,
557
+ /*.nb01 =*/ nb01,
558
+ /*.nb02 =*/ nb02,
559
+ /*.nb03 =*/ nb03,
560
+ /*.ne0 =*/ ne0,
561
+ /*.ne1 =*/ ne1,
562
+ /*.ne2 =*/ ne2,
563
+ /*.ne3 =*/ ne3,
564
+ /*.nb0 =*/ nb0,
565
+ /*.nb1 =*/ nb1,
566
+ /*.nb2 =*/ nb2,
567
+ /*.nb3 =*/ nb3,
568
+ };
569
+
570
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
571
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
572
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
573
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
574
+
575
+ const int nth = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), ne0);
576
+
577
+ ggml_metal_encoder_dispatch_threadgroups(enc, ne1, ne2, ne3, nth, 1, 1);
578
+
579
+ return 1;
580
+ }
581
+
582
+ int ggml_metal_op_acc(ggml_metal_op_t ctx, int idx) {
583
+ ggml_tensor * op = ctx->node(idx);
584
+
585
+ ggml_metal_library_t lib = ctx->lib;
586
+ ggml_metal_encoder_t enc = ctx->enc;
587
+
588
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
589
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
590
+ GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
591
+ GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
592
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
593
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
594
+
595
+ GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32);
596
+ GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32);
597
+ GGML_ASSERT(op->type == GGML_TYPE_F32);
598
+
599
+ GGML_ASSERT(ggml_is_contiguous(op->src[0]));
600
+ GGML_ASSERT(ggml_is_contiguous(op->src[1]));
601
+
602
+ const size_t pnb1 = ((const int32_t *) op->op_params)[0];
603
+ const size_t pnb2 = ((const int32_t *) op->op_params)[1];
604
+ const size_t pnb3 = ((const int32_t *) op->op_params)[2];
605
+ const size_t offs = ((const int32_t *) op->op_params)[3];
606
+
607
+ const bool inplace = (bool) ((const int32_t *) op->op_params)[4];
608
+
609
+ if (!inplace) {
610
+ // run a separete kernel to cpy src->dst
611
+ // not sure how to avoid this
612
+ // TODO: make a simpler cpy_bytes kernel
613
+
614
+ //const id<MTLComputePipelineState> pipeline = ctx->pipelines[GGML_METAL_PIPELINE_TYPE_CPY_F32_F32].obj;
615
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_cpy(lib, op->src[0]->type, op->type);
616
+
617
+ ggml_metal_kargs_cpy args = {
618
+ /*.nk0 =*/ ne00,
619
+ /*.ne00 =*/ ne00,
620
+ /*.ne01 =*/ ne01,
621
+ /*.ne02 =*/ ne02,
622
+ /*.ne03 =*/ ne03,
623
+ /*.nb00 =*/ nb00,
624
+ /*.nb01 =*/ nb01,
625
+ /*.nb02 =*/ nb02,
626
+ /*.nb03 =*/ nb03,
627
+ /*.ne0 =*/ ne0,
628
+ /*.ne1 =*/ ne1,
629
+ /*.ne2 =*/ ne2,
630
+ /*.ne3 =*/ ne3,
631
+ /*.nb0 =*/ nb0,
632
+ /*.nb1 =*/ nb1,
633
+ /*.nb2 =*/ nb2,
634
+ /*.nb3 =*/ nb3,
635
+ };
636
+
637
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
638
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
639
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
640
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
641
+
642
+ const int nth = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), ne00);
643
+
644
+ ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne02, ne03, nth, 1, 1);
645
+
646
+ ggml_metal_op_concurrency_reset(ctx);
647
+ }
648
+
649
+ ggml_metal_kargs_bin args = {
650
+ /*.ne00 =*/ ne00,
651
+ /*.ne01 =*/ ne01,
652
+ /*.ne02 =*/ ne02,
653
+ /*.ne03 =*/ ne03,
654
+ /*.nb00 =*/ nb00,
655
+ /*.nb01 =*/ pnb1,
656
+ /*.nb02 =*/ pnb2,
657
+ /*.nb03 =*/ pnb3,
658
+ /*.ne10 =*/ ne10,
659
+ /*.ne11 =*/ ne11,
660
+ /*.ne12 =*/ ne12,
661
+ /*.ne13 =*/ ne13,
662
+ /*.nb10 =*/ nb10,
663
+ /*.nb11 =*/ nb11,
664
+ /*.nb12 =*/ nb12,
665
+ /*.nb13 =*/ nb13,
666
+ /*.ne0 =*/ ne0,
667
+ /*.ne1 =*/ ne1,
668
+ /*.ne2 =*/ ne2,
669
+ /*.ne3 =*/ ne3,
670
+ /*.nb0 =*/ nb0,
671
+ /*.nb1 =*/ pnb1,
672
+ /*.nb2 =*/ pnb2,
673
+ /*.nb3 =*/ pnb3,
674
+ /*.offs =*/ offs,
675
+ /*.o1 =*/ { 0 },
676
+ };
677
+
678
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_bin(lib, GGML_OP_ADD, 1, false);
679
+
680
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
681
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
682
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
683
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
684
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3);
685
+
686
+ const int nth = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), ne00);
687
+
688
+ ggml_metal_encoder_dispatch_threadgroups(enc, ne11, ne12, ne13, nth, 1, 1);
689
+
690
+ return 1;
691
+ }
692
+
693
+ int ggml_metal_op_scale(ggml_metal_op_t ctx, int idx) {
694
+ ggml_tensor * op = ctx->node(idx);
695
+
696
+ ggml_metal_library_t lib = ctx->lib;
697
+ ggml_metal_encoder_t enc = ctx->enc;
698
+
699
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
700
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
701
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
702
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
703
+
704
+ float scale;
705
+ float bias;
706
+ memcpy(&scale, ((const int32_t *) op->op_params) + 0, sizeof(float));
707
+ memcpy(&bias, ((const int32_t *) op->op_params) + 1, sizeof(float));
708
+
709
+ ggml_metal_kargs_scale args = {
710
+ /*.scale =*/ scale,
711
+ /*.bias =*/ bias,
712
+ };
713
+
714
+ int64_t n = ggml_nelements(op);
715
+
716
+ if (n % 4 == 0) {
717
+ n /= 4;
718
+ }
719
+
720
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_unary(lib, op);
721
+
722
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
723
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
724
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
725
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
726
+
727
+ ggml_metal_encoder_dispatch_threadgroups(enc, n, 1, 1, 1, 1, 1);
728
+
729
+ return 1;
730
+ }
731
+
732
+ int ggml_metal_op_clamp(ggml_metal_op_t ctx, int idx) {
733
+ ggml_tensor * op = ctx->node(idx);
734
+
735
+ ggml_metal_library_t lib = ctx->lib;
736
+ ggml_metal_encoder_t enc = ctx->enc;
737
+
738
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
739
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
740
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
741
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
742
+
743
+ float min;
744
+ float max;
745
+ memcpy(&min, ((const int32_t *) op->op_params) + 0, sizeof(float));
746
+ memcpy(&max, ((const int32_t *) op->op_params) + 1, sizeof(float));
747
+
748
+ ggml_metal_kargs_clamp args = {
749
+ /*.min =*/ min,
750
+ /*.max =*/ max,
751
+ };
752
+
753
+ int64_t n = ggml_nelements(op);
754
+
755
+ if (n % 4 == 0) {
756
+ n /= 4;
757
+ }
758
+
759
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_unary(lib, op);
760
+
761
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
762
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
763
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
764
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
765
+
766
+ ggml_metal_encoder_dispatch_threadgroups(enc, n, 1, 1, 1, 1, 1);
767
+
768
+ return 1;
769
+ }
770
+
771
+ int ggml_metal_op_unary(ggml_metal_op_t ctx, int idx) {
772
+ ggml_tensor * op = ctx->node(idx);
773
+
774
+ ggml_metal_library_t lib = ctx->lib;
775
+ ggml_metal_encoder_t enc = ctx->enc;
776
+
777
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
778
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
779
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
780
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
781
+
782
+ int64_t n = ggml_nelements(op);
783
+
784
+ if (n % 4 == 0) {
785
+ n /= 4;
786
+ }
787
+
788
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_unary(lib, op);
789
+
790
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
791
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 0);
792
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 1);
793
+
794
+ ggml_metal_encoder_dispatch_threadgroups(enc, n, 1, 1, 1, 1, 1);
795
+
796
+ return 1;
797
+ }
798
+
799
+ int ggml_metal_op_glu(ggml_metal_op_t ctx, int idx) {
800
+ ggml_tensor * op = ctx->node(idx);
801
+
802
+ ggml_metal_library_t lib = ctx->lib;
803
+ ggml_metal_encoder_t enc = ctx->enc;
804
+
805
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
806
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
807
+ GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
808
+ GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
809
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
810
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
811
+
812
+ if (op->src[1]) {
813
+ GGML_ASSERT(ggml_are_same_shape(op->src[0], op->src[1]));
814
+ }
815
+
816
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_glu(lib, op);
817
+
818
+ const int32_t swp = ggml_get_op_params_i32(op, 1);
819
+ const float alpha = ggml_get_op_params_f32(op, 2);
820
+ const float limit = ggml_get_op_params_f32(op, 3);
821
+
822
+ const int32_t i00 = swp ? ne0 : 0;
823
+ const int32_t i10 = swp ? 0 : ne0;
824
+
825
+ ggml_metal_kargs_glu args = {
826
+ /*.ne00 =*/ ne00,
827
+ /*.nb01 =*/ nb01,
828
+ /*.ne10 =*/ op->src[1] ? ne10 : ne00,
829
+ /*.nb11 =*/ op->src[1] ? nb11 : nb01,
830
+ /*.ne0 =*/ ne0,
831
+ /*.nb1 =*/ nb1,
832
+ /*.i00 =*/ op->src[1] ? 0 : i00,
833
+ /*.i10 =*/ op->src[1] ? 0 : i10,
834
+ /*.alpha=*/ alpha,
835
+ /*.limit=*/ limit
836
+ };
837
+
838
+ const int64_t nrows = ggml_nrows(op->src[0]);
839
+
840
+ const int32_t nth = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), ne00/2);
841
+
842
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
843
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
844
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
845
+ if (op->src[1]) {
846
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
847
+ } else {
848
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 2);
849
+ }
850
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3);
851
+
852
+ ggml_metal_encoder_dispatch_threadgroups(enc, nrows, 1, 1, nth, 1, 1);
853
+
854
+ return 1;
855
+ }
856
+
857
+ int ggml_metal_op_sum(ggml_metal_op_t ctx, int idx) {
858
+ ggml_tensor * op = ctx->node(idx);
859
+
860
+ ggml_metal_library_t lib = ctx->lib;
861
+ ggml_metal_encoder_t enc = ctx->enc;
862
+
863
+ const uint64_t n = (uint64_t) ggml_nelements(op->src[0]);
864
+
865
+ ggml_metal_kargs_sum args = {
866
+ /*.np =*/ n,
867
+ };
868
+
869
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_sum(lib, op);
870
+
871
+ int nth = 32; // SIMD width
872
+
873
+ while (nth < (int) n && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
874
+ nth *= 2;
875
+ }
876
+
877
+ nth = std::min(nth, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
878
+ nth = std::min(nth, (int) n);
879
+
880
+ const int nsg = (nth + 31) / 32;
881
+
882
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
883
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
884
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
885
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
886
+
887
+ ggml_metal_encoder_set_threadgroup_memory_size(enc, nsg * sizeof(float), 0);
888
+
889
+ ggml_metal_encoder_dispatch_threadgroups(enc, 1, 1, 1, nth, 1, 1);
890
+
891
+ return 1;
892
+ }
893
+
894
+ int ggml_metal_op_sum_rows(ggml_metal_op_t ctx, int idx) {
895
+ ggml_tensor * op = ctx->node(idx);
896
+
897
+ ggml_metal_library_t lib = ctx->lib;
898
+ ggml_metal_encoder_t enc = ctx->enc;
899
+
900
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
901
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
902
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
903
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
904
+
905
+ ggml_metal_kargs_sum_rows args = {
906
+ /*.ne00 =*/ ne00,
907
+ /*.ne01 =*/ ne01,
908
+ /*.ne02 =*/ ne02,
909
+ /*.ne03 =*/ ne03,
910
+ /*.nb00 =*/ nb00,
911
+ /*.nb01 =*/ nb01,
912
+ /*.nb02 =*/ nb02,
913
+ /*.nb03 =*/ nb03,
914
+ /*.ne0 =*/ ne0,
915
+ /*.ne1 =*/ ne1,
916
+ /*.ne2 =*/ ne2,
917
+ /*.ne3 =*/ ne3,
918
+ /*.nb0 =*/ nb0,
919
+ /*.nb1 =*/ nb1,
920
+ /*.nb2 =*/ nb2,
921
+ /*.nb3 =*/ nb3,
922
+ };
923
+
924
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_sum_rows(lib, op);
925
+
926
+ int nth = 32; // SIMD width
927
+
928
+ while (nth < ne00 && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
929
+ nth *= 2;
930
+ }
931
+
932
+ nth = std::min(nth, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
933
+ nth = std::min(nth, ne00);
934
+
935
+ const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
936
+
937
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
938
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
939
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
940
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
941
+
942
+ ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
943
+
944
+ ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne02, ne03, nth, 1, 1);
945
+
946
+ return 1;
947
+ }
948
+
949
+ int ggml_metal_op_cumsum(ggml_metal_op_t ctx, int idx) {
950
+ ggml_tensor * op = ctx->node(idx);
951
+
952
+ ggml_metal_library_t lib = ctx->lib;
953
+ ggml_metal_encoder_t enc = ctx->enc;
954
+
955
+ GGML_ASSERT(ggml_is_contiguous_rows(op->src[0]));
956
+
957
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
958
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
959
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
960
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
961
+
962
+ ggml_metal_pipeline_t pipeline_blk = ggml_metal_library_get_pipeline_cumsum_blk(lib, op);
963
+
964
+ int nth = 1;
965
+ while (nth < ne00 && 2*nth <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline_blk)) {
966
+ nth *= 2;
967
+ }
968
+
969
+ GGML_ASSERT(ne00 <= nth*nth);
970
+
971
+ const int64_t net0 = (ne00 + nth - 1) / nth;
972
+ const int64_t net1 = ne01;
973
+ const int64_t net2 = ne02;
974
+ const int64_t net3 = ne03;
975
+
976
+ const uint64_t nbt0 = sizeof(float);
977
+ const uint64_t nbt1 = net0*nbt0;
978
+ const uint64_t nbt2 = net1*nbt1;
979
+ const uint64_t nbt3 = net2*nbt2;
980
+
981
+ const size_t smem = GGML_PAD(32*sizeof(float), 16);
982
+
983
+ ggml_metal_buffer_id bid_src0 = ggml_metal_get_buffer_id(op->src[0]);
984
+ ggml_metal_buffer_id bid_dst = ggml_metal_get_buffer_id(op);
985
+
986
+ ggml_metal_buffer_id bid_tmp = bid_dst;
987
+ bid_tmp.offs += ggml_nbytes(op);
988
+
989
+ {
990
+ ggml_metal_kargs_cumsum_blk args = {
991
+ /*.ne00 =*/ ne00,
992
+ /*.ne01 =*/ ne01,
993
+ /*.ne02 =*/ ne02,
994
+ /*.ne03 =*/ ne03,
995
+ /*.nb00 =*/ nb00,
996
+ /*.nb01 =*/ nb01,
997
+ /*.nb02 =*/ nb02,
998
+ /*.nb03 =*/ nb03,
999
+ /*.net0 =*/ net0,
1000
+ /*.net1 =*/ net1,
1001
+ /*.net2 =*/ net2,
1002
+ /*.net3 =*/ net3,
1003
+ /*.nbt0 =*/ nbt0,
1004
+ /*.nbt1 =*/ nbt1,
1005
+ /*.nbt2 =*/ nbt2,
1006
+ /*.nbt3 =*/ nbt3,
1007
+ /*.outb =*/ ne00 > nth,
1008
+ };
1009
+
1010
+ ggml_metal_encoder_set_pipeline(enc, pipeline_blk);
1011
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
1012
+ ggml_metal_encoder_set_buffer (enc, bid_src0, 1);
1013
+ ggml_metal_encoder_set_buffer (enc, bid_tmp, 2);
1014
+ ggml_metal_encoder_set_buffer (enc, bid_dst, 3);
1015
+
1016
+ ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
1017
+
1018
+ ggml_metal_encoder_dispatch_threadgroups(enc, net0*ne01, ne02, ne03, nth, 1, 1);
1019
+ }
1020
+
1021
+ if (ne00 > nth) {
1022
+ ggml_metal_op_concurrency_reset(ctx);
1023
+
1024
+ {
1025
+ ggml_metal_kargs_cumsum_blk args = {
1026
+ /*.ne00 =*/ net0,
1027
+ /*.ne01 =*/ net1,
1028
+ /*.ne02 =*/ net2,
1029
+ /*.ne03 =*/ net3,
1030
+ /*.nb00 =*/ nbt0,
1031
+ /*.nb01 =*/ nbt1,
1032
+ /*.nb02 =*/ nbt2,
1033
+ /*.nb03 =*/ nbt3,
1034
+ /*.net0 =*/ net0,
1035
+ /*.net1 =*/ net1,
1036
+ /*.net2 =*/ net2,
1037
+ /*.net3 =*/ net3,
1038
+ /*.nbt0 =*/ nbt0,
1039
+ /*.nbt1 =*/ nbt1,
1040
+ /*.nbt2 =*/ nbt2,
1041
+ /*.nbt3 =*/ nbt3,
1042
+ /*.outb =*/ false,
1043
+ };
1044
+
1045
+ ggml_metal_encoder_set_pipeline(enc, pipeline_blk);
1046
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
1047
+ ggml_metal_encoder_set_buffer (enc, bid_tmp, 1);
1048
+ ggml_metal_encoder_set_buffer (enc, bid_tmp, 2);
1049
+ ggml_metal_encoder_set_buffer (enc, bid_tmp, 3);
1050
+
1051
+ ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
1052
+
1053
+ ggml_metal_encoder_dispatch_threadgroups(enc, net1, net2, net3, nth, 1, 1);
1054
+ }
1055
+
1056
+ ggml_metal_op_concurrency_reset(ctx);
1057
+
1058
+ {
1059
+ ggml_metal_pipeline_t pipeline_add = ggml_metal_library_get_pipeline_cumsum_add(lib, op);
1060
+
1061
+ ggml_metal_kargs_cumsum_add args = {
1062
+ /*.ne00 =*/ ne00,
1063
+ /*.ne01 =*/ ne01,
1064
+ /*.ne02 =*/ ne02,
1065
+ /*.ne03 =*/ ne03,
1066
+ /*.nb00 =*/ nb00,
1067
+ /*.nb01 =*/ nb01,
1068
+ /*.nb02 =*/ nb02,
1069
+ /*.nb03 =*/ nb03,
1070
+ /*.net0 =*/ net0,
1071
+ /*.net1 =*/ net1,
1072
+ /*.net2 =*/ net2,
1073
+ /*.net3 =*/ net3,
1074
+ /*.nbt0 =*/ nbt0,
1075
+ /*.nbt1 =*/ nbt1,
1076
+ /*.nbt2 =*/ nbt2,
1077
+ /*.nbt3 =*/ nbt3,
1078
+ };
1079
+
1080
+ ggml_metal_encoder_set_pipeline(enc, pipeline_add);
1081
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
1082
+ ggml_metal_encoder_set_buffer (enc, bid_tmp, 1);
1083
+ ggml_metal_encoder_set_buffer (enc, bid_dst, 2);
1084
+
1085
+ ggml_metal_encoder_dispatch_threadgroups(enc, net0*ne01, ne02, ne03, nth, 1, 1);
1086
+ }
1087
+ }
1088
+
1089
+ return 1;
1090
+ }
1091
+
1092
+ int ggml_metal_op_get_rows(ggml_metal_op_t ctx, int idx) {
1093
+ ggml_tensor * op = ctx->node(idx);
1094
+
1095
+ ggml_metal_library_t lib = ctx->lib;
1096
+ ggml_metal_encoder_t enc = ctx->enc;
1097
+
1098
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
1099
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
1100
+ GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
1101
+ GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
1102
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
1103
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
1104
+
1105
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_get_rows(lib, op->src[0]->type);
1106
+
1107
+ ggml_metal_kargs_get_rows args = {
1108
+ /*.ne00t =*/ ggml_is_quantized(op->src[0]->type) ? ne00/16 : ne00,
1109
+ /*.ne00 =*/ ne00,
1110
+ /*.nb01 =*/ nb01,
1111
+ /*.nb02 =*/ nb02,
1112
+ /*.nb03 =*/ nb03,
1113
+ /*.ne10 =*/ ne10,
1114
+ /*.nb10 =*/ nb10,
1115
+ /*.nb11 =*/ nb11,
1116
+ /*.nb12 =*/ nb12,
1117
+ /*.nb1 =*/ nb1,
1118
+ /*.nb2 =*/ nb2,
1119
+ /*.nb3 =*/ nb3,
1120
+ };
1121
+
1122
+ const int nth = std::min(args.ne00t, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
1123
+
1124
+ const int nw0 = (args.ne00t + nth - 1)/nth;
1125
+
1126
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
1127
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
1128
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
1129
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
1130
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3);
1131
+
1132
+ ggml_metal_encoder_dispatch_threadgroups(enc, nw0*ne10, ne11, ne12, nth, 1, 1);
1133
+
1134
+ return 1;
1135
+ }
1136
+
1137
+ int ggml_metal_op_set_rows(ggml_metal_op_t ctx, int idx) {
1138
+ ggml_tensor * op = ctx->node(idx);
1139
+
1140
+ ggml_metal_library_t lib = ctx->lib;
1141
+ ggml_metal_encoder_t enc = ctx->enc;
1142
+
1143
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
1144
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
1145
+ GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
1146
+ GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
1147
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
1148
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
1149
+
1150
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_set_rows(lib, op->src[1]->type, op->type);
1151
+
1152
+ const int32_t nk0 = ne0/ggml_blck_size(op->type);
1153
+
1154
+ int nth = 32; // SIMD width
1155
+
1156
+ while (nth < nk0 && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
1157
+ nth *= 2;
1158
+ }
1159
+
1160
+ int nrptg = 1;
1161
+ if (nth > nk0) {
1162
+ nrptg = (nth + nk0 - 1)/nk0;
1163
+ nth = nk0;
1164
+
1165
+ if (nrptg*nth > ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
1166
+ nrptg--;
1167
+ }
1168
+ }
1169
+
1170
+ nth = std::min(nth, nk0);
1171
+
1172
+ ggml_metal_kargs_set_rows args = {
1173
+ /*.nk0 =*/ nk0,
1174
+ /*.ne01 =*/ ne01,
1175
+ /*.nb01 =*/ nb01,
1176
+ /*.nb02 =*/ nb02,
1177
+ /*.nb03 =*/ nb03,
1178
+ /*.ne11 =*/ ne11,
1179
+ /*.ne12 =*/ ne12,
1180
+ /*.nb10 =*/ nb10,
1181
+ /*.nb11 =*/ nb11,
1182
+ /*.nb12 =*/ nb12,
1183
+ /*.nb1 =*/ nb1,
1184
+ /*.nb2 =*/ nb2,
1185
+ /*.nb3 =*/ nb3,
1186
+ };
1187
+
1188
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
1189
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
1190
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
1191
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
1192
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3);
1193
+
1194
+ ggml_metal_encoder_dispatch_threadgroups(enc, (ne01 + nrptg - 1)/nrptg, ne02, ne03, nth, nrptg, 1);
1195
+
1196
+ return 1;
1197
+ }
1198
+
1199
+ int ggml_metal_op_soft_max(ggml_metal_op_t ctx, int idx) {
1200
+ ggml_tensor * op = ctx->node(idx);
1201
+
1202
+ ggml_metal_library_t lib = ctx->lib;
1203
+ ggml_metal_encoder_t enc = ctx->enc;
1204
+
1205
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
1206
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
1207
+ GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
1208
+ GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
1209
+ GGML_TENSOR_LOCALS( int32_t, ne2, op->src[2], ne);
1210
+ GGML_TENSOR_LOCALS(uint64_t, nb2, op->src[2], nb);
1211
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
1212
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
1213
+
1214
+ float scale;
1215
+ float max_bias;
1216
+
1217
+ memcpy(&scale, ((const int32_t *) op->op_params) + 0, sizeof(scale));
1218
+ memcpy(&max_bias, ((const int32_t *) op->op_params) + 1, sizeof(max_bias));
1219
+
1220
+ const uint32_t n_head = op->src[0]->ne[2];
1221
+ const int32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
1222
+
1223
+ const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
1224
+ const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
1225
+
1226
+ // softmax
1227
+
1228
+ ggml_metal_kargs_soft_max args = {
1229
+ /*.ne00 =*/ ne00,
1230
+ /*.ne01 =*/ ne01,
1231
+ /*.ne02 =*/ ne02,
1232
+ /*.nb01 =*/ nb01,
1233
+ /*.nb02 =*/ nb02,
1234
+ /*.nb03 =*/ nb03,
1235
+ /*.ne11 =*/ ne11,
1236
+ /*.ne12 =*/ ne12,
1237
+ /*.ne13 =*/ ne13,
1238
+ /*.nb11 =*/ nb11,
1239
+ /*.nb12 =*/ nb12,
1240
+ /*.nb13 =*/ nb13,
1241
+ /*.nb1 =*/ nb1,
1242
+ /*.nb2 =*/ nb2,
1243
+ /*.nb3 =*/ nb3,
1244
+ /*.scale =*/ scale,
1245
+ /*.max_bias =*/ max_bias,
1246
+ /*.m0 =*/ m0,
1247
+ /*.m1 =*/ m1,
1248
+ /*.n_head_log2 =*/ n_head_log2,
1249
+ };
1250
+
1251
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_soft_max(lib, op);
1252
+
1253
+ int nth = 32; // SIMD width
1254
+
1255
+ if (ne00%4 == 0) {
1256
+ while (nth < ne00/4 && nth*ne01*ne02*ne03 < 256) {
1257
+ nth *= 2;
1258
+ }
1259
+ } else {
1260
+ while (nth < ne00 && nth*ne01*ne02*ne03 < 256) {
1261
+ nth *= 2;
1262
+ }
1263
+ }
1264
+
1265
+ const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
1266
+
1267
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
1268
+ ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 0);
1269
+ ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[0]), 1);
1270
+ if (op->src[1]) {
1271
+ ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[1]), 2);
1272
+ } else {
1273
+ ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[0]), 2);
1274
+ }
1275
+ if (op->src[2]) {
1276
+ ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[2]), 3);
1277
+ } else {
1278
+ ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[0]), 3);
1279
+ }
1280
+ ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op), 4);
1281
+
1282
+ ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
1283
+
1284
+ ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne02, ne03, nth, 1, 1);
1285
+
1286
+ return 1;
1287
+ }
1288
+
1289
+ int ggml_metal_op_ssm_conv(ggml_metal_op_t ctx, int idx) {
1290
+ ggml_tensor * op = ctx->node(idx);
1291
+
1292
+ ggml_metal_library_t lib = ctx->lib;
1293
+ ggml_metal_encoder_t enc = ctx->enc;
1294
+
1295
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
1296
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
1297
+ GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
1298
+ GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
1299
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
1300
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
1301
+
1302
+ ggml_metal_kargs_ssm_conv args = {
1303
+ /*.ne00 =*/ ne00,
1304
+ /*.ne01 =*/ ne01,
1305
+ /*.ne02 =*/ ne02,
1306
+ /*.nb00 =*/ nb00,
1307
+ /*.nb01 =*/ nb01,
1308
+ /*.nb02 =*/ nb02,
1309
+ /*.ne10 =*/ ne10,
1310
+ /*.ne11 =*/ ne11,
1311
+ /*.nb10 =*/ nb10,
1312
+ /*.nb11 =*/ nb11,
1313
+ /*.ne0 =*/ ne0,
1314
+ /*.ne1 =*/ ne1,
1315
+ /*.ne2 =*/ ne2,
1316
+ /*.nb0 =*/ nb0,
1317
+ /*.nb1 =*/ nb1,
1318
+ /*.nb2 =*/ nb2,
1319
+ };
1320
+
1321
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_ssm_conv(lib, op);
1322
+
1323
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
1324
+ ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 0);
1325
+ ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[0]), 1);
1326
+ ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[1]), 2);
1327
+ ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op), 3);
1328
+
1329
+ ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne1, ne02, 1, 1, 1);
1330
+
1331
+ return 1;
1332
+ }
1333
+
1334
+ int ggml_metal_op_ssm_scan(ggml_metal_op_t ctx, int idx) {
1335
+ ggml_tensor * op = ctx->node(idx);
1336
+
1337
+ ggml_metal_library_t lib = ctx->lib;
1338
+ ggml_metal_encoder_t enc = ctx->enc;
1339
+
1340
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
1341
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
1342
+ GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
1343
+ GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
1344
+ GGML_TENSOR_LOCALS( int32_t, ne2, op->src[2], ne);
1345
+ GGML_TENSOR_LOCALS(uint64_t, nb2, op->src[2], nb);
1346
+ GGML_TENSOR_LOCALS( int32_t, ne3, op->src[3], ne);
1347
+ GGML_TENSOR_LOCALS(uint64_t, nb3, op->src[3], nb);
1348
+ GGML_TENSOR_LOCALS( int32_t, ne4, op->src[4], ne);
1349
+ GGML_TENSOR_LOCALS(uint64_t, nb4, op->src[4], nb);
1350
+ GGML_TENSOR_LOCALS( int32_t, ne5, op->src[5], ne);
1351
+ GGML_TENSOR_LOCALS(uint64_t, nb5, op->src[5], nb);
1352
+ GGML_TENSOR_LOCALS( int32_t, ne6, op->src[6], ne);
1353
+ GGML_TENSOR_LOCALS(uint64_t, nb6, op->src[6], nb);
1354
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
1355
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
1356
+
1357
+ const ggml_tensor * src3 = op->src[3];
1358
+ const ggml_tensor * src4 = op->src[4];
1359
+ const ggml_tensor * src5 = op->src[5];
1360
+ const ggml_tensor * src6 = op->src[6];
1361
+
1362
+ GGML_ASSERT(src3);
1363
+ GGML_ASSERT(src4);
1364
+ GGML_ASSERT(src5);
1365
+ GGML_ASSERT(src6);
1366
+
1367
+ const int64_t d_state = ne00;
1368
+ const int64_t d_inner = ne01;
1369
+ const int64_t n_head = ne02;
1370
+ const int64_t n_group = ne41;
1371
+ const int64_t n_seq_tokens = ne12;
1372
+ const int64_t n_seqs = ne13;
1373
+
1374
+ ggml_metal_kargs_ssm_scan args = {
1375
+ /*.d_state =*/ d_state,
1376
+ /*.d_inner =*/ d_inner,
1377
+ /*.n_head =*/ n_head,
1378
+ /*.n_group =*/ n_group,
1379
+ /*.n_seq_tokens =*/ n_seq_tokens,
1380
+ /*.n_seqs =*/ n_seqs,
1381
+ /*.s_off =*/ ggml_nelements(op->src[1]) * sizeof(float),
1382
+ /*.nb00 =*/ nb00,
1383
+ /*.nb01 =*/ nb01,
1384
+ /*.nb02 =*/ nb02,
1385
+ /*.nb03 =*/ nb03,
1386
+ /*.nb10 =*/ nb10,
1387
+ /*.nb11 =*/ nb11,
1388
+ /*.nb12 =*/ nb12,
1389
+ /*.ns12 =*/ nb12/nb10,
1390
+ /*.nb13 =*/ nb13,
1391
+ /*.nb20 =*/ nb20,
1392
+ /*.nb21 =*/ nb21,
1393
+ /*.ns21 =*/ nb21/nb20,
1394
+ /*.nb22 =*/ nb22,
1395
+ /*.ne30 =*/ ne30,
1396
+ /*.nb31 =*/ nb31,
1397
+ /*.nb41 =*/ nb41,
1398
+ /*.nb42 =*/ nb42,
1399
+ /*.ns42 =*/ nb42/nb40,
1400
+ /*.nb43 =*/ nb43,
1401
+ /*.nb51 =*/ nb51,
1402
+ /*.nb52 =*/ nb52,
1403
+ /*.ns52 =*/ nb52/nb50,
1404
+ /*.nb53 =*/ nb53,
1405
+ /*.nb0 =*/ nb0,
1406
+ };
1407
+
1408
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_ssm_scan(lib, op);
1409
+
1410
+ GGML_ASSERT(d_state <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
1411
+
1412
+ const size_t sms = ggml_metal_pipeline_get_smem(pipeline);
1413
+
1414
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
1415
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
1416
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
1417
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
1418
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[2]), 3);
1419
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[3]), 4);
1420
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[4]), 5);
1421
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[5]), 6);
1422
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[6]), 7);
1423
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 8);
1424
+
1425
+ ggml_metal_encoder_set_threadgroup_memory_size(enc, sms, 0);
1426
+
1427
+ ggml_metal_encoder_dispatch_threadgroups(enc, d_inner, n_head, n_seqs, d_state, 1, 1);
1428
+
1429
+ return 1;
1430
+ }
1431
+
1432
+ int ggml_metal_op_rwkv(ggml_metal_op_t ctx, int idx) {
1433
+ ggml_tensor * op = ctx->node(idx);
1434
+
1435
+ ggml_metal_library_t lib = ctx->lib;
1436
+ ggml_metal_encoder_t enc = ctx->enc;
1437
+
1438
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
1439
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
1440
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
1441
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
1442
+
1443
+ const int64_t B = op->op == GGML_OP_RWKV_WKV6 ? op->src[5]->ne[1] : op->src[6]->ne[1];
1444
+ const int64_t T = op->src[0]->ne[2];
1445
+ const int64_t C = op->ne[0];
1446
+ const int64_t H = op->src[0]->ne[1];
1447
+
1448
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_rwkv(lib, op);
1449
+
1450
+ int ida = 0;
1451
+
1452
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
1453
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), ida++);
1454
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), ida++);
1455
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[2]), ida++);
1456
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[3]), ida++);
1457
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[4]), ida++);
1458
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[5]), ida++);
1459
+ if (op->op == GGML_OP_RWKV_WKV7) {
1460
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[6]), ida++);
1461
+ }
1462
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), ida++);
1463
+ ggml_metal_encoder_set_bytes (enc, (void *) &B, sizeof(B), ida++);
1464
+ ggml_metal_encoder_set_bytes (enc, (void *) &T, sizeof(T), ida++);
1465
+ ggml_metal_encoder_set_bytes (enc, (void *) &C, sizeof(C), ida++);
1466
+ ggml_metal_encoder_set_bytes (enc, (void *) &H, sizeof(H), ida++);
1467
+
1468
+ ggml_metal_encoder_dispatch_threadgroups(enc, B * H, 1, 1, C/H, 1, 1);
1469
+
1470
+ return 1;
1471
+ }
1472
+
1473
+ int ggml_metal_op_cpy(ggml_metal_op_t ctx, int idx) {
1474
+ ggml_tensor * op = ctx->node(idx);
1475
+
1476
+ ggml_metal_library_t lib = ctx->lib;
1477
+ ggml_metal_encoder_t enc = ctx->enc;
1478
+
1479
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
1480
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
1481
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
1482
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
1483
+
1484
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_cpy(lib, op->src[0]->type, op->type);
1485
+
1486
+ GGML_ASSERT(ne00 % ggml_blck_size(op->src[0]->type) == 0);
1487
+
1488
+ int64_t nk0 = ne00;
1489
+ if (ggml_is_quantized(op->src[0]->type)) {
1490
+ nk0 = ne00/16;
1491
+ } else if (ggml_is_quantized(op->type)) {
1492
+ nk0 = ne00/ggml_blck_size(op->type);
1493
+ }
1494
+
1495
+ int nth = std::min<int>(nk0, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
1496
+
1497
+ // when rows are small, we can batch them together in a single threadgroup
1498
+ int nrptg = 1;
1499
+
1500
+ // TODO: relax this constraint in the future
1501
+ if (ggml_blck_size(op->src[0]->type) == 1 && ggml_blck_size(op->type) == 1) {
1502
+ if (nth > nk0) {
1503
+ nrptg = (nth + nk0 - 1)/nk0;
1504
+ nth = nk0;
1505
+
1506
+ if (nrptg*nth > ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
1507
+ nrptg--;
1508
+ }
1509
+ }
1510
+ }
1511
+
1512
+ nth = std::min<int>(nth, nk0);
1513
+
1514
+ ggml_metal_kargs_cpy args = {
1515
+ /*.nk0 =*/ nk0,
1516
+ /*.ne00 =*/ ne00,
1517
+ /*.ne01 =*/ ne01,
1518
+ /*.ne02 =*/ ne02,
1519
+ /*.ne03 =*/ ne03,
1520
+ /*.nb00 =*/ nb00,
1521
+ /*.nb01 =*/ nb01,
1522
+ /*.nb02 =*/ nb02,
1523
+ /*.nb03 =*/ nb03,
1524
+ /*.ne0 =*/ ne0,
1525
+ /*.ne1 =*/ ne1,
1526
+ /*.ne2 =*/ ne2,
1527
+ /*.ne3 =*/ ne3,
1528
+ /*.nb0 =*/ nb0,
1529
+ /*.nb1 =*/ nb1,
1530
+ /*.nb2 =*/ nb2,
1531
+ /*.nb3 =*/ nb3,
1532
+ };
1533
+
1534
+ const int nw0 = nrptg == 1 ? (nk0 + nth - 1)/nth : 1;
1535
+
1536
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
1537
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
1538
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
1539
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
1540
+
1541
+ ggml_metal_encoder_dispatch_threadgroups(enc, nw0*(ne01 + nrptg - 1)/nrptg, ne02, ne03, nth, nrptg, 1);
1542
+
1543
+ return 1;
1544
+ }
1545
+
1546
+ int ggml_metal_op_pool_2d(ggml_metal_op_t ctx, int idx) {
1547
+ ggml_tensor * op = ctx->node(idx);
1548
+
1549
+ ggml_metal_library_t lib = ctx->lib;
1550
+ ggml_metal_encoder_t enc = ctx->enc;
1551
+
1552
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
1553
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
1554
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
1555
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
1556
+
1557
+ const int32_t * opts = op->op_params;
1558
+ ggml_op_pool op_pool = (ggml_op_pool) opts[0];
1559
+
1560
+ const int32_t k0 = opts[1];
1561
+ const int32_t k1 = opts[2];
1562
+ const int32_t s0 = opts[3];
1563
+ const int32_t s1 = opts[4];
1564
+ const int32_t p0 = opts[5];
1565
+ const int32_t p1 = opts[6];
1566
+
1567
+ const int64_t IH = op->src[0]->ne[1];
1568
+ const int64_t IW = op->src[0]->ne[0];
1569
+
1570
+ const int64_t N = op->ne[3];
1571
+ const int64_t OC = op->ne[2];
1572
+ const int64_t OH = op->ne[1];
1573
+ const int64_t OW = op->ne[0];
1574
+
1575
+ const int64_t np = N * OC * OH * OW;
1576
+
1577
+ ggml_metal_kargs_pool_2d args_pool_2d = {
1578
+ /* .k0 = */ k0,
1579
+ /* .k1 = */ k1,
1580
+ /* .s0 = */ s0,
1581
+ /* .s1 = */ s1,
1582
+ /* .p0 = */ p0,
1583
+ /* .p1 = */ p1,
1584
+ /* .IH = */ IH,
1585
+ /* .IW = */ IW,
1586
+ /* .OH = */ OH,
1587
+ /* .OW = */ OW,
1588
+ /* .np = */ np
1589
+ };
1590
+
1591
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_pool_2d(lib, op, op_pool);
1592
+
1593
+ const int nth = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), (int) np);
1594
+ const int ntg = (np + nth - 1) / nth;
1595
+
1596
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
1597
+ ggml_metal_encoder_set_bytes (enc, &args_pool_2d, sizeof(args_pool_2d), 0);
1598
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
1599
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
1600
+
1601
+ ggml_metal_encoder_dispatch_threadgroups(enc, ntg, 1, 1, nth, 1, 1);
1602
+
1603
+ return 1;
1604
+ }
1605
+
1606
+ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
1607
+ ggml_tensor * op = ctx->node(idx);
1608
+
1609
+ ggml_metal_library_t lib = ctx->lib;
1610
+ ggml_metal_encoder_t enc = ctx->enc;
1611
+
1612
+ const ggml_metal_device_props * props_dev = ggml_metal_device_get_props(ctx->dev);
1613
+
1614
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
1615
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
1616
+ GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
1617
+ GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
1618
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
1619
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
1620
+
1621
+ GGML_ASSERT(ne00 == ne10);
1622
+
1623
+ GGML_ASSERT(ne12 % ne02 == 0);
1624
+ GGML_ASSERT(ne13 % ne03 == 0);
1625
+
1626
+ const int16_t r2 = ne12/ne02;
1627
+ const int16_t r3 = ne13/ne03;
1628
+
1629
+ // find the break-even point where the matrix-matrix kernel becomes more efficient compared
1630
+ // to the matrix-vector kernel
1631
+ const int ne11_mm_min = 8;
1632
+
1633
+ // first try to use small-batch mat-mv kernels
1634
+ // these should be efficient for BS [2, ~8]
1635
+ if (op->src[1]->type == GGML_TYPE_F32 && (ne00%128 == 0) &&
1636
+ (
1637
+ (
1638
+ (
1639
+ op->src[0]->type == GGML_TYPE_F32 || // TODO: helper function
1640
+ op->src[0]->type == GGML_TYPE_F16 ||
1641
+ op->src[0]->type == GGML_TYPE_Q4_0 ||
1642
+ op->src[0]->type == GGML_TYPE_Q4_1 ||
1643
+ op->src[0]->type == GGML_TYPE_Q5_0 ||
1644
+ op->src[0]->type == GGML_TYPE_Q5_1 ||
1645
+ op->src[0]->type == GGML_TYPE_Q8_0 ||
1646
+ op->src[0]->type == GGML_TYPE_MXFP4 ||
1647
+ op->src[0]->type == GGML_TYPE_IQ4_NL ||
1648
+ false) && (ne11 >= 2 && ne11 <= 8)
1649
+ ) ||
1650
+ (
1651
+ (
1652
+ op->src[0]->type == GGML_TYPE_Q4_K ||
1653
+ op->src[0]->type == GGML_TYPE_Q5_K ||
1654
+ op->src[0]->type == GGML_TYPE_Q6_K ||
1655
+ false) && (ne11 >= 4 && ne11 <= 8)
1656
+ )
1657
+ )
1658
+ ) {
1659
+ // TODO: determine the optimal parameters based on grid utilization
1660
+ // I still don't know why we should not always use the maximum available threads:
1661
+ //
1662
+ // nsg = pipeline.maxTotalThreadsPerThreadgroup / 32
1663
+ //
1664
+ // my current hypothesis is that the work grid is not evenly divisible for different nsg
1665
+ // values and there can be some tail effects when nsg is high. need to confirm this
1666
+ //
1667
+ const int nsg = 2; // num simdgroups per threadgroup
1668
+
1669
+ // num threads along row per simdgroup
1670
+ int16_t nxpsg = 0;
1671
+ if (ne00 % 256 == 0 && ne11 < 3) {
1672
+ nxpsg = 16;
1673
+ } else if (ne00 % 128 == 0) {
1674
+ nxpsg = 8;
1675
+ } else {
1676
+ nxpsg = 4;
1677
+ }
1678
+
1679
+ const int16_t nypsg = 32/nxpsg; // num threads along col per simdgroup (i.e. a simdgroup processes that many src0 rows at a time)
1680
+ const int16_t r0ptg = nypsg*nsg; // num src0 rows per threadgroup
1681
+ int16_t r1ptg = 4; // num src1 rows per threadgroup
1682
+
1683
+ // note: not sure how optimal are those across all different hardware. there might be someting cleverer
1684
+ switch (ne11) {
1685
+ case 2:
1686
+ r1ptg = 2; break;
1687
+ case 3:
1688
+ case 6:
1689
+ r1ptg = 3; break;
1690
+ case 4:
1691
+ case 7:
1692
+ case 8:
1693
+ r1ptg = 4; break;
1694
+ case 5:
1695
+ r1ptg = 5; break;
1696
+ default:
1697
+ GGML_ABORT("unsupported ne11");
1698
+ };
1699
+
1700
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_mul_mv_ext(lib, op->src[0]->type, op->src[1]->type, nsg, nxpsg, r1ptg);
1701
+
1702
+ ggml_metal_kargs_mul_mv_ext args = {
1703
+ /*.ne00 =*/ ne00,
1704
+ /*.ne01 =*/ ne01,
1705
+ /*.ne02 =*/ ne02,
1706
+ /*.nb00 =*/ nb00,
1707
+ /*.nb01 =*/ nb01,
1708
+ /*.nb02 =*/ nb02,
1709
+ /*.nb03 =*/ nb03,
1710
+ /*.ne10 =*/ ne10,
1711
+ /*.ne11 =*/ ne11,
1712
+ /*.ne12 =*/ ne12,
1713
+ /*.nb10 =*/ nb10,
1714
+ /*.nb11 =*/ nb11,
1715
+ /*.nb12 =*/ nb12,
1716
+ /*.nb13 =*/ nb13,
1717
+ /*.ne0 =*/ ne0,
1718
+ /*.ne1 =*/ ne1,
1719
+ /*.r2 =*/ r2,
1720
+ /*.r3 =*/ r3,
1721
+ };
1722
+
1723
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
1724
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
1725
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
1726
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
1727
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3);
1728
+
1729
+ ggml_metal_encoder_dispatch_threadgroups(enc, ((ne01 + r0ptg - 1)/r0ptg), ((ne11 + r1ptg - 1)/r1ptg), ne12*ne13, 32, nsg, 1);
1730
+ } else if (
1731
+ !ggml_is_transposed(op->src[0]) &&
1732
+ !ggml_is_transposed(op->src[1]) &&
1733
+ // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
1734
+ // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
1735
+ props_dev->has_simdgroup_mm && ne00 >= 64 && ne11 > ne11_mm_min) {
1736
+ //GGML_LOG_INFO("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
1737
+
1738
+ // some Metal matrix data types require aligned pointers
1739
+ // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5)
1740
+ //switch (op->src[0]->type) {
1741
+ // case GGML_TYPE_F32: GGML_ASSERT(nb01 % 16 == 0); break;
1742
+ // case GGML_TYPE_F16: GGML_ASSERT(nb01 % 8 == 0); break;
1743
+ // case GGML_TYPE_BF16: GGML_ASSERT(nb01 % 8 == 0); break;
1744
+ // default: break;
1745
+ //}
1746
+
1747
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_mul_mm(lib, op);
1748
+
1749
+ ggml_metal_kargs_mul_mm args = {
1750
+ /*.ne00 =*/ ne00,
1751
+ /*.ne02 =*/ ne02,
1752
+ /*.nb01 =*/ nb01,
1753
+ /*.nb02 =*/ nb02,
1754
+ /*.nb03 =*/ nb03,
1755
+ /*.ne12 =*/ ne12,
1756
+ /*.nb10 =*/ nb10,
1757
+ /*.nb11 =*/ nb11,
1758
+ /*.nb12 =*/ nb12,
1759
+ /*.nb13 =*/ nb13,
1760
+ /*.ne0 =*/ ne0,
1761
+ /*.ne1 =*/ ne1,
1762
+ /*.r2 =*/ r2,
1763
+ /*.r3 =*/ r3,
1764
+ };
1765
+
1766
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
1767
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
1768
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
1769
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
1770
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3);
1771
+
1772
+ const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
1773
+
1774
+ ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
1775
+ ggml_metal_encoder_dispatch_threadgroups(enc, ((ne11 + 31)/32), ((ne01 + 63)/64), ne12*ne13, 128, 1, 1);
1776
+ } else {
1777
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_mul_mv(lib, op);
1778
+
1779
+ const int nr0 = ggml_metal_pipeline_get_nr0(pipeline);
1780
+ const int nr1 = ggml_metal_pipeline_get_nr1(pipeline);
1781
+ const int nsg = ggml_metal_pipeline_get_nsg(pipeline);
1782
+
1783
+ const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
1784
+
1785
+ ggml_metal_kargs_mul_mv args = {
1786
+ /*.ne00 =*/ ne00,
1787
+ /*.ne01 =*/ ne01,
1788
+ /*.ne02 =*/ ne02,
1789
+ /*.nb00 =*/ nb00,
1790
+ /*.nb01 =*/ nb01,
1791
+ /*.nb02 =*/ nb02,
1792
+ /*.nb03 =*/ nb03,
1793
+ /*.ne10 =*/ ne10,
1794
+ /*.ne11 =*/ ne11,
1795
+ /*.ne12 =*/ ne12,
1796
+ /*.nb10 =*/ nb10,
1797
+ /*.nb11 =*/ nb11,
1798
+ /*.nb12 =*/ nb12,
1799
+ /*.nb13 =*/ nb13,
1800
+ /*.ne0 =*/ ne0,
1801
+ /*.ne1 =*/ ne1,
1802
+ /*.nr0 =*/ nr0,
1803
+ /*.r2 =*/ r2,
1804
+ /*.r3 =*/ r3,
1805
+ };
1806
+
1807
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
1808
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
1809
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
1810
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
1811
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3);
1812
+
1813
+ ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
1814
+
1815
+ if (op->src[0]->type == GGML_TYPE_F32 ||
1816
+ op->src[0]->type == GGML_TYPE_F16 ||
1817
+ op->src[0]->type == GGML_TYPE_BF16 ||
1818
+ op->src[0]->type == GGML_TYPE_Q8_0) {
1819
+ ggml_metal_encoder_dispatch_threadgroups(enc, ((ne01 + nr0 - 1)/(nr0)), ((ne11 + nr1 - 1)/nr1), ne12*ne13, 32, nsg, 1);
1820
+ } else {
1821
+ ggml_metal_encoder_dispatch_threadgroups(enc, ((ne01 + nr0*nsg - 1)/(nr0*nsg)), ((ne11 + nr1 - 1)/nr1), ne12*ne13, 32, nsg, 1);
1822
+ }
1823
+ }
1824
+
1825
+ return 1;
1826
+ }
1827
+
1828
+ size_t ggml_metal_op_mul_mat_id_extra_tpe(const ggml_tensor * op) {
1829
+ assert(op->op == GGML_OP_MUL_MAT_ID);
1830
+
1831
+ const int64_t ne02 = op->src[0]->ne[2]; // n_expert
1832
+
1833
+ return ggml_type_size(GGML_TYPE_I32)*ne02;
1834
+ }
1835
+
1836
+ size_t ggml_metal_op_mul_mat_id_extra_ids(const ggml_tensor * op) {
1837
+ assert(op->op == GGML_OP_MUL_MAT_ID);
1838
+
1839
+ const int64_t ne02 = op->src[0]->ne[2]; // n_expert
1840
+ const int64_t ne21 = op->src[2]->ne[1]; // n_token
1841
+
1842
+ return ggml_type_size(GGML_TYPE_I32)*ne02*ne21;
1843
+ }
1844
+
1845
+ int ggml_metal_op_mul_mat_id(ggml_metal_op_t ctx, int idx) {
1846
+ ggml_tensor * op = ctx->node(idx);
1847
+
1848
+ ggml_metal_library_t lib = ctx->lib;
1849
+ ggml_metal_encoder_t enc = ctx->enc;
1850
+
1851
+ const ggml_metal_device_props * props_dev = ggml_metal_device_get_props(ctx->dev);
1852
+
1853
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
1854
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
1855
+ GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
1856
+ GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
1857
+ GGML_TENSOR_LOCALS( int32_t, ne2, op->src[2], ne);
1858
+ GGML_TENSOR_LOCALS(uint64_t, nb2, op->src[2], nb);
1859
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
1860
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
1861
+
1862
+ // src2 = ids
1863
+ GGML_ASSERT(op->src[2]->type == GGML_TYPE_I32);
1864
+
1865
+ GGML_ASSERT(!ggml_is_transposed(op->src[0]));
1866
+ GGML_ASSERT(!ggml_is_transposed(op->src[1]));
1867
+
1868
+ GGML_ASSERT(ne03 == 1);
1869
+ GGML_ASSERT(ne13 == 1);
1870
+
1871
+ ggml_metal_buffer_id bid_src0 = ggml_metal_get_buffer_id(op->src[0]);
1872
+ ggml_metal_buffer_id bid_src1 = ggml_metal_get_buffer_id(op->src[1]);
1873
+ ggml_metal_buffer_id bid_src2 = ggml_metal_get_buffer_id(op->src[2]);
1874
+ ggml_metal_buffer_id bid_dst = ggml_metal_get_buffer_id(op);
1875
+
1876
+ const uint32_t r2 = 1;
1877
+ const uint32_t r3 = 1;
1878
+
1879
+ // find the break-even point where the matrix-matrix kernel becomes more efficient compared
1880
+ // to the matrix-vector kernel
1881
+ // ne20 = n_used_experts
1882
+ // ne21 = n_rows (batch size)
1883
+ const int ne21_mm_id_min = 32;
1884
+
1885
+ if (props_dev->has_simdgroup_mm && ne00 >= 64 && (ne21 >= ne21_mm_id_min)) {
1886
+ // some Metal matrix data types require aligned pointers
1887
+ // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5)
1888
+ //switch (op->src[0]->type) {
1889
+ // case GGML_TYPE_F32: GGML_ASSERT(nb01 % 16 == 0); break;
1890
+ // case GGML_TYPE_F16: GGML_ASSERT(nb01 % 8 == 0); break;
1891
+ // case GGML_TYPE_BF16: GGML_ASSERT(nb01 % 8 == 0); break;
1892
+ // default: break;
1893
+ //}
1894
+
1895
+ // extra buffers for intermediate id mapping
1896
+ ggml_metal_buffer_id bid_tpe = bid_dst;
1897
+ bid_tpe.offs += ggml_nbytes(op);
1898
+
1899
+ ggml_metal_buffer_id bid_ids = bid_tpe;
1900
+ bid_ids.offs += ggml_metal_op_mul_mat_id_extra_tpe(op);
1901
+
1902
+ {
1903
+ ggml_metal_kargs_mul_mm_id_map0 args = {
1904
+ ne02,
1905
+ ne10,
1906
+ ne11, // n_expert_used (bcast)
1907
+ nb11,
1908
+ nb12,
1909
+ ne21, // n_tokens
1910
+ ne20, // n_expert_used
1911
+ nb21,
1912
+ };
1913
+
1914
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_mul_mm_id_map0(lib, ne02, ne20);
1915
+
1916
+ const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
1917
+
1918
+ GGML_ASSERT(ne02 <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
1919
+
1920
+ GGML_ASSERT(smem <= props_dev->max_theadgroup_memory_size);
1921
+
1922
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
1923
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
1924
+ ggml_metal_encoder_set_buffer (enc, bid_src2, 1);
1925
+ ggml_metal_encoder_set_buffer (enc, bid_tpe, 2);
1926
+ ggml_metal_encoder_set_buffer (enc, bid_ids, 3);
1927
+
1928
+ ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
1929
+
1930
+ ggml_metal_encoder_dispatch_threadgroups(enc, 1, 1, 1, ne02, 1, 1);
1931
+ }
1932
+
1933
+ // this barrier is always needed because the next kernel has to wait for the id maps to be computed
1934
+ ggml_metal_op_concurrency_reset(ctx);
1935
+
1936
+ {
1937
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_mul_mm_id(lib, op);
1938
+
1939
+ ggml_metal_kargs_mul_mm_id args = {
1940
+ /*.ne00 =*/ ne00,
1941
+ /*.ne02 =*/ ne02,
1942
+ /*.nb01 =*/ nb01,
1943
+ /*.nb02 =*/ nb02,
1944
+ /*.nb03 =*/ nb03,
1945
+ /*.ne11 =*/ ne11, // n_expert_used (bcast)
1946
+ /*.nb10 =*/ nb10,
1947
+ /*.nb11 =*/ nb11,
1948
+ /*.nb12 =*/ nb12,
1949
+ /*.nb13 =*/ nb13,
1950
+ /*.ne20 =*/ ne20, // n_expert_used
1951
+ /*.ne21 =*/ ne21, // n_tokens
1952
+ /*.ne0 =*/ ne0,
1953
+ /*.ne1 =*/ ne1,
1954
+ /*.r2 =*/ r2,
1955
+ /*.r3 =*/ r3,
1956
+ };
1957
+
1958
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
1959
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
1960
+ ggml_metal_encoder_set_buffer (enc, bid_src0, 1);
1961
+ ggml_metal_encoder_set_buffer (enc, bid_src1, 2);
1962
+ ggml_metal_encoder_set_buffer (enc, bid_tpe, 3);
1963
+ ggml_metal_encoder_set_buffer (enc, bid_ids, 4);
1964
+ ggml_metal_encoder_set_buffer (enc, bid_dst, 5);
1965
+
1966
+ const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
1967
+
1968
+ ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
1969
+
1970
+ ggml_metal_encoder_dispatch_threadgroups(enc, (ne21 + 31)/32, (ne01 + 63)/64, ne02, 128, 1, 1);
1971
+ }
1972
+ } else {
1973
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_mul_mv_id(lib, op);
1974
+
1975
+ const int nr0 = ggml_metal_pipeline_get_nr0(pipeline);
1976
+ const int nr1 = ggml_metal_pipeline_get_nr1(pipeline);
1977
+ const int nsg = ggml_metal_pipeline_get_nsg(pipeline);
1978
+
1979
+ const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
1980
+
1981
+ ggml_metal_kargs_mul_mv_id args = {
1982
+ /*.nei0 =*/ ne20,
1983
+ /*.nei1 =*/ ne21,
1984
+ /*.nbi1 =*/ nb21,
1985
+ /*.ne00 =*/ ne00,
1986
+ /*.ne01 =*/ ne01,
1987
+ /*.ne02 =*/ ne02,
1988
+ /*.nb00 =*/ nb00,
1989
+ /*.nb01 =*/ nb01,
1990
+ /*.nb02 =*/ nb02,
1991
+ /*.ne10 =*/ ne10,
1992
+ /*.ne11 =*/ ne11,
1993
+ /*.ne12 =*/ ne12,
1994
+ /*.ne13 =*/ ne13,
1995
+ /*.nb10 =*/ nb10,
1996
+ /*.nb11 =*/ nb11,
1997
+ /*.nb12 =*/ nb12,
1998
+ /*.ne0 =*/ ne0,
1999
+ /*.ne1 =*/ ne1,
2000
+ /*.nb1 =*/ nb1,
2001
+ /*.nr0 =*/ nr0,
2002
+ };
2003
+
2004
+ if (ggml_is_quantized(op->src[0]->type)) {
2005
+ GGML_ASSERT(ne00 >= nsg*nr0);
2006
+ }
2007
+
2008
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
2009
+ ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 0);
2010
+ ggml_metal_encoder_set_buffer(enc, bid_src0, 1);
2011
+ ggml_metal_encoder_set_buffer(enc, bid_src1, 2);
2012
+ ggml_metal_encoder_set_buffer(enc, bid_dst, 3);
2013
+ ggml_metal_encoder_set_buffer(enc, bid_src2, 4);
2014
+
2015
+ const int64_t _ne1 = 1;
2016
+ const int64_t ne123 = ne20*ne21;
2017
+
2018
+ ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
2019
+
2020
+ if (op->src[0]->type == GGML_TYPE_F32 ||
2021
+ op->src[0]->type == GGML_TYPE_F16 ||
2022
+ op->src[0]->type == GGML_TYPE_BF16 ||
2023
+ op->src[0]->type == GGML_TYPE_Q8_0) {
2024
+ ggml_metal_encoder_dispatch_threadgroups(enc, (ne01 + nr0 - 1)/(nr0), (_ne1 + nr1 - 1)/nr1, ne123, 32, nsg, 1);
2025
+ } else {
2026
+ ggml_metal_encoder_dispatch_threadgroups(enc, (ne01 + nr0*nsg - 1)/(nr0*nsg), (_ne1 + nr1 - 1)/nr1, ne123, 32, nsg, 1);
2027
+ }
2028
+ }
2029
+
2030
+ return 1;
2031
+ }
2032
+
2033
+ int ggml_metal_op_add_id(ggml_metal_op_t ctx, int idx) {
2034
+ ggml_tensor * op = ctx->node(idx);
2035
+
2036
+ ggml_metal_library_t lib = ctx->lib;
2037
+ ggml_metal_encoder_t enc = ctx->enc;
2038
+
2039
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
2040
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
2041
+ GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
2042
+ GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
2043
+ GGML_TENSOR_LOCALS( int32_t, ne2, op->src[2], ne);
2044
+ GGML_TENSOR_LOCALS(uint64_t, nb2, op->src[2], nb);
2045
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
2046
+
2047
+ GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32);
2048
+ GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32);
2049
+ GGML_ASSERT(op->src[2]->type == GGML_TYPE_I32);
2050
+ GGML_ASSERT(op->type == GGML_TYPE_F32);
2051
+
2052
+ GGML_ASSERT(ggml_is_contiguous_rows(op->src[0]));
2053
+
2054
+ ggml_metal_kargs_add_id args = {
2055
+ /*.ne0 =*/ ne0,
2056
+ /*.ne1 =*/ ne1,
2057
+ /*.nb01 =*/ nb01,
2058
+ /*.nb02 =*/ nb02,
2059
+ /*.nb11 =*/ nb11,
2060
+ /*.nb21 =*/ nb21,
2061
+ };
2062
+
2063
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_base(lib, GGML_OP_ADD_ID);
2064
+
2065
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
2066
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
2067
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
2068
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
2069
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[2]), 3);
2070
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 4);
2071
+
2072
+ const int nth = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), ne00);
2073
+
2074
+ ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne02, 1, nth, 1, 1);
2075
+
2076
+ return 1;
2077
+ }
2078
+
2079
+ bool ggml_metal_op_flash_attn_ext_use_vec(const ggml_tensor * op) {
2080
+ assert(op->op == GGML_OP_FLASH_ATTN_EXT);
2081
+
2082
+ const int64_t ne00 = op->src[0]->ne[0]; // head size
2083
+ const int64_t ne01 = op->src[0]->ne[1]; // batch size
2084
+
2085
+ // use vec kernel if the batch size is small and if the head size is supported
2086
+ return (ne01 < 20) && (ne00 % 32 == 0);
2087
+ }
2088
+
2089
+ size_t ggml_metal_op_flash_attn_ext_extra_pad(const ggml_tensor * op) {
2090
+ assert(op->op == GGML_OP_FLASH_ATTN_EXT);
2091
+
2092
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
2093
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
2094
+ GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
2095
+ GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
2096
+ GGML_TENSOR_LOCALS( int32_t, ne2, op->src[2], ne);
2097
+ GGML_TENSOR_LOCALS(uint64_t, nb2, op->src[2], nb);
2098
+ GGML_TENSOR_LOCALS( int32_t, ne3, op->src[3], ne);
2099
+ GGML_TENSOR_LOCALS(uint64_t, nb3, op->src[3], nb);
2100
+
2101
+ size_t res = 0;
2102
+
2103
+ const bool has_mask = op->src[3] != nullptr;
2104
+
2105
+ if (ggml_metal_op_flash_attn_ext_use_vec(op)) {
2106
+ // note: always reserve the padding space to avoid graph reallocations
2107
+ //const bool has_kvpad = ne11 % OP_FLASH_ATTN_EXT_VEC_NCPSG != 0;
2108
+ const bool has_kvpad = true;
2109
+
2110
+ if (has_kvpad) {
2111
+ res += OP_FLASH_ATTN_EXT_VEC_NCPSG*(
2112
+ nb11*ne12*ne13 +
2113
+ nb21*ne22*ne23 +
2114
+ (has_mask ? ggml_type_size(GGML_TYPE_F16)*ne31*ne32*ne33 : 0));
2115
+ }
2116
+ } else {
2117
+ //const bool has_kvpad = ne11 % OP_FLASH_ATTN_EXT_NCPSG != 0;
2118
+ const bool has_kvpad = true;
2119
+
2120
+ if (has_kvpad) {
2121
+ res += OP_FLASH_ATTN_EXT_NCPSG*(
2122
+ nb11*ne12*ne13 +
2123
+ nb21*ne22*ne23 +
2124
+ (has_mask ? ggml_type_size(GGML_TYPE_F16)*ne31*ne32*ne33 : 0));
2125
+ }
2126
+ }
2127
+
2128
+ return res;
2129
+ }
2130
+
2131
+ size_t ggml_metal_op_flash_attn_ext_extra_blk(const ggml_tensor * op) {
2132
+ assert(op->op == GGML_OP_FLASH_ATTN_EXT);
2133
+
2134
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
2135
+ //GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
2136
+ //GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
2137
+ //GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
2138
+ //GGML_TENSOR_LOCALS( int32_t, ne2, op->src[2], ne);
2139
+ //GGML_TENSOR_LOCALS(uint64_t, nb2, op->src[2], nb);
2140
+ GGML_TENSOR_LOCALS( int32_t, ne3, op->src[3], ne);
2141
+ GGML_TENSOR_LOCALS(uint64_t, nb3, op->src[3], nb);
2142
+
2143
+ size_t res = 0;
2144
+
2145
+ const bool has_mask = op->src[3] != nullptr;
2146
+
2147
+ if (!has_mask) {
2148
+ return res;
2149
+ }
2150
+
2151
+ const bool is_vec = ggml_metal_op_flash_attn_ext_use_vec(op);
2152
+
2153
+ // this optimization is not useful for the vector kernels
2154
+ // note: always reserve the blk buffer to avoid graph reallocations
2155
+ //if (is_vec) {
2156
+ // return res;
2157
+ //}
2158
+
2159
+ const int nqptg = is_vec ? OP_FLASH_ATTN_EXT_VEC_NQPTG : OP_FLASH_ATTN_EXT_NQPTG;
2160
+ const int ncpsg = is_vec ? OP_FLASH_ATTN_EXT_VEC_NCPSG : OP_FLASH_ATTN_EXT_NCPSG;
2161
+
2162
+ const int64_t ne1 = (ne01 + nqptg - 1)/nqptg;
2163
+ const int64_t ne0 = (ne30 + ncpsg - 1)/ncpsg;
2164
+
2165
+ res += GGML_PAD(ggml_type_size(GGML_TYPE_I8)*ne0*ne1*ne32*ne33, 32);
2166
+
2167
+ return res;
2168
+ }
2169
+
2170
+ size_t ggml_metal_op_flash_attn_ext_extra_tmp(const ggml_tensor * op) {
2171
+ assert(op->op == GGML_OP_FLASH_ATTN_EXT);
2172
+
2173
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
2174
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
2175
+ //GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
2176
+ //GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
2177
+ GGML_TENSOR_LOCALS( int32_t, ne2, op->src[2], ne);
2178
+ GGML_TENSOR_LOCALS(uint64_t, nb2, op->src[2], nb);
2179
+ //GGML_TENSOR_LOCALS( int32_t, ne3, op->src[3], ne);
2180
+ //GGML_TENSOR_LOCALS(uint64_t, nb3, op->src[3], nb);
2181
+
2182
+ size_t res = 0;
2183
+
2184
+ // note: always reserve the temp buffer to avoid graph reallocations
2185
+ //if (ggml_metal_op_flash_attn_ext_use_vec(op)) {
2186
+ if (true) {
2187
+ const int64_t nwg = 32;
2188
+ const int64_t ne01_max = std::min(ne01, 32);
2189
+
2190
+ // temp buffer for writing the results from each workgroup
2191
+ // - ne20: the size of the Value head
2192
+ // - + 2: the S and M values for each intermediate result
2193
+ res += ggml_type_size(GGML_TYPE_F32)*(ne01_max*ne02*ne03*nwg*(ne20 + 2));
2194
+ }
2195
+
2196
+ return res;
2197
+ }
2198
+
2199
+ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
2200
+ ggml_tensor * op = ctx->node(idx);
2201
+
2202
+ ggml_metal_library_t lib = ctx->lib;
2203
+ ggml_metal_encoder_t enc = ctx->enc;
2204
+
2205
+ const ggml_metal_device_props * props_dev = ggml_metal_device_get_props(ctx->dev);
2206
+
2207
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
2208
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
2209
+ GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
2210
+ GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
2211
+ GGML_TENSOR_LOCALS( int32_t, ne2, op->src[2], ne);
2212
+ GGML_TENSOR_LOCALS(uint64_t, nb2, op->src[2], nb);
2213
+ GGML_TENSOR_LOCALS( int32_t, ne3, op->src[3], ne);
2214
+ GGML_TENSOR_LOCALS(uint64_t, nb3, op->src[3], nb);
2215
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
2216
+ GGML_TENSOR_LOCALS( int32_t, nb, op, nb);
2217
+
2218
+ GGML_ASSERT(ne00 % 4 == 0);
2219
+
2220
+ GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32);
2221
+ GGML_ASSERT(op->src[1]->type == op->src[2]->type);
2222
+
2223
+ //GGML_ASSERT(ggml_are_same_shape (src1, src2));
2224
+ GGML_ASSERT(ne11 == ne21);
2225
+ GGML_ASSERT(ne12 == ne22);
2226
+
2227
+ GGML_ASSERT(!op->src[3] || op->src[3]->type == GGML_TYPE_F16);
2228
+ GGML_ASSERT(!op->src[3] || op->src[3]->ne[1] >= op->src[0]->ne[1] &&
2229
+ "the Flash-Attention Metal kernel requires the mask to be at least n_queries big");
2230
+
2231
+ float scale;
2232
+ float max_bias;
2233
+ float logit_softcap;
2234
+
2235
+ memcpy(&scale, ((const int32_t *) op->op_params) + 0, sizeof(scale));
2236
+ memcpy(&max_bias, ((const int32_t *) op->op_params) + 1, sizeof(max_bias));
2237
+ memcpy(&logit_softcap, ((const int32_t *) op->op_params) + 2, sizeof(logit_softcap));
2238
+
2239
+ if (logit_softcap != 0.0f) {
2240
+ scale /= logit_softcap;
2241
+ }
2242
+
2243
+ const bool has_mask = op->src[3] != NULL;
2244
+ const bool has_sinks = op->src[4] != NULL;
2245
+ const bool has_bias = max_bias != 0.0f;
2246
+ const bool has_scap = logit_softcap != 0.0f;
2247
+
2248
+ const uint32_t n_head = op->src[0]->ne[2];
2249
+ const int32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
2250
+
2251
+ const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
2252
+ const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
2253
+
2254
+ GGML_ASSERT(ne01 < 65536);
2255
+
2256
+ ggml_metal_buffer_id bid_src0 = ggml_metal_get_buffer_id(op->src[0]);
2257
+ ggml_metal_buffer_id bid_src1 = ggml_metal_get_buffer_id(op->src[1]);
2258
+ ggml_metal_buffer_id bid_src2 = ggml_metal_get_buffer_id(op->src[2]);
2259
+ ggml_metal_buffer_id bid_src3 = has_mask ? ggml_metal_get_buffer_id(op->src[3]) : bid_src0;
2260
+ ggml_metal_buffer_id bid_src4 = has_sinks ? ggml_metal_get_buffer_id(op->src[4]) : bid_src0;
2261
+
2262
+ ggml_metal_buffer_id bid_dst = ggml_metal_get_buffer_id(op);
2263
+
2264
+ ggml_metal_buffer_id bid_pad = bid_dst;
2265
+ bid_pad.offs += ggml_nbytes(op);
2266
+
2267
+ ggml_metal_buffer_id bid_blk = bid_pad;
2268
+ bid_blk.offs += ggml_metal_op_flash_attn_ext_extra_pad(op);
2269
+
2270
+ ggml_metal_buffer_id bid_tmp = bid_blk;
2271
+ bid_tmp.offs += ggml_metal_op_flash_attn_ext_extra_blk(op);
2272
+
2273
+ if (!ggml_metal_op_flash_attn_ext_use_vec(op)) {
2274
+ // half8x8 kernel
2275
+ const int nqptg = OP_FLASH_ATTN_EXT_NQPTG; // queries per threadgroup
2276
+ const int ncpsg = OP_FLASH_ATTN_EXT_NCPSG; // cache values per simdgroup
2277
+
2278
+ GGML_ASSERT(nqptg <= 32);
2279
+ GGML_ASSERT(nqptg % 8 == 0);
2280
+ GGML_ASSERT(ncpsg % 32 == 0);
2281
+
2282
+ bool need_sync = false;
2283
+
2284
+ const bool has_kvpad = ne11 % ncpsg != 0;
2285
+
2286
+ if (has_kvpad) {
2287
+ assert(ggml_metal_op_flash_attn_ext_extra_pad(op) != 0);
2288
+
2289
+ ggml_metal_kargs_flash_attn_ext_pad args0 = {
2290
+ /*.ne11 =*/ne11,
2291
+ /*.ne_12_2 =*/ne12,
2292
+ /*.ne_12_3 =*/ne13,
2293
+ /*.nb11 =*/nb11,
2294
+ /*.nb12 =*/nb12,
2295
+ /*.nb13 =*/nb13,
2296
+ /*.nb21 =*/nb21,
2297
+ /*.nb22 =*/nb22,
2298
+ /*.nb23 =*/nb23,
2299
+ /*.ne31 =*/ne31,
2300
+ /*.ne32 =*/ne32,
2301
+ /*.ne33 =*/ne33,
2302
+ /*.nb31 =*/nb31,
2303
+ /*.nb32 =*/nb32,
2304
+ /*.nb33 =*/nb33,
2305
+ };
2306
+
2307
+ ggml_metal_pipeline_t pipeline0 = ggml_metal_library_get_pipeline_flash_attn_ext_pad(lib, op, has_mask, ncpsg);
2308
+
2309
+ ggml_metal_encoder_set_pipeline(enc, pipeline0);
2310
+ ggml_metal_encoder_set_bytes (enc, &args0, sizeof(args0), 0);
2311
+ ggml_metal_encoder_set_buffer (enc, bid_src1, 1);
2312
+ ggml_metal_encoder_set_buffer (enc, bid_src2, 2);
2313
+ ggml_metal_encoder_set_buffer (enc, bid_src3, 3);
2314
+ ggml_metal_encoder_set_buffer (enc, bid_pad, 4);
2315
+
2316
+ assert(ne12 == ne22);
2317
+ assert(ne13 == ne23);
2318
+
2319
+ ggml_metal_encoder_dispatch_threadgroups(enc, ncpsg, std::max(ne12, ne32), std::max(ne13, ne33), 32, 1, 1);
2320
+
2321
+ need_sync = true;
2322
+ }
2323
+
2324
+ if (has_mask) {
2325
+ assert(ggml_metal_op_flash_attn_ext_extra_blk(op) != 0);
2326
+
2327
+ ggml_metal_kargs_flash_attn_ext_blk args0 = {
2328
+ /*.ne01 =*/ ne01,
2329
+ /*.ne30 =*/ ne30,
2330
+ /*.ne31 =*/ ne31,
2331
+ /*.ne32 =*/ ne32,
2332
+ /*.ne33 =*/ ne33,
2333
+ /*.nb31 =*/ nb31,
2334
+ /*.nb32 =*/ nb32,
2335
+ /*.nb33 =*/ nb33,
2336
+ };
2337
+
2338
+ ggml_metal_pipeline_t pipeline0 = ggml_metal_library_get_pipeline_flash_attn_ext_blk(lib, op, nqptg, ncpsg);
2339
+
2340
+ ggml_metal_encoder_set_pipeline(enc, pipeline0);
2341
+ ggml_metal_encoder_set_bytes (enc, &args0, sizeof(args0), 0);
2342
+ ggml_metal_encoder_set_buffer (enc, bid_src3, 1);
2343
+ ggml_metal_encoder_set_buffer (enc, bid_blk, 2);
2344
+
2345
+ const int32_t nblk1 = ((ne01 + nqptg - 1)/nqptg);
2346
+ const int32_t nblk0 = ((ne30 + ncpsg - 1)/ncpsg);
2347
+
2348
+ ggml_metal_encoder_dispatch_threadgroups(enc, nblk0, nblk1, ne32*ne33, 32, 1, 1);
2349
+
2350
+ need_sync = true;
2351
+ }
2352
+
2353
+ if (need_sync) {
2354
+ ggml_metal_op_concurrency_reset(ctx);
2355
+ }
2356
+
2357
+ const int is_q = ggml_is_quantized(op->src[1]->type) ? 1 : 0;
2358
+
2359
+ // 2*(2*ncpsg)
2360
+ // ncpsg soft_max values + ncpsg mask values
2361
+ //
2362
+ // 16*32*(nsg)
2363
+ // the shared memory needed for the simdgroups to load the KV cache
2364
+ // each thread loads (dequantizes) 16 head elements, there are 32 threads in th SG
2365
+ //
2366
+ #define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(ne00 + 2*GGML_PAD(ne20, 64) + 2*(2*ncpsg)) + is_q*(16*32*(nsg)))*(sizeof(float)/2), 16))
2367
+
2368
+ //int64_t nsgmax = 4;
2369
+ //
2370
+ //if (is_q) {
2371
+ // nsgmax = 2;
2372
+ // while (true) {
2373
+ // const size_t smem = FATTN_SMEM(nsgmax);
2374
+ // if (smem > props_dev->max_theadgroup_memory_size) {
2375
+ // break;
2376
+ // }
2377
+ // nsgmax *= 2;
2378
+ // }
2379
+ // nsgmax /= 2;
2380
+ //}
2381
+
2382
+ // simdgroups per threadgroup (a.k.a. warps)
2383
+ //nsg = ne01 <= nqptg ? MAX(4, MIN(nsgmax, MIN(ne11/ncpsg, (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32))) : 4;
2384
+ int32_t nsg = 4;
2385
+
2386
+ const size_t smem = FATTN_SMEM(nsg);
2387
+
2388
+ ggml_metal_kargs_flash_attn_ext args = {
2389
+ /*.ne01 =*/ ne01,
2390
+ /*.ne02 =*/ ne02,
2391
+ /*.ne03 =*/ ne03,
2392
+ /*.nb01 =*/ nb01,
2393
+ /*.nb02 =*/ nb02,
2394
+ /*.nb03 =*/ nb03,
2395
+ /*.ne11 =*/ ne11,
2396
+ /*.ne_12_2 =*/ ne12,
2397
+ /*.ne_12_3 =*/ ne13,
2398
+ /*.ns10 =*/ int32_t(nb11/nb10),
2399
+ /*.nb11 =*/ nb11,
2400
+ /*.nb12 =*/ nb12,
2401
+ /*.nb13 =*/ nb13,
2402
+ /*.ns20 =*/ int32_t(nb21/nb20),
2403
+ /*.nb21 =*/ nb21,
2404
+ /*.nb22 =*/ nb22,
2405
+ /*.nb23 =*/ nb23,
2406
+ /*.ne31 =*/ ne31,
2407
+ /*.ne32 =*/ ne32,
2408
+ /*.ne33 =*/ ne33,
2409
+ /*.nb31 =*/ nb31,
2410
+ /*.nb32 =*/ nb32,
2411
+ /*.nb33 =*/ nb33,
2412
+ /*.ne1 =*/ ne1,
2413
+ /*.ne2 =*/ ne2,
2414
+ /*.ne3 =*/ ne3,
2415
+ /*.scale =*/ scale,
2416
+ /*.max_bias =*/ max_bias,
2417
+ /*.m0 =*/ m0,
2418
+ /*.m1 =*/ m1,
2419
+ /*.n_head_log2 =*/ n_head_log2,
2420
+ /*.logit_softcap =*/ logit_softcap,
2421
+ };
2422
+
2423
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_flash_attn_ext(lib, op, has_mask, has_sinks, has_bias, has_scap, has_kvpad, nsg);
2424
+
2425
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
2426
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
2427
+ ggml_metal_encoder_set_buffer (enc, bid_src0, 1);
2428
+ ggml_metal_encoder_set_buffer (enc, bid_src1, 2);
2429
+ ggml_metal_encoder_set_buffer (enc, bid_src2, 3);
2430
+ ggml_metal_encoder_set_buffer (enc, bid_src3, 4);
2431
+ ggml_metal_encoder_set_buffer (enc, bid_src4, 5);
2432
+ ggml_metal_encoder_set_buffer (enc, bid_pad, 6);
2433
+ ggml_metal_encoder_set_buffer (enc, bid_blk, 7);
2434
+ ggml_metal_encoder_set_buffer (enc, bid_dst, 8);
2435
+
2436
+ ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
2437
+
2438
+ ggml_metal_encoder_dispatch_threadgroups(enc, (ne01 + nqptg - 1)/nqptg, ne02, ne03, 32, nsg, 1);
2439
+ #undef FATTN_SMEM
2440
+ } else {
2441
+ // half4x4 kernel
2442
+ const int nqptg = OP_FLASH_ATTN_EXT_VEC_NQPTG; // queries per threadgroup
2443
+ const int ncpsg = OP_FLASH_ATTN_EXT_VEC_NCPSG; // cache values per simdgroup !! sync with kernel template arguments !!
2444
+ const int nkpsg = 1*ncpsg;
2445
+
2446
+ GGML_ASSERT(nqptg <= 32);
2447
+ GGML_ASSERT(nqptg % 1 == 0);
2448
+ GGML_ASSERT(ncpsg % 32 == 0);
2449
+
2450
+ bool need_sync = false;
2451
+
2452
+ const bool has_kvpad = ne11 % ncpsg != 0;
2453
+
2454
+ if (has_kvpad) {
2455
+ assert(ggml_metal_op_flash_attn_ext_extra_pad(op) != 0);
2456
+
2457
+ ggml_metal_kargs_flash_attn_ext_pad args0 = {
2458
+ /*.ne11 =*/ne11,
2459
+ /*.ne_12_2 =*/ne12,
2460
+ /*.ne_12_3 =*/ne13,
2461
+ /*.nb11 =*/nb11,
2462
+ /*.nb12 =*/nb12,
2463
+ /*.nb13 =*/nb13,
2464
+ /*.nb21 =*/nb21,
2465
+ /*.nb22 =*/nb22,
2466
+ /*.nb23 =*/nb23,
2467
+ /*.ne31 =*/ne31,
2468
+ /*.ne32 =*/ne32,
2469
+ /*.ne33 =*/ne33,
2470
+ /*.nb31 =*/nb31,
2471
+ /*.nb32 =*/nb32,
2472
+ /*.nb33 =*/nb33,
2473
+ };
2474
+
2475
+ ggml_metal_pipeline_t pipeline0 = ggml_metal_library_get_pipeline_flash_attn_ext_pad(lib, op, has_mask, ncpsg);
2476
+
2477
+ ggml_metal_encoder_set_pipeline(enc, pipeline0);
2478
+ ggml_metal_encoder_set_bytes (enc, &args0, sizeof(args0), 0);
2479
+ ggml_metal_encoder_set_buffer (enc, bid_src1, 1);
2480
+ ggml_metal_encoder_set_buffer (enc, bid_src2, 2);
2481
+ ggml_metal_encoder_set_buffer (enc, bid_src3, 3);
2482
+ ggml_metal_encoder_set_buffer (enc, bid_pad, 4);
2483
+
2484
+ assert(ne12 == ne22);
2485
+ assert(ne13 == ne23);
2486
+
2487
+ ggml_metal_encoder_dispatch_threadgroups(enc, ncpsg, std::max(ne12, ne32), std::max(ne13, ne33), 32, 1, 1);
2488
+
2489
+ need_sync = true;
2490
+ }
2491
+
2492
+ if (need_sync) {
2493
+ ggml_metal_op_concurrency_reset(ctx);
2494
+ }
2495
+
2496
+ // ne00 + 2*ncpsg*(nsg)
2497
+ // for each query, we load it as f16 in shared memory (ne00)
2498
+ // and store the soft_max values and the mask
2499
+ //
2500
+ // ne20*(nsg)
2501
+ // each simdgroup has a full f32 head vector in shared mem to accumulate results
2502
+ //
2503
+ #define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(GGML_PAD(ne00, 128) + 4*ncpsg*(nsg)) + 2*GGML_PAD(ne20, 128)*(nsg))*(sizeof(float)/2), 16))
2504
+
2505
+ int64_t nsgmax = 2;
2506
+ while (true) {
2507
+ const size_t smem = FATTN_SMEM(nsgmax);
2508
+ // avoid using more than half of the threadgroup memory - can cause slow downs especially for large head sizes
2509
+ if (smem > props_dev->max_theadgroup_memory_size/2) {
2510
+ break;
2511
+ }
2512
+ nsgmax *= 2;
2513
+ }
2514
+ nsgmax /= 2;
2515
+
2516
+ // simdgroups per threadgroup (a.k.a. warps)
2517
+ //const int64_t nsgt = MAX(2, MIN(nsgmax, MIN((ne11 + nkpsg - 1)/(nkpsg), (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32)));
2518
+ const int64_t nsgt = MAX(2, MIN(nsgmax, MIN((ne11 + nkpsg - 1)/(nkpsg), (int64_t) 1024/32)));
2519
+
2520
+ int64_t nsg = 1;
2521
+ while (nsg <= nsgt) {
2522
+ nsg *= 2;
2523
+ }
2524
+ nsg /= 2;
2525
+
2526
+ // workgroups
2527
+ // each workgroup handles nsg*nkpsg cache values
2528
+ int32_t nwg = 1;
2529
+ if (false) {
2530
+ // for small KV caches, we could launch a single workgroup and write the results directly to dst/
2531
+ // however, this does not lead to significant improvement, so disabled
2532
+ nwg = 1;
2533
+ nsg = 4;
2534
+ } else {
2535
+ nwg = 32;
2536
+ nsg = 1;
2537
+ while (2*nwg*nsg*nkpsg < ne11 && nsg < 4) {
2538
+ nsg *= 2;
2539
+ }
2540
+ }
2541
+
2542
+ ggml_metal_kargs_flash_attn_ext_vec args = {
2543
+ /*.ne01 =*/ ne01,
2544
+ /*.ne02 =*/ ne02,
2545
+ /*.ne03 =*/ ne03,
2546
+ /*.nb01 =*/ nb01,
2547
+ /*.nb02 =*/ nb02,
2548
+ /*.nb03 =*/ nb03,
2549
+ /*.ne11 =*/ ne11,
2550
+ /*.ne_12_2 =*/ ne12,
2551
+ /*.ne_12_3 =*/ ne13,
2552
+ /*.ns10 =*/ int32_t(nb11/nb10),
2553
+ /*.nb11 =*/ nb11,
2554
+ /*.nb12 =*/ nb12,
2555
+ /*.nb13 =*/ nb13,
2556
+ /*.ns20 =*/ int32_t(nb21/nb20),
2557
+ /*.nb21 =*/ nb21,
2558
+ /*.nb22 =*/ nb22,
2559
+ /*.nb23 =*/ nb23,
2560
+ /*.ne31 =*/ ne31,
2561
+ /*.ne32 =*/ ne32,
2562
+ /*.ne33 =*/ ne33,
2563
+ /*.nb31 =*/ nb31,
2564
+ /*.nb32 =*/ nb32,
2565
+ /*.nb33 =*/ nb33,
2566
+ /*.ne1 =*/ ne1,
2567
+ /*.ne2 =*/ ne2,
2568
+ /*.ne3 =*/ ne3,
2569
+ /*.scale =*/ scale,
2570
+ /*.max_bias =*/ max_bias,
2571
+ /*.m0 =*/ m0,
2572
+ /*.m1 =*/ m1,
2573
+ /*.n_head_log2 =*/ n_head_log2,
2574
+ /*.logit_softcap =*/ logit_softcap,
2575
+ };
2576
+
2577
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_flash_attn_ext_vec(lib, op, has_mask, has_sinks, has_bias, has_scap, has_kvpad, nsg, nwg);
2578
+
2579
+ GGML_ASSERT(nsg*32 <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
2580
+
2581
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
2582
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
2583
+ ggml_metal_encoder_set_buffer (enc, bid_src0, 1);
2584
+ ggml_metal_encoder_set_buffer (enc, bid_src1, 2);
2585
+ ggml_metal_encoder_set_buffer (enc, bid_src2, 3);
2586
+ ggml_metal_encoder_set_buffer (enc, bid_src3, 4);
2587
+ ggml_metal_encoder_set_buffer (enc, bid_src4, 5);
2588
+
2589
+ const size_t smem = FATTN_SMEM(nsg);
2590
+
2591
+ //printf("smem: %zu, max: %zu, nsg = %d, nsgmax = %d\n", smem, props_dev->max_theadgroup_memory_size, (int) nsg, (int) nsgmax);
2592
+ GGML_ASSERT(smem <= props_dev->max_theadgroup_memory_size);
2593
+
2594
+ if (nwg == 1) {
2595
+ assert(ggml_metal_op_flash_attn_ext_extra_tmp(op) == 0);
2596
+
2597
+ // using 1 workgroup -> write the result directly into dst
2598
+ ggml_metal_encoder_set_buffer(enc, bid_pad, 6);
2599
+ ggml_metal_encoder_set_buffer(enc, bid_dst, 7);
2600
+
2601
+ ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
2602
+
2603
+ ggml_metal_encoder_dispatch_threadgroups(enc, (ne01 + nqptg - 1)/nqptg, ne02, ne03*nwg, 32, nsg, 1);
2604
+ } else {
2605
+ // sanity checks
2606
+ assert(ggml_metal_op_flash_attn_ext_extra_tmp(op) != 0);
2607
+
2608
+ GGML_ASSERT(ne01*ne02*ne03 == ne1*ne2*ne3);
2609
+ GGML_ASSERT((uint64_t)ne1*ne2*ne3 <= (1u << 31));
2610
+
2611
+ // write the results from each workgroup into a temp buffer
2612
+ ggml_metal_encoder_set_buffer(enc, bid_pad, 6);
2613
+ ggml_metal_encoder_set_buffer(enc, bid_tmp, 7);
2614
+
2615
+ ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
2616
+ ggml_metal_encoder_dispatch_threadgroups(enc, (ne01 + nqptg - 1)/nqptg, ne02, ne03*nwg, 32, nsg, 1);
2617
+
2618
+ // sync the 2 kernels
2619
+ ggml_metal_op_concurrency_reset(ctx);
2620
+
2621
+ // reduce the results from the workgroups
2622
+ {
2623
+ const int32_t nrows = ne1*ne2*ne3;
2624
+
2625
+ ggml_metal_kargs_flash_attn_ext_vec_reduce args0 = {
2626
+ nrows,
2627
+ };
2628
+
2629
+ ggml_metal_pipeline_t pipeline0 = ggml_metal_library_get_pipeline_flash_attn_ext_vec_reduce(lib, op, ne20, nwg);
2630
+
2631
+ ggml_metal_encoder_set_pipeline(enc, pipeline0);
2632
+ ggml_metal_encoder_set_bytes (enc, &args0, sizeof(args0), 0);
2633
+ ggml_metal_encoder_set_buffer (enc, bid_tmp, 1);
2634
+ ggml_metal_encoder_set_buffer (enc, bid_dst, 2);
2635
+
2636
+ ggml_metal_encoder_dispatch_threadgroups(enc, nrows, 1, 1, 32*nwg, 1, 1);
2637
+ }
2638
+ }
2639
+ #undef FATTN_SMEM
2640
+ }
2641
+
2642
+ return 1;
2643
+ }
2644
+
2645
+ int ggml_metal_op_bin(ggml_metal_op_t ctx, int idx) {
2646
+ ggml_tensor * op = ctx->node(idx);
2647
+
2648
+ ggml_metal_library_t lib = ctx->lib;
2649
+ ggml_metal_encoder_t enc = ctx->enc;
2650
+
2651
+ const bool use_fusion = ctx->use_fusion;
2652
+
2653
+ const int debug_fusion = ctx->debug_fusion;
2654
+
2655
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
2656
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
2657
+ GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
2658
+ GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
2659
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
2660
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
2661
+
2662
+ GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32);
2663
+ GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32);
2664
+
2665
+ GGML_ASSERT(ggml_is_contiguous_rows(op->src[0]));
2666
+ GGML_ASSERT(ggml_is_contiguous_rows(op->src[1]));
2667
+
2668
+ bool bcast_row = false;
2669
+
2670
+ ggml_metal_buffer_id bid_src0 = ggml_metal_get_buffer_id(op->src[0]);
2671
+ ggml_metal_buffer_id bid_src1 = ggml_metal_get_buffer_id(op->src[1]);
2672
+ ggml_metal_buffer_id bid_dst = ggml_metal_get_buffer_id(op);
2673
+
2674
+ ggml_metal_kargs_bin args = {
2675
+ /*.ne00 =*/ ne00,
2676
+ /*.ne01 =*/ ne01,
2677
+ /*.ne02 =*/ ne02,
2678
+ /*.ne03 =*/ ne03,
2679
+ /*.nb00 =*/ nb00,
2680
+ /*.nb01 =*/ nb01,
2681
+ /*.nb02 =*/ nb02,
2682
+ /*.nb03 =*/ nb03,
2683
+ /*.ne10 =*/ ne10,
2684
+ /*.ne11 =*/ ne11,
2685
+ /*.ne12 =*/ ne12,
2686
+ /*.ne13 =*/ ne13,
2687
+ /*.nb10 =*/ nb10,
2688
+ /*.nb11 =*/ nb11,
2689
+ /*.nb12 =*/ nb12,
2690
+ /*.nb13 =*/ nb13,
2691
+ /*.ne0 =*/ ne0,
2692
+ /*.ne1 =*/ ne1,
2693
+ /*.ne2 =*/ ne2,
2694
+ /*.ne3 =*/ ne3,
2695
+ /*.nb0 =*/ nb0,
2696
+ /*.nb1 =*/ nb1,
2697
+ /*.nb2 =*/ nb2,
2698
+ /*.nb3 =*/ nb3,
2699
+ /*.offs =*/ 0,
2700
+ /*.o1 =*/ { bid_src1.offs },
2701
+ };
2702
+
2703
+ ggml_op fops[8];
2704
+
2705
+ int n_fuse = 1;
2706
+
2707
+ // c[0] = add(a, b[0])
2708
+ // c[1] = add(c[0], b[1])
2709
+ // c[2] = add(c[1], b[2])
2710
+ // ...
2711
+ if (use_fusion) {
2712
+ fops[0] = GGML_OP_ADD;
2713
+ fops[1] = GGML_OP_ADD;
2714
+ fops[2] = GGML_OP_ADD;
2715
+ fops[3] = GGML_OP_ADD;
2716
+ fops[4] = GGML_OP_ADD;
2717
+ fops[5] = GGML_OP_ADD;
2718
+ fops[6] = GGML_OP_ADD;
2719
+ fops[7] = GGML_OP_ADD;
2720
+
2721
+ // note: in metal, we sometimes encode the graph in parallel so we have to avoid fusing ops
2722
+ // across splits. idx_end indicates the last node in the current split
2723
+ for (n_fuse = 0; n_fuse <= 6; ++n_fuse) {
2724
+ if (!ctx->can_fuse(idx + n_fuse, fops + n_fuse, 2)) {
2725
+ break;
2726
+ }
2727
+
2728
+ ggml_tensor * f0 = ctx->node(idx + n_fuse);
2729
+ ggml_tensor * f1 = ctx->node(idx + n_fuse + 1);
2730
+
2731
+ if (f0 != f1->src[0]) {
2732
+ break;
2733
+ }
2734
+
2735
+ // b[0] === b[1] === ...
2736
+ if (!ggml_are_same_layout(f0->src[1], f1->src[1])) {
2737
+ break;
2738
+ }
2739
+
2740
+ // only fuse ops if src1 is in the same Metal buffer
2741
+ ggml_metal_buffer_id bid_fuse = ggml_metal_get_buffer_id(f1->src[1]);
2742
+ if (bid_fuse.metal != bid_src1.metal) {
2743
+ break;
2744
+ }
2745
+
2746
+ //ctx->fuse_cnt[ops[n_fuse + 1]->op]++;
2747
+
2748
+ args.o1[n_fuse + 1] = bid_fuse.offs;
2749
+ }
2750
+
2751
+ ++n_fuse;
2752
+
2753
+ if (debug_fusion > 1 && n_fuse > 1) {
2754
+ GGML_LOG_DEBUG("%s: fuse: ADD x %d\n", __func__, n_fuse);
2755
+ }
2756
+ }
2757
+
2758
+ // the offsets of src1 and all fused buffers are relative to the start of the src1 buffer
2759
+ bid_src1.offs = 0;
2760
+
2761
+ ggml_metal_pipeline_t pipeline = nullptr;
2762
+
2763
+ if (ggml_nelements(op->src[1]) == ne10 && ggml_is_contiguous(op->src[1]) && ne00 % 4 == 0 && ne10 % 4 == 0) {
2764
+ GGML_ASSERT(ggml_is_contiguous(op->src[0]));
2765
+
2766
+ // src1 is a row
2767
+ GGML_ASSERT(ne11 == 1);
2768
+
2769
+ pipeline = ggml_metal_library_get_pipeline_bin(lib, op->op, n_fuse, true);
2770
+
2771
+ bcast_row = true;
2772
+ } else {
2773
+ pipeline = ggml_metal_library_get_pipeline_bin(lib, op->op, n_fuse, false);
2774
+ }
2775
+
2776
+ if (n_fuse > 1) {
2777
+ bid_dst = ggml_metal_get_buffer_id(ctx->node(idx + n_fuse - 1));
2778
+
2779
+ for (int i = 1; i < n_fuse; ++i) {
2780
+ if (!ggml_metal_op_concurrency_check(ctx, ctx->node(idx + i))) {
2781
+ ggml_metal_op_concurrency_reset(ctx);
2782
+
2783
+ break;
2784
+ }
2785
+ }
2786
+ }
2787
+
2788
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
2789
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
2790
+ ggml_metal_encoder_set_buffer (enc, bid_src0, 1);
2791
+ ggml_metal_encoder_set_buffer (enc, bid_src1, 2);
2792
+ ggml_metal_encoder_set_buffer (enc, bid_dst, 3);
2793
+
2794
+ if (bcast_row) {
2795
+ const int64_t n = ggml_nelements(op)/4;
2796
+
2797
+ ggml_metal_encoder_dispatch_threadgroups(enc, n, 1, 1, 1, 1, 1);
2798
+ } else {
2799
+ int nth = 32;
2800
+
2801
+ while (16*nth < ne0 && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
2802
+ nth *= 2;
2803
+ }
2804
+
2805
+ ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne02, ne03, nth, 1, 1);
2806
+ }
2807
+
2808
+ return n_fuse;
2809
+ }
2810
+
2811
+ int ggml_metal_op_l2_norm(ggml_metal_op_t ctx, int idx) {
2812
+ ggml_tensor * op = ctx->node(idx);
2813
+
2814
+ ggml_metal_library_t lib = ctx->lib;
2815
+ ggml_metal_encoder_t enc = ctx->enc;
2816
+
2817
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
2818
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
2819
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
2820
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
2821
+
2822
+ float eps;
2823
+ memcpy(&eps, op->op_params, sizeof(float));
2824
+
2825
+ int nth = 32; // SIMD width
2826
+
2827
+ ggml_metal_kargs_l2_norm args = {
2828
+ /*.ne00 =*/ ne00,
2829
+ /*.ne00_4 =*/ ne00/4,
2830
+ /*.nb01 =*/ nb01,
2831
+ /*.eps =*/ eps,
2832
+ };
2833
+
2834
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_l2_norm(lib, op);
2835
+
2836
+ while (nth < ne00/4 && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
2837
+ nth *= 2;
2838
+ }
2839
+
2840
+ nth = std::min(nth, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
2841
+ nth = std::min(nth, ne00/4);
2842
+
2843
+ const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
2844
+
2845
+ const int64_t nrows = ggml_nrows(op->src[0]);
2846
+
2847
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
2848
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
2849
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
2850
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
2851
+
2852
+ ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
2853
+
2854
+ ggml_metal_encoder_dispatch_threadgroups(enc, nrows, 1, 1, nth, 1, 1);
2855
+
2856
+ return 1;
2857
+ }
2858
+
2859
+ int ggml_metal_op_group_norm(ggml_metal_op_t ctx, int idx) {
2860
+ ggml_tensor * op = ctx->node(idx);
2861
+
2862
+ ggml_metal_library_t lib = ctx->lib;
2863
+ ggml_metal_encoder_t enc = ctx->enc;
2864
+
2865
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
2866
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
2867
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
2868
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
2869
+
2870
+ const int32_t ngrp = ((const int32_t *) op->op_params)[0];
2871
+
2872
+ float eps;
2873
+ memcpy(&eps, op->op_params + 1, sizeof(float));
2874
+
2875
+ ggml_metal_kargs_group_norm args = {
2876
+ /*.ne00 =*/ ne00,
2877
+ /*.ne01 =*/ ne01,
2878
+ /*.ne02 =*/ ne02,
2879
+ /*.nb00 =*/ nb00,
2880
+ /*.nb01 =*/ nb01,
2881
+ /*.nb02 =*/ nb02,
2882
+ /*.ngrp =*/ ngrp,
2883
+ /*.eps =*/ eps,
2884
+ };
2885
+
2886
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_group_norm(lib, op);
2887
+
2888
+ int nth = 32; // SIMD width
2889
+ //while (nth < ne00/4 && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
2890
+ // nth *= 2;
2891
+ //}
2892
+
2893
+ //nth = std::min(nth, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
2894
+ //nth = std::min(nth, ne00/4);
2895
+
2896
+ const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
2897
+
2898
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
2899
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
2900
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
2901
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
2902
+
2903
+ ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
2904
+
2905
+ ggml_metal_encoder_dispatch_threadgroups(enc, ngrp, 1, 1, nth, 1, 1);
2906
+
2907
+ return 1;
2908
+ }
2909
+
2910
+ int ggml_metal_op_norm(ggml_metal_op_t ctx, int idx) {
2911
+ ggml_tensor * op = ctx->node(idx);
2912
+
2913
+ ggml_metal_library_t lib = ctx->lib;
2914
+ ggml_metal_encoder_t enc = ctx->enc;
2915
+
2916
+ const bool use_fusion = ctx->use_fusion;
2917
+
2918
+ const int debug_fusion = ctx->debug_fusion;
2919
+
2920
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
2921
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
2922
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
2923
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
2924
+
2925
+ float eps;
2926
+ memcpy(&eps, op->op_params, sizeof(float));
2927
+
2928
+ ggml_metal_buffer_id bid_src0 = ggml_metal_get_buffer_id(op->src[0]);
2929
+ ggml_metal_buffer_id bid_dst = ggml_metal_get_buffer_id(op);
2930
+
2931
+ ggml_metal_kargs_norm args = {
2932
+ /*.ne00 =*/ ne00,
2933
+ /*.ne00_t =*/ ne00 % 4 == 0 ? ne00/4 : ne00,
2934
+ /*.nb1 =*/ nb1,
2935
+ /*.nb2 =*/ nb2,
2936
+ /*.nb3 =*/ nb3,
2937
+ /*.eps =*/ eps,
2938
+ /*.nef1 =*/ { ne01 },
2939
+ /*.nef2 =*/ { ne02 },
2940
+ /*.nef3 =*/ { ne03 },
2941
+ /*.nbf1 =*/ { nb01 },
2942
+ /*.nbf2 =*/ { nb02 },
2943
+ /*.nbf3 =*/ { nb03 },
2944
+ };
2945
+
2946
+ ggml_op fops[8];
2947
+
2948
+ int n_fuse = 1;
2949
+
2950
+ ggml_metal_buffer_id bid_fuse[2] = { bid_src0, bid_src0 };
2951
+
2952
+ // d[0] = norm(a)
2953
+ // d[1] = mul(d[0], b)
2954
+ // d[2] = add(d[1], c)
2955
+ if (use_fusion) {
2956
+ fops[0] = op->op;
2957
+ fops[1] = GGML_OP_MUL;
2958
+ fops[2] = GGML_OP_ADD;
2959
+
2960
+ for (n_fuse = 0; n_fuse <= 1; ++n_fuse) {
2961
+ if (!ctx->can_fuse(idx + n_fuse, fops + n_fuse, 2)) {
2962
+ break;
2963
+ }
2964
+
2965
+ ggml_tensor * f0 = ctx->node(idx + n_fuse);
2966
+ ggml_tensor * f1 = ctx->node(idx + n_fuse + 1);
2967
+
2968
+ if (f0 != f1->src[0]) {
2969
+ break;
2970
+ }
2971
+
2972
+ if (f1->src[1]->ne[0] != op->ne[0]) {
2973
+ break;
2974
+ }
2975
+
2976
+ if (!ggml_is_contiguous_rows(f1->src[1])) {
2977
+ break;
2978
+ }
2979
+
2980
+ if (f1->type != GGML_TYPE_F32) {
2981
+ break;
2982
+ }
2983
+
2984
+ //ctx->fuse_cnt[f1->op]++;
2985
+
2986
+ bid_fuse[n_fuse] = ggml_metal_get_buffer_id(f1->src[1]);
2987
+
2988
+ args.nef1[n_fuse + 1] = f1->src[1]->ne[1];
2989
+ args.nef2[n_fuse + 1] = f1->src[1]->ne[2];
2990
+ args.nef3[n_fuse + 1] = f1->src[1]->ne[3];
2991
+
2992
+ args.nbf1[n_fuse + 1] = f1->src[1]->nb[1];
2993
+ args.nbf2[n_fuse + 1] = f1->src[1]->nb[2];
2994
+ args.nbf3[n_fuse + 1] = f1->src[1]->nb[3];
2995
+ }
2996
+
2997
+ ++n_fuse;
2998
+
2999
+ if (debug_fusion > 1 && n_fuse > 1) {
3000
+ if (n_fuse == 2) {
3001
+ GGML_LOG_DEBUG("%s: fuse: %s + MUL\n", __func__, ggml_op_name(op->op));
3002
+ }
3003
+ if (n_fuse == 3) {
3004
+ GGML_LOG_DEBUG("%s: fuse: %s + MUL + ADD\n", __func__, ggml_op_name(op->op));
3005
+ }
3006
+ }
3007
+ }
3008
+
3009
+ if (n_fuse > 1) {
3010
+ bid_dst = ggml_metal_get_buffer_id(ctx->node(idx + n_fuse - 1));
3011
+
3012
+ for (int i = 1; i < n_fuse; ++i) {
3013
+ if (!ggml_metal_op_concurrency_check(ctx, ctx->node(idx + i))) {
3014
+ ggml_metal_op_concurrency_reset(ctx);
3015
+
3016
+ break;
3017
+ }
3018
+ }
3019
+ }
3020
+
3021
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_norm(lib, op, n_fuse);
3022
+
3023
+ int nth = 32; // SIMD width
3024
+
3025
+ while (nth < args.ne00_t && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
3026
+ nth *= 2;
3027
+ }
3028
+
3029
+ nth = std::min(nth, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
3030
+ nth = std::min(nth, args.ne00_t);
3031
+
3032
+ const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
3033
+
3034
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
3035
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
3036
+ ggml_metal_encoder_set_buffer (enc, bid_src0, 1);
3037
+ ggml_metal_encoder_set_buffer (enc, bid_fuse[0], 2);
3038
+ ggml_metal_encoder_set_buffer (enc, bid_fuse[1], 3);
3039
+ ggml_metal_encoder_set_buffer (enc, bid_dst, 4);
3040
+
3041
+ ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
3042
+
3043
+ ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne02, ne03, nth, 1, 1);
3044
+
3045
+ return n_fuse;
3046
+ }
3047
+
3048
+ int ggml_metal_op_rope(ggml_metal_op_t ctx, int idx) {
3049
+ ggml_tensor * op = ctx->node(idx);
3050
+
3051
+ ggml_metal_library_t lib = ctx->lib;
3052
+ ggml_metal_encoder_t enc = ctx->enc;
3053
+
3054
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
3055
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
3056
+ GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
3057
+ GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
3058
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
3059
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
3060
+
3061
+ // make sure we have one or more position id(ne10) per token(ne02)
3062
+ GGML_ASSERT(ne10 % ne02 == 0);
3063
+ GGML_ASSERT(ne10 >= ne02);
3064
+
3065
+ const int nth = std::min(1024, ne00);
3066
+
3067
+ const int n_past = ((const int32_t *) op->op_params)[0];
3068
+ const int n_dims = ((const int32_t *) op->op_params)[1];
3069
+ //const int mode = ((const int32_t *) op->op_params)[2];
3070
+ // skip 3, n_ctx, used in GLM RoPE, unimplemented in metal
3071
+ const int n_ctx_orig = ((const int32_t *) op->op_params)[4];
3072
+
3073
+ float freq_base;
3074
+ float freq_scale;
3075
+ float ext_factor;
3076
+ float attn_factor;
3077
+ float beta_fast;
3078
+ float beta_slow;
3079
+
3080
+ memcpy(&freq_base, (const int32_t *) op->op_params + 5, sizeof(float));
3081
+ memcpy(&freq_scale, (const int32_t *) op->op_params + 6, sizeof(float));
3082
+ memcpy(&ext_factor, (const int32_t *) op->op_params + 7, sizeof(float));
3083
+ memcpy(&attn_factor, (const int32_t *) op->op_params + 8, sizeof(float));
3084
+ memcpy(&beta_fast, (const int32_t *) op->op_params + 9, sizeof(float));
3085
+ memcpy(&beta_slow, (const int32_t *) op->op_params + 10, sizeof(float));
3086
+
3087
+ // mrope
3088
+ const int sect_0 = ((const int32_t *) op->op_params)[11];
3089
+ const int sect_1 = ((const int32_t *) op->op_params)[12];
3090
+ const int sect_2 = ((const int32_t *) op->op_params)[13];
3091
+ const int sect_3 = ((const int32_t *) op->op_params)[14];
3092
+
3093
+ ggml_metal_kargs_rope args = {
3094
+ /*.ne00 =*/ ne00,
3095
+ /*.ne01 =*/ ne01,
3096
+ /*.ne02 =*/ ne02,
3097
+ /*.ne03 =*/ ne03,
3098
+ /*.nb00 =*/ nb00,
3099
+ /*.nb01 =*/ nb01,
3100
+ /*.nb02 =*/ nb02,
3101
+ /*.nb03 =*/ nb03,
3102
+ /*.ne0 =*/ ne0,
3103
+ /*.ne1 =*/ ne1,
3104
+ /*.ne2 =*/ ne2,
3105
+ /*.ne3 =*/ ne3,
3106
+ /*.nb0 =*/ nb0,
3107
+ /*.nb1 =*/ nb1,
3108
+ /*.nb2 =*/ nb2,
3109
+ /*.nb3 =*/ nb3,
3110
+ /*.n_past =*/ n_past,
3111
+ /*.n_dims =*/ n_dims,
3112
+ /*.n_ctx_orig =*/ n_ctx_orig,
3113
+ /*.freq_base =*/ freq_base,
3114
+ /*.freq_scale =*/ freq_scale,
3115
+ /*.ext_factor =*/ ext_factor,
3116
+ /*.attn_factor =*/ attn_factor,
3117
+ /*.beta_fast =*/ beta_fast,
3118
+ /*.beta_slow =*/ beta_slow,
3119
+ /* sect_0 =*/ sect_0,
3120
+ /* sect_1 =*/ sect_1,
3121
+ /* sect_2 =*/ sect_2,
3122
+ /* sect_3 =*/ sect_3,
3123
+ /* src2 =*/ op->src[2] != nullptr,
3124
+ };
3125
+
3126
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_rope(lib, op);
3127
+
3128
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
3129
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
3130
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
3131
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
3132
+ if (op->src[2]) {
3133
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[2]), 3);
3134
+ } else {
3135
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 3);
3136
+ }
3137
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 4);
3138
+
3139
+ ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne02, ne03, nth, 1, 1);
3140
+
3141
+ return 1;
3142
+ }
3143
+
3144
+ int ggml_metal_op_im2col(ggml_metal_op_t ctx, int idx) {
3145
+ ggml_tensor * op = ctx->node(idx);
3146
+
3147
+ ggml_metal_library_t lib = ctx->lib;
3148
+ ggml_metal_encoder_t enc = ctx->enc;
3149
+
3150
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
3151
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
3152
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
3153
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
3154
+
3155
+ const int32_t s0 = ((const int32_t *)(op->op_params))[0];
3156
+ const int32_t s1 = ((const int32_t *)(op->op_params))[1];
3157
+ const int32_t p0 = ((const int32_t *)(op->op_params))[2];
3158
+ const int32_t p1 = ((const int32_t *)(op->op_params))[3];
3159
+ const int32_t d0 = ((const int32_t *)(op->op_params))[4];
3160
+ const int32_t d1 = ((const int32_t *)(op->op_params))[5];
3161
+
3162
+ const bool is_2D = ((const int32_t *)(op->op_params))[6] == 1;
3163
+
3164
+ const int32_t N = op->src[1]->ne[is_2D ? 3 : 2];
3165
+ const int32_t IC = op->src[1]->ne[is_2D ? 2 : 1];
3166
+ const int32_t IH = is_2D ? op->src[1]->ne[1] : 1;
3167
+ const int32_t IW = op->src[1]->ne[0];
3168
+
3169
+ const int32_t KH = is_2D ? op->src[0]->ne[1] : 1;
3170
+ const int32_t KW = op->src[0]->ne[0];
3171
+
3172
+ const int32_t OH = is_2D ? op->ne[2] : 1;
3173
+ const int32_t OW = op->ne[1];
3174
+
3175
+ const int32_t CHW = IC * KH * KW;
3176
+
3177
+ const uint64_t ofs0 = op->src[1]->nb[is_2D ? 3 : 2] / 4;
3178
+ const uint64_t ofs1 = op->src[1]->nb[is_2D ? 2 : 1] / 4;
3179
+
3180
+ ggml_metal_kargs_im2col args = {
3181
+ /*.ofs0 =*/ ofs0,
3182
+ /*.ofs1 =*/ ofs1,
3183
+ /*.IW =*/ IW,
3184
+ /*.IH =*/ IH,
3185
+ /*.CHW =*/ CHW,
3186
+ /*.s0 =*/ s0,
3187
+ /*.s1 =*/ s1,
3188
+ /*.p0 =*/ p0,
3189
+ /*.p1 =*/ p1,
3190
+ /*.d0 =*/ d0,
3191
+ /*.d1 =*/ d1,
3192
+ /*.N =*/ N,
3193
+ /*.KH =*/ KH,
3194
+ /*.KW =*/ KW,
3195
+ /*.KHW =*/ KH * KW,
3196
+ };
3197
+
3198
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_im2col(lib, op);
3199
+
3200
+ GGML_ASSERT(KH*KW <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
3201
+
3202
+ const uint64_t ntptg0 = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)/(KH*KW), N);
3203
+
3204
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
3205
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
3206
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 1);
3207
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
3208
+
3209
+ ggml_metal_encoder_dispatch_threadgroups(enc, IC, OH, OW, ntptg0, KH, KW);
3210
+
3211
+ return 1;
3212
+ }
3213
+
3214
+ int ggml_metal_op_conv_2d(ggml_metal_op_t ctx, int idx) {
3215
+ ggml_tensor * op = ctx->node(idx);
3216
+
3217
+ ggml_metal_library_t lib = ctx->lib;
3218
+ ggml_metal_encoder_t enc = ctx->enc;
3219
+
3220
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
3221
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
3222
+ GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
3223
+ GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
3224
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
3225
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
3226
+
3227
+ GGML_ASSERT(ggml_is_contiguous(op->src[0]));
3228
+ GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32);
3229
+ GGML_ASSERT(op->type == GGML_TYPE_F32);
3230
+ GGML_ASSERT(op->src[0]->type == GGML_TYPE_F16 || op->src[0]->type == GGML_TYPE_F32);
3231
+
3232
+ const int32_t s0 = ((const int32_t *) op->op_params)[0];
3233
+ const int32_t s1 = ((const int32_t *) op->op_params)[1];
3234
+ const int32_t p0 = ((const int32_t *) op->op_params)[2];
3235
+ const int32_t p1 = ((const int32_t *) op->op_params)[3];
3236
+ const int32_t d0 = ((const int32_t *) op->op_params)[4];
3237
+ const int32_t d1 = ((const int32_t *) op->op_params)[5];
3238
+
3239
+ ggml_metal_kargs_conv_2d args = {
3240
+ /*.nb00 =*/ nb00,
3241
+ /*.nb01 =*/ nb01,
3242
+ /*.nb02 =*/ nb02,
3243
+ /*.nb03 =*/ nb03,
3244
+ /*.nb10 =*/ nb10,
3245
+ /*.nb11 =*/ nb11,
3246
+ /*.nb12 =*/ nb12,
3247
+ /*.nb13 =*/ nb13,
3248
+ /*.nb0 =*/ nb0,
3249
+ /*.nb1 =*/ nb1,
3250
+ /*.nb2 =*/ nb2,
3251
+ /*.nb3 =*/ nb3,
3252
+ /*.IW =*/ ne10,
3253
+ /*.IH =*/ ne11,
3254
+ /*.KW =*/ ne00,
3255
+ /*.KH =*/ ne01,
3256
+ /*.IC =*/ ne02,
3257
+ /*.OC =*/ ne03,
3258
+ /*.OW =*/ ne0,
3259
+ /*.OH =*/ ne1,
3260
+ /*.N =*/ ne3,
3261
+ /*.s0 =*/ s0,
3262
+ /*.s1 =*/ s1,
3263
+ /*.p0 =*/ p0,
3264
+ /*.p1 =*/ p1,
3265
+ /*.d0 =*/ d0,
3266
+ /*.d1 =*/ d1,
3267
+ };
3268
+
3269
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_conv_2d(lib, op);
3270
+
3271
+ int nth = ggml_metal_pipeline_max_theads_per_threadgroup(pipeline);
3272
+ nth = std::min(nth, 256);
3273
+ nth = std::max(nth, 1);
3274
+
3275
+ const uint64_t n_out = ggml_nelements(op);
3276
+
3277
+ uint64_t tg = (n_out + nth - 1)/nth;
3278
+ tg = std::max<uint64_t>(tg, 1);
3279
+ tg = std::min<uint64_t>(tg, (uint64_t) std::numeric_limits<int>::max());
3280
+
3281
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
3282
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
3283
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
3284
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
3285
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3);
3286
+
3287
+ ggml_metal_encoder_dispatch_threadgroups(enc, tg, 1, 1, nth, 1, 1);
3288
+
3289
+ return 1;
3290
+ }
3291
+
3292
+ int ggml_metal_op_conv_transpose_1d(ggml_metal_op_t ctx, int idx) {
3293
+ ggml_tensor * op = ctx->node(idx);
3294
+
3295
+ ggml_metal_library_t lib = ctx->lib;
3296
+ ggml_metal_encoder_t enc = ctx->enc;
3297
+
3298
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
3299
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
3300
+ GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
3301
+ GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
3302
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
3303
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
3304
+
3305
+ const int32_t s0 = ((const int32_t *)(op->op_params))[0];
3306
+
3307
+ const int32_t IC = op->src[1]->ne[1];
3308
+ const int32_t IL = op->src[1]->ne[0];
3309
+
3310
+ const int32_t K = op->src[0]->ne[0];
3311
+
3312
+ const int32_t OL = op->ne[0];
3313
+ const int32_t OC = op->ne[1];
3314
+
3315
+ ggml_metal_kargs_conv_transpose_1d args = {
3316
+ /*.IC =*/ IC,
3317
+ /*.IL =*/ IL,
3318
+ /*.K =*/ K,
3319
+ /*.s0 =*/ s0,
3320
+ /*.nb0 =*/ nb0,
3321
+ /*.nb1 =*/ nb1,
3322
+ };
3323
+
3324
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_conv_transpose_1d(lib, op);
3325
+
3326
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
3327
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
3328
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
3329
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
3330
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3);
3331
+
3332
+ ggml_metal_encoder_dispatch_threadgroups(enc, OL, OC, 1, 1, 1, 1);
3333
+
3334
+ return 1;
3335
+ }
3336
+
3337
+ int ggml_metal_op_conv_transpose_2d(ggml_metal_op_t ctx, int idx) {
3338
+ ggml_tensor * op = ctx->node(idx);
3339
+
3340
+ ggml_metal_library_t lib = ctx->lib;
3341
+ ggml_metal_encoder_t enc = ctx->enc;
3342
+
3343
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
3344
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
3345
+ GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
3346
+ GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
3347
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
3348
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
3349
+
3350
+ const int32_t s0 = ((const int32_t *)(op->op_params))[0];
3351
+
3352
+ const int32_t IC = op->src[1]->ne[2];
3353
+ const int32_t IH = op->src[1]->ne[1];
3354
+ const int32_t IW = op->src[1]->ne[0];
3355
+
3356
+ const int32_t KH = op->src[0]->ne[1];
3357
+ const int32_t KW = op->src[0]->ne[0];
3358
+
3359
+ const int32_t OW = op->ne[0];
3360
+ const int32_t OH = op->ne[1];
3361
+ const int32_t OC = op->ne[2];
3362
+
3363
+ ggml_metal_kargs_conv_transpose_2d args = {
3364
+ /*.IC =*/ IC,
3365
+ /*.IH =*/ IH,
3366
+ /*.IW =*/ IW,
3367
+ /*.KH =*/ KH,
3368
+ /*.KW =*/ KW,
3369
+ /*.OC =*/ OC,
3370
+ /*.s0 =*/ s0,
3371
+ /*.nb0 =*/ nb0,
3372
+ /*.nb1 =*/ nb1,
3373
+ /*.nb2 =*/ nb2,
3374
+ };
3375
+
3376
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_conv_transpose_2d(lib, op);
3377
+
3378
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
3379
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
3380
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
3381
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
3382
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3);
3383
+
3384
+ // Metal requires buffer size to be multiple of 16 bytes
3385
+ const size_t smem = GGML_PAD(KW * KH * sizeof(float), 16);
3386
+ ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
3387
+
3388
+ ggml_metal_encoder_dispatch_threadgroups(enc, OW, OH, OC, KW, KH, 1);
3389
+
3390
+ return 1;
3391
+ }
3392
+
3393
+ int ggml_metal_op_upscale(ggml_metal_op_t ctx, int idx) {
3394
+ ggml_tensor * op = ctx->node(idx);
3395
+
3396
+ ggml_metal_library_t lib = ctx->lib;
3397
+ ggml_metal_encoder_t enc = ctx->enc;
3398
+
3399
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
3400
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
3401
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
3402
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
3403
+
3404
+ const float sf0 = (float)ne0/op->src[0]->ne[0];
3405
+ const float sf1 = (float)ne1/op->src[0]->ne[1];
3406
+ const float sf2 = (float)ne2/op->src[0]->ne[2];
3407
+ const float sf3 = (float)ne3/op->src[0]->ne[3];
3408
+
3409
+ ggml_metal_kargs_upscale args = {
3410
+ /*.ne00 =*/ ne00,
3411
+ /*.ne01 =*/ ne01,
3412
+ /*.ne02 =*/ ne02,
3413
+ /*.ne03 =*/ ne03,
3414
+ /*.nb00 =*/ nb00,
3415
+ /*.nb01 =*/ nb01,
3416
+ /*.nb02 =*/ nb02,
3417
+ /*.nb03 =*/ nb03,
3418
+ /*.ne0 =*/ ne0,
3419
+ /*.ne1 =*/ ne1,
3420
+ /*.ne2 =*/ ne2,
3421
+ /*.ne3 =*/ ne3,
3422
+ /*.nb0 =*/ nb0,
3423
+ /*.nb1 =*/ nb1,
3424
+ /*.nb2 =*/ nb2,
3425
+ /*.nb3 =*/ nb3,
3426
+ /*.sf0 =*/ sf0,
3427
+ /*.sf1 =*/ sf1,
3428
+ /*.sf2 =*/ sf2,
3429
+ /*.sf3 =*/ sf3
3430
+ };
3431
+
3432
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_upscale(lib, op);
3433
+
3434
+ const int nth = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), ne0);
3435
+
3436
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
3437
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
3438
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
3439
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
3440
+
3441
+ ggml_metal_encoder_dispatch_threadgroups(enc, ne1, ne2, ne3, nth, 1, 1);
3442
+
3443
+ return 1;
3444
+ }
3445
+
3446
+ int ggml_metal_op_pad(ggml_metal_op_t ctx, int idx) {
3447
+ ggml_tensor * op = ctx->node(idx);
3448
+
3449
+ ggml_metal_library_t lib = ctx->lib;
3450
+ ggml_metal_encoder_t enc = ctx->enc;
3451
+
3452
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
3453
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
3454
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
3455
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
3456
+
3457
+ ggml_metal_kargs_pad args = {
3458
+ /*.ne00 =*/ ne00,
3459
+ /*.ne01 =*/ ne01,
3460
+ /*.ne02 =*/ ne02,
3461
+ /*.ne03 =*/ ne03,
3462
+ /*.nb00 =*/ nb00,
3463
+ /*.nb01 =*/ nb01,
3464
+ /*.nb02 =*/ nb02,
3465
+ /*.nb03 =*/ nb03,
3466
+ /*.ne0 =*/ ne0,
3467
+ /*.ne1 =*/ ne1,
3468
+ /*.ne2 =*/ ne2,
3469
+ /*.ne3 =*/ ne3,
3470
+ /*.nb0 =*/ nb0,
3471
+ /*.nb1 =*/ nb1,
3472
+ /*.nb2 =*/ nb2,
3473
+ /*.nb3 =*/ nb3
3474
+ };
3475
+
3476
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_pad(lib, op);
3477
+
3478
+ const int nth = std::min(1024, ne0);
3479
+
3480
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
3481
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
3482
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
3483
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
3484
+
3485
+ ggml_metal_encoder_dispatch_threadgroups(enc, ne1, ne2, ne3, nth, 1, 1);
3486
+
3487
+ return 1;
3488
+ }
3489
+
3490
+ int ggml_metal_op_pad_reflect_1d(ggml_metal_op_t ctx, int idx) {
3491
+ ggml_tensor * op = ctx->node(idx);
3492
+
3493
+ ggml_metal_library_t lib = ctx->lib;
3494
+ ggml_metal_encoder_t enc = ctx->enc;
3495
+
3496
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
3497
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
3498
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
3499
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
3500
+
3501
+ ggml_metal_kargs_pad_reflect_1d args = {
3502
+ /*.ne00 =*/ ne00,
3503
+ /*.ne01 =*/ ne01,
3504
+ /*.ne02 =*/ ne02,
3505
+ /*.ne03 =*/ ne03,
3506
+ /*.nb00 =*/ nb00,
3507
+ /*.nb01 =*/ nb01,
3508
+ /*.nb02 =*/ nb02,
3509
+ /*.nb03 =*/ nb03,
3510
+ /*.ne0 =*/ ne0,
3511
+ /*.ne1 =*/ ne1,
3512
+ /*.ne2 =*/ ne2,
3513
+ /*.ne3 =*/ ne3,
3514
+ /*.nb0 =*/ nb0,
3515
+ /*.nb1 =*/ nb1,
3516
+ /*.nb2 =*/ nb2,
3517
+ /*.nb3 =*/ nb3,
3518
+ /*.p0 =*/ ((const int32_t *)(op->op_params))[0],
3519
+ /*.p1 =*/ ((const int32_t *)(op->op_params))[1]
3520
+ };
3521
+
3522
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_pad_reflect_1d(lib, op);
3523
+
3524
+ const int nth = std::min(1024, ne0);
3525
+
3526
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
3527
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
3528
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
3529
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
3530
+
3531
+ ggml_metal_encoder_dispatch_threadgroups(enc, ne1, ne2, ne3, nth, 1, 1);
3532
+
3533
+ return 1;
3534
+ }
3535
+
3536
+ int ggml_metal_op_arange(ggml_metal_op_t ctx, int idx) {
3537
+ ggml_tensor * op = ctx->node(idx);
3538
+
3539
+ ggml_metal_library_t lib = ctx->lib;
3540
+ ggml_metal_encoder_t enc = ctx->enc;
3541
+
3542
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
3543
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
3544
+
3545
+ float start;
3546
+ float step;
3547
+
3548
+ memcpy(&start, ((const int32_t *) op->op_params) + 0, sizeof(float));
3549
+ memcpy(&step, ((const int32_t *) op->op_params) + 2, sizeof(float));
3550
+
3551
+ ggml_metal_kargs_arange args = {
3552
+ /*.ne0 =*/ ne0,
3553
+ /*.start =*/ start,
3554
+ /*.step =*/ step
3555
+ };
3556
+
3557
+ const int nth = std::min(1024, ne0);
3558
+
3559
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_arange(lib, op);
3560
+
3561
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
3562
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
3563
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 1);
3564
+
3565
+ ggml_metal_encoder_dispatch_threadgroups(enc, 1, 1, 1, nth, 1, 1);
3566
+
3567
+ return 1;
3568
+ }
3569
+
3570
+ int ggml_metal_op_timestep_embedding(ggml_metal_op_t ctx, int idx) {
3571
+ ggml_tensor * op = ctx->node(idx);
3572
+
3573
+ ggml_metal_library_t lib = ctx->lib;
3574
+ ggml_metal_encoder_t enc = ctx->enc;
3575
+
3576
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
3577
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
3578
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
3579
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
3580
+
3581
+ const int dim = op->op_params[0];
3582
+ const int max_period = op->op_params[1];
3583
+
3584
+ ggml_metal_kargs_timestep_embedding args = {
3585
+ /*.nb1 =*/ nb1,
3586
+ /*.dim =*/ dim,
3587
+ /*.max_period =*/ max_period,
3588
+ };
3589
+
3590
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_timestep_embedding(lib, op);
3591
+
3592
+ const int nth = std::max(1, std::min(1024, dim/2));
3593
+
3594
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
3595
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
3596
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
3597
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
3598
+
3599
+ ggml_metal_encoder_dispatch_threadgroups(enc, ne00, 1, 1, nth, 1, 1);
3600
+
3601
+ return 1;
3602
+ }
3603
+
3604
+ int ggml_metal_op_argmax(ggml_metal_op_t ctx, int idx) {
3605
+ ggml_tensor * op = ctx->node(idx);
3606
+
3607
+ ggml_metal_library_t lib = ctx->lib;
3608
+ ggml_metal_encoder_t enc = ctx->enc;
3609
+
3610
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
3611
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
3612
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
3613
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
3614
+
3615
+ ggml_metal_kargs_argmax args = {
3616
+ /*.ne00 = */ ne00,
3617
+ /*.nb01 = */ nb01,
3618
+ };
3619
+
3620
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_argmax(lib, op);
3621
+
3622
+ const int64_t nrows = ggml_nrows(op->src[0]);
3623
+
3624
+ int nth = 32; // SIMD width
3625
+ while (nth < ne00 && nth*ne01*ne02*ne03 < 256) {
3626
+ nth *= 2;
3627
+ }
3628
+
3629
+ const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
3630
+
3631
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
3632
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
3633
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
3634
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
3635
+
3636
+ ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
3637
+
3638
+ ggml_metal_encoder_dispatch_threadgroups(enc, nrows, 1, 1, nth, 1, 1);
3639
+
3640
+ return 1;
3641
+ }
3642
+
3643
+ int ggml_metal_op_argsort(ggml_metal_op_t ctx, int idx) {
3644
+ ggml_tensor * op = ctx->node(idx);
3645
+
3646
+ ggml_metal_library_t lib = ctx->lib;
3647
+ ggml_metal_encoder_t enc = ctx->enc;
3648
+
3649
+ GGML_ASSERT(ggml_is_contiguous_rows(op->src[0]));
3650
+
3651
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
3652
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
3653
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
3654
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
3655
+
3656
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_argsort(lib, op);
3657
+
3658
+ // bitonic sort requires the number of elements to be power of 2
3659
+ int nth = 1;
3660
+ while (nth < ne00 && 2*nth <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
3661
+ nth *= 2;
3662
+ }
3663
+
3664
+ const int npr = (ne00 + nth - 1)/nth;
3665
+
3666
+ // Metal kernels require the buffer size to be multiple of 16 bytes
3667
+ // https://developer.apple.com/documentation/metal/mtlcomputecommandencoder/1443142-setthreadgroupmemorylength
3668
+ const size_t smem = GGML_PAD(nth*sizeof(int32_t), 16);
3669
+
3670
+ ggml_metal_buffer_id bid_src0 = ggml_metal_get_buffer_id(op->src[0]);
3671
+ ggml_metal_buffer_id bid_dst = ggml_metal_get_buffer_id(op);
3672
+
3673
+ ggml_metal_buffer_id bid_tmp = bid_dst;
3674
+ bid_tmp.offs += ggml_nbytes(op);
3675
+
3676
+ if ((int) ceil(std::log(npr) / std::log(2)) % 2 == 1) {
3677
+ std::swap(bid_dst, bid_tmp);
3678
+ }
3679
+
3680
+ ggml_metal_kargs_argsort args = {
3681
+ /*.ne00 =*/ ne00,
3682
+ /*.ne01 =*/ ne01,
3683
+ /*.ne02 =*/ ne02,
3684
+ /*.ne03 =*/ ne03,
3685
+ /*.nb00 =*/ nb00,
3686
+ /*.nb01 =*/ nb01,
3687
+ /*.nb02 =*/ nb02,
3688
+ /*.nb03 =*/ nb03,
3689
+ };
3690
+
3691
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
3692
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
3693
+ ggml_metal_encoder_set_buffer (enc, bid_src0, 1);
3694
+ ggml_metal_encoder_set_buffer (enc, bid_dst, 2);
3695
+
3696
+ ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
3697
+
3698
+ ggml_metal_encoder_dispatch_threadgroups(enc, npr*ne01, ne02, ne03, nth, 1, 1);
3699
+
3700
+ ggml_metal_pipeline_t pipeline_merge = ggml_metal_library_get_pipeline_argsort_merge(lib, op);
3701
+
3702
+ int len = nth;
3703
+
3704
+ while (len < ne00) {
3705
+ ggml_metal_op_concurrency_reset(ctx);
3706
+
3707
+ ggml_metal_kargs_argsort_merge args_merge = {
3708
+ .ne00 = ne00,
3709
+ .ne01 = ne01,
3710
+ .ne02 = ne02,
3711
+ .ne03 = ne03,
3712
+ .nb00 = nb00,
3713
+ .nb01 = nb01,
3714
+ .nb02 = nb02,
3715
+ .nb03 = nb03,
3716
+ .len = len,
3717
+ };
3718
+
3719
+ // merges per row
3720
+ const int nm = (ne00 + 2*len - 1) / (2*len);
3721
+
3722
+ const int nth = std::min(512, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline_merge));
3723
+
3724
+ ggml_metal_encoder_set_pipeline(enc, pipeline_merge);
3725
+ ggml_metal_encoder_set_bytes (enc, &args_merge, sizeof(args_merge), 0);
3726
+ ggml_metal_encoder_set_buffer (enc, bid_src0, 1);
3727
+ ggml_metal_encoder_set_buffer (enc, bid_dst, 2);
3728
+ ggml_metal_encoder_set_buffer (enc, bid_tmp, 3);
3729
+
3730
+ ggml_metal_encoder_dispatch_threadgroups(enc, nm*ne01, ne02, ne03, nth, 1, 1);
3731
+
3732
+ std::swap(bid_dst, bid_tmp);
3733
+
3734
+ len <<= 1;
3735
+ }
3736
+
3737
+ return 1;
3738
+ }
3739
+
3740
+ int ggml_metal_op_leaky_relu(ggml_metal_op_t ctx, int idx) {
3741
+ ggml_tensor * op = ctx->node(idx);
3742
+
3743
+ ggml_metal_library_t lib = ctx->lib;
3744
+ ggml_metal_encoder_t enc = ctx->enc;
3745
+
3746
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
3747
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
3748
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
3749
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
3750
+
3751
+ float slope;
3752
+ memcpy(&slope, op->op_params, sizeof(float));
3753
+
3754
+ ggml_metal_kargs_leaky_relu args = {
3755
+ /*.slope =*/ slope
3756
+ };
3757
+
3758
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_unary(lib, op);
3759
+
3760
+ int64_t n = ggml_nelements(op);
3761
+
3762
+ if (n % 4 == 0) {
3763
+ n /= 4;
3764
+ }
3765
+
3766
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
3767
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
3768
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
3769
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
3770
+
3771
+ ggml_metal_encoder_dispatch_threadgroups(enc, n, 1, 1, 1, 1, 1);
3772
+
3773
+ return 1;
3774
+ }
3775
+
3776
+ int ggml_metal_op_opt_step_adamw(ggml_metal_op_t ctx, int idx) {
3777
+ ggml_tensor * op = ctx->node(idx);
3778
+
3779
+ ggml_metal_library_t lib = ctx->lib;
3780
+ ggml_metal_encoder_t enc = ctx->enc;
3781
+
3782
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
3783
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
3784
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
3785
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
3786
+
3787
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_opt_step_adamw(lib, op);
3788
+
3789
+ const int64_t np = ggml_nelements(op->src[0]);
3790
+ ggml_metal_kargs_opt_step_adamw args = {
3791
+ /*.np =*/ np,
3792
+ };
3793
+
3794
+ int ida = 0;
3795
+
3796
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
3797
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), ida++);
3798
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), ida++);
3799
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), ida++);
3800
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[2]), ida++);
3801
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[3]), ida++);
3802
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[4]), ida++);
3803
+
3804
+ const int nth = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), ne0);
3805
+ const int64_t n = (np + nth - 1) / nth;
3806
+
3807
+ ggml_metal_encoder_dispatch_threadgroups(enc, n, 1, 1, nth, 1, 1);
3808
+
3809
+ return 1;
3810
+ }
3811
+
3812
+ int ggml_metal_op_opt_step_sgd(ggml_metal_op_t ctx, int idx) {
3813
+ ggml_tensor * op = ctx->node(idx);
3814
+
3815
+ ggml_metal_library_t lib = ctx->lib;
3816
+ ggml_metal_encoder_t enc = ctx->enc;
3817
+
3818
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
3819
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
3820
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
3821
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
3822
+
3823
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_opt_step_sgd(lib, op);
3824
+
3825
+ const int64_t np = ggml_nelements(op->src[0]);
3826
+ ggml_metal_kargs_opt_step_sgd args = {
3827
+ /*.np =*/ np,
3828
+ };
3829
+
3830
+ int ida = 0;
3831
+
3832
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
3833
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), ida++);
3834
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), ida++);
3835
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), ida++);
3836
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[2]), ida++);
3837
+
3838
+ const int nth = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), ne0);
3839
+ const int64_t n = (np + nth - 1) / nth;
3840
+
3841
+ ggml_metal_encoder_dispatch_threadgroups(enc, n, 1, 1, nth, 1, 1);
3842
+
3843
+ return 1;
3844
+ }