whispercpp 1.3.2 → 1.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (664) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +6 -3
  3. data/README.md +71 -14
  4. data/Rakefile +20 -7
  5. data/ext/.gitignore +4 -6
  6. data/ext/dependencies.rb +36 -24
  7. data/ext/extconf.rb +1 -1
  8. data/ext/options.rb +48 -184
  9. data/ext/ruby_whisper.c +18 -0
  10. data/ext/ruby_whisper_context.c +43 -12
  11. data/ext/ruby_whisper_model.c +1 -1
  12. data/ext/ruby_whisper_params.c +59 -27
  13. data/ext/ruby_whisper_segment.c +81 -4
  14. data/ext/ruby_whisper_transcribe.cpp +13 -7
  15. data/ext/ruby_whisper_vad_params.c +1 -1
  16. data/ext/sources/CMakeLists.txt +5 -1
  17. data/ext/sources/bindings/javascript/package.json +1 -1
  18. data/ext/sources/build-xcframework.sh +24 -0
  19. data/ext/sources/examples/CMakeLists.txt +1 -0
  20. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +120 -24
  21. data/ext/sources/examples/addon.node/addon.cpp +154 -35
  22. data/ext/sources/examples/addon.node/index.js +10 -5
  23. data/ext/sources/examples/addon.node/vad-example.js +132 -0
  24. data/ext/sources/examples/bench/bench.cpp +29 -18
  25. data/ext/sources/examples/bench.wasm/index-tmpl.html +10 -9
  26. data/ext/sources/examples/cli/cli.cpp +7 -4
  27. data/ext/sources/examples/command/command.cpp +58 -32
  28. data/ext/sources/examples/command.wasm/index-tmpl.html +5 -4
  29. data/ext/sources/examples/common-ggml.cpp +2 -0
  30. data/ext/sources/examples/common-whisper.cpp +14 -7
  31. data/ext/sources/examples/lsp/lsp.cpp +21 -17
  32. data/ext/sources/examples/quantize/quantize.cpp +3 -0
  33. data/ext/sources/examples/server/CMakeLists.txt +3 -0
  34. data/ext/sources/examples/server/server.cpp +193 -35
  35. data/ext/sources/examples/server.py +6 -1
  36. data/ext/sources/examples/stream/stream.cpp +10 -2
  37. data/ext/sources/examples/stream.wasm/emscripten.cpp +6 -6
  38. data/ext/sources/examples/stream.wasm/index-tmpl.html +82 -5
  39. data/ext/sources/examples/talk-llama/CMakeLists.txt +3 -0
  40. data/ext/sources/examples/talk-llama/llama-adapter.cpp +101 -4
  41. data/ext/sources/examples/talk-llama/llama-adapter.h +6 -0
  42. data/ext/sources/examples/talk-llama/llama-arch.cpp +756 -15
  43. data/ext/sources/examples/talk-llama/llama-arch.h +85 -1
  44. data/ext/sources/examples/talk-llama/llama-batch.cpp +773 -272
  45. data/ext/sources/examples/talk-llama/llama-batch.h +126 -55
  46. data/ext/sources/examples/talk-llama/llama-chat.cpp +150 -13
  47. data/ext/sources/examples/talk-llama/llama-chat.h +8 -0
  48. data/ext/sources/examples/talk-llama/llama-context.cpp +814 -542
  49. data/ext/sources/examples/talk-llama/llama-context.h +68 -32
  50. data/ext/sources/examples/talk-llama/llama-cparams.cpp +1 -1
  51. data/ext/sources/examples/talk-llama/llama-cparams.h +4 -4
  52. data/ext/sources/examples/talk-llama/llama-graph.cpp +787 -440
  53. data/ext/sources/examples/talk-llama/llama-graph.h +333 -153
  54. data/ext/sources/examples/talk-llama/llama-hparams.cpp +128 -6
  55. data/ext/sources/examples/talk-llama/llama-hparams.h +80 -17
  56. data/ext/sources/examples/talk-llama/llama-impl.h +2 -0
  57. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +326 -0
  58. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.h +137 -0
  59. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +1248 -1967
  60. data/ext/sources/examples/talk-llama/llama-kv-cache.h +218 -345
  61. data/ext/sources/examples/talk-llama/llama-kv-cells.h +164 -52
  62. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +266 -0
  63. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +139 -0
  64. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +1154 -0
  65. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +182 -0
  66. data/ext/sources/examples/talk-llama/llama-memory.cpp +58 -0
  67. data/ext/sources/examples/talk-llama/llama-memory.h +94 -4
  68. data/ext/sources/examples/talk-llama/llama-mmap.cpp +1 -1
  69. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +44 -17
  70. data/ext/sources/examples/talk-llama/llama-model-loader.h +3 -2
  71. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +1 -0
  72. data/ext/sources/examples/talk-llama/llama-model.cpp +11377 -5248
  73. data/ext/sources/examples/talk-llama/llama-model.h +87 -9
  74. data/ext/sources/examples/talk-llama/llama-quant.cpp +137 -16
  75. data/ext/sources/examples/talk-llama/llama-sampling.cpp +226 -126
  76. data/ext/sources/examples/talk-llama/llama-vocab.cpp +502 -38
  77. data/ext/sources/examples/talk-llama/llama-vocab.h +46 -0
  78. data/ext/sources/examples/talk-llama/llama.cpp +76 -17
  79. data/ext/sources/examples/talk-llama/llama.h +176 -151
  80. data/ext/sources/examples/talk-llama/talk-llama.cpp +11 -6
  81. data/ext/sources/examples/talk-llama/unicode.cpp +212 -0
  82. data/ext/sources/examples/talk-llama/unicode.h +45 -0
  83. data/ext/sources/examples/vad-speech-segments/speech.cpp +6 -0
  84. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +6 -2
  85. data/ext/sources/examples/whisper.wasm/index-tmpl.html +17 -16
  86. data/ext/sources/ggml/CMakeLists.txt +106 -33
  87. data/ext/sources/ggml/cmake/common.cmake +24 -0
  88. data/ext/sources/ggml/cmake/ggml-config.cmake.in +132 -93
  89. data/ext/sources/ggml/include/ggml-backend.h +18 -2
  90. data/ext/sources/ggml/include/ggml-cpu.h +2 -0
  91. data/ext/sources/ggml/include/ggml-metal.h +1 -6
  92. data/ext/sources/ggml/include/ggml-opt.h +25 -6
  93. data/ext/sources/ggml/include/ggml-webgpu.h +19 -0
  94. data/ext/sources/ggml/include/ggml-zdnn.h +17 -0
  95. data/ext/sources/ggml/include/ggml.h +365 -21
  96. data/ext/sources/ggml/src/CMakeLists.txt +98 -25
  97. data/ext/sources/ggml/src/ggml-alloc.c +265 -141
  98. data/ext/sources/ggml/src/ggml-backend-impl.h +4 -1
  99. data/ext/sources/ggml/src/ggml-backend-reg.cpp +35 -13
  100. data/ext/sources/ggml/src/ggml-backend.cpp +266 -60
  101. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +4 -4
  102. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +5 -4
  103. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +15 -0
  104. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +3 -1
  105. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +903 -717
  106. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +143 -25
  107. data/ext/sources/ggml/src/ggml-cann/common.h +149 -2
  108. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +521 -78
  109. data/ext/sources/ggml/src/ggml-common.h +21 -0
  110. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +165 -50
  111. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +5 -3
  112. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
  113. data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  114. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +3650 -0
  115. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +1891 -0
  116. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +2160 -0
  117. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  118. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +2305 -0
  119. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1897 -0
  120. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +342 -0
  121. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +1468 -0
  122. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +1221 -0
  123. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +3820 -0
  124. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +6307 -0
  125. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +214 -0
  126. data/ext/sources/ggml/src/ggml-cpu/common.h +18 -3
  127. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +23 -7
  128. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +179 -110
  129. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +44 -33
  130. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  131. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +152 -18
  132. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +7 -1
  133. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +228 -98
  134. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +532 -1124
  135. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  136. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +3374 -2081
  137. data/ext/sources/ggml/src/ggml-cpu/ops.h +13 -8
  138. data/ext/sources/ggml/src/ggml-cpu/quants.c +1193 -0
  139. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +34 -0
  140. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1982 -0
  141. data/ext/sources/ggml/src/ggml-cpu/repack.h +120 -0
  142. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +367 -46
  143. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
  144. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  145. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  146. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  147. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +3 -3
  148. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +1 -1
  149. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +272 -35
  150. data/ext/sources/ggml/src/ggml-cpu/vec.h +794 -142
  151. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +20 -16
  152. data/ext/sources/ggml/src/ggml-cuda/add-id.cu +58 -0
  153. data/ext/sources/ggml/src/ggml-cuda/add-id.cuh +3 -0
  154. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +330 -191
  155. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +2 -0
  156. data/ext/sources/ggml/src/ggml-cuda/common.cuh +291 -81
  157. data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
  158. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  159. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  160. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  161. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  162. data/ext/sources/ggml/src/ggml-cuda/conv2d.cu +166 -0
  163. data/ext/sources/ggml/src/ggml-cuda/conv2d.cuh +5 -0
  164. data/ext/sources/ggml/src/ggml-cuda/convert.cu +117 -22
  165. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +20 -0
  166. data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +217 -0
  167. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +64 -307
  168. data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
  169. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +14 -40
  170. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +499 -368
  171. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +142 -93
  172. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +755 -0
  173. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +3 -0
  174. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +593 -0
  175. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +90 -50
  176. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +185 -198
  177. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
  178. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +50 -39
  179. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +636 -222
  180. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +196 -35
  181. data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +1 -0
  182. data/ext/sources/ggml/src/ggml-cuda/mean.cu +73 -0
  183. data/ext/sources/ggml/src/ggml-cuda/mean.cuh +3 -0
  184. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +198 -45
  185. data/ext/sources/ggml/src/ggml-cuda/mmf.cu +123 -0
  186. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +496 -0
  187. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +206 -57
  188. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +1262 -721
  189. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +506 -0
  190. data/ext/sources/ggml/src/ggml-cuda/{mmv.cuh → mmvf.cuh} +4 -5
  191. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +64 -73
  192. data/ext/sources/ggml/src/ggml-cuda/norm.cu +284 -12
  193. data/ext/sources/ggml/src/ggml-cuda/norm.cuh +7 -0
  194. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
  195. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
  196. data/ext/sources/ggml/src/ggml-cuda/pad.cu +46 -23
  197. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cu +91 -0
  198. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
  199. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +12 -10
  200. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
  201. data/ext/sources/ggml/src/ggml-cuda/roll.cu +67 -0
  202. data/ext/sources/ggml/src/ggml-cuda/roll.cuh +5 -0
  203. data/ext/sources/ggml/src/ggml-cuda/rope.cu +21 -27
  204. data/ext/sources/ggml/src/ggml-cuda/scale.cu +14 -11
  205. data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +276 -0
  206. data/ext/sources/ggml/src/ggml-cuda/set-rows.cuh +7 -0
  207. data/ext/sources/ggml/src/ggml-cuda/softcap.cu +34 -0
  208. data/ext/sources/ggml/src/ggml-cuda/softcap.cuh +5 -0
  209. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +126 -59
  210. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
  211. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +322 -98
  212. data/ext/sources/ggml/src/ggml-cuda/sum.cu +6 -10
  213. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +23 -19
  214. data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  215. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
  216. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
  217. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
  218. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
  219. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
  220. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
  221. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
  222. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
  223. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
  224. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
  225. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
  226. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
  227. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
  228. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
  229. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
  230. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
  231. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
  232. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
  233. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
  234. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
  235. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
  236. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
  237. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
  238. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
  239. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
  240. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
  241. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
  242. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
  243. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
  244. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
  245. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
  246. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
  247. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
  248. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
  249. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
  250. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
  251. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +21 -18
  252. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
  253. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
  254. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
  255. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
  256. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
  257. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
  258. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
  259. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
  260. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
  261. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
  262. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
  263. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
  264. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
  265. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
  266. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
  267. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
  268. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
  269. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +259 -0
  270. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +14 -0
  271. data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +3 -3
  272. data/ext/sources/ggml/src/ggml-cuda/unary.cu +179 -0
  273. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +15 -0
  274. data/ext/sources/ggml/src/ggml-cuda/upscale.cu +92 -6
  275. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
  276. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
  277. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +58 -36
  278. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +4 -3
  279. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +14 -2
  280. data/ext/sources/ggml/src/ggml-impl.h +229 -175
  281. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +21 -17
  282. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
  283. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  284. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
  285. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +600 -0
  286. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +1376 -0
  287. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +226 -0
  288. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +1308 -0
  289. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +163 -63
  290. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +3158 -0
  291. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +82 -0
  292. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +718 -0
  293. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +3208 -1575
  294. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +18 -8
  295. data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +2 -2
  296. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +32 -0
  297. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +4430 -792
  298. data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +107 -0
  299. data/ext/sources/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  300. data/ext/sources/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  301. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  302. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  303. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  304. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +84 -0
  305. data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +138 -0
  306. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  307. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +370 -0
  308. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  309. data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
  310. data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +378 -0
  311. data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +121 -0
  312. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
  313. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
  314. data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
  315. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  316. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +132 -0
  317. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +133 -0
  318. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  319. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  320. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  321. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  322. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  323. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  324. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  325. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  326. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  327. data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
  328. data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  329. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  330. data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +79 -0
  331. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
  332. data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +189 -0
  333. data/ext/sources/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  334. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +34 -13
  335. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +34 -13
  336. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +34 -13
  337. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +34 -13
  338. data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +138 -0
  339. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
  340. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  341. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +20 -0
  342. data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  343. data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +120 -0
  344. data/ext/sources/ggml/src/ggml-opt.cpp +97 -41
  345. data/ext/sources/ggml/src/ggml-quants.c +117 -24
  346. data/ext/sources/ggml/src/ggml-quants.h +6 -0
  347. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +85 -62
  348. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
  349. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +2 -0
  350. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +9 -0
  351. data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +6 -0
  352. data/ext/sources/ggml/src/ggml-sycl/common.hpp +20 -48
  353. data/ext/sources/ggml/src/ggml-sycl/concat.cpp +13 -17
  354. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +21 -2
  355. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +116 -211
  356. data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +213 -1
  357. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  358. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +700 -1041
  359. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +20 -9
  360. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +17 -26
  361. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +2 -96
  362. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +393 -250
  363. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +1 -1
  364. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +32 -8
  365. data/ext/sources/ggml/src/ggml-sycl/quantize.hpp +133 -0
  366. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +38 -11
  367. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +125 -21
  368. data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +234 -0
  369. data/ext/sources/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  370. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  371. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  372. data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +4 -3
  373. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +105 -17
  374. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +36 -32
  375. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +4198 -1145
  376. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
  377. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +41 -1
  378. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
  379. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +13 -4
  380. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +39 -29
  381. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +349 -0
  382. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  383. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +2 -2
  384. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +66 -12
  385. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +154 -0
  386. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +21 -0
  387. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +1 -1
  388. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +2 -1
  389. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +6 -5
  390. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +4 -2
  391. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
  392. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +1 -1
  393. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
  394. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +1 -1
  395. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +1 -1
  396. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
  397. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +21 -0
  398. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +69 -24
  399. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +60 -20
  400. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +98 -42
  401. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +64 -27
  402. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +74 -13
  403. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  404. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  405. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  406. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  407. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +4 -17
  408. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +19 -10
  409. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +25 -15
  410. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +19 -0
  411. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
  412. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
  413. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
  414. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +18 -14
  415. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +126 -0
  416. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +65 -1
  417. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +11 -7
  418. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
  419. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +144 -531
  420. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +206 -38
  421. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.comp +556 -0
  422. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +12 -5
  423. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +15 -9
  424. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +111 -0
  425. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
  426. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +24 -3
  427. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +53 -3
  428. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  429. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +64 -11
  430. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
  431. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  432. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +1 -4
  433. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
  434. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
  435. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
  436. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp +5 -0
  437. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
  438. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +29 -7
  439. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +4 -0
  440. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
  441. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +38 -5
  442. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  443. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
  444. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +4 -3
  445. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +101 -9
  446. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
  447. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/utils.comp +25 -0
  448. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +338 -71
  449. data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +54 -0
  450. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1558 -0
  451. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl +44 -0
  452. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl +41 -0
  453. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
  454. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
  455. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +60 -0
  456. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +124 -0
  457. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
  458. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  459. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl +44 -0
  460. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl +41 -0
  461. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +907 -0
  462. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +57 -0
  463. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_in_place.wgsl +48 -0
  464. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +81 -0
  465. data/ext/sources/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  466. data/ext/sources/ggml/src/ggml-zdnn/common.hpp +59 -0
  467. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +628 -0
  468. data/ext/sources/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  469. data/ext/sources/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  470. data/ext/sources/ggml/src/ggml-zdnn/utils.cpp +79 -0
  471. data/ext/sources/ggml/src/ggml-zdnn/utils.hpp +19 -0
  472. data/ext/sources/ggml/src/ggml.c +802 -142
  473. data/ext/sources/ggml/src/ggml.cpp +26 -0
  474. data/ext/sources/ggml/src/gguf.cpp +32 -4
  475. data/ext/sources/include/whisper.h +2 -0
  476. data/ext/sources/src/CMakeLists.txt +2 -0
  477. data/ext/sources/src/coreml/whisper-compat.h +10 -0
  478. data/ext/sources/src/coreml/whisper-compat.m +35 -0
  479. data/ext/sources/src/coreml/whisper-decoder-impl.m +1 -0
  480. data/ext/sources/src/coreml/whisper-encoder-impl.m +1 -0
  481. data/ext/sources/src/whisper.cpp +241 -215
  482. data/ext/sources/tests/CMakeLists.txt +8 -1
  483. data/ext/sources/tests/test-vad-full.cpp +3 -3
  484. data/ext/sources/tests/test-vad.cpp +2 -2
  485. data/extsources.rb +15 -9
  486. data/lib/whisper/context.rb +15 -0
  487. data/lib/whisper/model/uri.rb +57 -2
  488. data/lib/whisper/segment.rb +58 -0
  489. data/sig/whisper.rbs +75 -38
  490. data/{tests → test}/helper.rb +1 -12
  491. data/{tests → test}/test_model.rb +9 -0
  492. data/test/test_package.rb +51 -0
  493. data/{tests → test}/test_params.rb +8 -0
  494. data/test/test_segment.rb +146 -0
  495. data/{tests → test}/test_whisper.rb +70 -0
  496. data/whispercpp.gemspec +2 -3
  497. metadata +246 -191
  498. data/ext/sources/.dockerignore +0 -3
  499. data/ext/sources/.github/workflows/bindings-ruby.yml +0 -21
  500. data/ext/sources/ci/run.sh +0 -336
  501. data/ext/sources/close-issue.yml +0 -28
  502. data/ext/sources/ggml/include/ggml-kompute.h +0 -50
  503. data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +0 -107
  504. data/ext/sources/ggml/src/ggml-amx/common.h +0 -94
  505. data/ext/sources/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
  506. data/ext/sources/ggml/src/ggml-amx/mmq.cpp +0 -2510
  507. data/ext/sources/ggml/src/ggml-amx/mmq.h +0 -17
  508. data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  509. data/ext/sources/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  510. data/ext/sources/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  511. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  512. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  513. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  514. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  515. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  516. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  517. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  518. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -6431
  519. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  520. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13747
  521. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -357
  522. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
  523. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -365
  524. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
  525. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -482
  526. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -472
  527. data/ext/sources/ggml/src/ggml-cuda/mmv.cu +0 -336
  528. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  529. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  530. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  531. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  532. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  533. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  534. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  535. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  536. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  537. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  538. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  539. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  540. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  541. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  542. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  543. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  544. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  545. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  546. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  547. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  548. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  549. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  550. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  551. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  552. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  553. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  554. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  555. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  556. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  557. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  558. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  559. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  560. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  561. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  562. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  563. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  564. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  565. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  566. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  567. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  568. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  569. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  570. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  571. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  572. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  573. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  574. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  575. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  576. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  577. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  578. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  579. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  580. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  581. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  582. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  583. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  584. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  585. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  586. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  587. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  588. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  589. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  590. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  591. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  592. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  593. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  594. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  595. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  596. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  597. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  598. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  599. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  600. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  601. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  602. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  603. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  604. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  605. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  606. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  607. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  608. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  609. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  610. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  611. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  612. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  613. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  614. data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  615. data/ext/sources/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  616. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
  617. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
  618. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
  619. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
  620. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
  621. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
  622. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
  623. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
  624. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
  625. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
  626. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
  627. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
  628. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
  629. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
  630. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
  631. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
  632. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
  633. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
  634. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
  635. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
  636. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
  637. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
  638. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
  639. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
  640. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
  641. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
  642. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
  643. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
  644. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
  645. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
  646. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
  647. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
  648. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
  649. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
  650. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
  651. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
  652. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
  653. data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +0 -5998
  654. data/tests/test_package.rb +0 -46
  655. data/tests/test_segment.rb +0 -74
  656. /data/ext/sources/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  657. /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  658. /data/{tests → test}/jfk_reader/.gitignore +0 -0
  659. /data/{tests → test}/jfk_reader/extconf.rb +0 -0
  660. /data/{tests → test}/jfk_reader/jfk_reader.c +0 -0
  661. /data/{tests → test}/test_callback.rb +0 -0
  662. /data/{tests → test}/test_error.rb +0 -0
  663. /data/{tests → test}/test_vad.rb +0 -0
  664. /data/{tests → test}/test_vad_params.rb +0 -0
@@ -0,0 +1,3158 @@
1
+ #include "ggml-metal-ops.h"
2
+
3
+ #include "ggml.h"
4
+ #include "ggml-impl.h"
5
+ #include "ggml-backend-impl.h"
6
+
7
+ #include "ggml-metal-impl.h"
8
+ #include "ggml-metal-common.h"
9
+ #include "ggml-metal-device.h"
10
+
11
+ #include <cassert>
12
+ #include <algorithm>
13
+
14
+ static ggml_metal_buffer_id ggml_metal_get_buffer_id(const ggml_tensor * t) {
15
+ if (!t) {
16
+ return { nullptr, 0 };
17
+ }
18
+
19
+ ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;
20
+
21
+ ggml_metal_buffer_t ctx = (ggml_metal_buffer_t) buffer->context;
22
+
23
+ return ggml_metal_buffer_get_id(ctx, t);
24
+ }
25
+
26
+ struct ggml_metal_op {
27
+ ggml_metal_op(
28
+ ggml_metal_device_t dev,
29
+ ggml_metal_cmd_buf_t cmd_buf,
30
+ ggml_cgraph * gf,
31
+ int idx_start,
32
+ int idx_end,
33
+ bool use_fusion,
34
+ bool use_concurrency,
35
+ bool use_capture,
36
+ int debug_graph,
37
+ int debug_fusion) {
38
+ this->dev = dev;
39
+ this->lib = ggml_metal_device_get_library(dev);
40
+ this->enc = ggml_metal_encoder_init(cmd_buf, use_concurrency);
41
+ this->mem_ranges = ggml_mem_ranges_init(debug_graph);
42
+ this->idx_start = idx_start;
43
+ this->idx_end = idx_end;
44
+ this->use_fusion = use_fusion;
45
+ this->use_concurrency = use_concurrency;
46
+ this->use_capture = use_capture;
47
+ this->debug_graph = debug_graph;
48
+ this->debug_fusion = debug_fusion;
49
+ this->gf = gf;
50
+
51
+ idxs.reserve(gf->n_nodes);
52
+
53
+ // filter empty nodes
54
+ // TODO: this can be removed when the allocator starts filtering them earlier
55
+ // https://github.com/ggml-org/llama.cpp/pull/16130#issuecomment-3327905830
56
+ for (int i = idx_start; i < idx_end; i++) {
57
+ if (!ggml_op_is_empty(gf->nodes[i]->op) && !ggml_is_empty(gf->nodes[i])) {
58
+ idxs.push_back(i);
59
+ }
60
+ }
61
+ }
62
+
63
+ ~ggml_metal_op() {
64
+ ggml_metal_encoder_end_encoding(this->enc);
65
+ ggml_metal_encoder_free(this->enc);
66
+ ggml_mem_ranges_free(this->mem_ranges);
67
+ }
68
+
69
+ int n_nodes() const {
70
+ return idxs.size();
71
+ }
72
+
73
+ ggml_tensor * node(int i) const {
74
+ assert(i >= 0 && i < (int) idxs.size());
75
+ return ggml_graph_node(gf, idxs[i]);
76
+ }
77
+
78
+ bool can_fuse(int i0, const ggml_op * ops, int n_ops) const {
79
+ assert(use_fusion);
80
+ assert(i0 >= 0 && i0 < n_nodes());
81
+
82
+ if (i0 + n_ops > n_nodes()) {
83
+ return false;
84
+ }
85
+
86
+ return ggml_can_fuse_ext(gf, idxs.data() + i0, ops, n_ops);
87
+ }
88
+
89
+ ggml_metal_device_t dev;
90
+ ggml_metal_library_t lib;
91
+ ggml_metal_encoder_t enc;
92
+ ggml_mem_ranges_t mem_ranges;
93
+
94
+ bool use_fusion;
95
+ bool use_concurrency;
96
+ bool use_capture;
97
+
98
+ int debug_graph;
99
+ int debug_fusion;
100
+
101
+ private:
102
+ ggml_cgraph * gf;
103
+
104
+ int idx_start;
105
+ int idx_end;
106
+
107
+ // non-empty node indices
108
+ std::vector<int> idxs;
109
+ };
110
+
111
+ ggml_metal_op_t ggml_metal_op_init(
112
+ ggml_metal_device_t dev,
113
+ ggml_metal_cmd_buf_t cmd_buf,
114
+ ggml_cgraph * gf,
115
+ int idx_start,
116
+ int idx_end,
117
+ bool use_fusion,
118
+ bool use_concurrency,
119
+ bool use_capture,
120
+ int debug_graph,
121
+ int debug_fusion) {
122
+ ggml_metal_op_t res = new ggml_metal_op(
123
+ dev,
124
+ cmd_buf,
125
+ gf,
126
+ idx_start,
127
+ idx_end,
128
+ use_fusion,
129
+ use_concurrency,
130
+ use_capture,
131
+ debug_graph,
132
+ debug_fusion);
133
+
134
+ return res;
135
+ }
136
+
137
+ void ggml_metal_op_free(ggml_metal_op_t ctx) {
138
+ delete ctx;
139
+ }
140
+
141
+ int ggml_metal_op_n_nodes(ggml_metal_op_t ctx) {
142
+ return ctx->n_nodes();
143
+ }
144
+
145
+ static bool ggml_metal_op_concurrency_reset(ggml_metal_op_t ctx) {
146
+ if (!ctx->mem_ranges) {
147
+ return true;
148
+ }
149
+
150
+ ggml_metal_encoder_memory_barrier(ctx->enc);
151
+
152
+ ggml_mem_ranges_reset(ctx->mem_ranges);
153
+
154
+ return true;
155
+ }
156
+
157
+ static bool ggml_metal_op_concurrency_check(ggml_metal_op_t ctx, const ggml_tensor * node) {
158
+ if (!ctx->mem_ranges) {
159
+ return false;
160
+ }
161
+
162
+ return ggml_mem_ranges_check(ctx->mem_ranges, node);
163
+ }
164
+
165
+ static bool ggml_metal_op_concurrency_add(ggml_metal_op_t ctx, const ggml_tensor * node) {
166
+ if (!ctx->mem_ranges) {
167
+ return true;
168
+ }
169
+
170
+ return ggml_mem_ranges_add(ctx->mem_ranges, node);
171
+ }
172
+
173
+ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
174
+ struct ggml_tensor * node = ctx->node(idx);
175
+
176
+ //GGML_LOG_INFO("%s: encoding node %3d, op = %8s\n", __func__, idx, ggml_op_name(node->op));
177
+
178
+ if (ggml_is_empty(node)) {
179
+ return 1;
180
+ }
181
+
182
+ switch (node->op) {
183
+ case GGML_OP_NONE:
184
+ case GGML_OP_RESHAPE:
185
+ case GGML_OP_VIEW:
186
+ case GGML_OP_TRANSPOSE:
187
+ case GGML_OP_PERMUTE:
188
+ {
189
+ // noop -> next node
190
+ if (ctx->debug_graph > 0) {
191
+ GGML_LOG_DEBUG("%s: node[%5d] - %-12s %s\n", __func__, idx, ggml_op_name(node->op), "(noop)");
192
+ }
193
+ } return 1;
194
+ default:
195
+ {
196
+ } break;
197
+ }
198
+
199
+ if (!ggml_metal_device_supports_op(ctx->dev, node)) {
200
+ GGML_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(node));
201
+ GGML_ABORT("unsupported op");
202
+ }
203
+
204
+ int n_fuse = 1;
205
+
206
+ // check if the current node can run concurrently with other nodes before it
207
+ // the condition is that:
208
+ // - the current node cannot write to any previous src or dst ranges
209
+ // - the current node cannot read from any previous dst ranges
210
+ //
211
+ // if the condition is not satisfied, we put a memory barrier and clear all ranges
212
+ // otherwise, we add the new ranges to the encoding context and process the node concurrently
213
+ //
214
+ {
215
+ const bool is_concurrent = ggml_metal_op_concurrency_check(ctx, node);
216
+
217
+ if (!is_concurrent) {
218
+ ggml_metal_op_concurrency_reset(ctx);
219
+ }
220
+
221
+ if (ctx->debug_graph > 0) {
222
+ GGML_LOG_DEBUG("%s: node[%5d] - %-12s %s\n", __func__, idx, ggml_op_name(node->op), is_concurrent ? "(concurrent)" : "");
223
+ }
224
+ if (ctx->debug_graph > 1) {
225
+ GGML_TENSOR_LOCALS( int64_t, ne0, node->src[0], ne);
226
+ GGML_TENSOR_LOCALS(uint64_t, nb0, node->src[0], nb);
227
+ GGML_TENSOR_LOCALS( int64_t, ne1, node->src[1], ne);
228
+ GGML_TENSOR_LOCALS(uint64_t, nb1, node->src[1], nb);
229
+ GGML_TENSOR_LOCALS( int64_t, ne, node, ne);
230
+ GGML_TENSOR_LOCALS(uint64_t, nb, node, nb);
231
+
232
+ if (node->src[0]) {
233
+ GGML_LOG_DEBUG("%s: src0 - %4s [%5lld, %5lld, %5lld, %5lld] [%5lld, %5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(node->src[0]->type), ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03,
234
+ ggml_is_contiguous(node->src[0]), node->src[0]->name);
235
+ }
236
+ if (node->src[1]) {
237
+ GGML_LOG_DEBUG("%s: src1 - %4s [%5lld, %5lld, %5lld, %5lld] [%5lld, %5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(node->src[1]->type), ne10, ne11, ne12, ne13, nb10, nb11, nb12, nb13,
238
+ ggml_is_contiguous(node->src[1]), node->src[1]->name);
239
+ }
240
+ if (node) {
241
+ GGML_LOG_DEBUG("%s: node - %4s [%5lld, %5lld, %5lld, %5lld] [%5lld, %5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(node->type), ne0, ne1, ne2, ne3, nb0, nb1, nb2, nb3,
242
+ node->name);
243
+ }
244
+ }
245
+ }
246
+
247
+ switch (node->op) {
248
+ case GGML_OP_CONCAT:
249
+ {
250
+ n_fuse = ggml_metal_op_concat(ctx, idx);
251
+ } break;
252
+ case GGML_OP_ADD:
253
+ case GGML_OP_SUB:
254
+ case GGML_OP_MUL:
255
+ case GGML_OP_DIV:
256
+ {
257
+ n_fuse = ggml_metal_op_bin(ctx, idx);
258
+ } break;
259
+ case GGML_OP_ADD_ID:
260
+ {
261
+ n_fuse = ggml_metal_op_add_id(ctx, idx);
262
+ } break;
263
+ case GGML_OP_REPEAT:
264
+ {
265
+ n_fuse = ggml_metal_op_repeat(ctx, idx);
266
+ } break;
267
+ case GGML_OP_ACC:
268
+ {
269
+ n_fuse = ggml_metal_op_acc(ctx, idx);
270
+ } break;
271
+ case GGML_OP_SCALE:
272
+ {
273
+ n_fuse = ggml_metal_op_scale(ctx, idx);
274
+ } break;
275
+ case GGML_OP_CLAMP:
276
+ {
277
+ n_fuse = ggml_metal_op_clamp(ctx, idx);
278
+ } break;
279
+ case GGML_OP_SQR:
280
+ case GGML_OP_SQRT:
281
+ case GGML_OP_SIN:
282
+ case GGML_OP_COS:
283
+ case GGML_OP_LOG:
284
+ case GGML_OP_UNARY:
285
+ {
286
+ n_fuse = ggml_metal_op_unary(ctx, idx);
287
+ } break;
288
+ case GGML_OP_GLU:
289
+ {
290
+ n_fuse = ggml_metal_op_glu(ctx, idx);
291
+ } break;
292
+ case GGML_OP_SUM_ROWS:
293
+ case GGML_OP_MEAN:
294
+ {
295
+ n_fuse = ggml_metal_op_sum_rows(ctx, idx);
296
+ } break;
297
+ case GGML_OP_SOFT_MAX:
298
+ {
299
+ n_fuse = ggml_metal_op_soft_max(ctx, idx);
300
+ } break;
301
+ case GGML_OP_SSM_CONV:
302
+ {
303
+ n_fuse = ggml_metal_op_ssm_conv(ctx, idx);
304
+ } break;
305
+ case GGML_OP_SSM_SCAN:
306
+ {
307
+ n_fuse = ggml_metal_op_ssm_scan(ctx, idx);
308
+ } break;
309
+ case GGML_OP_RWKV_WKV6:
310
+ case GGML_OP_RWKV_WKV7:
311
+ {
312
+ n_fuse = ggml_metal_op_rwkv(ctx, idx);
313
+ } break;
314
+ case GGML_OP_MUL_MAT:
315
+ {
316
+ n_fuse = ggml_metal_op_mul_mat(ctx, idx);
317
+ } break;
318
+ case GGML_OP_MUL_MAT_ID:
319
+ {
320
+ n_fuse = ggml_metal_op_mul_mat_id(ctx, idx);
321
+ } break;
322
+ case GGML_OP_GET_ROWS:
323
+ {
324
+ n_fuse = ggml_metal_op_get_rows(ctx, idx);
325
+ } break;
326
+ case GGML_OP_SET_ROWS:
327
+ {
328
+ n_fuse = ggml_metal_op_set_rows(ctx, idx);
329
+ } break;
330
+ case GGML_OP_L2_NORM:
331
+ {
332
+ n_fuse = ggml_metal_op_l2_norm(ctx, idx);
333
+ } break;
334
+ case GGML_OP_GROUP_NORM:
335
+ {
336
+ n_fuse = ggml_metal_op_group_norm(ctx, idx);
337
+ } break;
338
+ case GGML_OP_NORM:
339
+ case GGML_OP_RMS_NORM:
340
+ {
341
+ n_fuse = ggml_metal_op_norm(ctx, idx);
342
+ } break;
343
+ case GGML_OP_ROPE:
344
+ {
345
+ n_fuse = ggml_metal_op_rope(ctx, idx);
346
+ } break;
347
+ case GGML_OP_IM2COL:
348
+ {
349
+ n_fuse = ggml_metal_op_im2col(ctx, idx);
350
+ } break;
351
+ case GGML_OP_CONV_TRANSPOSE_1D:
352
+ {
353
+ n_fuse = ggml_metal_op_conv_transpose_1d(ctx, idx);
354
+ } break;
355
+ case GGML_OP_UPSCALE:
356
+ {
357
+ n_fuse = ggml_metal_op_upscale(ctx, idx);
358
+ } break;
359
+ case GGML_OP_PAD:
360
+ {
361
+ n_fuse = ggml_metal_op_pad(ctx, idx);
362
+ } break;
363
+ case GGML_OP_PAD_REFLECT_1D:
364
+ {
365
+ n_fuse = ggml_metal_op_pad_reflect_1d(ctx, idx);
366
+ } break;
367
+ case GGML_OP_ARANGE:
368
+ {
369
+ n_fuse = ggml_metal_op_arange(ctx, idx);
370
+ } break;
371
+ case GGML_OP_TIMESTEP_EMBEDDING:
372
+ {
373
+ n_fuse = ggml_metal_op_timestep_embedding(ctx, idx);
374
+ } break;
375
+ case GGML_OP_ARGSORT:
376
+ {
377
+ n_fuse = ggml_metal_op_argsort(ctx, idx);
378
+ } break;
379
+ case GGML_OP_LEAKY_RELU:
380
+ {
381
+ n_fuse = ggml_metal_op_leaky_relu(ctx, idx);
382
+ } break;
383
+ case GGML_OP_FLASH_ATTN_EXT:
384
+ {
385
+ n_fuse = ggml_metal_op_flash_attn_ext(ctx, idx);
386
+ } break;
387
+ case GGML_OP_DUP:
388
+ case GGML_OP_CPY:
389
+ case GGML_OP_CONT:
390
+ {
391
+ n_fuse = ggml_metal_op_cpy(ctx, idx);
392
+ } break;
393
+ case GGML_OP_POOL_2D:
394
+ {
395
+ n_fuse = ggml_metal_op_pool_2d(ctx, idx);
396
+ } break;
397
+ case GGML_OP_ARGMAX:
398
+ {
399
+ n_fuse = ggml_metal_op_argmax(ctx, idx);
400
+ } break;
401
+ default:
402
+ {
403
+ GGML_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(node->op));
404
+ GGML_ABORT("fatal error");
405
+ }
406
+ }
407
+
408
+ if (ctx->debug_graph > 0) {
409
+ if (n_fuse > 1) {
410
+ GGML_LOG_DEBUG("%s: fuse %d ops\n", __func__, n_fuse);
411
+ }
412
+ }
413
+
414
+ // update the mem ranges in the encoding context
415
+ for (int i = 0; i < n_fuse; ++i) {
416
+ if (!ggml_metal_op_concurrency_add(ctx, ctx->node(idx + i))) {
417
+ ggml_metal_op_concurrency_reset(ctx);
418
+ }
419
+ }
420
+
421
+ return n_fuse;
422
+ }
423
+
424
+ int ggml_metal_op_encode(ggml_metal_op_t ctx, int idx) {
425
+ if (ctx->use_capture) {
426
+ ggml_metal_encoder_debug_group_push(ctx->enc, ggml_op_desc(ctx->node(idx)));
427
+ }
428
+
429
+ int res = ggml_metal_op_encode_impl(ctx, idx);
430
+ if (idx + res > ctx->n_nodes()) {
431
+ GGML_ABORT("fusion error: nodes spanning multiple encoders have been fused. this indicates a bug in the fusion logic %s",
432
+ "https://github.com/ggml-org/llama.cpp/pull/14849");
433
+ }
434
+
435
+ if (ctx->use_capture) {
436
+ ggml_metal_encoder_debug_group_pop(ctx->enc);
437
+ }
438
+
439
+ return res;
440
+ }
441
+
442
+ int ggml_metal_op_concat(ggml_metal_op_t ctx, int idx) {
443
+ ggml_tensor * op = ctx->node(idx);
444
+
445
+ ggml_metal_library_t lib = ctx->lib;
446
+ ggml_metal_encoder_t enc = ctx->enc;
447
+
448
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
449
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
450
+ GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
451
+ GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
452
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
453
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
454
+
455
+ const int32_t dim = ((const int32_t *) op->op_params)[0];
456
+
457
+ ggml_metal_kargs_concat args = {
458
+ /*.ne00 =*/ ne00,
459
+ /*.ne01 =*/ ne01,
460
+ /*.ne02 =*/ ne02,
461
+ /*.ne03 =*/ ne03,
462
+ /*.nb00 =*/ nb00,
463
+ /*.nb01 =*/ nb01,
464
+ /*.nb02 =*/ nb02,
465
+ /*.nb03 =*/ nb03,
466
+ /*.ne10 =*/ ne10,
467
+ /*.ne11 =*/ ne11,
468
+ /*.ne12 =*/ ne12,
469
+ /*.ne13 =*/ ne13,
470
+ /*.nb10 =*/ nb10,
471
+ /*.nb11 =*/ nb11,
472
+ /*.nb12 =*/ nb12,
473
+ /*.nb13 =*/ nb13,
474
+ /*.ne0 =*/ ne0,
475
+ /*.ne1 =*/ ne1,
476
+ /*.ne2 =*/ ne2,
477
+ /*.ne3 =*/ ne3,
478
+ /*.nb0 =*/ nb0,
479
+ /*.nb1 =*/ nb1,
480
+ /*.nb2 =*/ nb2,
481
+ /*.nb3 =*/ nb3,
482
+ /*.dim =*/ dim,
483
+ };
484
+
485
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_base(lib, GGML_OP_CONCAT);
486
+
487
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
488
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
489
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
490
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
491
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3);
492
+
493
+ const int nth = std::min(1024, ne0);
494
+
495
+ ggml_metal_encoder_dispatch_threadgroups(enc, ne1, ne2, ne3, nth, 1, 1);
496
+
497
+ return 1;
498
+ }
499
+
500
+ int ggml_metal_op_repeat(ggml_metal_op_t ctx, int idx) {
501
+ ggml_tensor * op = ctx->node(idx);
502
+
503
+ ggml_metal_library_t lib = ctx->lib;
504
+ ggml_metal_encoder_t enc = ctx->enc;
505
+
506
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
507
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
508
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
509
+ GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
510
+
511
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_repeat(lib, op->type);
512
+
513
+ ggml_metal_kargs_repeat args = {
514
+ /*.ne00 =*/ ne00,
515
+ /*.ne01 =*/ ne01,
516
+ /*.ne02 =*/ ne02,
517
+ /*.ne03 =*/ ne03,
518
+ /*.nb00 =*/ nb00,
519
+ /*.nb01 =*/ nb01,
520
+ /*.nb02 =*/ nb02,
521
+ /*.nb03 =*/ nb03,
522
+ /*.ne0 =*/ ne0,
523
+ /*.ne1 =*/ ne1,
524
+ /*.ne2 =*/ ne2,
525
+ /*.ne3 =*/ ne3,
526
+ /*.nb0 =*/ nb0,
527
+ /*.nb1 =*/ nb1,
528
+ /*.nb2 =*/ nb2,
529
+ /*.nb3 =*/ nb3,
530
+ };
531
+
532
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
533
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
534
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
535
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
536
+
537
+ const int nth = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), ne0);
538
+
539
+ ggml_metal_encoder_dispatch_threadgroups(enc, ne1, ne2, ne3, nth, 1, 1);
540
+
541
+ return 1;
542
+ }
543
+
544
+ int ggml_metal_op_acc(ggml_metal_op_t ctx, int idx) {
545
+ ggml_tensor * op = ctx->node(idx);
546
+
547
+ ggml_metal_library_t lib = ctx->lib;
548
+ ggml_metal_encoder_t enc = ctx->enc;
549
+
550
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
551
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
552
+ GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
553
+ GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
554
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
555
+ GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
556
+
557
+ GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32);
558
+ GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32);
559
+ GGML_ASSERT(op->type == GGML_TYPE_F32);
560
+
561
+ GGML_ASSERT(ggml_is_contiguous(op->src[0]));
562
+ GGML_ASSERT(ggml_is_contiguous(op->src[1]));
563
+
564
+ const size_t pnb1 = ((const int32_t *) op->op_params)[0];
565
+ const size_t pnb2 = ((const int32_t *) op->op_params)[1];
566
+ const size_t pnb3 = ((const int32_t *) op->op_params)[2];
567
+ const size_t offs = ((const int32_t *) op->op_params)[3];
568
+
569
+ const bool inplace = (bool) ((const int32_t *) op->op_params)[4];
570
+
571
+ if (!inplace) {
572
+ // run a separete kernel to cpy src->dst
573
+ // not sure how to avoid this
574
+ // TODO: make a simpler cpy_bytes kernel
575
+
576
+ //const id<MTLComputePipelineState> pipeline = ctx->pipelines[GGML_METAL_PIPELINE_TYPE_CPY_F32_F32].obj;
577
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_cpy(lib, op->src[0]->type, op->type);
578
+
579
+ ggml_metal_kargs_cpy args = {
580
+ /*.ne00 =*/ ne00,
581
+ /*.ne01 =*/ ne01,
582
+ /*.ne02 =*/ ne02,
583
+ /*.ne03 =*/ ne03,
584
+ /*.nb00 =*/ nb00,
585
+ /*.nb01 =*/ nb01,
586
+ /*.nb02 =*/ nb02,
587
+ /*.nb03 =*/ nb03,
588
+ /*.ne0 =*/ ne0,
589
+ /*.ne1 =*/ ne1,
590
+ /*.ne2 =*/ ne2,
591
+ /*.ne3 =*/ ne3,
592
+ /*.nb0 =*/ nb0,
593
+ /*.nb1 =*/ nb1,
594
+ /*.nb2 =*/ nb2,
595
+ /*.nb3 =*/ nb3,
596
+ };
597
+
598
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
599
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
600
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
601
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
602
+
603
+ const int nth = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), ne00);
604
+
605
+ ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne02, ne03, nth, 1, 1);
606
+
607
+ ggml_metal_op_concurrency_reset(ctx);
608
+ }
609
+
610
+ ggml_metal_kargs_bin args = {
611
+ /*.ne00 =*/ ne00,
612
+ /*.ne01 =*/ ne01,
613
+ /*.ne02 =*/ ne02,
614
+ /*.ne03 =*/ ne03,
615
+ /*.nb00 =*/ nb00,
616
+ /*.nb01 =*/ pnb1,
617
+ /*.nb02 =*/ pnb2,
618
+ /*.nb03 =*/ pnb3,
619
+ /*.ne10 =*/ ne10,
620
+ /*.ne11 =*/ ne11,
621
+ /*.ne12 =*/ ne12,
622
+ /*.ne13 =*/ ne13,
623
+ /*.nb10 =*/ nb10,
624
+ /*.nb11 =*/ nb11,
625
+ /*.nb12 =*/ nb12,
626
+ /*.nb13 =*/ nb13,
627
+ /*.ne0 =*/ ne0,
628
+ /*.ne1 =*/ ne1,
629
+ /*.ne2 =*/ ne2,
630
+ /*.ne3 =*/ ne3,
631
+ /*.nb0 =*/ nb0,
632
+ /*.nb1 =*/ pnb1,
633
+ /*.nb2 =*/ pnb2,
634
+ /*.nb3 =*/ pnb3,
635
+ /*.offs =*/ offs,
636
+ /*.o1 =*/ { 0 },
637
+ };
638
+
639
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_bin(lib, GGML_OP_ADD, 1, false);
640
+
641
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
642
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
643
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
644
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
645
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3);
646
+
647
+ const int nth = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), ne00);
648
+
649
+ ggml_metal_encoder_dispatch_threadgroups(enc, ne11, ne12, ne13, nth, 1, 1);
650
+
651
+ return 1;
652
+ }
653
+
654
+ int ggml_metal_op_scale(ggml_metal_op_t ctx, int idx) {
655
+ ggml_tensor * op = ctx->node(idx);
656
+
657
+ ggml_metal_library_t lib = ctx->lib;
658
+ ggml_metal_encoder_t enc = ctx->enc;
659
+
660
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
661
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
662
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
663
+ GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
664
+
665
+ float scale;
666
+ float bias;
667
+ memcpy(&scale, ((const int32_t *) op->op_params) + 0, sizeof(float));
668
+ memcpy(&bias, ((const int32_t *) op->op_params) + 1, sizeof(float));
669
+
670
+ ggml_metal_kargs_scale args = {
671
+ /*.scale =*/ scale,
672
+ /*.bias =*/ bias,
673
+ };
674
+
675
+ int64_t n = ggml_nelements(op);
676
+
677
+ if (n % 4 == 0) {
678
+ n /= 4;
679
+ }
680
+
681
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_unary(lib, op);
682
+
683
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
684
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
685
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
686
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
687
+
688
+ ggml_metal_encoder_dispatch_threadgroups(enc, n, 1, 1, 1, 1, 1);
689
+
690
+ return 1;
691
+ }
692
+
693
+ int ggml_metal_op_clamp(ggml_metal_op_t ctx, int idx) {
694
+ ggml_tensor * op = ctx->node(idx);
695
+
696
+ ggml_metal_library_t lib = ctx->lib;
697
+ ggml_metal_encoder_t enc = ctx->enc;
698
+
699
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
700
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
701
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
702
+ GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
703
+
704
+ float min;
705
+ float max;
706
+ memcpy(&min, ((const int32_t *) op->op_params) + 0, sizeof(float));
707
+ memcpy(&max, ((const int32_t *) op->op_params) + 1, sizeof(float));
708
+
709
+ ggml_metal_kargs_clamp args = {
710
+ /*.min =*/ min,
711
+ /*.max =*/ max,
712
+ };
713
+
714
+ int64_t n = ggml_nelements(op);
715
+
716
+ if (n % 4 == 0) {
717
+ n /= 4;
718
+ }
719
+
720
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_unary(lib, op);
721
+
722
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
723
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
724
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
725
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
726
+
727
+ ggml_metal_encoder_dispatch_threadgroups(enc, n, 1, 1, 1, 1, 1);
728
+
729
+ return 1;
730
+ }
731
+
732
+ int ggml_metal_op_unary(ggml_metal_op_t ctx, int idx) {
733
+ ggml_tensor * op = ctx->node(idx);
734
+
735
+ ggml_metal_library_t lib = ctx->lib;
736
+ ggml_metal_encoder_t enc = ctx->enc;
737
+
738
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
739
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
740
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
741
+ GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
742
+
743
+ int64_t n = ggml_nelements(op);
744
+
745
+ if (n % 4 == 0) {
746
+ n /= 4;
747
+ }
748
+
749
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_unary(lib, op);
750
+
751
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
752
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 0);
753
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 1);
754
+
755
+ ggml_metal_encoder_dispatch_threadgroups(enc, n, 1, 1, 1, 1, 1);
756
+
757
+ return 1;
758
+ }
759
+
760
+ int ggml_metal_op_glu(ggml_metal_op_t ctx, int idx) {
761
+ ggml_tensor * op = ctx->node(idx);
762
+
763
+ ggml_metal_library_t lib = ctx->lib;
764
+ ggml_metal_encoder_t enc = ctx->enc;
765
+
766
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
767
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
768
+ GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
769
+ GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
770
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
771
+ GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
772
+
773
+ if (op->src[1]) {
774
+ GGML_ASSERT(ggml_are_same_shape(op->src[0], op->src[1]));
775
+ }
776
+
777
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_glu(lib, op);
778
+
779
+ const int32_t swp = ggml_get_op_params_i32(op, 1);
780
+ const float alpha = ggml_get_op_params_f32(op, 2);
781
+ const float limit = ggml_get_op_params_f32(op, 3);
782
+
783
+ const int32_t i00 = swp ? ne0 : 0;
784
+ const int32_t i10 = swp ? 0 : ne0;
785
+
786
+ ggml_metal_kargs_glu args = {
787
+ /*.ne00 =*/ ne00,
788
+ /*.nb01 =*/ nb01,
789
+ /*.ne10 =*/ op->src[1] ? ne10 : ne00,
790
+ /*.nb11 =*/ op->src[1] ? nb11 : nb01,
791
+ /*.ne0 =*/ ne0,
792
+ /*.nb1 =*/ nb1,
793
+ /*.i00 =*/ op->src[1] ? 0 : i00,
794
+ /*.i10 =*/ op->src[1] ? 0 : i10,
795
+ /*.alpha=*/ alpha,
796
+ /*.limit=*/ limit
797
+ };
798
+
799
+ const int64_t nrows = ggml_nrows(op->src[0]);
800
+
801
+ const int32_t nth = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), ne00/2);
802
+
803
+ //[encoder setComputePipelineState:pipeline];
804
+ //[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
805
+ //if (src1) {
806
+ // [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
807
+ //} else {
808
+ // [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
809
+ //}
810
+ //[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
811
+ //[encoder setBytes:&args length:sizeof(args) atIndex:3];
812
+
813
+ //[encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
814
+
815
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
816
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
817
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
818
+ if (op->src[1]) {
819
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
820
+ } else {
821
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 2);
822
+ }
823
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3);
824
+
825
+ ggml_metal_encoder_dispatch_threadgroups(enc, nrows, 1, 1, nth, 1, 1);
826
+
827
+ return 1;
828
+ }
829
+
830
+ int ggml_metal_op_sum_rows(ggml_metal_op_t ctx, int idx) {
831
+ ggml_tensor * op = ctx->node(idx);
832
+
833
+ ggml_metal_library_t lib = ctx->lib;
834
+ ggml_metal_encoder_t enc = ctx->enc;
835
+
836
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
837
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
838
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
839
+ GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
840
+
841
+ ggml_metal_kargs_sum_rows args = {
842
+ /*.ne00 =*/ ne00,
843
+ /*.ne01 =*/ ne01,
844
+ /*.ne02 =*/ ne02,
845
+ /*.ne03 =*/ ne03,
846
+ /*.nb00 =*/ nb00,
847
+ /*.nb01 =*/ nb01,
848
+ /*.nb02 =*/ nb02,
849
+ /*.nb03 =*/ nb03,
850
+ /*.ne0 =*/ ne0,
851
+ /*.ne1 =*/ ne1,
852
+ /*.ne2 =*/ ne2,
853
+ /*.ne3 =*/ ne3,
854
+ /*.nb0 =*/ nb0,
855
+ /*.nb1 =*/ nb1,
856
+ /*.nb2 =*/ nb2,
857
+ /*.nb3 =*/ nb3,
858
+ };
859
+
860
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_sum_rows(lib, op);
861
+
862
+ int nth = 32; // SIMD width
863
+
864
+ while (nth < ne00 && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
865
+ nth *= 2;
866
+ }
867
+
868
+ nth = std::min(nth, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
869
+ nth = std::min(nth, ne00);
870
+
871
+ const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
872
+
873
+ //[encoder setComputePipelineState:pipeline];
874
+ //[encoder setBytes:&args length:sizeof(args) atIndex:0];
875
+ //[encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
876
+ //[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
877
+ //[encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
878
+
879
+ //[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
880
+
881
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
882
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
883
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
884
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
885
+
886
+ ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
887
+
888
+ ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne02, ne03, nth, 1, 1);
889
+
890
+ return 1;
891
+ }
892
+
893
+ int ggml_metal_op_get_rows(ggml_metal_op_t ctx, int idx) {
894
+ ggml_tensor * op = ctx->node(idx);
895
+
896
+ ggml_metal_library_t lib = ctx->lib;
897
+ ggml_metal_encoder_t enc = ctx->enc;
898
+
899
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
900
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
901
+ GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
902
+ GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
903
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
904
+ GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
905
+
906
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_get_rows(lib, op->src[0]->type);
907
+
908
+ ggml_metal_kargs_get_rows args = {
909
+ /*.ne00 =*/ ne00,
910
+ /*.nb01 =*/ nb01,
911
+ /*.nb02 =*/ nb02,
912
+ /*.ne10 =*/ ne10,
913
+ /*.nb10 =*/ nb10,
914
+ /*.nb11 =*/ nb11,
915
+ /*.nb1 =*/ nb1,
916
+ /*.nb2 =*/ nb2,
917
+ };
918
+
919
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
920
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
921
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
922
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
923
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3);
924
+
925
+ ggml_metal_encoder_dispatch_threadgroups(enc, ne10, ne11, ne12, 32, 1, 1);
926
+
927
+ return 1;
928
+ }
929
+
930
+ int ggml_metal_op_set_rows(ggml_metal_op_t ctx, int idx) {
931
+ ggml_tensor * op = ctx->node(idx);
932
+
933
+ ggml_metal_library_t lib = ctx->lib;
934
+ ggml_metal_encoder_t enc = ctx->enc;
935
+
936
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
937
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
938
+ GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
939
+ GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
940
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
941
+ GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
942
+
943
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_set_rows(lib, op->src[1]->type, op->type);
944
+
945
+ const int32_t nk0 = ne0/ggml_blck_size(op->type);
946
+
947
+ int nth = 32; // SIMD width
948
+
949
+ while (nth < nk0 && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
950
+ nth *= 2;
951
+ }
952
+
953
+ int nrptg = 1;
954
+ if (nth > nk0) {
955
+ nrptg = (nth + nk0 - 1)/nk0;
956
+ nth = nk0;
957
+
958
+ if (nrptg*nth > ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
959
+ nrptg--;
960
+ }
961
+ }
962
+
963
+ nth = std::min(nth, nk0);
964
+
965
+ ggml_metal_kargs_set_rows args = {
966
+ /*.nk0 =*/ nk0,
967
+ /*.ne01 =*/ ne01,
968
+ /*.nb01 =*/ nb01,
969
+ /*.nb02 =*/ nb02,
970
+ /*.nb03 =*/ nb03,
971
+ /*.ne11 =*/ ne11,
972
+ /*.ne12 =*/ ne12,
973
+ /*.nb10 =*/ nb10,
974
+ /*.nb11 =*/ nb11,
975
+ /*.nb12 =*/ nb12,
976
+ /*.nb1 =*/ nb1,
977
+ /*.nb2 =*/ nb2,
978
+ /*.nb3 =*/ nb3,
979
+ };
980
+
981
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
982
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
983
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
984
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
985
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3);
986
+
987
+ ggml_metal_encoder_dispatch_threadgroups(enc, (ne01 + nrptg - 1)/nrptg, ne02, ne03, nth, nrptg, 1);
988
+
989
+ return 1;
990
+ }
991
+
992
+ int ggml_metal_op_soft_max(ggml_metal_op_t ctx, int idx) {
993
+ ggml_tensor * op = ctx->node(idx);
994
+
995
+ ggml_metal_library_t lib = ctx->lib;
996
+ ggml_metal_encoder_t enc = ctx->enc;
997
+
998
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
999
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
1000
+ GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
1001
+ GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
1002
+ GGML_TENSOR_LOCALS( int32_t, ne2, op->src[2], ne);
1003
+ GGML_TENSOR_LOCALS(uint64_t, nb2, op->src[2], nb);
1004
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
1005
+ GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
1006
+
1007
+ float scale;
1008
+ float max_bias;
1009
+
1010
+ memcpy(&scale, ((const int32_t *) op->op_params) + 0, sizeof(scale));
1011
+ memcpy(&max_bias, ((const int32_t *) op->op_params) + 1, sizeof(max_bias));
1012
+
1013
+ const uint32_t n_head = op->src[0]->ne[2];
1014
+ const int32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
1015
+
1016
+ const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
1017
+ const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
1018
+
1019
+ // softmax
1020
+
1021
+ ggml_metal_kargs_soft_max args = {
1022
+ /*.ne00 =*/ ne00,
1023
+ /*.ne01 =*/ ne01,
1024
+ /*.ne02 =*/ ne02,
1025
+ /*.nb01 =*/ nb01,
1026
+ /*.nb02 =*/ nb02,
1027
+ /*.nb03 =*/ nb03,
1028
+ /*.ne11 =*/ ne11,
1029
+ /*.ne12 =*/ ne12,
1030
+ /*.ne13 =*/ ne13,
1031
+ /*.nb11 =*/ nb11,
1032
+ /*.nb12 =*/ nb12,
1033
+ /*.nb13 =*/ nb13,
1034
+ /*.nb1 =*/ nb1,
1035
+ /*.nb2 =*/ nb2,
1036
+ /*.nb3 =*/ nb3,
1037
+ /*.scale =*/ scale,
1038
+ /*.max_bias =*/ max_bias,
1039
+ /*.m0 =*/ m0,
1040
+ /*.m1 =*/ m1,
1041
+ /*.n_head_log2 =*/ n_head_log2,
1042
+ };
1043
+
1044
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_soft_max(lib, op);
1045
+
1046
+ int nth = 32; // SIMD width
1047
+
1048
+ if (ne00%4 == 0) {
1049
+ while (nth < ne00/4 && nth*ne01*ne02*ne03 < 256) {
1050
+ nth *= 2;
1051
+ }
1052
+ } else {
1053
+ while (nth < ne00 && nth*ne01*ne02*ne03 < 256) {
1054
+ nth *= 2;
1055
+ }
1056
+ }
1057
+
1058
+ const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
1059
+
1060
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
1061
+ ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 0);
1062
+ ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[0]), 1);
1063
+ if (op->src[1]) {
1064
+ ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[1]), 2);
1065
+ } else {
1066
+ ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[0]), 2);
1067
+ }
1068
+ if (op->src[2]) {
1069
+ ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[2]), 3);
1070
+ } else {
1071
+ ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[0]), 3);
1072
+ }
1073
+ ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op), 4);
1074
+
1075
+ ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
1076
+
1077
+ ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne02, ne03, nth, 1, 1);
1078
+
1079
+ return 1;
1080
+ }
1081
+
1082
+ int ggml_metal_op_ssm_conv(ggml_metal_op_t ctx, int idx) {
1083
+ ggml_tensor * op = ctx->node(idx);
1084
+
1085
+ ggml_metal_library_t lib = ctx->lib;
1086
+ ggml_metal_encoder_t enc = ctx->enc;
1087
+
1088
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
1089
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
1090
+ GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
1091
+ GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
1092
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
1093
+ GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
1094
+
1095
+ ggml_metal_kargs_ssm_conv args = {
1096
+ /*.ne00 =*/ ne00,
1097
+ /*.ne01 =*/ ne01,
1098
+ /*.ne02 =*/ ne02,
1099
+ /*.nb00 =*/ nb00,
1100
+ /*.nb01 =*/ nb01,
1101
+ /*.nb02 =*/ nb02,
1102
+ /*.ne10 =*/ ne10,
1103
+ /*.ne11 =*/ ne11,
1104
+ /*.nb10 =*/ nb10,
1105
+ /*.nb11 =*/ nb11,
1106
+ /*.ne0 =*/ ne0,
1107
+ /*.ne1 =*/ ne1,
1108
+ /*.ne2 =*/ ne2,
1109
+ /*.nb0 =*/ nb0,
1110
+ /*.nb1 =*/ nb1,
1111
+ /*.nb2 =*/ nb2,
1112
+ };
1113
+
1114
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_ssm_conv(lib, op);
1115
+
1116
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
1117
+ ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 0);
1118
+ ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[0]), 1);
1119
+ ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[1]), 2);
1120
+ ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op), 3);
1121
+
1122
+ ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne1, ne02, 1, 1, 1);
1123
+
1124
+ return 1;
1125
+ }
1126
+
1127
+ int ggml_metal_op_ssm_scan(ggml_metal_op_t ctx, int idx) {
1128
+ ggml_tensor * op = ctx->node(idx);
1129
+
1130
+ ggml_metal_library_t lib = ctx->lib;
1131
+ ggml_metal_encoder_t enc = ctx->enc;
1132
+
1133
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
1134
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
1135
+ GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
1136
+ GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
1137
+ GGML_TENSOR_LOCALS( int32_t, ne2, op->src[2], ne);
1138
+ GGML_TENSOR_LOCALS(uint64_t, nb2, op->src[2], nb);
1139
+ GGML_TENSOR_LOCALS( int32_t, ne3, op->src[3], ne);
1140
+ GGML_TENSOR_LOCALS(uint64_t, nb3, op->src[3], nb);
1141
+ GGML_TENSOR_LOCALS( int32_t, ne4, op->src[4], ne);
1142
+ GGML_TENSOR_LOCALS(uint64_t, nb4, op->src[4], nb);
1143
+ GGML_TENSOR_LOCALS( int32_t, ne5, op->src[5], ne);
1144
+ GGML_TENSOR_LOCALS(uint64_t, nb5, op->src[5], nb);
1145
+ GGML_TENSOR_LOCALS( int32_t, ne6, op->src[6], ne);
1146
+ GGML_TENSOR_LOCALS(uint64_t, nb6, op->src[6], nb);
1147
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
1148
+ GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
1149
+
1150
+ const ggml_tensor * src3 = op->src[3];
1151
+ const ggml_tensor * src4 = op->src[4];
1152
+ const ggml_tensor * src5 = op->src[5];
1153
+ const ggml_tensor * src6 = op->src[6];
1154
+
1155
+ GGML_ASSERT(src3);
1156
+ GGML_ASSERT(src4);
1157
+ GGML_ASSERT(src5);
1158
+ GGML_ASSERT(src6);
1159
+
1160
+ const int64_t d_state = ne00;
1161
+ const int64_t d_inner = ne01;
1162
+ const int64_t n_head = ne02;
1163
+ const int64_t n_group = ne41;
1164
+ const int64_t n_seq_tokens = ne12;
1165
+ const int64_t n_seqs = ne13;
1166
+
1167
+ ggml_metal_kargs_ssm_scan args = {
1168
+ /*.d_state =*/ d_state,
1169
+ /*.d_inner =*/ d_inner,
1170
+ /*.n_head =*/ n_head,
1171
+ /*.n_group =*/ n_group,
1172
+ /*.n_seq_tokens =*/ n_seq_tokens,
1173
+ /*.n_seqs =*/ n_seqs,
1174
+ /*.s_off =*/ ggml_nelements(op->src[1]) * sizeof(float),
1175
+ /*.nb01 =*/ nb01,
1176
+ /*.nb02 =*/ nb02,
1177
+ /*.nb03 =*/ nb03,
1178
+ /*.nb11 =*/ nb11,
1179
+ /*.nb12 =*/ nb12,
1180
+ /*.nb13 =*/ nb13,
1181
+ /*.nb21 =*/ nb21,
1182
+ /*.nb22 =*/ nb22,
1183
+ /*.nb31 =*/ nb31,
1184
+ /*.nb41 =*/ nb41,
1185
+ /*.nb42 =*/ nb42,
1186
+ /*.nb43 =*/ nb43,
1187
+ /*.nb51 =*/ nb51,
1188
+ /*.nb52 =*/ nb52,
1189
+ /*.nb53 =*/ nb53,
1190
+ };
1191
+
1192
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_ssm_scan(lib, op);
1193
+
1194
+ const size_t sms = ggml_metal_pipeline_get_smem(pipeline);
1195
+
1196
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
1197
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
1198
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
1199
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
1200
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[2]), 3);
1201
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[3]), 4);
1202
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[4]), 5);
1203
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[5]), 6);
1204
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[6]), 7);
1205
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 8);
1206
+
1207
+ ggml_metal_encoder_set_threadgroup_memory_size(enc, sms, 0);
1208
+
1209
+ if (ne30 == 1) {
1210
+ // Mamba-2
1211
+ ggml_metal_encoder_dispatch_threadgroups(enc, d_inner, n_head, n_seqs, d_state, 1, 1);
1212
+ } else {
1213
+ GGML_ASSERT(d_inner == 1);
1214
+ ggml_metal_encoder_dispatch_threadgroups(enc, n_head, n_seqs, 1, d_state, 1, 1);
1215
+ }
1216
+
1217
+ return 1;
1218
+ }
1219
+
1220
+ int ggml_metal_op_rwkv(ggml_metal_op_t ctx, int idx) {
1221
+ ggml_tensor * op = ctx->node(idx);
1222
+
1223
+ ggml_metal_library_t lib = ctx->lib;
1224
+ ggml_metal_encoder_t enc = ctx->enc;
1225
+
1226
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
1227
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
1228
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
1229
+ GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
1230
+
1231
+ const int64_t B = op->op == GGML_OP_RWKV_WKV6 ? op->src[5]->ne[1] : op->src[6]->ne[1];
1232
+ const int64_t T = op->src[0]->ne[2];
1233
+ const int64_t C = op->ne[0];
1234
+ const int64_t H = op->src[0]->ne[1];
1235
+
1236
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_rwkv(lib, op);
1237
+
1238
+ int ida = 0;
1239
+
1240
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
1241
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), ida++);
1242
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), ida++);
1243
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[2]), ida++);
1244
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[3]), ida++);
1245
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[4]), ida++);
1246
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[5]), ida++);
1247
+ if (op->op == GGML_OP_RWKV_WKV7) {
1248
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[6]), ida++);
1249
+ }
1250
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), ida++);
1251
+ ggml_metal_encoder_set_bytes (enc, (void *) &B, sizeof(B), ida++);
1252
+ ggml_metal_encoder_set_bytes (enc, (void *) &T, sizeof(T), ida++);
1253
+ ggml_metal_encoder_set_bytes (enc, (void *) &C, sizeof(C), ida++);
1254
+ ggml_metal_encoder_set_bytes (enc, (void *) &H, sizeof(H), ida++);
1255
+
1256
+ ggml_metal_encoder_dispatch_threadgroups(enc, B * H, 1, 1, C/H, 1, 1);
1257
+
1258
+ return 1;
1259
+ }
1260
+
1261
+ int ggml_metal_op_cpy(ggml_metal_op_t ctx, int idx) {
1262
+ ggml_tensor * op = ctx->node(idx);
1263
+
1264
+ ggml_metal_library_t lib = ctx->lib;
1265
+ ggml_metal_encoder_t enc = ctx->enc;
1266
+
1267
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
1268
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
1269
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
1270
+ GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
1271
+
1272
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_cpy(lib, op->src[0]->type, op->type);
1273
+
1274
+ GGML_ASSERT(ne00 % ggml_blck_size(op->src[0]->type) == 0);
1275
+
1276
+ // TODO: support
1277
+ //const int32_t nk00 = ne00/ggml_blck_size(op->type);
1278
+ const int32_t nk00 = ne00;
1279
+
1280
+ int nth = 32; // SIMD width
1281
+
1282
+ while (nth < nk00 && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
1283
+ nth *= 2;
1284
+ }
1285
+
1286
+ nth = std::min(nth, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
1287
+
1288
+ // when rows are small, we can batch them together in a single threadgroup
1289
+ int nrptg = 1;
1290
+
1291
+ // TODO: relax this constraint in the future
1292
+ if (ggml_blck_size(op->src[0]->type) == 1 && ggml_blck_size(op->type) == 1) {
1293
+ if (nth > nk00) {
1294
+ nrptg = (nth + nk00 - 1)/nk00;
1295
+ nth = nk00;
1296
+
1297
+ if (nrptg*nth > ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
1298
+ nrptg--;
1299
+ }
1300
+ }
1301
+ }
1302
+
1303
+ nth = std::min(nth, nk00);
1304
+
1305
+ ggml_metal_kargs_cpy args = {
1306
+ /*.ne00 =*/ nk00,
1307
+ /*.ne01 =*/ ne01,
1308
+ /*.ne02 =*/ ne02,
1309
+ /*.ne03 =*/ ne03,
1310
+ /*.nb00 =*/ nb00,
1311
+ /*.nb01 =*/ nb01,
1312
+ /*.nb02 =*/ nb02,
1313
+ /*.nb03 =*/ nb03,
1314
+ /*.ne0 =*/ ne0,
1315
+ /*.ne1 =*/ ne1,
1316
+ /*.ne2 =*/ ne2,
1317
+ /*.ne3 =*/ ne3,
1318
+ /*.nb0 =*/ nb0,
1319
+ /*.nb1 =*/ nb1,
1320
+ /*.nb2 =*/ nb2,
1321
+ /*.nb3 =*/ nb3,
1322
+ };
1323
+
1324
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
1325
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
1326
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
1327
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
1328
+
1329
+ ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne02, ne03, nth, nrptg, 1);
1330
+
1331
+ return 1;
1332
+ }
1333
+
1334
+ int ggml_metal_op_pool_2d(ggml_metal_op_t ctx, int idx) {
1335
+ ggml_tensor * op = ctx->node(idx);
1336
+
1337
+ ggml_metal_library_t lib = ctx->lib;
1338
+ ggml_metal_encoder_t enc = ctx->enc;
1339
+
1340
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
1341
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
1342
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
1343
+ GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
1344
+
1345
+ const int32_t * opts = op->op_params;
1346
+ ggml_op_pool op_pool = (ggml_op_pool) opts[0];
1347
+
1348
+ const int32_t k0 = opts[1];
1349
+ const int32_t k1 = opts[2];
1350
+ const int32_t s0 = opts[3];
1351
+ const int32_t s1 = opts[4];
1352
+ const int32_t p0 = opts[5];
1353
+ const int32_t p1 = opts[6];
1354
+
1355
+ const int64_t IH = op->src[0]->ne[1];
1356
+ const int64_t IW = op->src[0]->ne[0];
1357
+
1358
+ const int64_t N = op->ne[3];
1359
+ const int64_t OC = op->ne[2];
1360
+ const int64_t OH = op->ne[1];
1361
+ const int64_t OW = op->ne[0];
1362
+
1363
+ const int64_t np = N * OC * OH * OW;
1364
+
1365
+ ggml_metal_kargs_pool_2d args_pool_2d = {
1366
+ /* .k0 = */ k0,
1367
+ /* .k1 = */ k1,
1368
+ /* .s0 = */ s0,
1369
+ /* .s1 = */ s1,
1370
+ /* .p0 = */ p0,
1371
+ /* .p1 = */ p1,
1372
+ /* .IH = */ IH,
1373
+ /* .IW = */ IW,
1374
+ /* .OH = */ OH,
1375
+ /* .OW = */ OW,
1376
+ /* .np = */ np
1377
+ };
1378
+
1379
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_pool_2d(lib, op, op_pool);
1380
+
1381
+ const int nth = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), (int) np);
1382
+ const int ntg = (np + nth - 1) / nth;
1383
+
1384
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
1385
+ ggml_metal_encoder_set_bytes (enc, &args_pool_2d, sizeof(args_pool_2d), 0);
1386
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
1387
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
1388
+
1389
+ ggml_metal_encoder_dispatch_threadgroups(enc, ntg, 1, 1, nth, 1, 1);
1390
+
1391
+ return 1;
1392
+ }
1393
+
1394
+ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
1395
+ ggml_tensor * op = ctx->node(idx);
1396
+
1397
+ ggml_metal_library_t lib = ctx->lib;
1398
+ ggml_metal_encoder_t enc = ctx->enc;
1399
+
1400
+ const ggml_metal_device_props * props_dev = ggml_metal_device_get_props(ctx->dev);
1401
+
1402
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
1403
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
1404
+ GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
1405
+ GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
1406
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
1407
+ GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
1408
+
1409
+ GGML_ASSERT(ne00 == ne10);
1410
+
1411
+ GGML_ASSERT(ne12 % ne02 == 0);
1412
+ GGML_ASSERT(ne13 % ne03 == 0);
1413
+
1414
+ const int16_t r2 = ne12/ne02;
1415
+ const int16_t r3 = ne13/ne03;
1416
+
1417
+ // find the break-even point where the matrix-matrix kernel becomes more efficient compared
1418
+ // to the matrix-vector kernel
1419
+ const int ne11_mm_min = 8;
1420
+
1421
+ // first try to use small-batch mat-mv kernels
1422
+ // these should be efficient for BS [2, ~8]
1423
+ if (op->src[1]->type == GGML_TYPE_F32 && (ne00%128 == 0) &&
1424
+ (
1425
+ (
1426
+ (
1427
+ op->src[0]->type == GGML_TYPE_F32 || // TODO: helper function
1428
+ op->src[0]->type == GGML_TYPE_F16 ||
1429
+ op->src[0]->type == GGML_TYPE_Q4_0 ||
1430
+ op->src[0]->type == GGML_TYPE_Q4_1 ||
1431
+ op->src[0]->type == GGML_TYPE_Q5_0 ||
1432
+ op->src[0]->type == GGML_TYPE_Q5_1 ||
1433
+ op->src[0]->type == GGML_TYPE_Q8_0 ||
1434
+ op->src[0]->type == GGML_TYPE_MXFP4 ||
1435
+ op->src[0]->type == GGML_TYPE_IQ4_NL ||
1436
+ false) && (ne11 >= 2 && ne11 <= 8)
1437
+ ) ||
1438
+ (
1439
+ (
1440
+ op->src[0]->type == GGML_TYPE_Q4_K ||
1441
+ op->src[0]->type == GGML_TYPE_Q5_K ||
1442
+ op->src[0]->type == GGML_TYPE_Q6_K ||
1443
+ false) && (ne11 >= 4 && ne11 <= 8)
1444
+ )
1445
+ )
1446
+ ) {
1447
+ // TODO: determine the optimal parameters based on grid utilization
1448
+ // I still don't know why we should not always use the maximum available threads:
1449
+ //
1450
+ // nsg = pipeline.maxTotalThreadsPerThreadgroup / 32
1451
+ //
1452
+ // my current hypothesis is that the work grid is not evenly divisible for different nsg
1453
+ // values and there can be some tail effects when nsg is high. need to confirm this
1454
+ //
1455
+ const int nsg = 2; // num simdgroups per threadgroup
1456
+
1457
+ // num threads along row per simdgroup
1458
+ int16_t nxpsg = 0;
1459
+ if (ne00 % 256 == 0 && ne11 < 3) {
1460
+ nxpsg = 16;
1461
+ } else if (ne00 % 128 == 0) {
1462
+ nxpsg = 8;
1463
+ } else {
1464
+ nxpsg = 4;
1465
+ }
1466
+
1467
+ const int16_t nypsg = 32/nxpsg; // num threads along col per simdgroup (i.e. a simdgroup processes that many src0 rows at a time)
1468
+ const int16_t r0ptg = nypsg*nsg; // num src0 rows per threadgroup
1469
+ int16_t r1ptg = 4; // num src1 rows per threadgroup
1470
+
1471
+ // note: not sure how optimal are those across all different hardware. there might be someting cleverer
1472
+ switch (ne11) {
1473
+ case 2:
1474
+ r1ptg = 2; break;
1475
+ case 3:
1476
+ case 6:
1477
+ r1ptg = 3; break;
1478
+ case 4:
1479
+ case 7:
1480
+ case 8:
1481
+ r1ptg = 4; break;
1482
+ case 5:
1483
+ r1ptg = 5; break;
1484
+ default:
1485
+ GGML_ABORT("unsupported ne11");
1486
+ };
1487
+
1488
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_mul_mv_ext(lib, op->src[0]->type, op->src[1]->type, nsg, nxpsg, r1ptg);
1489
+
1490
+ ggml_metal_kargs_mul_mv_ext args = {
1491
+ /*.ne00 =*/ ne00,
1492
+ /*.ne01 =*/ ne01,
1493
+ /*.ne02 =*/ ne02,
1494
+ /*.nb00 =*/ nb00,
1495
+ /*.nb01 =*/ nb01,
1496
+ /*.nb02 =*/ nb02,
1497
+ /*.nb03 =*/ nb03,
1498
+ /*.ne10 =*/ ne10,
1499
+ /*.ne11 =*/ ne11,
1500
+ /*.ne12 =*/ ne12,
1501
+ /*.nb10 =*/ nb10,
1502
+ /*.nb11 =*/ nb11,
1503
+ /*.nb12 =*/ nb12,
1504
+ /*.nb13 =*/ nb13,
1505
+ /*.ne0 =*/ ne0,
1506
+ /*.ne1 =*/ ne1,
1507
+ /*.r2 =*/ r2,
1508
+ /*.r3 =*/ r3,
1509
+ };
1510
+
1511
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
1512
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
1513
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
1514
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
1515
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3);
1516
+
1517
+ ggml_metal_encoder_dispatch_threadgroups(enc, ((ne01 + r0ptg - 1)/r0ptg), ((ne11 + r1ptg - 1)/r1ptg), ne12*ne13, 32, nsg, 1);
1518
+ } else if (
1519
+ !ggml_is_transposed(op->src[0]) &&
1520
+ !ggml_is_transposed(op->src[1]) &&
1521
+ // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
1522
+ // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
1523
+ props_dev->has_simdgroup_mm && ne00 >= 64 &&
1524
+ (ne11 > ne11_mm_min || (ggml_is_quantized(op->src[0]->type) && ne12 > 1))) {
1525
+ //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
1526
+
1527
+ // some Metal matrix data types require aligned pointers
1528
+ // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5)
1529
+ //switch (op->src[0]->type) {
1530
+ // case GGML_TYPE_F32: GGML_ASSERT(nb01 % 16 == 0); break;
1531
+ // case GGML_TYPE_F16: GGML_ASSERT(nb01 % 8 == 0); break;
1532
+ // case GGML_TYPE_BF16: GGML_ASSERT(nb01 % 8 == 0); break;
1533
+ // default: break;
1534
+ //}
1535
+
1536
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_mul_mm(lib, op);
1537
+
1538
+ ggml_metal_kargs_mul_mm args = {
1539
+ /*.ne00 =*/ ne00,
1540
+ /*.ne02 =*/ ne02,
1541
+ /*.nb01 =*/ nb01,
1542
+ /*.nb02 =*/ nb02,
1543
+ /*.nb03 =*/ nb03,
1544
+ /*.ne12 =*/ ne12,
1545
+ /*.nb10 =*/ nb10,
1546
+ /*.nb11 =*/ nb11,
1547
+ /*.nb12 =*/ nb12,
1548
+ /*.nb13 =*/ nb13,
1549
+ /*.ne0 =*/ ne0,
1550
+ /*.ne1 =*/ ne1,
1551
+ /*.r2 =*/ r2,
1552
+ /*.r3 =*/ r3,
1553
+ };
1554
+
1555
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
1556
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
1557
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
1558
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
1559
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3);
1560
+
1561
+ const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
1562
+
1563
+ ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
1564
+ ggml_metal_encoder_dispatch_threadgroups(enc, ((ne11 + 31)/32), ((ne01 + 63)/64), ne12*ne13, 128, 1, 1);
1565
+ } else {
1566
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_mul_mv(lib, op);
1567
+
1568
+ const int nr0 = ggml_metal_pipeline_get_nr0(pipeline);
1569
+ const int nr1 = ggml_metal_pipeline_get_nr1(pipeline);
1570
+ const int nsg = ggml_metal_pipeline_get_nsg(pipeline);
1571
+
1572
+ const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
1573
+
1574
+ ggml_metal_kargs_mul_mv args = {
1575
+ /*.ne00 =*/ ne00,
1576
+ /*.ne01 =*/ ne01,
1577
+ /*.ne02 =*/ ne02,
1578
+ /*.nb00 =*/ nb00,
1579
+ /*.nb01 =*/ nb01,
1580
+ /*.nb02 =*/ nb02,
1581
+ /*.nb03 =*/ nb03,
1582
+ /*.ne10 =*/ ne10,
1583
+ /*.ne11 =*/ ne11,
1584
+ /*.ne12 =*/ ne12,
1585
+ /*.nb10 =*/ nb10,
1586
+ /*.nb11 =*/ nb11,
1587
+ /*.nb12 =*/ nb12,
1588
+ /*.nb13 =*/ nb13,
1589
+ /*.ne0 =*/ ne0,
1590
+ /*.ne1 =*/ ne1,
1591
+ /*.nr0 =*/ nr0,
1592
+ /*.r2 =*/ r2,
1593
+ /*.r3 =*/ r3,
1594
+ };
1595
+
1596
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
1597
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
1598
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
1599
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
1600
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3);
1601
+
1602
+ ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
1603
+
1604
+ if (op->src[0]->type == GGML_TYPE_F32 ||
1605
+ op->src[0]->type == GGML_TYPE_F16 ||
1606
+ op->src[0]->type == GGML_TYPE_BF16 ||
1607
+ op->src[0]->type == GGML_TYPE_Q8_0) {
1608
+ ggml_metal_encoder_dispatch_threadgroups(enc, ((ne01 + nr0 - 1)/(nr0)), ((ne11 + nr1 - 1)/nr1), ne12*ne13, 32, nsg, 1);
1609
+ } else {
1610
+ ggml_metal_encoder_dispatch_threadgroups(enc, ((ne01 + nr0*nsg - 1)/(nr0*nsg)), ((ne11 + nr1 - 1)/nr1), ne12*ne13, 32, nsg, 1);
1611
+ }
1612
+ }
1613
+
1614
+ return 1;
1615
+ }
1616
+
1617
+ size_t ggml_metal_op_mul_mat_id_extra_tpe(const ggml_tensor * op) {
1618
+ assert(op->op == GGML_OP_MUL_MAT_ID);
1619
+
1620
+ const int64_t ne02 = op->src[0]->ne[2]; // n_expert
1621
+
1622
+ return ggml_type_size(GGML_TYPE_I32)*ne02;
1623
+ }
1624
+
1625
+ size_t ggml_metal_op_mul_mat_id_extra_ids(const ggml_tensor * op) {
1626
+ assert(op->op == GGML_OP_MUL_MAT_ID);
1627
+
1628
+ const int64_t ne02 = op->src[0]->ne[2]; // n_expert
1629
+ const int64_t ne21 = op->src[2]->ne[1]; // n_token
1630
+
1631
+ return ggml_type_size(GGML_TYPE_I32)*ne02*ne21;
1632
+ }
1633
+
1634
+ int ggml_metal_op_mul_mat_id(ggml_metal_op_t ctx, int idx) {
1635
+ ggml_tensor * op = ctx->node(idx);
1636
+
1637
+ ggml_metal_library_t lib = ctx->lib;
1638
+ ggml_metal_encoder_t enc = ctx->enc;
1639
+
1640
+ const ggml_metal_device_props * props_dev = ggml_metal_device_get_props(ctx->dev);
1641
+
1642
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
1643
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
1644
+ GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
1645
+ GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
1646
+ GGML_TENSOR_LOCALS( int32_t, ne2, op->src[2], ne);
1647
+ GGML_TENSOR_LOCALS(uint64_t, nb2, op->src[2], nb);
1648
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
1649
+ GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
1650
+
1651
+ // src2 = ids
1652
+ GGML_ASSERT(op->src[2]->type == GGML_TYPE_I32);
1653
+
1654
+ GGML_ASSERT(!ggml_is_transposed(op->src[0]));
1655
+ GGML_ASSERT(!ggml_is_transposed(op->src[1]));
1656
+
1657
+ GGML_ASSERT(ne03 == 1);
1658
+ GGML_ASSERT(ne13 == 1);
1659
+
1660
+ ggml_metal_buffer_id bid_src0 = ggml_metal_get_buffer_id(op->src[0]);
1661
+ ggml_metal_buffer_id bid_src1 = ggml_metal_get_buffer_id(op->src[1]);
1662
+ ggml_metal_buffer_id bid_src2 = ggml_metal_get_buffer_id(op->src[2]);
1663
+ ggml_metal_buffer_id bid_dst = ggml_metal_get_buffer_id(op);
1664
+
1665
+ const uint32_t r2 = 1;
1666
+ const uint32_t r3 = 1;
1667
+
1668
+ // find the break-even point where the matrix-matrix kernel becomes more efficient compared
1669
+ // to the matrix-vector kernel
1670
+ // ne20 = n_used_experts
1671
+ // ne21 = n_rows (batch size)
1672
+ const int ne21_mm_id_min = 32;
1673
+
1674
+ if (props_dev->has_simdgroup_mm && ne00 >= 64 && (ne21 >= ne21_mm_id_min)) {
1675
+ // some Metal matrix data types require aligned pointers
1676
+ // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5)
1677
+ //switch (op->src[0]->type) {
1678
+ // case GGML_TYPE_F32: GGML_ASSERT(nb01 % 16 == 0); break;
1679
+ // case GGML_TYPE_F16: GGML_ASSERT(nb01 % 8 == 0); break;
1680
+ // case GGML_TYPE_BF16: GGML_ASSERT(nb01 % 8 == 0); break;
1681
+ // default: break;
1682
+ //}
1683
+
1684
+ // extra buffers for intermediate id mapping
1685
+ ggml_metal_buffer_id bid_tpe = bid_dst;
1686
+ bid_tpe.offs += ggml_nbytes(op);
1687
+
1688
+ ggml_metal_buffer_id bid_ids = bid_tpe;
1689
+ bid_ids.offs += ggml_metal_op_mul_mat_id_extra_tpe(op);
1690
+
1691
+ {
1692
+ ggml_metal_kargs_mul_mm_id_map0 args = {
1693
+ ne02,
1694
+ ne10,
1695
+ ne11, // n_expert_used (bcast)
1696
+ nb11,
1697
+ nb12,
1698
+ ne21, // n_tokens
1699
+ ne20, // n_expert_used
1700
+ nb21,
1701
+ };
1702
+
1703
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_mul_mm_id_map0(lib, ne02, ne20);
1704
+
1705
+ const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
1706
+
1707
+ GGML_ASSERT(ne02 <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
1708
+
1709
+ GGML_ASSERT(smem <= props_dev->max_theadgroup_memory_size);
1710
+
1711
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
1712
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
1713
+ ggml_metal_encoder_set_buffer (enc, bid_src2, 1);
1714
+ ggml_metal_encoder_set_buffer (enc, bid_tpe, 2);
1715
+ ggml_metal_encoder_set_buffer (enc, bid_ids, 3);
1716
+
1717
+ ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
1718
+
1719
+ ggml_metal_encoder_dispatch_threadgroups(enc, 1, 1, 1, ne02, 1, 1);
1720
+ }
1721
+
1722
+ // this barrier is always needed because the next kernel has to wait for the id maps to be computed
1723
+ ggml_metal_op_concurrency_reset(ctx);
1724
+
1725
+ {
1726
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_mul_mm_id(lib, op);
1727
+
1728
+ ggml_metal_kargs_mul_mm_id args = {
1729
+ /*.ne00 =*/ ne00,
1730
+ /*.ne02 =*/ ne02,
1731
+ /*.nb01 =*/ nb01,
1732
+ /*.nb02 =*/ nb02,
1733
+ /*.nb03 =*/ nb03,
1734
+ /*.ne11 =*/ ne11, // n_expert_used (bcast)
1735
+ /*.nb10 =*/ nb10,
1736
+ /*.nb11 =*/ nb11,
1737
+ /*.nb12 =*/ nb12,
1738
+ /*.nb13 =*/ nb13,
1739
+ /*.ne20 =*/ ne20, // n_expert_used
1740
+ /*.ne21 =*/ ne21, // n_tokens
1741
+ /*.ne0 =*/ ne0,
1742
+ /*.ne1 =*/ ne1,
1743
+ /*.r2 =*/ r2,
1744
+ /*.r3 =*/ r3,
1745
+ };
1746
+
1747
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
1748
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
1749
+ ggml_metal_encoder_set_buffer (enc, bid_src0, 1);
1750
+ ggml_metal_encoder_set_buffer (enc, bid_src1, 2);
1751
+ ggml_metal_encoder_set_buffer (enc, bid_tpe, 3);
1752
+ ggml_metal_encoder_set_buffer (enc, bid_ids, 4);
1753
+ ggml_metal_encoder_set_buffer (enc, bid_dst, 5);
1754
+
1755
+ const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
1756
+
1757
+ ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
1758
+
1759
+ ggml_metal_encoder_dispatch_threadgroups(enc, (ne21 + 31)/32, (ne01 + 63)/64, ne02, 128, 1, 1);
1760
+ }
1761
+ } else {
1762
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_mul_mv_id(lib, op);
1763
+
1764
+ const int nr0 = ggml_metal_pipeline_get_nr0(pipeline);
1765
+ const int nr1 = ggml_metal_pipeline_get_nr1(pipeline);
1766
+ const int nsg = ggml_metal_pipeline_get_nsg(pipeline);
1767
+
1768
+ const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
1769
+
1770
+ ggml_metal_kargs_mul_mv_id args = {
1771
+ /*.nei0 =*/ ne20,
1772
+ /*.nei1 =*/ ne21,
1773
+ /*.nbi1 =*/ nb21,
1774
+ /*.ne00 =*/ ne00,
1775
+ /*.ne01 =*/ ne01,
1776
+ /*.ne02 =*/ ne02,
1777
+ /*.nb00 =*/ nb00,
1778
+ /*.nb01 =*/ nb01,
1779
+ /*.nb02 =*/ nb02,
1780
+ /*.ne10 =*/ ne10,
1781
+ /*.ne11 =*/ ne11,
1782
+ /*.ne12 =*/ ne12,
1783
+ /*.ne13 =*/ ne13,
1784
+ /*.nb10 =*/ nb10,
1785
+ /*.nb11 =*/ nb11,
1786
+ /*.nb12 =*/ nb12,
1787
+ /*.ne0 =*/ ne0,
1788
+ /*.ne1 =*/ ne1,
1789
+ /*.nb1 =*/ nb1,
1790
+ /*.nr0 =*/ nr0,
1791
+ };
1792
+
1793
+ if (ggml_is_quantized(op->src[0]->type)) {
1794
+ GGML_ASSERT(ne00 >= nsg*nr0);
1795
+ }
1796
+
1797
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
1798
+ ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 0);
1799
+ ggml_metal_encoder_set_buffer(enc, bid_src0, 1);
1800
+ ggml_metal_encoder_set_buffer(enc, bid_src1, 2);
1801
+ ggml_metal_encoder_set_buffer(enc, bid_dst, 3);
1802
+ ggml_metal_encoder_set_buffer(enc, bid_src2, 4);
1803
+
1804
+ const int64_t _ne1 = 1;
1805
+ const int64_t ne123 = ne20*ne21;
1806
+
1807
+ ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
1808
+
1809
+ if (op->src[0]->type == GGML_TYPE_F32 ||
1810
+ op->src[0]->type == GGML_TYPE_F16 ||
1811
+ op->src[0]->type == GGML_TYPE_BF16 ||
1812
+ op->src[0]->type == GGML_TYPE_Q8_0) {
1813
+ ggml_metal_encoder_dispatch_threadgroups(enc, (ne01 + nr0 - 1)/(nr0), (_ne1 + nr1 - 1)/nr1, ne123, 32, nsg, 1);
1814
+ } else {
1815
+ ggml_metal_encoder_dispatch_threadgroups(enc, (ne01 + nr0*nsg - 1)/(nr0*nsg), (_ne1 + nr1 - 1)/nr1, ne123, 32, nsg, 1);
1816
+ }
1817
+ }
1818
+
1819
+ return 1;
1820
+ }
1821
+
1822
+ int ggml_metal_op_add_id(ggml_metal_op_t ctx, int idx) {
1823
+ ggml_tensor * op = ctx->node(idx);
1824
+
1825
+ ggml_metal_library_t lib = ctx->lib;
1826
+ ggml_metal_encoder_t enc = ctx->enc;
1827
+
1828
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
1829
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
1830
+ GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
1831
+ GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
1832
+ GGML_TENSOR_LOCALS( int32_t, ne2, op->src[2], ne);
1833
+ GGML_TENSOR_LOCALS(uint64_t, nb2, op->src[2], nb);
1834
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
1835
+
1836
+ GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32);
1837
+ GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32);
1838
+ GGML_ASSERT(op->src[2]->type == GGML_TYPE_I32);
1839
+ GGML_ASSERT(op->type == GGML_TYPE_F32);
1840
+
1841
+ GGML_ASSERT(ggml_is_contiguous_rows(op->src[0]));
1842
+
1843
+ ggml_metal_kargs_add_id args = {
1844
+ /*.ne0 =*/ ne0,
1845
+ /*.ne1 =*/ ne1,
1846
+ /*.nb01 =*/ nb01,
1847
+ /*.nb02 =*/ nb02,
1848
+ /*.nb11 =*/ nb11,
1849
+ /*.nb21 =*/ nb21,
1850
+ };
1851
+
1852
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_base(lib, GGML_OP_ADD_ID);
1853
+
1854
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
1855
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
1856
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
1857
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
1858
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[2]), 3);
1859
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 4);
1860
+
1861
+ const int nth = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), ne00);
1862
+
1863
+ ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne02, 1, nth, 1, 1);
1864
+
1865
+ return 1;
1866
+ }
1867
+
1868
+ bool ggml_metal_op_flash_attn_ext_use_vec(const ggml_tensor * op) {
1869
+ assert(op->op == GGML_OP_FLASH_ATTN_EXT);
1870
+
1871
+ const int64_t ne00 = op->src[0]->ne[0]; // head size
1872
+ const int64_t ne01 = op->src[0]->ne[1]; // batch size
1873
+
1874
+ // use vec kernel if the batch size is small and if the head size is supported
1875
+ return (ne01 < 20) && (ne00 % 32 == 0);
1876
+ }
1877
+
1878
+ size_t ggml_metal_op_flash_attn_ext_extra_tmp(const ggml_tensor * op) {
1879
+ assert(op->op == GGML_OP_FLASH_ATTN_EXT);
1880
+
1881
+ const int64_t nwg = 32;
1882
+
1883
+ const int64_t ne01 = op->src[0]->ne[1];
1884
+ const int64_t ne02 = op->src[0]->ne[2];
1885
+ const int64_t ne03 = op->src[0]->ne[3];
1886
+ const int64_t ne20 = op->src[2]->ne[0];
1887
+
1888
+ // temp buffer for writing the results from each workgroup
1889
+ // - ne20: the size of the Value head
1890
+ // - + 2: the S and M values for each intermediate result
1891
+ return ggml_type_size(GGML_TYPE_F32)*(ne01*ne02*ne03*nwg*(ne20 + 2));
1892
+ }
1893
+
1894
+ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
1895
+ ggml_tensor * op = ctx->node(idx);
1896
+
1897
+ ggml_metal_library_t lib = ctx->lib;
1898
+ ggml_metal_encoder_t enc = ctx->enc;
1899
+
1900
+ const ggml_metal_device_props * props_dev = ggml_metal_device_get_props(ctx->dev);
1901
+
1902
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
1903
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
1904
+ GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
1905
+ GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
1906
+ GGML_TENSOR_LOCALS( int32_t, ne2, op->src[2], ne);
1907
+ GGML_TENSOR_LOCALS(uint64_t, nb2, op->src[2], nb);
1908
+ GGML_TENSOR_LOCALS( int32_t, ne3, op->src[3], ne);
1909
+ GGML_TENSOR_LOCALS(uint64_t, nb3, op->src[3], nb);
1910
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
1911
+ GGML_TENSOR_LOCALS( int32_t, nb, op, nb);
1912
+
1913
+ GGML_ASSERT(ne00 % 4 == 0);
1914
+ GGML_ASSERT(ne11 % 32 == 0);
1915
+
1916
+ GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32);
1917
+ GGML_ASSERT(op->src[1]->type == op->src[2]->type);
1918
+
1919
+ //GGML_ASSERT(ggml_are_same_shape (src1, src2));
1920
+ GGML_ASSERT(ne11 == ne21);
1921
+ GGML_ASSERT(ne12 == ne22);
1922
+
1923
+ GGML_ASSERT(!op->src[3] || op->src[3]->type == GGML_TYPE_F16);
1924
+ GGML_ASSERT(!op->src[3] || op->src[3]->ne[1] >= GGML_PAD(op->src[0]->ne[1], 8) &&
1925
+ "the Flash-Attention Metal kernel requires the mask to be padded to 8 and at least n_queries big");
1926
+
1927
+ float scale;
1928
+ float max_bias;
1929
+ float logit_softcap;
1930
+
1931
+ memcpy(&scale, ((const int32_t *) op->op_params) + 0, sizeof(scale));
1932
+ memcpy(&max_bias, ((const int32_t *) op->op_params) + 1, sizeof(max_bias));
1933
+ memcpy(&logit_softcap, ((const int32_t *) op->op_params) + 2, sizeof(logit_softcap));
1934
+
1935
+ if (logit_softcap != 0.0f) {
1936
+ scale /= logit_softcap;
1937
+ }
1938
+
1939
+ const bool has_mask = op->src[3] != NULL;
1940
+ const bool has_sinks = op->src[4] != NULL;
1941
+ const bool has_bias = max_bias != 0.0f;
1942
+ const bool has_scap = logit_softcap != 0.0f;
1943
+
1944
+ const uint32_t n_head = op->src[0]->ne[2];
1945
+ const int32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
1946
+
1947
+ const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
1948
+ const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
1949
+
1950
+ GGML_ASSERT(ne01 < 65536);
1951
+
1952
+ if (!ggml_metal_op_flash_attn_ext_use_vec(op)) {
1953
+ // half8x8 kernel
1954
+ const int64_t nqptg = 8; // queries per threadgroup !! sync with kernel template arguments !!
1955
+ const int64_t ncpsg = 64; // cache values per simdgroup !! sync with kernel template arguments !!
1956
+
1957
+ GGML_ASSERT(nqptg <= 32);
1958
+ GGML_ASSERT(nqptg % 8 == 0);
1959
+ GGML_ASSERT(ncpsg % 32 == 0);
1960
+
1961
+ const int is_q = ggml_is_quantized(op->src[1]->type) ? 1 : 0;
1962
+
1963
+ // 2*(2*ncpsg)
1964
+ // ncpsg soft_max values + ncpsg mask values
1965
+ //
1966
+ // 16*32*(nsg)
1967
+ // the shared memory needed for the simdgroups to load the KV cache
1968
+ // each thread loads (dequantizes) 16 head elements, there are 32 threads in th SG
1969
+ //
1970
+ #define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(ne00 + 2*GGML_PAD(ne20, 64) + 2*(2*ncpsg)) + is_q*(16*32*(nsg)))*(sizeof(float)/2), 16))
1971
+
1972
+ //int64_t nsgmax = 4;
1973
+ //
1974
+ //if (is_q) {
1975
+ // nsgmax = 2;
1976
+ // while (true) {
1977
+ // const size_t smem = FATTN_SMEM(nsgmax);
1978
+ // if (smem > props_dev->max_theadgroup_memory_size) {
1979
+ // break;
1980
+ // }
1981
+ // nsgmax *= 2;
1982
+ // }
1983
+ // nsgmax /= 2;
1984
+ //}
1985
+
1986
+ // simdgroups per threadgroup (a.k.a. warps)
1987
+ //nsg = ne01 <= nqptg ? MAX(4, MIN(nsgmax, MIN(ne11/ncpsg, (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32))) : 4;
1988
+ int32_t nsg = 4;
1989
+
1990
+ const size_t smem = FATTN_SMEM(nsg);
1991
+
1992
+ ggml_metal_kargs_flash_attn_ext args = {
1993
+ /*.ne01 =*/ ne01,
1994
+ /*.ne02 =*/ ne02,
1995
+ /*.ne03 =*/ ne03,
1996
+ /*.nb01 =*/ nb01,
1997
+ /*.nb02 =*/ nb02,
1998
+ /*.nb03 =*/ nb03,
1999
+ /*.ne11 =*/ ne11,
2000
+ /*.ne_12_2 =*/ ne12,
2001
+ /*.ne_12_3 =*/ ne13,
2002
+ /*.ns10 =*/ int32_t(nb11/nb10),
2003
+ /*.nb11 =*/ nb11,
2004
+ /*.nb12 =*/ nb12,
2005
+ /*.nb13 =*/ nb13,
2006
+ /*.ns20 =*/ int32_t(nb21/nb20),
2007
+ /*.nb21 =*/ nb21,
2008
+ /*.nb22 =*/ nb22,
2009
+ /*.nb23 =*/ nb23,
2010
+ /*.ne32 =*/ ne32,
2011
+ /*.ne33 =*/ ne33,
2012
+ /*.nb31 =*/ nb31,
2013
+ /*.nb32 =*/ nb32,
2014
+ /*.nb33 =*/ nb33,
2015
+ /*.ne1 =*/ ne1,
2016
+ /*.ne2 =*/ ne2,
2017
+ /*.ne3 =*/ ne3,
2018
+ /*.scale =*/ scale,
2019
+ /*.max_bias =*/ max_bias,
2020
+ /*.m0 =*/ m0,
2021
+ /*.m1 =*/ m1,
2022
+ /*.n_head_log2 =*/ n_head_log2,
2023
+ /*.logit_softcap =*/ logit_softcap,
2024
+ };
2025
+
2026
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_flash_attn_ext(lib, op, has_mask, has_sinks, has_bias, has_scap, nsg);
2027
+
2028
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
2029
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
2030
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
2031
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
2032
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[2]), 3);
2033
+ if (op->src[3]) {
2034
+ ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[3]), 4);
2035
+ } else {
2036
+ ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[0]), 4);
2037
+ }
2038
+ if (op->src[4]) {
2039
+ ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[4]), 5);
2040
+ } else {
2041
+ ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[0]), 5);
2042
+ }
2043
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 6);
2044
+
2045
+ ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
2046
+
2047
+ ggml_metal_encoder_dispatch_threadgroups(enc, (ne01 + nqptg - 1)/nqptg, ne02, ne03, 32, nsg, 1);
2048
+ #undef FATTN_SMEM
2049
+ } else {
2050
+ // half4x4 kernel
2051
+ const int64_t nqptg = 1; // queries per threadgroup !! sync with kernel template arguments !!
2052
+ const int64_t ncpsg = 32; // cache values per simdgroup !! sync with kernel template arguments !!
2053
+ const int64_t nkpsg = 1*ncpsg;
2054
+
2055
+ GGML_ASSERT(nqptg <= 32);
2056
+ GGML_ASSERT(nqptg % 1 == 0);
2057
+ GGML_ASSERT(ncpsg % 32 == 0);
2058
+
2059
+ // ne00 + 2*ncpsg*(nsg)
2060
+ // for each query, we load it as f16 in shared memory (ne00)
2061
+ // and store the soft_max values and the mask
2062
+ //
2063
+ // ne20*(nsg)
2064
+ // each simdgroup has a full f32 head vector in shared mem to accumulate results
2065
+ //
2066
+ #define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(GGML_PAD(ne00, 128) + 4*ncpsg*(nsg)) + 2*GGML_PAD(ne20, 128)*(nsg))*(sizeof(float)/2), 16))
2067
+
2068
+ int64_t nsgmax = 2;
2069
+ while (true) {
2070
+ const size_t smem = FATTN_SMEM(nsgmax);
2071
+ // avoid using more than half of the threadgroup memory - can cause slow downs especially for large head sizes
2072
+ if (smem > props_dev->max_theadgroup_memory_size/2) {
2073
+ break;
2074
+ }
2075
+ nsgmax *= 2;
2076
+ }
2077
+ nsgmax /= 2;
2078
+
2079
+ // simdgroups per threadgroup (a.k.a. warps)
2080
+ //const int64_t nsgt = MAX(2, MIN(nsgmax, MIN((ne11 + nkpsg - 1)/(nkpsg), (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32)));
2081
+ const int64_t nsgt = MAX(2, MIN(nsgmax, MIN((ne11 + nkpsg - 1)/(nkpsg), (int64_t) 1024/32)));
2082
+
2083
+ int64_t nsg = 1;
2084
+ while (nsg <= nsgt) {
2085
+ nsg *= 2;
2086
+ }
2087
+ nsg /= 2;
2088
+
2089
+ // workgroups
2090
+ // each workgroup handles nsg*nkpsg cache values
2091
+ int32_t nwg = 1;
2092
+ if (false) {
2093
+ // for small KV caches, we could launch a single workgroup and write the results directly to dst/
2094
+ // however, this does not lead to significant improvement, so disabled
2095
+ nwg = 1;
2096
+ nsg = 4;
2097
+ } else {
2098
+ nwg = 32;
2099
+ nsg = 1;
2100
+ while (2*nwg*nsg*nkpsg < ne11 && nsg < 4) {
2101
+ nsg *= 2;
2102
+ }
2103
+ }
2104
+
2105
+ ggml_metal_kargs_flash_attn_ext_vec args = {
2106
+ /*.ne01 =*/ ne01,
2107
+ /*.ne02 =*/ ne02,
2108
+ /*.ne03 =*/ ne03,
2109
+ /*.nb01 =*/ nb01,
2110
+ /*.nb02 =*/ nb02,
2111
+ /*.nb03 =*/ nb03,
2112
+ /*.ne11 =*/ ne11,
2113
+ /*.ne_12_2 =*/ ne12,
2114
+ /*.ne_12_3 =*/ ne13,
2115
+ /*.ns10 =*/ int32_t(nb11/nb10),
2116
+ /*.nb11 =*/ nb11,
2117
+ /*.nb12 =*/ nb12,
2118
+ /*.nb13 =*/ nb13,
2119
+ /*.ns20 =*/ int32_t(nb21/nb20),
2120
+ /*.nb21 =*/ nb21,
2121
+ /*.nb22 =*/ nb22,
2122
+ /*.nb23 =*/ nb23,
2123
+ /*.ne32 =*/ ne32,
2124
+ /*.ne33 =*/ ne33,
2125
+ /*.nb31 =*/ nb31,
2126
+ /*.nb32 =*/ nb32,
2127
+ /*.nb33 =*/ nb33,
2128
+ /*.ne1 =*/ ne1,
2129
+ /*.ne2 =*/ ne2,
2130
+ /*.ne3 =*/ ne3,
2131
+ /*.scale =*/ scale,
2132
+ /*.max_bias =*/ max_bias,
2133
+ /*.m0 =*/ m0,
2134
+ /*.m1 =*/ m1,
2135
+ /*.n_head_log2 =*/ n_head_log2,
2136
+ /*.logit_softcap =*/ logit_softcap,
2137
+ };
2138
+
2139
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_flash_attn_ext_vec(lib, op, has_mask, has_sinks, has_bias, has_scap, nsg, nwg);
2140
+
2141
+ GGML_ASSERT(nsg*32 <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
2142
+
2143
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
2144
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
2145
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
2146
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
2147
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[2]), 3);
2148
+ if (op->src[3]) {
2149
+ ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[3]), 4);
2150
+ } else {
2151
+ ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[0]), 4);
2152
+ }
2153
+ if (op->src[4]) {
2154
+ ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[4]), 5);
2155
+ } else {
2156
+ ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[0]), 5);
2157
+ }
2158
+
2159
+ const size_t smem = FATTN_SMEM(nsg);
2160
+
2161
+ //printf("smem: %zu, max: %zu, nsg = %d, nsgmax = %d\n", smem, props_dev->max_theadgroup_memory_size, (int) nsg, (int) nsgmax);
2162
+ GGML_ASSERT(smem <= props_dev->max_theadgroup_memory_size);
2163
+
2164
+ if (nwg == 1) {
2165
+ // using 1 workgroup -> write the result directly into dst
2166
+ ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op), 6);
2167
+
2168
+ ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
2169
+
2170
+ ggml_metal_encoder_dispatch_threadgroups(enc, (ne01 + nqptg - 1)/nqptg, ne02, ne03*nwg, 32, nsg, 1);
2171
+ } else {
2172
+ // sanity checks
2173
+ GGML_ASSERT(ne01*ne02*ne03 == ne1*ne2*ne3);
2174
+ GGML_ASSERT((uint64_t)ne1*ne2*ne3 <= (1u << 31));
2175
+
2176
+ ggml_metal_buffer_id bid_dst = ggml_metal_get_buffer_id(op);
2177
+
2178
+ // write the results from each workgroup into a temp buffer
2179
+ ggml_metal_buffer_id bid_tmp = bid_dst;
2180
+ bid_tmp.offs += ggml_nbytes(op);
2181
+ ggml_metal_encoder_set_buffer(enc, bid_tmp, 6);
2182
+
2183
+ ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
2184
+ ggml_metal_encoder_dispatch_threadgroups(enc, (ne01 + nqptg - 1)/nqptg, ne02, ne03*nwg, 32, nsg, 1);
2185
+
2186
+ // sync the 2 kernels
2187
+ ggml_metal_op_concurrency_reset(ctx);
2188
+
2189
+ // reduce the results from the workgroups
2190
+ {
2191
+ const int32_t nrows = ne1*ne2*ne3;
2192
+
2193
+ ggml_metal_kargs_flash_attn_ext_vec_reduce args0 = {
2194
+ nrows,
2195
+ };
2196
+
2197
+ ggml_metal_pipeline_t pipeline0 = ggml_metal_library_get_pipeline_flash_attn_ext_vec_reduce(lib, op, ne20, nwg);
2198
+
2199
+ ggml_metal_encoder_set_pipeline(enc, pipeline0);
2200
+ ggml_metal_encoder_set_bytes (enc, &args0, sizeof(args0), 0);
2201
+ ggml_metal_encoder_set_buffer (enc, bid_tmp, 1);
2202
+ ggml_metal_encoder_set_buffer (enc, bid_dst, 2);
2203
+
2204
+ ggml_metal_encoder_dispatch_threadgroups(enc, nrows, 1, 1, 32*nwg, 1, 1);
2205
+ }
2206
+ }
2207
+ #undef FATTN_SMEM
2208
+ }
2209
+
2210
+ return 1;
2211
+ }
2212
+
2213
+ int ggml_metal_op_bin(ggml_metal_op_t ctx, int idx) {
2214
+ ggml_tensor * op = ctx->node(idx);
2215
+
2216
+ ggml_metal_library_t lib = ctx->lib;
2217
+ ggml_metal_encoder_t enc = ctx->enc;
2218
+
2219
+ const bool use_fusion = ctx->use_fusion;
2220
+
2221
+ const int debug_fusion = ctx->debug_fusion;
2222
+
2223
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
2224
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
2225
+ GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
2226
+ GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
2227
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
2228
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
2229
+
2230
+ GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32);
2231
+ GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32);
2232
+
2233
+ GGML_ASSERT(ggml_is_contiguous_rows(op->src[0]));
2234
+ GGML_ASSERT(ggml_is_contiguous_rows(op->src[1]));
2235
+
2236
+ bool bcast_row = false;
2237
+
2238
+ ggml_metal_buffer_id bid_src0 = ggml_metal_get_buffer_id(op->src[0]);
2239
+ ggml_metal_buffer_id bid_src1 = ggml_metal_get_buffer_id(op->src[1]);
2240
+ ggml_metal_buffer_id bid_dst = ggml_metal_get_buffer_id(op);
2241
+
2242
+ ggml_metal_kargs_bin args = {
2243
+ /*.ne00 =*/ ne00,
2244
+ /*.ne01 =*/ ne01,
2245
+ /*.ne02 =*/ ne02,
2246
+ /*.ne03 =*/ ne03,
2247
+ /*.nb00 =*/ nb00,
2248
+ /*.nb01 =*/ nb01,
2249
+ /*.nb02 =*/ nb02,
2250
+ /*.nb03 =*/ nb03,
2251
+ /*.ne10 =*/ ne10,
2252
+ /*.ne11 =*/ ne11,
2253
+ /*.ne12 =*/ ne12,
2254
+ /*.ne13 =*/ ne13,
2255
+ /*.nb10 =*/ nb10,
2256
+ /*.nb11 =*/ nb11,
2257
+ /*.nb12 =*/ nb12,
2258
+ /*.nb13 =*/ nb13,
2259
+ /*.ne0 =*/ ne0,
2260
+ /*.ne1 =*/ ne1,
2261
+ /*.ne2 =*/ ne2,
2262
+ /*.ne3 =*/ ne3,
2263
+ /*.nb0 =*/ nb0,
2264
+ /*.nb1 =*/ nb1,
2265
+ /*.nb2 =*/ nb2,
2266
+ /*.nb3 =*/ nb3,
2267
+ /*.offs =*/ 0,
2268
+ /*.o1 =*/ { bid_src1.offs },
2269
+ };
2270
+
2271
+ ggml_op fops[8];
2272
+
2273
+ int n_fuse = 1;
2274
+
2275
+ // c[0] = add(a, b[0])
2276
+ // c[1] = add(c[0], b[1])
2277
+ // c[2] = add(c[1], b[2])
2278
+ // ...
2279
+ if (use_fusion) {
2280
+ fops[0] = GGML_OP_ADD;
2281
+ fops[1] = GGML_OP_ADD;
2282
+ fops[2] = GGML_OP_ADD;
2283
+ fops[3] = GGML_OP_ADD;
2284
+ fops[4] = GGML_OP_ADD;
2285
+ fops[5] = GGML_OP_ADD;
2286
+ fops[6] = GGML_OP_ADD;
2287
+ fops[7] = GGML_OP_ADD;
2288
+
2289
+ // note: in metal, we sometimes encode the graph in parallel so we have to avoid fusing ops
2290
+ // across splits. idx_end indicates the last node in the current split
2291
+ for (n_fuse = 0; n_fuse <= 6; ++n_fuse) {
2292
+ if (!ctx->can_fuse(idx + n_fuse, fops + n_fuse, 2)) {
2293
+ break;
2294
+ }
2295
+
2296
+ ggml_tensor * f0 = ctx->node(idx + n_fuse);
2297
+ ggml_tensor * f1 = ctx->node(idx + n_fuse + 1);
2298
+
2299
+ if (f0 != f1->src[0]) {
2300
+ break;
2301
+ }
2302
+
2303
+ // b[0] === b[1] === ...
2304
+ if (!ggml_are_same_layout(f0->src[1], f1->src[1])) {
2305
+ break;
2306
+ }
2307
+
2308
+ // only fuse ops if src1 is in the same Metal buffer
2309
+ ggml_metal_buffer_id bid_fuse = ggml_metal_get_buffer_id(f1->src[1]);
2310
+ if (bid_fuse.metal != bid_src1.metal) {
2311
+ break;
2312
+ }
2313
+
2314
+ //ctx->fuse_cnt[ops[n_fuse + 1]->op]++;
2315
+
2316
+ args.o1[n_fuse + 1] = bid_fuse.offs;
2317
+ }
2318
+
2319
+ ++n_fuse;
2320
+
2321
+ if (debug_fusion > 1 && n_fuse > 1) {
2322
+ GGML_LOG_DEBUG("%s: fuse: ADD x %d\n", __func__, n_fuse);
2323
+ }
2324
+ }
2325
+
2326
+ // the offsets of src1 and all fused buffers are relative to the start of the src1 buffer
2327
+ bid_src1.offs = 0;
2328
+
2329
+ ggml_metal_pipeline_t pipeline = nullptr;
2330
+
2331
+ if (ggml_nelements(op->src[1]) == ne10 && ggml_is_contiguous(op->src[1]) && ne00 % 4 == 0 && ne10 % 4 == 0) {
2332
+ GGML_ASSERT(ggml_is_contiguous(op->src[0]));
2333
+
2334
+ // src1 is a row
2335
+ GGML_ASSERT(ne11 == 1);
2336
+
2337
+ pipeline = ggml_metal_library_get_pipeline_bin(lib, op->op, n_fuse, true);
2338
+
2339
+ bcast_row = true;
2340
+ } else {
2341
+ pipeline = ggml_metal_library_get_pipeline_bin(lib, op->op, n_fuse, false);
2342
+ }
2343
+
2344
+ if (n_fuse > 1) {
2345
+ bid_dst = ggml_metal_get_buffer_id(ctx->node(idx + n_fuse - 1));
2346
+
2347
+ for (int i = 1; i < n_fuse; ++i) {
2348
+ if (!ggml_metal_op_concurrency_check(ctx, ctx->node(idx + i))) {
2349
+ ggml_metal_op_concurrency_reset(ctx);
2350
+
2351
+ break;
2352
+ }
2353
+ }
2354
+ }
2355
+
2356
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
2357
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
2358
+ ggml_metal_encoder_set_buffer (enc, bid_src0, 1);
2359
+ ggml_metal_encoder_set_buffer (enc, bid_src1, 2);
2360
+ ggml_metal_encoder_set_buffer (enc, bid_dst, 3);
2361
+
2362
+ if (bcast_row) {
2363
+ const int64_t n = ggml_nelements(op)/4;
2364
+
2365
+ ggml_metal_encoder_dispatch_threadgroups(enc, n, 1, 1, 1, 1, 1);
2366
+ } else {
2367
+ int nth = 32;
2368
+
2369
+ while (16*nth < ne0 && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
2370
+ nth *= 2;
2371
+ }
2372
+
2373
+ ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne02, ne03, nth, 1, 1);
2374
+ }
2375
+
2376
+ return n_fuse;
2377
+ }
2378
+
2379
+ int ggml_metal_op_l2_norm(ggml_metal_op_t ctx, int idx) {
2380
+ ggml_tensor * op = ctx->node(idx);
2381
+
2382
+ ggml_metal_library_t lib = ctx->lib;
2383
+ ggml_metal_encoder_t enc = ctx->enc;
2384
+
2385
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
2386
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
2387
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
2388
+ GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
2389
+
2390
+ float eps;
2391
+ memcpy(&eps, op->op_params, sizeof(float));
2392
+
2393
+ int nth = 32; // SIMD width
2394
+
2395
+ ggml_metal_kargs_l2_norm args = {
2396
+ /*.ne00 =*/ ne00,
2397
+ /*.ne00_4 =*/ ne00/4,
2398
+ /*.nb01 =*/ nb01,
2399
+ /*.eps =*/ eps,
2400
+ };
2401
+
2402
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_l2_norm(lib, op);
2403
+
2404
+ while (nth < ne00/4 && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
2405
+ nth *= 2;
2406
+ }
2407
+
2408
+ nth = std::min(nth, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
2409
+ nth = std::min(nth, ne00/4);
2410
+
2411
+ const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
2412
+
2413
+ const int64_t nrows = ggml_nrows(op->src[0]);
2414
+
2415
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
2416
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
2417
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
2418
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
2419
+
2420
+ ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
2421
+
2422
+ ggml_metal_encoder_dispatch_threadgroups(enc, nrows, 1, 1, nth, 1, 1);
2423
+
2424
+ return 1;
2425
+ }
2426
+
2427
+ int ggml_metal_op_group_norm(ggml_metal_op_t ctx, int idx) {
2428
+ ggml_tensor * op = ctx->node(idx);
2429
+
2430
+ ggml_metal_library_t lib = ctx->lib;
2431
+ ggml_metal_encoder_t enc = ctx->enc;
2432
+
2433
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
2434
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
2435
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
2436
+ GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
2437
+
2438
+ const int32_t ngrp = ((const int32_t *) op->op_params)[0];
2439
+
2440
+ float eps;
2441
+ memcpy(&eps, op->op_params + 1, sizeof(float));
2442
+
2443
+ ggml_metal_kargs_group_norm args = {
2444
+ /*.ne00 =*/ ne00,
2445
+ /*.ne01 =*/ ne01,
2446
+ /*.ne02 =*/ ne02,
2447
+ /*.nb00 =*/ nb00,
2448
+ /*.nb01 =*/ nb01,
2449
+ /*.nb02 =*/ nb02,
2450
+ /*.ngrp =*/ ngrp,
2451
+ /*.eps =*/ eps,
2452
+ };
2453
+
2454
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_group_norm(lib, op);
2455
+
2456
+ int nth = 32; // SIMD width
2457
+ //while (nth < ne00/4 && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
2458
+ // nth *= 2;
2459
+ //}
2460
+
2461
+ //nth = std::min(nth, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
2462
+ //nth = std::min(nth, ne00/4);
2463
+
2464
+ const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
2465
+
2466
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
2467
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
2468
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
2469
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
2470
+
2471
+ ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
2472
+
2473
+ ggml_metal_encoder_dispatch_threadgroups(enc, ngrp, 1, 1, nth, 1, 1);
2474
+
2475
+ return 1;
2476
+ }
2477
+
2478
+ int ggml_metal_op_norm(ggml_metal_op_t ctx, int idx) {
2479
+ ggml_tensor * op = ctx->node(idx);
2480
+
2481
+ ggml_metal_library_t lib = ctx->lib;
2482
+ ggml_metal_encoder_t enc = ctx->enc;
2483
+
2484
+ const bool use_fusion = ctx->use_fusion;
2485
+
2486
+ const int debug_fusion = ctx->debug_fusion;
2487
+
2488
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
2489
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
2490
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
2491
+ GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
2492
+
2493
+ float eps;
2494
+ memcpy(&eps, op->op_params, sizeof(float));
2495
+
2496
+ ggml_metal_buffer_id bid_src0 = ggml_metal_get_buffer_id(op->src[0]);
2497
+ ggml_metal_buffer_id bid_dst = ggml_metal_get_buffer_id(op);
2498
+
2499
+ ggml_metal_kargs_norm args = {
2500
+ /*.ne00 =*/ ne00,
2501
+ /*.ne00_t =*/ ne00 % 4 == 0 ? ne00/4 : ne00,
2502
+ /*.nb1 =*/ nb1,
2503
+ /*.nb2 =*/ nb2,
2504
+ /*.nb3 =*/ nb3,
2505
+ /*.eps =*/ eps,
2506
+ /*.nef1 =*/ { ne01 },
2507
+ /*.nef2 =*/ { ne02 },
2508
+ /*.nef3 =*/ { ne03 },
2509
+ /*.nbf1 =*/ { nb01 },
2510
+ /*.nbf2 =*/ { nb02 },
2511
+ /*.nbf3 =*/ { nb03 },
2512
+ };
2513
+
2514
+ ggml_op fops[8];
2515
+
2516
+ int n_fuse = 1;
2517
+
2518
+ ggml_metal_buffer_id bid_fuse[2] = { bid_src0, bid_src0 };
2519
+
2520
+ // d[0] = norm(a)
2521
+ // d[1] = mul(d[0], b)
2522
+ // d[2] = add(d[1], c)
2523
+ if (use_fusion) {
2524
+ fops[0] = op->op;
2525
+ fops[1] = GGML_OP_MUL;
2526
+ fops[2] = GGML_OP_ADD;
2527
+
2528
+ for (n_fuse = 0; n_fuse <= 1; ++n_fuse) {
2529
+ if (!ctx->can_fuse(idx + n_fuse, fops + n_fuse, 2)) {
2530
+ break;
2531
+ }
2532
+
2533
+ ggml_tensor * f0 = ctx->node(idx + n_fuse);
2534
+ ggml_tensor * f1 = ctx->node(idx + n_fuse + 1);
2535
+
2536
+ if (f0 != f1->src[0]) {
2537
+ break;
2538
+ }
2539
+
2540
+ if (f1->src[1]->ne[0] != op->ne[0]) {
2541
+ break;
2542
+ }
2543
+
2544
+ if (!ggml_is_contiguous_rows(f1->src[1])) {
2545
+ break;
2546
+ }
2547
+
2548
+ if (f1->type != GGML_TYPE_F32) {
2549
+ break;
2550
+ }
2551
+
2552
+ //ctx->fuse_cnt[f1->op]++;
2553
+
2554
+ bid_fuse[n_fuse] = ggml_metal_get_buffer_id(f1->src[1]);
2555
+
2556
+ args.nef1[n_fuse + 1] = f1->src[1]->ne[1];
2557
+ args.nef2[n_fuse + 1] = f1->src[1]->ne[2];
2558
+ args.nef3[n_fuse + 1] = f1->src[1]->ne[3];
2559
+
2560
+ args.nbf1[n_fuse + 1] = f1->src[1]->nb[1];
2561
+ args.nbf2[n_fuse + 1] = f1->src[1]->nb[2];
2562
+ args.nbf3[n_fuse + 1] = f1->src[1]->nb[3];
2563
+ }
2564
+
2565
+ ++n_fuse;
2566
+
2567
+ if (debug_fusion > 1 && n_fuse > 1) {
2568
+ if (n_fuse == 2) {
2569
+ GGML_LOG_DEBUG("%s: fuse: %s + MUL\n", __func__, ggml_op_name(op->op));
2570
+ }
2571
+ if (n_fuse == 3) {
2572
+ GGML_LOG_DEBUG("%s: fuse: %s + MUL + ADD\n", __func__, ggml_op_name(op->op));
2573
+ }
2574
+ }
2575
+ }
2576
+
2577
+ if (n_fuse > 1) {
2578
+ bid_dst = ggml_metal_get_buffer_id(ctx->node(idx + n_fuse - 1));
2579
+
2580
+ for (int i = 1; i < n_fuse; ++i) {
2581
+ if (!ggml_metal_op_concurrency_check(ctx, ctx->node(idx + i))) {
2582
+ ggml_metal_op_concurrency_reset(ctx);
2583
+
2584
+ break;
2585
+ }
2586
+ }
2587
+ }
2588
+
2589
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_norm(lib, op, n_fuse);
2590
+
2591
+ int nth = 32; // SIMD width
2592
+
2593
+ while (nth < args.ne00_t && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
2594
+ nth *= 2;
2595
+ }
2596
+
2597
+ nth = std::min(nth, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
2598
+ nth = std::min(nth, args.ne00_t);
2599
+
2600
+ const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
2601
+
2602
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
2603
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
2604
+ ggml_metal_encoder_set_buffer (enc, bid_src0, 1);
2605
+ ggml_metal_encoder_set_buffer (enc, bid_fuse[0], 2);
2606
+ ggml_metal_encoder_set_buffer (enc, bid_fuse[1], 3);
2607
+ ggml_metal_encoder_set_buffer (enc, bid_dst, 4);
2608
+
2609
+ ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
2610
+
2611
+ ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne02, ne03, nth, 1, 1);
2612
+
2613
+ return n_fuse;
2614
+ }
2615
+
2616
+ int ggml_metal_op_rope(ggml_metal_op_t ctx, int idx) {
2617
+ ggml_tensor * op = ctx->node(idx);
2618
+
2619
+ ggml_metal_library_t lib = ctx->lib;
2620
+ ggml_metal_encoder_t enc = ctx->enc;
2621
+
2622
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
2623
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
2624
+ GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
2625
+ GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
2626
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
2627
+ GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
2628
+
2629
+ // make sure we have one or more position id(ne10) per token(ne02)
2630
+ GGML_ASSERT(ne10 % ne02 == 0);
2631
+ GGML_ASSERT(ne10 >= ne02);
2632
+
2633
+ const int nth = std::min(1024, ne00);
2634
+
2635
+ const int n_past = ((const int32_t *) op->op_params)[0];
2636
+ const int n_dims = ((const int32_t *) op->op_params)[1];
2637
+ //const int mode = ((const int32_t *) op->op_params)[2];
2638
+ // skip 3, n_ctx, used in GLM RoPE, unimplemented in metal
2639
+ const int n_ctx_orig = ((const int32_t *) op->op_params)[4];
2640
+
2641
+ float freq_base;
2642
+ float freq_scale;
2643
+ float ext_factor;
2644
+ float attn_factor;
2645
+ float beta_fast;
2646
+ float beta_slow;
2647
+
2648
+ memcpy(&freq_base, (const int32_t *) op->op_params + 5, sizeof(float));
2649
+ memcpy(&freq_scale, (const int32_t *) op->op_params + 6, sizeof(float));
2650
+ memcpy(&ext_factor, (const int32_t *) op->op_params + 7, sizeof(float));
2651
+ memcpy(&attn_factor, (const int32_t *) op->op_params + 8, sizeof(float));
2652
+ memcpy(&beta_fast, (const int32_t *) op->op_params + 9, sizeof(float));
2653
+ memcpy(&beta_slow, (const int32_t *) op->op_params + 10, sizeof(float));
2654
+
2655
+ // mrope
2656
+ const int sect_0 = ((const int32_t *) op->op_params)[11];
2657
+ const int sect_1 = ((const int32_t *) op->op_params)[12];
2658
+ const int sect_2 = ((const int32_t *) op->op_params)[13];
2659
+ const int sect_3 = ((const int32_t *) op->op_params)[14];
2660
+
2661
+ ggml_metal_kargs_rope args = {
2662
+ /*.ne00 =*/ ne00,
2663
+ /*.ne01 =*/ ne01,
2664
+ /*.ne02 =*/ ne02,
2665
+ /*.ne03 =*/ ne03,
2666
+ /*.nb00 =*/ nb00,
2667
+ /*.nb01 =*/ nb01,
2668
+ /*.nb02 =*/ nb02,
2669
+ /*.nb03 =*/ nb03,
2670
+ /*.ne0 =*/ ne0,
2671
+ /*.ne1 =*/ ne1,
2672
+ /*.ne2 =*/ ne2,
2673
+ /*.ne3 =*/ ne3,
2674
+ /*.nb0 =*/ nb0,
2675
+ /*.nb1 =*/ nb1,
2676
+ /*.nb2 =*/ nb2,
2677
+ /*.nb3 =*/ nb3,
2678
+ /*.n_past =*/ n_past,
2679
+ /*.n_dims =*/ n_dims,
2680
+ /*.n_ctx_orig =*/ n_ctx_orig,
2681
+ /*.freq_base =*/ freq_base,
2682
+ /*.freq_scale =*/ freq_scale,
2683
+ /*.ext_factor =*/ ext_factor,
2684
+ /*.attn_factor =*/ attn_factor,
2685
+ /*.beta_fast =*/ beta_fast,
2686
+ /*.beta_slow =*/ beta_slow,
2687
+ /* sect_0 =*/ sect_0,
2688
+ /* sect_1 =*/ sect_1,
2689
+ /* sect_2 =*/ sect_2,
2690
+ /* sect_3 =*/ sect_3,
2691
+ };
2692
+
2693
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_rope(lib, op);
2694
+
2695
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
2696
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
2697
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
2698
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
2699
+ if (op->src[2]) {
2700
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[2]), 3);
2701
+ } else {
2702
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 3);
2703
+ }
2704
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 4);
2705
+
2706
+ ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne02, ne03, nth, 1, 1);
2707
+
2708
+ return 1;
2709
+ }
2710
+
2711
+ int ggml_metal_op_im2col(ggml_metal_op_t ctx, int idx) {
2712
+ ggml_tensor * op = ctx->node(idx);
2713
+
2714
+ ggml_metal_library_t lib = ctx->lib;
2715
+ ggml_metal_encoder_t enc = ctx->enc;
2716
+
2717
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
2718
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
2719
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
2720
+ GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
2721
+
2722
+ const int32_t s0 = ((const int32_t *)(op->op_params))[0];
2723
+ const int32_t s1 = ((const int32_t *)(op->op_params))[1];
2724
+ const int32_t p0 = ((const int32_t *)(op->op_params))[2];
2725
+ const int32_t p1 = ((const int32_t *)(op->op_params))[3];
2726
+ const int32_t d0 = ((const int32_t *)(op->op_params))[4];
2727
+ const int32_t d1 = ((const int32_t *)(op->op_params))[5];
2728
+
2729
+ const bool is_2D = ((const int32_t *)(op->op_params))[6] == 1;
2730
+
2731
+ const int32_t N = op->src[1]->ne[is_2D ? 3 : 2];
2732
+ const int32_t IC = op->src[1]->ne[is_2D ? 2 : 1];
2733
+ const int32_t IH = is_2D ? op->src[1]->ne[1] : 1;
2734
+ const int32_t IW = op->src[1]->ne[0];
2735
+
2736
+ const int32_t KH = is_2D ? op->src[0]->ne[1] : 1;
2737
+ const int32_t KW = op->src[0]->ne[0];
2738
+
2739
+ const int32_t OH = is_2D ? op->ne[2] : 1;
2740
+ const int32_t OW = op->ne[1];
2741
+
2742
+ const int32_t CHW = IC * KH * KW;
2743
+
2744
+ const uint64_t ofs0 = op->src[1]->nb[is_2D ? 3 : 2] / 4;
2745
+ const uint64_t ofs1 = op->src[1]->nb[is_2D ? 2 : 1] / 4;
2746
+
2747
+ ggml_metal_kargs_im2col args = {
2748
+ /*.ofs0 =*/ ofs0,
2749
+ /*.ofs1 =*/ ofs1,
2750
+ /*.IW =*/ IW,
2751
+ /*.IH =*/ IH,
2752
+ /*.CHW =*/ CHW,
2753
+ /*.s0 =*/ s0,
2754
+ /*.s1 =*/ s1,
2755
+ /*.p0 =*/ p0,
2756
+ /*.p1 =*/ p1,
2757
+ /*.d0 =*/ d0,
2758
+ /*.d1 =*/ d1,
2759
+ /*.N =*/ N,
2760
+ /*.KH =*/ KH,
2761
+ /*.KW =*/ KW,
2762
+ /*.KHW =*/ KH * KW,
2763
+ };
2764
+
2765
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_im2col(lib, op);
2766
+
2767
+ GGML_ASSERT(KH*KW <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
2768
+
2769
+ const uint64_t ntptg0 = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)/(KH*KW), N);
2770
+
2771
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
2772
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
2773
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 1);
2774
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
2775
+
2776
+ ggml_metal_encoder_dispatch_threadgroups(enc, IC, OH, OW, ntptg0, KH, KW);
2777
+
2778
+ return 1;
2779
+ }
2780
+
2781
+ int ggml_metal_op_conv_transpose_1d(ggml_metal_op_t ctx, int idx) {
2782
+ ggml_tensor * op = ctx->node(idx);
2783
+
2784
+ ggml_metal_library_t lib = ctx->lib;
2785
+ ggml_metal_encoder_t enc = ctx->enc;
2786
+
2787
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
2788
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
2789
+ GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
2790
+ GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
2791
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
2792
+ GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
2793
+
2794
+ const int32_t s0 = ((const int32_t *)(op->op_params))[0];
2795
+
2796
+ const int32_t IC = op->src[1]->ne[1];
2797
+ const int32_t IL = op->src[1]->ne[0];
2798
+
2799
+ const int32_t K = op->src[0]->ne[0];
2800
+
2801
+ const int32_t OL = op->ne[0];
2802
+ const int32_t OC = op->ne[1];
2803
+
2804
+ ggml_metal_kargs_conv_transpose_1d args = {
2805
+ /*.IC =*/ IC,
2806
+ /*.IL =*/ IL,
2807
+ /*.K =*/ K,
2808
+ /*.s0 =*/ s0,
2809
+ /*.nb0 =*/ nb0,
2810
+ /*.nb1 =*/ nb1,
2811
+ };
2812
+
2813
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_conv_transpose_1d(lib, op);
2814
+
2815
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
2816
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
2817
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
2818
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
2819
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3);
2820
+
2821
+ ggml_metal_encoder_dispatch_threadgroups(enc, OL, OC, 1, 1, 1, 1);
2822
+
2823
+ return 1;
2824
+ }
2825
+
2826
+ int ggml_metal_op_upscale(ggml_metal_op_t ctx, int idx) {
2827
+ ggml_tensor * op = ctx->node(idx);
2828
+
2829
+ ggml_metal_library_t lib = ctx->lib;
2830
+ ggml_metal_encoder_t enc = ctx->enc;
2831
+
2832
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
2833
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
2834
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
2835
+ GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
2836
+
2837
+ const float sf0 = (float)ne0/op->src[0]->ne[0];
2838
+ const float sf1 = (float)ne1/op->src[0]->ne[1];
2839
+ const float sf2 = (float)ne2/op->src[0]->ne[2];
2840
+ const float sf3 = (float)ne3/op->src[0]->ne[3];
2841
+
2842
+ ggml_metal_kargs_upscale args = {
2843
+ /*.ne00 =*/ ne00,
2844
+ /*.ne01 =*/ ne01,
2845
+ /*.ne02 =*/ ne02,
2846
+ /*.ne03 =*/ ne03,
2847
+ /*.nb00 =*/ nb00,
2848
+ /*.nb01 =*/ nb01,
2849
+ /*.nb02 =*/ nb02,
2850
+ /*.nb03 =*/ nb03,
2851
+ /*.ne0 =*/ ne0,
2852
+ /*.ne1 =*/ ne1,
2853
+ /*.ne2 =*/ ne2,
2854
+ /*.ne3 =*/ ne3,
2855
+ /*.nb0 =*/ nb0,
2856
+ /*.nb1 =*/ nb1,
2857
+ /*.nb2 =*/ nb2,
2858
+ /*.nb3 =*/ nb3,
2859
+ /*.sf0 =*/ sf0,
2860
+ /*.sf1 =*/ sf1,
2861
+ /*.sf2 =*/ sf2,
2862
+ /*.sf3 =*/ sf3
2863
+ };
2864
+
2865
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_upscale(lib, op);
2866
+
2867
+ const int nth = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), ne0);
2868
+
2869
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
2870
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
2871
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
2872
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
2873
+
2874
+ ggml_metal_encoder_dispatch_threadgroups(enc, ne1, ne2, ne3, nth, 1, 1);
2875
+
2876
+ return 1;
2877
+ }
2878
+
2879
+ int ggml_metal_op_pad(ggml_metal_op_t ctx, int idx) {
2880
+ ggml_tensor * op = ctx->node(idx);
2881
+
2882
+ ggml_metal_library_t lib = ctx->lib;
2883
+ ggml_metal_encoder_t enc = ctx->enc;
2884
+
2885
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
2886
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
2887
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
2888
+ GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
2889
+
2890
+ ggml_metal_kargs_pad args = {
2891
+ /*.ne00 =*/ ne00,
2892
+ /*.ne01 =*/ ne01,
2893
+ /*.ne02 =*/ ne02,
2894
+ /*.ne03 =*/ ne03,
2895
+ /*.nb00 =*/ nb00,
2896
+ /*.nb01 =*/ nb01,
2897
+ /*.nb02 =*/ nb02,
2898
+ /*.nb03 =*/ nb03,
2899
+ /*.ne0 =*/ ne0,
2900
+ /*.ne1 =*/ ne1,
2901
+ /*.ne2 =*/ ne2,
2902
+ /*.ne3 =*/ ne3,
2903
+ /*.nb0 =*/ nb0,
2904
+ /*.nb1 =*/ nb1,
2905
+ /*.nb2 =*/ nb2,
2906
+ /*.nb3 =*/ nb3
2907
+ };
2908
+
2909
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_pad(lib, op);
2910
+
2911
+ const int nth = std::min(1024, ne0);
2912
+
2913
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
2914
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
2915
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
2916
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
2917
+
2918
+ ggml_metal_encoder_dispatch_threadgroups(enc, ne1, ne2, ne3, nth, 1, 1);
2919
+
2920
+ return 1;
2921
+ }
2922
+
2923
+ int ggml_metal_op_pad_reflect_1d(ggml_metal_op_t ctx, int idx) {
2924
+ ggml_tensor * op = ctx->node(idx);
2925
+
2926
+ ggml_metal_library_t lib = ctx->lib;
2927
+ ggml_metal_encoder_t enc = ctx->enc;
2928
+
2929
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
2930
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
2931
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
2932
+ GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
2933
+
2934
+ ggml_metal_kargs_pad_reflect_1d args = {
2935
+ /*.ne00 =*/ ne00,
2936
+ /*.ne01 =*/ ne01,
2937
+ /*.ne02 =*/ ne02,
2938
+ /*.ne03 =*/ ne03,
2939
+ /*.nb00 =*/ nb00,
2940
+ /*.nb01 =*/ nb01,
2941
+ /*.nb02 =*/ nb02,
2942
+ /*.nb03 =*/ nb03,
2943
+ /*.ne0 =*/ ne0,
2944
+ /*.ne1 =*/ ne1,
2945
+ /*.ne2 =*/ ne2,
2946
+ /*.ne3 =*/ ne3,
2947
+ /*.nb0 =*/ nb0,
2948
+ /*.nb1 =*/ nb1,
2949
+ /*.nb2 =*/ nb2,
2950
+ /*.nb3 =*/ nb3,
2951
+ /*.p0 =*/ ((const int32_t *)(op->op_params))[0],
2952
+ /*.p1 =*/ ((const int32_t *)(op->op_params))[1]
2953
+ };
2954
+
2955
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_pad_reflect_1d(lib, op);
2956
+
2957
+ const int nth = std::min(1024, ne0);
2958
+
2959
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
2960
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
2961
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
2962
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
2963
+
2964
+ ggml_metal_encoder_dispatch_threadgroups(enc, ne1, ne2, ne3, nth, 1, 1);
2965
+
2966
+ return 1;
2967
+ }
2968
+
2969
+ int ggml_metal_op_arange(ggml_metal_op_t ctx, int idx) {
2970
+ ggml_tensor * op = ctx->node(idx);
2971
+
2972
+ ggml_metal_library_t lib = ctx->lib;
2973
+ ggml_metal_encoder_t enc = ctx->enc;
2974
+
2975
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
2976
+ GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
2977
+
2978
+ float start;
2979
+ float step;
2980
+
2981
+ memcpy(&start, ((const int32_t *) op->op_params) + 0, sizeof(float));
2982
+ memcpy(&step, ((const int32_t *) op->op_params) + 2, sizeof(float));
2983
+
2984
+ ggml_metal_kargs_arange args = {
2985
+ /*.ne0 =*/ ne0,
2986
+ /*.start =*/ start,
2987
+ /*.step =*/ step
2988
+ };
2989
+
2990
+ const int nth = std::min(1024, ne0);
2991
+
2992
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_arange(lib, op);
2993
+
2994
+ //[encoder setComputePipelineState:pipeline];
2995
+ //[encoder setBuffer:id_dst offset:offs_dst atIndex:0];
2996
+ //[encoder setBytes:&args length:sizeof(args) atIndex:1];
2997
+
2998
+ //[encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
2999
+
3000
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
3001
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
3002
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 1);
3003
+
3004
+ ggml_metal_encoder_dispatch_threadgroups(enc, 1, 1, 1, nth, 1, 1);
3005
+
3006
+ return 1;
3007
+ }
3008
+
3009
+ int ggml_metal_op_timestep_embedding(ggml_metal_op_t ctx, int idx) {
3010
+ ggml_tensor * op = ctx->node(idx);
3011
+
3012
+ ggml_metal_library_t lib = ctx->lib;
3013
+ ggml_metal_encoder_t enc = ctx->enc;
3014
+
3015
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
3016
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
3017
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
3018
+ GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
3019
+
3020
+ const int dim = op->op_params[0];
3021
+ const int max_period = op->op_params[1];
3022
+
3023
+ ggml_metal_kargs_timestep_embedding args = {
3024
+ /*.nb1 =*/ nb1,
3025
+ /*.dim =*/ dim,
3026
+ /*.max_period =*/ max_period,
3027
+ };
3028
+
3029
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_timestep_embedding(lib, op);
3030
+
3031
+ const int nth = std::max(1, std::min(1024, dim/2));
3032
+
3033
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
3034
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
3035
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
3036
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
3037
+
3038
+ ggml_metal_encoder_dispatch_threadgroups(enc, ne00, 1, 1, nth, 1, 1);
3039
+
3040
+ return 1;
3041
+ }
3042
+
3043
+ int ggml_metal_op_argmax(ggml_metal_op_t ctx, int idx) {
3044
+ ggml_tensor * op = ctx->node(idx);
3045
+
3046
+ ggml_metal_library_t lib = ctx->lib;
3047
+ ggml_metal_encoder_t enc = ctx->enc;
3048
+
3049
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
3050
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
3051
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
3052
+ GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
3053
+
3054
+ ggml_metal_kargs_argmax args = {
3055
+ /*.ne00 = */ ne00,
3056
+ /*.nb01 = */ nb01,
3057
+ };
3058
+
3059
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_argmax(lib, op);
3060
+
3061
+ const int64_t nrows = ggml_nrows(op->src[0]);
3062
+
3063
+ int nth = 32; // SIMD width
3064
+ while (nth < ne00 && nth*ne01*ne02*ne03 < 256) {
3065
+ nth *= 2;
3066
+ }
3067
+
3068
+ const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
3069
+
3070
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
3071
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
3072
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
3073
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
3074
+
3075
+ ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
3076
+
3077
+ ggml_metal_encoder_dispatch_threadgroups(enc, nrows, 1, 1, nth, 1, 1);
3078
+
3079
+ return 1;
3080
+ }
3081
+
3082
+ int ggml_metal_op_argsort(ggml_metal_op_t ctx, int idx) {
3083
+ ggml_tensor * op = ctx->node(idx);
3084
+
3085
+ ggml_metal_library_t lib = ctx->lib;
3086
+ ggml_metal_encoder_t enc = ctx->enc;
3087
+
3088
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
3089
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
3090
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
3091
+ GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
3092
+
3093
+ // bitonic sort requires the number of elements to be power of 2
3094
+ int64_t ne00_padded = 1;
3095
+ while (ne00_padded < ne00) {
3096
+ ne00_padded *= 2;
3097
+ }
3098
+
3099
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_argsort(lib, op);
3100
+
3101
+ const int64_t nrows = ggml_nrows(op->src[0]);
3102
+
3103
+ // Metal kernels require the buffer size to be multiple of 16 bytes
3104
+ // https://developer.apple.com/documentation/metal/mtlcomputecommandencoder/1443142-setthreadgroupmemorylength
3105
+ const size_t smem = GGML_PAD(ne00_padded*sizeof(int32_t), 16);
3106
+
3107
+ ggml_metal_kargs_argsort args = {
3108
+ /*.ncols =*/ ne00,
3109
+ /*.ncols_pad =*/ ne00_padded
3110
+ };
3111
+
3112
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
3113
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
3114
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
3115
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
3116
+
3117
+ ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
3118
+
3119
+ ggml_metal_encoder_dispatch_threadgroups(enc, 1, nrows, 1, ne00_padded, 1, 1);
3120
+
3121
+ return 1;
3122
+ }
3123
+
3124
+ int ggml_metal_op_leaky_relu(ggml_metal_op_t ctx, int idx) {
3125
+ ggml_tensor * op = ctx->node(idx);
3126
+
3127
+ ggml_metal_library_t lib = ctx->lib;
3128
+ ggml_metal_encoder_t enc = ctx->enc;
3129
+
3130
+ GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
3131
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
3132
+ GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
3133
+ GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
3134
+
3135
+ float slope;
3136
+ memcpy(&slope, op->op_params, sizeof(float));
3137
+
3138
+ ggml_metal_kargs_leaky_relu args = {
3139
+ /*.slope =*/ slope
3140
+ };
3141
+
3142
+ ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_unary(lib, op);
3143
+
3144
+ int64_t n = ggml_nelements(op);
3145
+
3146
+ if (n % 4 == 0) {
3147
+ n /= 4;
3148
+ }
3149
+
3150
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
3151
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
3152
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
3153
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
3154
+
3155
+ ggml_metal_encoder_dispatch_threadgroups(enc, n, 1, 1, 1, 1, 1);
3156
+
3157
+ return 1;
3158
+ }