whispercpp 1.3.4 → 1.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (891) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +1 -1
  3. data/README.md +158 -44
  4. data/ext/extconf.rb +3 -2
  5. data/ext/ruby_whisper.c +34 -6
  6. data/ext/ruby_whisper.h +67 -0
  7. data/ext/ruby_whisper_context.c +236 -144
  8. data/ext/ruby_whisper_context_params.c +163 -0
  9. data/ext/ruby_whisper_model.c +12 -13
  10. data/ext/ruby_whisper_params.c +47 -24
  11. data/ext/ruby_whisper_segment.c +84 -20
  12. data/ext/ruby_whisper_token.c +371 -0
  13. data/ext/ruby_whisper_transcribe.cpp +5 -2
  14. data/ext/ruby_whisper_vad_context.c +122 -0
  15. data/ext/ruby_whisper_vad_context_detect.cpp +51 -0
  16. data/ext/ruby_whisper_vad_params.c +0 -1
  17. data/ext/ruby_whisper_vad_segment.c +138 -0
  18. data/ext/ruby_whisper_vad_segments.c +105 -0
  19. data/ext/sources/CMakeLists.txt +4 -1
  20. data/ext/sources/bindings/javascript/package.json +1 -1
  21. data/ext/sources/cmake/arm64-apple-clang.cmake +16 -0
  22. data/ext/sources/cmake/arm64-windows-llvm.cmake +16 -0
  23. data/ext/sources/cmake/riscv64-spacemit-linux-gnu-gcc.cmake +29 -0
  24. data/ext/sources/cmake/whisper-config.cmake.in +5 -40
  25. data/ext/sources/cmake/x64-windows-llvm.cmake +5 -0
  26. data/ext/sources/examples/addon.node/vad-example.js +2 -2
  27. data/ext/sources/examples/bench/bench.cpp +23 -18
  28. data/ext/sources/examples/cli/cli.cpp +129 -112
  29. data/ext/sources/examples/common-ggml.cpp +2 -0
  30. data/ext/sources/examples/lsp/CMakeLists.txt +2 -1
  31. data/ext/sources/examples/miniaudio.h +4507 -2131
  32. data/ext/sources/examples/quantize/CMakeLists.txt +2 -1
  33. data/ext/sources/examples/server/server.cpp +28 -15
  34. data/ext/sources/examples/talk-llama/CMakeLists.txt +8 -3
  35. data/ext/sources/examples/talk-llama/llama-adapter.cpp +5 -2
  36. data/ext/sources/examples/talk-llama/llama-adapter.h +7 -0
  37. data/ext/sources/examples/talk-llama/llama-arch.cpp +2378 -1988
  38. data/ext/sources/examples/talk-llama/llama-arch.h +109 -2
  39. data/ext/sources/examples/talk-llama/llama-batch.cpp +78 -34
  40. data/ext/sources/examples/talk-llama/llama-batch.h +17 -4
  41. data/ext/sources/examples/talk-llama/llama-chat.cpp +100 -4
  42. data/ext/sources/examples/talk-llama/llama-chat.h +5 -0
  43. data/ext/sources/examples/talk-llama/llama-context.cpp +1088 -403
  44. data/ext/sources/examples/talk-llama/llama-context.h +70 -23
  45. data/ext/sources/examples/talk-llama/llama-cparams.h +6 -0
  46. data/ext/sources/examples/talk-llama/llama-ext.h +12 -0
  47. data/ext/sources/examples/talk-llama/llama-grammar.cpp +295 -60
  48. data/ext/sources/examples/talk-llama/llama-grammar.h +22 -1
  49. data/ext/sources/examples/talk-llama/llama-graph.cpp +925 -155
  50. data/ext/sources/examples/talk-llama/llama-graph.h +234 -23
  51. data/ext/sources/examples/talk-llama/llama-hparams.cpp +79 -38
  52. data/ext/sources/examples/talk-llama/llama-hparams.h +118 -18
  53. data/ext/sources/examples/talk-llama/llama-impl.cpp +11 -7
  54. data/ext/sources/examples/talk-llama/llama-impl.h +14 -2
  55. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +8 -4
  56. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +405 -140
  57. data/ext/sources/examples/talk-llama/llama-kv-cache.h +24 -10
  58. data/ext/sources/examples/talk-llama/llama-kv-cells.h +44 -2
  59. data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.cpp +275 -0
  60. data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.h +140 -0
  61. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +12 -10
  62. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +42 -31
  63. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +2 -2
  64. data/ext/sources/examples/talk-llama/llama-mmap.cpp +197 -45
  65. data/ext/sources/examples/talk-llama/llama-mmap.h +8 -3
  66. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +606 -116
  67. data/ext/sources/examples/talk-llama/llama-model-loader.h +41 -5
  68. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +61 -44
  69. data/ext/sources/examples/talk-llama/llama-model-saver.h +5 -2
  70. data/ext/sources/examples/talk-llama/llama-model.cpp +2756 -13643
  71. data/ext/sources/examples/talk-llama/llama-model.h +112 -18
  72. data/ext/sources/examples/talk-llama/llama-quant.cpp +582 -365
  73. data/ext/sources/examples/talk-llama/{llama-sampling.cpp → llama-sampler.cpp} +1409 -199
  74. data/ext/sources/examples/talk-llama/llama-sampler.h +42 -0
  75. data/ext/sources/examples/talk-llama/llama-vocab.cpp +248 -82
  76. data/ext/sources/examples/talk-llama/llama-vocab.h +50 -40
  77. data/ext/sources/examples/talk-llama/llama.cpp +802 -21
  78. data/ext/sources/examples/talk-llama/llama.h +210 -39
  79. data/ext/sources/examples/talk-llama/models/afmoe.cpp +190 -0
  80. data/ext/sources/examples/talk-llama/models/apertus.cpp +125 -0
  81. data/ext/sources/examples/talk-llama/models/arcee.cpp +135 -0
  82. data/ext/sources/examples/talk-llama/models/arctic.cpp +137 -0
  83. data/ext/sources/examples/talk-llama/models/arwkv7.cpp +86 -0
  84. data/ext/sources/examples/talk-llama/models/baichuan.cpp +123 -0
  85. data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +143 -0
  86. data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +133 -0
  87. data/ext/sources/examples/talk-llama/models/bert.cpp +184 -0
  88. data/ext/sources/examples/talk-llama/models/bitnet.cpp +145 -0
  89. data/ext/sources/examples/talk-llama/models/bloom.cpp +101 -0
  90. data/ext/sources/examples/talk-llama/models/chameleon.cpp +178 -0
  91. data/ext/sources/examples/talk-llama/models/chatglm.cpp +132 -0
  92. data/ext/sources/examples/talk-llama/models/codeshell.cpp +111 -0
  93. data/ext/sources/examples/talk-llama/models/cogvlm.cpp +102 -0
  94. data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +134 -0
  95. data/ext/sources/examples/talk-llama/models/command-r.cpp +122 -0
  96. data/ext/sources/examples/talk-llama/models/dbrx.cpp +122 -0
  97. data/ext/sources/examples/talk-llama/models/deci.cpp +135 -0
  98. data/ext/sources/examples/talk-llama/models/deepseek.cpp +142 -0
  99. data/ext/sources/examples/talk-llama/models/deepseek2.cpp +262 -0
  100. data/ext/sources/examples/talk-llama/models/delta-net-base.cpp +445 -0
  101. data/ext/sources/examples/talk-llama/models/dots1.cpp +132 -0
  102. data/ext/sources/examples/talk-llama/models/dream.cpp +105 -0
  103. data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +148 -0
  104. data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +110 -0
  105. data/ext/sources/examples/talk-llama/models/eurobert.cpp +97 -0
  106. data/ext/sources/examples/talk-llama/models/exaone-moe.cpp +145 -0
  107. data/ext/sources/examples/talk-llama/models/exaone.cpp +114 -0
  108. data/ext/sources/examples/talk-llama/models/exaone4.cpp +123 -0
  109. data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +111 -0
  110. data/ext/sources/examples/talk-llama/models/falcon.cpp +120 -0
  111. data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +116 -0
  112. data/ext/sources/examples/talk-llama/models/gemma.cpp +112 -0
  113. data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +128 -0
  114. data/ext/sources/examples/talk-llama/models/gemma3.cpp +155 -0
  115. data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +384 -0
  116. data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +170 -0
  117. data/ext/sources/examples/talk-llama/models/glm4.cpp +157 -0
  118. data/ext/sources/examples/talk-llama/models/gpt2.cpp +105 -0
  119. data/ext/sources/examples/talk-llama/models/gptneox.cpp +144 -0
  120. data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +195 -0
  121. data/ext/sources/examples/talk-llama/models/granite.cpp +210 -0
  122. data/ext/sources/examples/talk-llama/models/grok.cpp +159 -0
  123. data/ext/sources/examples/talk-llama/models/grovemoe.cpp +139 -0
  124. data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +132 -0
  125. data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +153 -0
  126. data/ext/sources/examples/talk-llama/models/internlm2.cpp +120 -0
  127. data/ext/sources/examples/talk-llama/models/jais.cpp +86 -0
  128. data/ext/sources/examples/talk-llama/models/jais2.cpp +123 -0
  129. data/ext/sources/examples/talk-llama/models/jamba.cpp +106 -0
  130. data/ext/sources/examples/talk-llama/models/kimi-linear.cpp +381 -0
  131. data/ext/sources/examples/talk-llama/models/lfm2.cpp +196 -0
  132. data/ext/sources/examples/talk-llama/models/llada-moe.cpp +122 -0
  133. data/ext/sources/examples/talk-llama/models/llada.cpp +99 -0
  134. data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +178 -0
  135. data/ext/sources/examples/talk-llama/models/llama.cpp +175 -0
  136. data/ext/sources/examples/talk-llama/models/maincoder.cpp +117 -0
  137. data/ext/sources/examples/talk-llama/models/mamba-base.cpp +289 -0
  138. data/ext/sources/examples/talk-llama/models/mamba.cpp +54 -0
  139. data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +129 -0
  140. data/ext/sources/examples/talk-llama/models/minicpm3.cpp +200 -0
  141. data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +123 -0
  142. data/ext/sources/examples/talk-llama/models/mistral3.cpp +160 -0
  143. data/ext/sources/examples/talk-llama/models/models.h +704 -0
  144. data/ext/sources/examples/talk-llama/models/modern-bert.cpp +109 -0
  145. data/ext/sources/examples/talk-llama/models/mpt.cpp +126 -0
  146. data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +162 -0
  147. data/ext/sources/examples/talk-llama/models/nemotron.cpp +122 -0
  148. data/ext/sources/examples/talk-llama/models/neo-bert.cpp +104 -0
  149. data/ext/sources/examples/talk-llama/models/olmo.cpp +121 -0
  150. data/ext/sources/examples/talk-llama/models/olmo2.cpp +150 -0
  151. data/ext/sources/examples/talk-llama/models/olmoe.cpp +124 -0
  152. data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +127 -0
  153. data/ext/sources/examples/talk-llama/models/openelm.cpp +124 -0
  154. data/ext/sources/examples/talk-llama/models/orion.cpp +123 -0
  155. data/ext/sources/examples/talk-llama/models/paddleocr.cpp +122 -0
  156. data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +121 -0
  157. data/ext/sources/examples/talk-llama/models/phi2.cpp +121 -0
  158. data/ext/sources/examples/talk-llama/models/phi3.cpp +152 -0
  159. data/ext/sources/examples/talk-llama/models/plamo.cpp +110 -0
  160. data/ext/sources/examples/talk-llama/models/plamo2.cpp +320 -0
  161. data/ext/sources/examples/talk-llama/models/plamo3.cpp +128 -0
  162. data/ext/sources/examples/talk-llama/models/plm.cpp +169 -0
  163. data/ext/sources/examples/talk-llama/models/qwen.cpp +108 -0
  164. data/ext/sources/examples/talk-llama/models/qwen2.cpp +126 -0
  165. data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +151 -0
  166. data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +117 -0
  167. data/ext/sources/examples/talk-llama/models/qwen3.cpp +120 -0
  168. data/ext/sources/examples/talk-llama/models/qwen35.cpp +381 -0
  169. data/ext/sources/examples/talk-llama/models/qwen35moe.cpp +422 -0
  170. data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +131 -0
  171. data/ext/sources/examples/talk-llama/models/qwen3next.cpp +525 -0
  172. data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +140 -0
  173. data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +132 -0
  174. data/ext/sources/examples/talk-llama/models/refact.cpp +94 -0
  175. data/ext/sources/examples/talk-llama/models/rnd1.cpp +126 -0
  176. data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +164 -0
  177. data/ext/sources/examples/talk-llama/models/rwkv6.cpp +94 -0
  178. data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +86 -0
  179. data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +137 -0
  180. data/ext/sources/examples/talk-llama/models/rwkv7.cpp +90 -0
  181. data/ext/sources/examples/talk-llama/models/seed-oss.cpp +124 -0
  182. data/ext/sources/examples/talk-llama/models/smallthinker.cpp +126 -0
  183. data/ext/sources/examples/talk-llama/models/smollm3.cpp +128 -0
  184. data/ext/sources/examples/talk-llama/models/stablelm.cpp +146 -0
  185. data/ext/sources/examples/talk-llama/models/starcoder.cpp +100 -0
  186. data/ext/sources/examples/talk-llama/models/starcoder2.cpp +121 -0
  187. data/ext/sources/examples/talk-llama/models/step35-iswa.cpp +165 -0
  188. data/ext/sources/examples/talk-llama/models/t5-dec.cpp +166 -0
  189. data/ext/sources/examples/talk-llama/models/t5-enc.cpp +96 -0
  190. data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +149 -0
  191. data/ext/sources/examples/talk-llama/models/xverse.cpp +108 -0
  192. data/ext/sources/examples/talk-llama/unicode.cpp +121 -79
  193. data/ext/sources/examples/vad-speech-segments/CMakeLists.txt +1 -1
  194. data/ext/sources/examples/whisper.wasm/index-tmpl.html +1 -1
  195. data/ext/sources/ggml/CMakeLists.txt +90 -56
  196. data/ext/sources/ggml/include/ggml-alloc.h +9 -0
  197. data/ext/sources/ggml/include/ggml-backend.h +5 -2
  198. data/ext/sources/ggml/include/ggml-cann.h +1 -1
  199. data/ext/sources/ggml/include/ggml-cpu.h +6 -0
  200. data/ext/sources/ggml/include/ggml-hexagon.h +19 -0
  201. data/ext/sources/ggml/include/ggml-openvino.h +37 -0
  202. data/ext/sources/ggml/include/ggml-opt.h +1 -1
  203. data/ext/sources/ggml/include/ggml-rpc.h +14 -12
  204. data/ext/sources/ggml/include/ggml-virtgpu.h +14 -0
  205. data/ext/sources/ggml/include/ggml-zendnn.h +22 -0
  206. data/ext/sources/ggml/include/ggml.h +246 -21
  207. data/ext/sources/ggml/src/CMakeLists.txt +85 -11
  208. data/ext/sources/ggml/src/ggml-alloc.c +128 -50
  209. data/ext/sources/ggml/src/ggml-backend-dl.cpp +48 -0
  210. data/ext/sources/ggml/src/ggml-backend-dl.h +45 -0
  211. data/ext/sources/ggml/src/ggml-backend-impl.h +1 -4
  212. data/ext/sources/ggml/src/ggml-backend-reg.cpp +54 -88
  213. data/ext/sources/ggml/src/ggml-backend.cpp +76 -23
  214. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +18 -4
  215. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +11 -11
  216. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +58 -46
  217. data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +139 -48
  218. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +2427 -1785
  219. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +238 -362
  220. data/ext/sources/ggml/src/ggml-cann/common.h +285 -211
  221. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +663 -831
  222. data/ext/sources/ggml/src/ggml-common.h +11 -0
  223. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +170 -95
  224. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +42 -18
  225. data/ext/sources/ggml/src/ggml-cpu/amx/common.h +34 -10
  226. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +85 -85
  227. data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +4 -0
  228. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +513 -27
  229. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +4192 -992
  230. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +4 -5
  231. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
  232. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1761 -49
  233. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1391 -0
  234. data/ext/sources/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  235. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +8 -10
  236. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +9 -9
  237. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +124 -24
  238. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +157 -28
  239. data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +2 -6
  240. data/ext/sources/ggml/src/ggml-cpu/common.h +8 -0
  241. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +8 -3
  242. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +251 -80
  243. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +19 -0
  244. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +587 -119
  245. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +33 -44
  246. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1093 -194
  247. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1284 -203
  248. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +6 -0
  249. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1519 -527
  250. data/ext/sources/ggml/src/ggml-cpu/ops.h +6 -4
  251. data/ext/sources/ggml/src/ggml-cpu/quants.c +40 -0
  252. data/ext/sources/ggml/src/ggml-cpu/quants.h +3 -0
  253. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +3632 -781
  254. data/ext/sources/ggml/src/ggml-cpu/repack.h +129 -4
  255. data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +136 -0
  256. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +152 -46
  257. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +3 -2
  258. data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +152 -1
  259. data/ext/sources/ggml/src/ggml-cpu/unary-ops.h +7 -0
  260. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +140 -0
  261. data/ext/sources/ggml/src/ggml-cpu/vec.h +261 -146
  262. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +72 -1
  263. data/ext/sources/ggml/src/ggml-cuda/argmax.cu +2 -2
  264. data/ext/sources/ggml/src/ggml-cuda/argsort.cu +132 -6
  265. data/ext/sources/ggml/src/ggml-cuda/argsort.cuh +16 -0
  266. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +33 -31
  267. data/ext/sources/ggml/src/ggml-cuda/common.cuh +474 -85
  268. data/ext/sources/ggml/src/ggml-cuda/convert.cu +41 -27
  269. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +10 -0
  270. data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +1 -1
  271. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +342 -246
  272. data/ext/sources/ggml/src/ggml-cuda/cpy.cuh +1 -5
  273. data/ext/sources/ggml/src/ggml-cuda/cumsum.cu +307 -0
  274. data/ext/sources/ggml/src/ggml-cuda/cumsum.cuh +5 -0
  275. data/ext/sources/ggml/src/ggml-cuda/diag.cu +77 -0
  276. data/ext/sources/ggml/src/ggml-cuda/diag.cuh +5 -0
  277. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +98 -74
  278. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +973 -665
  279. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +35 -741
  280. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +1255 -0
  281. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +33 -40
  282. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +40 -18
  283. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +48 -0
  284. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +206 -45
  285. data/ext/sources/ggml/src/ggml-cuda/fill.cu +37 -0
  286. data/ext/sources/ggml/src/ggml-cuda/fill.cuh +3 -0
  287. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +263 -0
  288. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
  289. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1688 -302
  290. data/ext/sources/ggml/src/ggml-cuda/mean.cu +12 -10
  291. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +908 -48
  292. data/ext/sources/ggml/src/ggml-cuda/mmf.cu +88 -20
  293. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +502 -90
  294. data/ext/sources/ggml/src/ggml-cuda/mmid.cu +164 -0
  295. data/ext/sources/ggml/src/ggml-cuda/mmid.cuh +5 -0
  296. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +69 -176
  297. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +532 -193
  298. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +460 -104
  299. data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +5 -2
  300. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +360 -122
  301. data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +2 -1
  302. data/ext/sources/ggml/src/ggml-cuda/norm.cu +18 -76
  303. data/ext/sources/ggml/src/ggml-cuda/pad.cu +73 -39
  304. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +152 -1
  305. data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +14 -0
  306. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +2 -16
  307. data/ext/sources/ggml/src/ggml-cuda/rope.cu +364 -149
  308. data/ext/sources/ggml/src/ggml-cuda/rope.cuh +2 -0
  309. data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +101 -47
  310. data/ext/sources/ggml/src/ggml-cuda/set.cu +39 -0
  311. data/ext/sources/ggml/src/ggml-cuda/set.cuh +7 -0
  312. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +163 -41
  313. data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +275 -0
  314. data/ext/sources/ggml/src/ggml-cuda/solve_tri.cuh +3 -0
  315. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +68 -50
  316. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
  317. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +49 -84
  318. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +5 -0
  319. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +1 -0
  320. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +5 -0
  321. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +1 -0
  322. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +1 -0
  323. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +1 -0
  324. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
  325. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
  326. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
  327. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
  328. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
  329. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
  330. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
  331. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
  332. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
  333. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +22 -4
  334. data/ext/sources/ggml/src/ggml-cuda/top-k.cu +95 -0
  335. data/ext/sources/ggml/src/ggml-cuda/top-k.cuh +3 -0
  336. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +275 -119
  337. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +20 -7
  338. data/ext/sources/ggml/src/ggml-cuda/tri.cu +136 -0
  339. data/ext/sources/ggml/src/ggml-cuda/tri.cuh +5 -0
  340. data/ext/sources/ggml/src/ggml-cuda/unary.cu +160 -11
  341. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +38 -0
  342. data/ext/sources/ggml/src/ggml-cuda/upscale.cu +163 -7
  343. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +31 -17
  344. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
  345. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +22 -1
  346. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +6 -0
  347. data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +117 -0
  348. data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3325 -0
  349. data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +46 -0
  350. data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +813 -0
  351. data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
  352. data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +891 -0
  353. data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
  354. data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +252 -0
  355. data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +713 -0
  356. data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +112 -0
  357. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.c +63 -0
  358. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.h +182 -0
  359. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
  360. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
  361. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
  362. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +35 -0
  363. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +155 -0
  364. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +63 -0
  365. data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
  366. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
  367. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +240 -0
  368. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
  369. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +251 -0
  370. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
  371. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
  372. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
  373. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
  374. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
  375. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
  376. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
  377. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
  378. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
  379. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +26 -0
  380. data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +1199 -0
  381. data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +2670 -0
  382. data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +497 -0
  383. data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +168 -0
  384. data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +419 -0
  385. data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +339 -0
  386. data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
  387. data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +382 -0
  388. data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +293 -0
  389. data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
  390. data/ext/sources/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
  391. data/ext/sources/ggml/src/ggml-hexagon/htp-drv.h +121 -0
  392. data/ext/sources/ggml/src/ggml-hexagon/libdl.h +79 -0
  393. data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
  394. data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +153 -0
  395. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +14 -13
  396. data/ext/sources/ggml/src/ggml-impl.h +129 -6
  397. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -10
  398. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +15 -4
  399. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +8 -0
  400. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +173 -34
  401. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +912 -344
  402. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +124 -59
  403. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +588 -144
  404. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +396 -23
  405. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +1724 -421
  406. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +16 -3
  407. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +333 -114
  408. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +3050 -1539
  409. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +3 -1
  410. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +30 -1
  411. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +4279 -497
  412. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +41 -99
  413. data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +45 -0
  414. data/ext/sources/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
  415. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +267 -0
  416. data/ext/sources/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
  417. data/ext/sources/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
  418. data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +113 -0
  419. data/ext/sources/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
  420. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +4 -3
  421. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
  422. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
  423. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
  424. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
  425. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
  426. data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +36 -12
  427. data/ext/sources/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
  428. data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +140 -0
  429. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
  430. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +24 -10
  431. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +24 -10
  432. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
  433. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
  434. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
  435. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
  436. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
  437. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
  438. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
  439. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
  440. data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mv_q6_k.cl → mul_mv_q6_k_f32.cl} +4 -0
  441. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
  442. data/ext/sources/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
  443. data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +29 -20
  444. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +31 -32
  445. data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +25 -10
  446. data/ext/sources/ggml/src/ggml-opencl/kernels/rope.cl +50 -24
  447. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +14 -4
  448. data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +35 -16
  449. data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +116 -0
  450. data/ext/sources/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
  451. data/ext/sources/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
  452. data/ext/sources/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
  453. data/ext/sources/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
  454. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +114 -13
  455. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +94 -48
  456. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +39 -0
  457. data/ext/sources/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
  458. data/ext/sources/ggml/src/ggml-openvino/.clang-format +154 -0
  459. data/ext/sources/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
  460. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +975 -0
  461. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
  462. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +373 -0
  463. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
  464. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +1110 -0
  465. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +884 -0
  466. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.h +153 -0
  467. data/ext/sources/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
  468. data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
  469. data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
  470. data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
  471. data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
  472. data/ext/sources/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
  473. data/ext/sources/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
  474. data/ext/sources/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
  475. data/ext/sources/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
  476. data/ext/sources/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
  477. data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
  478. data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
  479. data/ext/sources/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
  480. data/ext/sources/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
  481. data/ext/sources/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
  482. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
  483. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +123 -0
  484. data/ext/sources/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
  485. data/ext/sources/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
  486. data/ext/sources/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
  487. data/ext/sources/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
  488. data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
  489. data/ext/sources/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
  490. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +46 -0
  491. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +39 -0
  492. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +123 -0
  493. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +17 -0
  494. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
  495. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
  496. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
  497. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
  498. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
  499. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +293 -0
  500. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
  501. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +226 -0
  502. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +85 -0
  503. data/ext/sources/ggml/src/ggml-openvino/utils.cpp +823 -0
  504. data/ext/sources/ggml/src/ggml-openvino/utils.h +123 -0
  505. data/ext/sources/ggml/src/ggml-quants.c +96 -5
  506. data/ext/sources/ggml/src/ggml-quants.h +3 -0
  507. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +438 -156
  508. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +59 -87
  509. data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +81 -0
  510. data/ext/sources/ggml/src/ggml-sycl/add-id.hpp +8 -0
  511. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +7 -0
  512. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +21 -29
  513. data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +0 -6
  514. data/ext/sources/ggml/src/ggml-sycl/common.hpp +427 -20
  515. data/ext/sources/ggml/src/ggml-sycl/concat.cpp +55 -44
  516. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +103 -1
  517. data/ext/sources/ggml/src/ggml-sycl/convert.hpp +22 -1
  518. data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +79 -0
  519. data/ext/sources/ggml/src/ggml-sycl/count-equal.hpp +9 -0
  520. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +0 -3
  521. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +18 -0
  522. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +867 -50
  523. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +401 -358
  524. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +12 -2
  525. data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +1179 -0
  526. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +55 -0
  527. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +1338 -0
  528. data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +667 -0
  529. data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +225 -0
  530. data/ext/sources/ggml/src/ggml-sycl/fattn.hpp +22 -0
  531. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +309 -0
  532. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +8 -0
  533. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +645 -155
  534. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +22 -0
  535. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +221 -66
  536. data/ext/sources/ggml/src/ggml-sycl/norm.hpp +2 -0
  537. data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +3 -3
  538. data/ext/sources/ggml/src/ggml-sycl/pad.cpp +97 -0
  539. data/ext/sources/ggml/src/ggml-sycl/pad.hpp +24 -0
  540. data/ext/sources/ggml/src/ggml-sycl/pad_reflect_1d.cpp +100 -0
  541. data/ext/sources/ggml/src/ggml-sycl/pad_reflect_1d.hpp +10 -0
  542. data/ext/sources/ggml/src/ggml-sycl/presets.hpp +5 -0
  543. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +1 -1
  544. data/ext/sources/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
  545. data/ext/sources/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
  546. data/ext/sources/ggml/src/ggml-sycl/roll.cpp +122 -0
  547. data/ext/sources/ggml/src/ggml-sycl/roll.hpp +20 -0
  548. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +457 -281
  549. data/ext/sources/ggml/src/ggml-sycl/rope.hpp +6 -0
  550. data/ext/sources/ggml/src/ggml-sycl/set.cpp +73 -0
  551. data/ext/sources/ggml/src/ggml-sycl/set.hpp +5 -0
  552. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +327 -162
  553. data/ext/sources/ggml/src/ggml-sycl/softmax.hpp +4 -0
  554. data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +127 -0
  555. data/ext/sources/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
  556. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
  557. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
  558. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
  559. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
  560. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
  561. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
  562. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
  563. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
  564. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
  565. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +7 -0
  566. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +7 -0
  567. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +7 -0
  568. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +7 -0
  569. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +7 -0
  570. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +7 -0
  571. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +7 -0
  572. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +7 -0
  573. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +7 -0
  574. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +7 -0
  575. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +7 -0
  576. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +7 -0
  577. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +7 -0
  578. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +7 -0
  579. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +7 -0
  580. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +7 -0
  581. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +7 -0
  582. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +7 -0
  583. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +7 -0
  584. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +7 -0
  585. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +7 -0
  586. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +7 -0
  587. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +7 -0
  588. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +7 -0
  589. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +7 -0
  590. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +7 -0
  591. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +7 -0
  592. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +7 -0
  593. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +7 -0
  594. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +7 -0
  595. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +7 -0
  596. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +7 -0
  597. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +7 -0
  598. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +7 -0
  599. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +7 -0
  600. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +7 -0
  601. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +71 -0
  602. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +1 -1
  603. data/ext/sources/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
  604. data/ext/sources/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
  605. data/ext/sources/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
  606. data/ext/sources/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
  607. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
  608. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
  609. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
  610. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
  611. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
  612. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
  613. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
  614. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
  615. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
  616. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
  617. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
  618. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
  619. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
  620. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
  621. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
  622. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
  623. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
  624. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
  625. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
  626. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
  627. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
  628. data/ext/sources/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
  629. data/ext/sources/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
  630. data/ext/sources/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
  631. data/ext/sources/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
  632. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
  633. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
  634. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
  635. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
  636. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
  637. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
  638. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
  639. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
  640. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
  641. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
  642. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
  643. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
  644. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
  645. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +39 -19
  646. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +5994 -3055
  647. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
  648. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +18 -10
  649. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +2 -2
  650. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
  651. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +1 -1
  652. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
  653. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +2 -2
  654. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +33 -26
  655. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
  656. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
  657. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +2 -2
  658. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +2 -2
  659. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +2 -2
  660. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +1 -1
  661. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +47 -49
  662. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +1 -1
  663. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +2 -2
  664. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +3 -3
  665. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +4 -4
  666. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
  667. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +2 -2
  668. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +2 -2
  669. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
  670. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +83 -0
  671. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
  672. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
  673. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +1 -1
  674. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs.comp → dequant_funcs.glsl} +9 -21
  675. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs_cm2.comp → dequant_funcs_cm2.glsl} +18 -4
  676. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_head.comp → dequant_head.glsl} +1 -1
  677. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
  678. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +1 -1
  679. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +1 -1
  680. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +1 -1
  681. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +1 -1
  682. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +1 -1
  683. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +1 -1
  684. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +1 -1
  685. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +1 -1
  686. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +3 -3
  687. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +3 -3
  688. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
  689. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +1 -1
  690. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +1 -1
  691. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +3 -3
  692. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +1 -1
  693. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +1 -1
  694. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +3 -3
  695. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
  696. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +1 -1
  697. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +29 -0
  698. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +1 -1
  699. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +2 -2
  700. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
  701. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +3 -3
  702. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
  703. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +386 -160
  704. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{flash_attn_base.comp → flash_attn_base.glsl} +82 -20
  705. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +400 -174
  706. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +123 -37
  707. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
  708. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +10 -9
  709. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
  710. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +128 -0
  711. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +2 -2
  712. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +2 -2
  713. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +2 -2
  714. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +2 -2
  715. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +2 -2
  716. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +2 -2
  717. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_binary_head.comp → generic_binary_head.glsl} +17 -2
  718. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_head.comp → generic_head.glsl} +2 -0
  719. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_unary_head.comp → generic_unary_head.glsl} +7 -0
  720. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +4 -4
  721. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +3 -3
  722. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{glu_head.comp → glu_head.glsl} +1 -1
  723. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +2 -2
  724. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +2 -2
  725. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +2 -2
  726. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +19 -7
  727. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +2 -3
  728. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +13 -10
  729. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +2 -2
  730. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +18 -0
  731. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +2 -2
  732. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +2 -2
  733. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{mul_mat_vec_base.comp → mul_mat_vec_base.glsl} +77 -29
  734. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +35 -0
  735. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +71 -21
  736. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +41 -25
  737. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +2 -2
  738. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +44 -26
  739. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +2 -2
  740. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +2 -2
  741. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +2 -2
  742. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +9 -7
  743. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +9 -7
  744. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +4 -6
  745. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +2 -2
  746. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +4 -6
  747. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +4 -6
  748. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +2 -2
  749. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +39 -36
  750. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +494 -0
  751. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +88 -105
  752. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +41 -26
  753. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{mul_mm_funcs.comp → mul_mm_funcs.glsl} +69 -59
  754. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +74 -0
  755. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +92 -230
  756. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +454 -0
  757. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +78 -0
  758. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +97 -13
  759. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
  760. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +2 -2
  761. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +2 -2
  762. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +1 -1
  763. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +21 -6
  764. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +1 -1
  765. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +10 -10
  766. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +2 -2
  767. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +2 -2
  768. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +2 -2
  769. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +2 -2
  770. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +49 -4
  771. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +2 -2
  772. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +2 -2
  773. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +2 -2
  774. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +207 -0
  775. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +20 -0
  776. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +8 -49
  777. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +8 -32
  778. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +8 -32
  779. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +33 -0
  780. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +8 -38
  781. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
  782. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +2 -2
  783. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
  784. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +2 -2
  785. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +2 -2
  786. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +2 -2
  787. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +2 -2
  788. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +1 -1
  789. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +2 -2
  790. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp +62 -0
  791. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp +79 -0
  792. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp +65 -0
  793. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl +53 -0
  794. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
  795. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/solve_tri.comp +81 -0
  796. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +2 -2
  797. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +2 -2
  798. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +50 -0
  799. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +124 -0
  800. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
  801. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +2 -2
  802. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +2 -25
  803. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.glsl +25 -0
  804. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +2 -2
  805. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +2 -2
  806. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +2 -2
  807. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +1 -1
  808. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_argsort.comp +118 -0
  809. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +213 -0
  810. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_nary_search.comp +246 -0
  811. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +43 -0
  812. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
  813. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{types.comp → types.glsl} +345 -26
  814. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +90 -12
  815. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +384 -180
  816. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
  817. data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +28 -2
  818. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1374 -0
  819. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +2544 -726
  820. data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
  821. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
  822. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
  823. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
  824. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +141 -0
  825. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +65 -72
  826. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +75 -0
  827. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +107 -0
  828. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
  829. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +73 -15
  830. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +636 -0
  831. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{get_rows.tmpl.wgsl → get_rows.wgsl} +53 -259
  832. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
  833. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat.tmpl.wgsl → mul_mat.wgsl} +72 -261
  834. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +766 -0
  835. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +147 -0
  836. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +196 -0
  837. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +480 -0
  838. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
  839. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
  840. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +83 -17
  841. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
  842. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +63 -0
  843. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +40 -12
  844. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
  845. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
  846. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
  847. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +6 -1
  848. data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +91 -0
  849. data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +469 -0
  850. data/ext/sources/ggml/src/ggml.c +590 -64
  851. data/ext/sources/ggml/src/gguf.cpp +229 -44
  852. data/ext/sources/include/whisper.h +1 -0
  853. data/ext/sources/src/CMakeLists.txt +3 -1
  854. data/ext/sources/src/whisper.cpp +106 -62
  855. data/ext/sources/tests/CMakeLists.txt +2 -2
  856. data/ext/sources/tests/test-vad-full.cpp +4 -2
  857. data/ext/sources/tests/test-vad.cpp +1 -1
  858. data/extsources.rb +1 -0
  859. data/lib/whisper/model/uri.rb +17 -18
  860. data/sig/whisper.rbs +162 -4
  861. data/test/test_context_params.rb +82 -0
  862. data/test/test_params.rb +16 -8
  863. data/test/test_segment.rb +0 -1
  864. data/test/test_token.rb +81 -0
  865. data/test/test_vad.rb +1 -1
  866. data/test/test_vad_context.rb +100 -0
  867. data/test/test_vad_segment.rb +19 -0
  868. data/test/test_vad_segments.rb +16 -0
  869. data/test/test_whisper.rb +27 -0
  870. data/whispercpp.gemspec +1 -1
  871. metadata +502 -37
  872. data/ext/sources/build-xcframework.sh +0 -571
  873. data/ext/sources/examples/talk-llama/llama-sampling.h +0 -32
  874. data/ext/sources/ggml/cmake/BuildTypes.cmake +0 -54
  875. data/ext/sources/ggml/src/ggml-cann/Doxyfile +0 -2579
  876. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +0 -105
  877. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +0 -55
  878. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl +0 -44
  879. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl +0 -41
  880. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +0 -45
  881. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +0 -60
  882. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl +0 -44
  883. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl +0 -41
  884. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_in_place.wgsl +0 -48
  885. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_bfloat16_support.comp → feature-tests/bfloat16.comp} +0 -0
  886. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat_support.comp → feature-tests/coopmat.comp} +0 -0
  887. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat2_support.comp → feature-tests/coopmat2.comp} +0 -0
  888. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_integer_dot_support.comp → feature-tests/integer_dot.comp} +0 -0
  889. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{glu_main.comp → glu_main.glsl} +0 -0
  890. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{rte.comp → rte.glsl} +0 -0
  891. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{utils.comp → utils.glsl} +0 -0
@@ -1,12 +1,13 @@
1
1
  #import "ggml-metal-device.h"
2
2
 
3
3
  #import "ggml-impl.h"
4
- #import "ggml-threading.h"
5
4
 
6
5
  #include <Foundation/Foundation.h>
7
6
 
8
7
  #include <Metal/Metal.h>
9
8
 
9
+ #include <stdatomic.h>
10
+
10
11
  #ifndef TARGET_OS_VISION
11
12
  #define TARGET_OS_VISION 0
12
13
  #endif
@@ -19,8 +20,9 @@
19
20
  #define GGML_METAL_HAS_RESIDENCY_SETS 1
20
21
  #endif
21
22
 
22
- // overload of MTLGPUFamilyMetal3 (not available in some environments)
23
+ // overload of MTLGPUFamilyMetalX (not available in some environments)
23
24
  static const NSInteger MTLGPUFamilyMetal3_GGML = 5001;
25
+ static const NSInteger MTLGPUFamilyMetal4_GGML = 5002;
24
26
 
25
27
  #if !GGML_METAL_EMBED_LIBRARY
26
28
  // Here to assist with NSBundle Path Hack
@@ -69,14 +71,6 @@ void ggml_metal_cv_set_bool(ggml_metal_cv_t cv, bool value, int32_t idx) {
69
71
 
70
72
  struct ggml_metal_pipeline {
71
73
  id<MTLComputePipelineState> obj;
72
-
73
- // suggested dispatch sizes
74
- int nsg;
75
-
76
- int nr0;
77
- int nr1;
78
-
79
- size_t smem;
80
74
  };
81
75
 
82
76
  ggml_metal_pipeline_t ggml_metal_pipeline_init(void) {
@@ -84,10 +78,6 @@ ggml_metal_pipeline_t ggml_metal_pipeline_init(void) {
84
78
 
85
79
  *res = (struct ggml_metal_pipeline) {
86
80
  /*.obj =*/ nil,
87
- /*.nsg =*/ 0,
88
- /*.nr0 =*/ 0,
89
- /*.nr1 =*/ 0,
90
- /*.smem =*/ 0,
91
81
  };
92
82
 
93
83
  return res;
@@ -99,40 +89,8 @@ void ggml_metal_pipeline_free(ggml_metal_pipeline_t pipeline) {
99
89
  free(pipeline);
100
90
  }
101
91
 
102
- void ggml_metal_pipeline_set_nsg(ggml_metal_pipeline_t pipeline, int nsg) {
103
- pipeline->nsg = nsg;
104
- }
105
-
106
- int ggml_metal_pipeline_get_nsg(ggml_metal_pipeline_t pipeline) {
107
- return pipeline->nsg;
108
- }
109
-
110
- void ggml_metal_pipeline_set_nr0(ggml_metal_pipeline_t pipeline, int nr0) {
111
- pipeline->nr0 = nr0;
112
- }
113
-
114
- int ggml_metal_pipeline_get_nr0(ggml_metal_pipeline_t pipeline) {
115
- return pipeline->nr0;
116
- }
117
-
118
- void ggml_metal_pipeline_set_nr1(ggml_metal_pipeline_t pipeline, int nr1) {
119
- pipeline->nr1 = nr1;
120
- }
121
-
122
- int ggml_metal_pipeline_get_nr1(ggml_metal_pipeline_t pipeline) {
123
- return pipeline->nr1;
124
- }
125
-
126
- void ggml_metal_pipeline_set_smem(ggml_metal_pipeline_t pipeline, size_t smem) {
127
- pipeline->smem = smem;
128
- }
129
-
130
- size_t ggml_metal_pipeline_get_smem(ggml_metal_pipeline_t pipeline) {
131
- return pipeline->smem;
132
- }
133
-
134
- int ggml_metal_pipeline_max_theads_per_threadgroup(ggml_metal_pipeline_t pipeline) {
135
- return pipeline->obj.maxTotalThreadsPerThreadgroup;
92
+ int ggml_metal_pipeline_max_theads_per_threadgroup(struct ggml_metal_pipeline_with_params pipeline) {
93
+ return pipeline.pipeline->obj.maxTotalThreadsPerThreadgroup;
136
94
  }
137
95
 
138
96
  struct ggml_metal_library {
@@ -140,6 +98,8 @@ struct ggml_metal_library {
140
98
  id<MTLDevice> device;
141
99
 
142
100
  ggml_metal_pipelines_t pipelines; // cache of compiled pipelines
101
+
102
+ NSLock * lock;
143
103
  };
144
104
 
145
105
  ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev) {
@@ -256,6 +216,10 @@ ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev) {
256
216
  [prep setObject:@"1" forKey:@"GGML_METAL_HAS_BF16"];
257
217
  }
258
218
 
219
+ if (ggml_metal_device_get_props(dev)->has_tensor) {
220
+ [prep setObject:@"1" forKey:@"GGML_METAL_HAS_TENSOR"];
221
+ }
222
+
259
223
  #if GGML_METAL_EMBED_LIBRARY
260
224
  [prep setObject:@"1" forKey:@"GGML_METAL_EMBED_LIBRARY"];
261
225
  #endif
@@ -286,9 +250,77 @@ ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev) {
286
250
 
287
251
  ggml_metal_library_t res = calloc(1, sizeof(struct ggml_metal_library));
288
252
 
289
- res->obj = library;
290
- res->device = device;
253
+ res->obj = library;
254
+ res->device = device;
255
+ res->pipelines = ggml_metal_pipelines_init();
256
+ res->lock = [NSLock new];
257
+
258
+ return res;
259
+ }
260
+
261
+ ggml_metal_library_t ggml_metal_library_init_from_source(ggml_metal_device_t dev, const char * source, bool verbose) {
262
+ if (source == NULL) {
263
+ GGML_LOG_ERROR("%s: source is NULL\n", __func__);
264
+ return NULL;
265
+ }
266
+
267
+ id<MTLDevice> device = ggml_metal_device_get_obj(dev);
268
+ id<MTLLibrary> library = nil;
269
+ NSError * error = nil;
270
+
271
+ const int64_t t_start = ggml_time_us();
272
+
273
+ NSString * src = [[NSString alloc] initWithBytes:source
274
+ length:strlen(source)
275
+ encoding:NSUTF8StringEncoding];
276
+ if (!src) {
277
+ GGML_LOG_ERROR("%s: failed to create NSString from source\n", __func__);
278
+ return NULL;
279
+ }
280
+
281
+ @autoreleasepool {
282
+ NSMutableDictionary * prep = [NSMutableDictionary dictionary];
283
+
284
+ MTLCompileOptions * options = [MTLCompileOptions new];
285
+ options.preprocessorMacros = prep;
286
+
287
+ library = [device newLibraryWithSource:src options:options error:&error];
288
+ if (error) {
289
+ if (verbose) {
290
+ GGML_LOG_ERROR("%s: error compiling source: %s\n", __func__, [[error description] UTF8String]);
291
+ } else {
292
+ GGML_LOG_ERROR("%s: error compiling source\n", __func__);
293
+ }
294
+ library = nil;
295
+ }
296
+
297
+ [options release];
298
+ }
299
+
300
+ [src release];
301
+
302
+ if (!library) {
303
+ if (verbose) {
304
+ GGML_LOG_ERROR("%s: failed to create Metal library from source\n", __func__);
305
+ }
306
+
307
+ return NULL;
308
+ }
309
+
310
+ if (verbose) {
311
+ GGML_LOG_INFO("%s: compiled in %.3f sec\n", __func__, (ggml_time_us() - t_start) / 1e6);
312
+ }
313
+
314
+ ggml_metal_library_t res = calloc(1, sizeof(struct ggml_metal_library));
315
+ if (!res) {
316
+ GGML_LOG_ERROR("%s: calloc failed\n", __func__);
317
+ return NULL;
318
+ }
319
+
320
+ res->obj = library;
321
+ res->device = device;
291
322
  res->pipelines = ggml_metal_pipelines_init();
323
+ res->lock = [NSLock new];
292
324
 
293
325
  return res;
294
326
  }
@@ -304,26 +336,51 @@ void ggml_metal_library_free(ggml_metal_library_t lib) {
304
336
 
305
337
  ggml_metal_pipelines_free(lib->pipelines);
306
338
 
339
+ [lib->lock release];
340
+
307
341
  free(lib);
308
342
  }
309
343
 
310
- ggml_metal_pipeline_t ggml_metal_library_get_pipeline(ggml_metal_library_t lib, const char * name) {
311
- return ggml_metal_pipelines_get(lib->pipelines, name);
344
+ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline(ggml_metal_library_t lib, const char * name) {
345
+ [lib->lock lock];
346
+
347
+ struct ggml_metal_pipeline_with_params res = {
348
+ /*.pipeline =*/ nil,
349
+ /*.nsg =*/ 0,
350
+ /*.nr0 =*/ 0,
351
+ /*.nr1 =*/ 0,
352
+ /*.smem =*/ 0,
353
+ /*.c4 =*/ false,
354
+ /*.cnt =*/ false,
355
+ };
356
+
357
+ res.pipeline = ggml_metal_pipelines_get(lib->pipelines, name);
358
+
359
+ [lib->lock unlock];
360
+
361
+ return res;
312
362
  }
313
363
 
314
- ggml_metal_pipeline_t ggml_metal_library_compile_pipeline(ggml_metal_library_t lib, const char * base, const char * name, ggml_metal_cv_t cv) {
315
- // note: the pipelines are cached in the library per device, so they are shared across all metal contexts
316
- ggml_critical_section_start();
364
+ struct ggml_metal_pipeline_with_params ggml_metal_library_compile_pipeline(ggml_metal_library_t lib, const char * base, const char * name, ggml_metal_cv_t cv) {
365
+ struct ggml_metal_pipeline_with_params res = {
366
+ /*.pipeline =*/ nil,
367
+ /*.nsg =*/ 0,
368
+ /*.nr0 =*/ 0,
369
+ /*.nr1 =*/ 0,
370
+ /*.smem =*/ 0,
371
+ /*.c4 =*/ false,
372
+ /*.cnt =*/ false,
373
+ };
374
+
375
+ [lib->lock lock];
317
376
 
318
- ggml_metal_pipeline_t res = ggml_metal_library_get_pipeline(lib, name);
319
- if (res) {
320
- ggml_critical_section_end();
377
+ res.pipeline = ggml_metal_pipelines_get(lib->pipelines, name);
378
+ if (res.pipeline) {
379
+ [lib->lock unlock];
321
380
 
322
381
  return res;
323
382
  }
324
383
 
325
- res = ggml_metal_pipeline_init();
326
-
327
384
  @autoreleasepool {
328
385
  NSError * error = nil;
329
386
 
@@ -338,28 +395,53 @@ ggml_metal_pipeline_t ggml_metal_library_compile_pipeline(ggml_metal_library_t l
338
395
  mtl_function = [lib->obj newFunctionWithName:base_func constantValues:cv->obj error:&error];
339
396
  }
340
397
  if (!mtl_function) {
341
- ggml_critical_section_end();
398
+ [lib->lock unlock];
342
399
 
343
- GGML_LOG_ERROR("%s: error: failed to compile pipeline: base = '%s', name = '%s'\n", __func__, base, name);
400
+ GGML_LOG_ERROR("%s: failed to compile pipeline: base = '%s', name = '%s'\n", __func__, base, name);
344
401
  if (error) {
345
- GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
402
+ GGML_LOG_ERROR("%s: %s\n", __func__, [[error description] UTF8String]);
346
403
  }
347
404
 
348
- return nil;
405
+ return res;
349
406
  }
350
407
 
351
- res->obj = [lib->device newComputePipelineStateWithFunction:mtl_function error:&error];
352
-
353
- ggml_metal_pipelines_add(lib->pipelines, name, res);
408
+ id<MTLComputePipelineState> obj = [lib->device newComputePipelineStateWithFunction:mtl_function error:&error];
354
409
 
355
410
  [mtl_function release];
356
411
 
357
- GGML_LOG_DEBUG("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, name, (void *) res->obj,
358
- (int) res->obj.maxTotalThreadsPerThreadgroup,
359
- (int) res->obj.threadExecutionWidth);
412
+ if (!obj) {
413
+ [lib->lock unlock];
414
+
415
+ GGML_LOG_ERROR("%s: failed to create pipeline state: base = '%s', name = '%s'\n", __func__, base, name);
416
+ if (error) {
417
+ GGML_LOG_ERROR("%s: %s\n", __func__, [[error description] UTF8String]);
418
+ }
419
+
420
+ return res;
421
+ }
422
+
423
+ GGML_LOG_DEBUG("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, name,
424
+ (void *) obj,
425
+ (int) obj.maxTotalThreadsPerThreadgroup,
426
+ (int) obj.threadExecutionWidth);
427
+
428
+ if (obj.maxTotalThreadsPerThreadgroup == 0 || obj.threadExecutionWidth == 0) {
429
+ [obj release];
430
+
431
+ [lib->lock unlock];
432
+
433
+ GGML_LOG_ERROR("%s: incompatible pipeline %s\n", __func__, name);
434
+
435
+ return res;
436
+ }
437
+
438
+ res.pipeline = ggml_metal_pipeline_init();
439
+ res.pipeline->obj = obj;
440
+
441
+ ggml_metal_pipelines_add(lib->pipelines, name, res.pipeline);
360
442
  }
361
443
 
362
- ggml_critical_section_end();
444
+ [lib->lock unlock];
363
445
 
364
446
  return res;
365
447
  }
@@ -401,8 +483,8 @@ void ggml_metal_encoder_debug_group_pop (ggml_metal_encoder_t encoder) {
401
483
  [encoder->obj popDebugGroup];
402
484
  }
403
485
 
404
- void ggml_metal_encoder_set_pipeline(ggml_metal_encoder_t encoder, ggml_metal_pipeline_t pipeline) {
405
- [encoder->obj setComputePipelineState:pipeline->obj];
486
+ void ggml_metal_encoder_set_pipeline(ggml_metal_encoder_t encoder, struct ggml_metal_pipeline_with_params pipeline) {
487
+ [encoder->obj setComputePipelineState:pipeline.pipeline->obj];
406
488
  }
407
489
 
408
490
  void ggml_metal_encoder_set_bytes(ggml_metal_encoder_t encoder, void * data, size_t size, int idx) {
@@ -437,12 +519,110 @@ struct ggml_metal_device {
437
519
  // ref: https://github.com/ggml-org/llama.cpp/pull/15906
438
520
  id<MTLCommandQueue> mtl_queue;
439
521
 
522
+ ggml_metal_rsets_t rsets;
523
+
440
524
  ggml_metal_library_t library;
441
525
 
442
526
  struct ggml_metal_device_props props;
527
+
528
+ // virtual address for GPU memory allocations
529
+ atomic_uintptr_t addr_virt;
530
+ };
531
+
532
+ //
533
+ // MTLResidenceSet wrapper
534
+ //
535
+
536
+ struct ggml_metal_rsets {
537
+ NSLock * lock;
538
+
539
+ NSMutableArray * data;
540
+
541
+ // number of seconds since the last graph computation
542
+ // keep the residency sets wired for that amount of time to avoid being collected by the OS
543
+ int keep_alive_s;
544
+
545
+ // background heartbeat thread to keep the residency sets alive
546
+ atomic_bool d_stop;
547
+ atomic_int d_loop;
548
+
549
+ dispatch_group_t d_group;
443
550
  };
444
551
 
445
- ggml_metal_device_t ggml_metal_device_init(void) {
552
+ ggml_metal_rsets_t ggml_metal_rsets_init(void) {
553
+ ggml_metal_rsets_t res = calloc(1, sizeof(struct ggml_metal_rsets));
554
+
555
+ res->lock = [[NSLock alloc] init];
556
+ res->data = [[NSMutableArray alloc] init];
557
+
558
+ // by default keep the memory wired for 3 minutes
559
+ res->keep_alive_s = 3*60;
560
+
561
+ const char * GGML_METAL_RESIDENCY_KEEP_ALIVE_S = getenv("GGML_METAL_RESIDENCY_KEEP_ALIVE_S");
562
+ if (GGML_METAL_RESIDENCY_KEEP_ALIVE_S) {
563
+ res->keep_alive_s = atoi(GGML_METAL_RESIDENCY_KEEP_ALIVE_S);
564
+ }
565
+
566
+ if (res->keep_alive_s <= 0) {
567
+ res->keep_alive_s = 3*60;
568
+ }
569
+
570
+ GGML_LOG_INFO("%s: creating a residency set collection (keep_alive = %d s)\n", __func__, res->keep_alive_s);
571
+
572
+ atomic_store_explicit(&res->d_stop, false, memory_order_relaxed);
573
+ atomic_store_explicit(&res->d_loop, 2*res->keep_alive_s, memory_order_relaxed);
574
+
575
+ res->d_group = dispatch_group_create();
576
+
577
+ // start a background thread that periodically requests residency for all the currently active sets in the collection
578
+ // the requests stop after a certain amount of time (keep_alive_s) of inactivity
579
+ dispatch_queue_t d_queue = dispatch_get_global_queue(QOS_CLASS_DEFAULT, 0);
580
+ dispatch_group_async(res->d_group, d_queue, ^{
581
+ #if defined(GGML_METAL_HAS_RESIDENCY_SETS)
582
+ if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, visionOS 2.0, *)) {
583
+ while (!atomic_load_explicit(&res->d_stop, memory_order_relaxed)) {
584
+ if (atomic_load_explicit(&res->d_loop, memory_order_relaxed) > 0) {
585
+ [res->lock lock];
586
+
587
+ for (int i = 0; i < (int) res->data.count; ++i) {
588
+ [res->data[i] requestResidency];
589
+ }
590
+
591
+ atomic_fetch_sub_explicit(&res->d_loop, 1, memory_order_relaxed);
592
+
593
+ [res->lock unlock];
594
+ }
595
+
596
+ // half a second
597
+ usleep(500 * 1000);
598
+ }
599
+ }
600
+ #endif
601
+ });
602
+
603
+ return res;
604
+ }
605
+
606
+ void ggml_metal_rsets_free(ggml_metal_rsets_t rsets) {
607
+ if (rsets == NULL) {
608
+ return;
609
+ }
610
+
611
+ // note: if you hit this assert, most likely you haven't deallocated all Metal resources before exiting
612
+ GGML_ASSERT([rsets->data count] == 0);
613
+
614
+ atomic_store_explicit(&rsets->d_stop, true, memory_order_relaxed);
615
+
616
+ dispatch_group_wait(rsets->d_group, DISPATCH_TIME_FOREVER);
617
+ dispatch_release(rsets->d_group);
618
+
619
+ [rsets->data release];
620
+ [rsets->lock release];
621
+
622
+ free(rsets);
623
+ }
624
+
625
+ ggml_metal_device_t ggml_metal_device_init(int device) {
446
626
  ggml_metal_device_t dev = calloc(1, sizeof(struct ggml_metal_device));
447
627
 
448
628
  assert(dev != NULL);
@@ -456,6 +636,9 @@ ggml_metal_device_t ggml_metal_device_init(void) {
456
636
  GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
457
637
  }
458
638
 
639
+ dev->addr_virt = 0x000000400ULL;
640
+
641
+ dev->props.device = device;
459
642
  dev->props.has_simdgroup_reduction = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];
460
643
  dev->props.has_simdgroup_reduction |= [dev->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
461
644
 
@@ -464,6 +647,128 @@ ggml_metal_device_t ggml_metal_device_init(void) {
464
647
 
465
648
  dev->props.has_bfloat = [dev->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
466
649
  dev->props.has_bfloat |= [dev->mtl_device supportsFamily:MTLGPUFamilyApple6];
650
+ if (getenv("GGML_METAL_BF16_DISABLE") != NULL) {
651
+ dev->props.has_bfloat = false;
652
+ }
653
+
654
+ dev->props.has_tensor = [dev->mtl_device supportsFamily:MTLGPUFamilyMetal4_GGML];
655
+ if (getenv("GGML_METAL_TENSOR_DISABLE") != NULL) {
656
+ dev->props.has_tensor = false;
657
+ }
658
+
659
+ // note: disable the tensor API by default for old chips because with the current implementation it is not useful
660
+ // - M2 Ultra: ~5% slower
661
+ // - M4, M4 Max: no significant difference
662
+ //
663
+ // TODO: try to update the tensor API kernels to at least match the simdgroup performance
664
+ if (getenv("GGML_METAL_TENSOR_ENABLE") == NULL &&
665
+ ![[dev->mtl_device name] containsString:@"M5"] &&
666
+ ![[dev->mtl_device name] containsString:@"M6"] &&
667
+ ![[dev->mtl_device name] containsString:@"A19"] &&
668
+ ![[dev->mtl_device name] containsString:@"A20"]) {
669
+ GGML_LOG_WARN("%s: tensor API disabled for pre-M5 and pre-A19 devices\n", __func__);
670
+ dev->props.has_tensor = false;
671
+ }
672
+
673
+ // double-check that the tensor API compiles
674
+ if (dev->props.has_tensor) {
675
+ const char * src_tensor_f16 = "\n"
676
+ "#include <metal_stdlib> \n"
677
+ "#include <metal_tensor> \n"
678
+ "#include <MetalPerformancePrimitives/MetalPerformancePrimitives.h> \n"
679
+ " \n"
680
+ "using namespace metal; \n"
681
+ "using namespace mpp::tensor_ops; \n"
682
+ " \n"
683
+ "kernel void dummy_kernel( \n"
684
+ " tensor<device half, dextents<int32_t, 2>> A [[buffer(0)]], \n"
685
+ " tensor<device half, dextents<int32_t, 2>> B [[buffer(1)]], \n"
686
+ " device float * C [[buffer(2)]], \n"
687
+ " uint2 tgid [[threadgroup_position_in_grid]]) \n"
688
+ "{ \n"
689
+ " auto tA = A.slice(0, (int)tgid.y); \n"
690
+ " auto tB = B.slice((int)tgid.x, 0); \n"
691
+ " \n"
692
+ " matmul2d< \n"
693
+ " matmul2d_descriptor(8, 8, dynamic_extent), \n"
694
+ " execution_simdgroups<4>> mm; \n"
695
+ " \n"
696
+ " auto cT = mm.get_destination_cooperative_tensor<decltype(tA), decltype(tB), float>(); \n"
697
+ " \n"
698
+ " auto sA = tA.slice(0, 0); \n"
699
+ " auto sB = tB.slice(0, 0); \n"
700
+ " mm.run(sB, sA, cT); \n"
701
+ " \n"
702
+ " auto tC = tensor<device float, dextents<int32_t, 2>, tensor_inline>(C, dextents<int32_t, 2>(4, 4)); \n"
703
+ " \n"
704
+ " cT.store(tC); \n"
705
+ "}";
706
+
707
+ GGML_LOG_INFO("%s: testing tensor API for f16 support\n", __func__);
708
+ ggml_metal_library_t lib = ggml_metal_library_init_from_source(dev, src_tensor_f16, false);
709
+ if (lib == NULL) {
710
+ GGML_LOG_WARN("%s: - the tensor API is not supported in this environment - disabling\n", __func__);
711
+ dev->props.has_tensor = false;
712
+ } else {
713
+ struct ggml_metal_pipeline_with_params ppl = ggml_metal_library_compile_pipeline(lib, "dummy_kernel", "dummy_kernel", nil);
714
+ if (!ppl.pipeline) {
715
+ GGML_LOG_WARN("%s: - the tensor API is not supported in this environment - disabling\n", __func__);
716
+ dev->props.has_tensor = false;
717
+ }
718
+
719
+ ggml_metal_library_free(lib);
720
+ }
721
+ }
722
+
723
+ // try to compile a dummy kernel to determine if the tensor API is supported for bfloat
724
+ if (dev->props.has_tensor && dev->props.has_bfloat) {
725
+ const char * src_tensor_bf16 = "\n"
726
+ "#include <metal_stdlib> \n"
727
+ "#include <metal_tensor> \n"
728
+ "#include <MetalPerformancePrimitives/MetalPerformancePrimitives.h> \n"
729
+ " \n"
730
+ "using namespace metal; \n"
731
+ "using namespace mpp::tensor_ops; \n"
732
+ " \n"
733
+ "kernel void dummy_kernel( \n"
734
+ " tensor<device bfloat, dextents<int32_t, 2>> A [[buffer(0)]], \n"
735
+ " tensor<device bfloat, dextents<int32_t, 2>> B [[buffer(1)]], \n"
736
+ " device float * C [[buffer(2)]], \n"
737
+ " uint2 tgid [[threadgroup_position_in_grid]]) \n"
738
+ "{ \n"
739
+ " auto tA = A.slice(0, (int)tgid.y); \n"
740
+ " auto tB = B.slice((int)tgid.x, 0); \n"
741
+ " \n"
742
+ " matmul2d< \n"
743
+ " matmul2d_descriptor(8, 8, dynamic_extent), \n"
744
+ " execution_simdgroups<4>> mm; \n"
745
+ " \n"
746
+ " auto cT = mm.get_destination_cooperative_tensor<decltype(tA), decltype(tB), float>(); \n"
747
+ " \n"
748
+ " auto sA = tA.slice(0, 0); \n"
749
+ " auto sB = tB.slice(0, 0); \n"
750
+ " mm.run(sB, sA, cT); \n"
751
+ " \n"
752
+ " auto tC = tensor<device float, dextents<int32_t, 2>, tensor_inline>(C, dextents<int32_t, 2>(4, 4)); \n"
753
+ " \n"
754
+ " cT.store(tC); \n"
755
+ "}";
756
+
757
+ GGML_LOG_INFO("%s: testing tensor API for bfloat support\n", __func__);
758
+ ggml_metal_library_t lib = ggml_metal_library_init_from_source(dev, src_tensor_bf16, false);
759
+ if (lib == NULL) {
760
+ GGML_LOG_WARN("%s: - the tensor API does not support bfloat - disabling bfloat support\n", __func__);
761
+ dev->props.has_bfloat = false;
762
+ } else {
763
+ struct ggml_metal_pipeline_with_params ppl = ggml_metal_library_compile_pipeline(lib, "dummy_kernel", "dummy_kernel", nil);
764
+ if (!ppl.pipeline) {
765
+ GGML_LOG_WARN("%s: - the tensor API does not support bfloat - disabling bfloat support\n", __func__);
766
+ dev->props.has_bfloat = false;
767
+ }
768
+
769
+ ggml_metal_library_free(lib);
770
+ }
771
+ }
467
772
 
468
773
  dev->props.use_residency_sets = true;
469
774
  #if defined(GGML_METAL_HAS_RESIDENCY_SETS)
@@ -471,25 +776,42 @@ ggml_metal_device_t ggml_metal_device_init(void) {
471
776
  #endif
472
777
 
473
778
  dev->props.use_shared_buffers = dev->props.has_unified_memory;
474
-
779
+ #if TARGET_OS_OSX
780
+ // In case of eGPU, shared memory may be preferable.
781
+ dev->props.use_shared_buffers |= [dev->mtl_device location] == MTLDeviceLocationExternal;
782
+ #endif
475
783
  if (getenv("GGML_METAL_SHARED_BUFFERS_DISABLE") != NULL) {
476
784
  dev->props.use_shared_buffers = false;
477
785
  }
786
+ if (getenv("GGML_METAL_SHARED_BUFFERS_ENABLE") != NULL) {
787
+ dev->props.use_shared_buffers = true;
788
+ }
478
789
 
479
790
  dev->props.supports_gpu_family_apple7 = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];
480
791
 
792
+ dev->props.op_offload_min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
793
+
481
794
  dev->props.max_buffer_size = dev->mtl_device.maxBufferLength;
482
- dev->props.max_working_set_size = dev->mtl_device.recommendedMaxWorkingSetSize;
483
795
  dev->props.max_theadgroup_memory_size = dev->mtl_device.maxThreadgroupMemoryLength;
796
+ if (@available(macOS 10.12, iOS 16.0, *)) {
797
+ dev->props.max_working_set_size = dev->mtl_device.recommendedMaxWorkingSetSize;
798
+ } else {
799
+ dev->props.max_working_set_size = dev->mtl_device.maxBufferLength;
800
+ }
484
801
 
485
- strncpy(dev->props.name, [[dev->mtl_device name] UTF8String], sizeof(dev->props.name) - 1);
802
+ snprintf(dev->props.name, sizeof(dev->props.name), "%s%d", "MTL", device);
803
+ snprintf(dev->props.desc, sizeof(dev->props.desc), "%s", [[dev->mtl_device name] UTF8String]);
486
804
 
487
805
  dev->library = ggml_metal_library_init(dev);
488
806
  if (!dev->library) {
489
807
  GGML_LOG_ERROR("%s: error: failed to create library\n", __func__);
490
808
  }
491
809
 
492
- // --------------------------------------------------
810
+ if (dev->props.use_residency_sets) {
811
+ dev->rsets = ggml_metal_rsets_init();
812
+ } else {
813
+ dev->rsets = nil;
814
+ }
493
815
 
494
816
  // print MTL GPU family:
495
817
  GGML_LOG_INFO("%s: GPU name: %s\n", __func__, dev->props.name);
@@ -524,6 +846,7 @@ ggml_metal_device_t ggml_metal_device_init(void) {
524
846
  GGML_LOG_INFO("%s: simdgroup matrix mul. = %s\n", __func__, dev->props.has_simdgroup_mm ? "true" : "false");
525
847
  GGML_LOG_INFO("%s: has unified memory = %s\n", __func__, dev->props.has_unified_memory ? "true" : "false");
526
848
  GGML_LOG_INFO("%s: has bfloat = %s\n", __func__, dev->props.has_bfloat ? "true" : "false");
849
+ GGML_LOG_INFO("%s: has tensor = %s\n", __func__, dev->props.has_tensor ? "true" : "false");
527
850
  GGML_LOG_INFO("%s: use residency sets = %s\n", __func__, dev->props.use_residency_sets ? "true" : "false");
528
851
  GGML_LOG_INFO("%s: use shared buffers = %s\n", __func__, dev->props.use_shared_buffers ? "true" : "false");
529
852
 
@@ -541,6 +864,8 @@ ggml_metal_device_t ggml_metal_device_init(void) {
541
864
  void ggml_metal_device_free(ggml_metal_device_t dev) {
542
865
  assert(dev != NULL);
543
866
 
867
+ ggml_metal_rsets_free(dev->rsets);
868
+
544
869
  ggml_metal_library_free(dev->library);
545
870
  dev->library = NULL;
546
871
 
@@ -569,6 +894,95 @@ ggml_metal_library_t ggml_metal_device_get_library(ggml_metal_device_t dev) {
569
894
  return dev->library;
570
895
  }
571
896
 
897
+ void ggml_metal_device_rsets_add(ggml_metal_device_t dev, ggml_metal_rset_t rset) {
898
+ if (rset == nil) {
899
+ return;
900
+ }
901
+
902
+ GGML_ASSERT(dev->rsets);
903
+
904
+ [dev->rsets->lock lock];
905
+
906
+ [dev->rsets->data addObject:rset];
907
+
908
+ [dev->rsets->lock unlock];
909
+ }
910
+
911
+ void ggml_metal_device_rsets_rm(ggml_metal_device_t dev, ggml_metal_rset_t rset) {
912
+ if (rset == nil) {
913
+ return;
914
+ }
915
+
916
+ GGML_ASSERT(dev->rsets);
917
+
918
+ [dev->rsets->lock lock];
919
+
920
+ [dev->rsets->data removeObject:rset];
921
+
922
+ [dev->rsets->lock unlock];
923
+ }
924
+
925
+ void ggml_metal_device_rsets_keep_alive(ggml_metal_device_t dev) {
926
+ if (dev->rsets == NULL) {
927
+ return;
928
+ }
929
+
930
+ atomic_store_explicit(&dev->rsets->d_loop, 2*dev->rsets->keep_alive_s, memory_order_relaxed);
931
+ }
932
+
933
+ struct ggml_metal_event {
934
+ void * obj; // id<MTLEvent>
935
+
936
+ atomic_int value;
937
+ };
938
+
939
+ void ggml_metal_event_encode_signal(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf_raw) {
940
+ id<MTLEvent> event = (id<MTLEvent>)ev->obj;
941
+
942
+ id<MTLCommandBuffer> cmd_buf = (id<MTLCommandBuffer>) cmd_buf_raw;
943
+
944
+ [cmd_buf encodeSignalEvent:event value:atomic_fetch_add_explicit(&ev->value, 1, memory_order_relaxed) + 1];
945
+ }
946
+
947
+ void ggml_metal_event_encode_wait(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf_raw) {
948
+ id<MTLEvent> event = (id<MTLEvent>)ev->obj;
949
+
950
+ id<MTLCommandBuffer> cmd_buf = (id<MTLCommandBuffer>) cmd_buf_raw;
951
+
952
+ [cmd_buf encodeWaitForEvent:event value:atomic_load_explicit(&ev->value, memory_order_relaxed)];
953
+ }
954
+
955
+ ggml_metal_event_t ggml_metal_device_event_init(ggml_metal_device_t dev) {
956
+ id<MTLEvent> event = [dev->mtl_device newEvent];
957
+
958
+ ggml_metal_event_t ev = calloc(1, sizeof(struct ggml_metal_event));
959
+
960
+ ev->obj = (__bridge void *)event;
961
+ ev->value = 0;
962
+
963
+ return ev;
964
+ }
965
+
966
+ void ggml_metal_device_event_free(ggml_metal_device_t dev, ggml_metal_event_t ev) {
967
+ id<MTLEvent> event = ev->obj;
968
+ [event release];
969
+
970
+ free(ev);
971
+
972
+ GGML_UNUSED(dev);
973
+ }
974
+
975
+ void ggml_metal_device_event_synchronize(ggml_metal_device_t dev, ggml_metal_event_t ev) {
976
+ @autoreleasepool {
977
+ id<MTLEvent> event = ev->obj;
978
+
979
+ id<MTLCommandBuffer> cmd_buf = [dev->mtl_queue commandBuffer];
980
+ [cmd_buf encodeWaitForEvent:event value:atomic_load_explicit(&ev->value, memory_order_relaxed)];
981
+ [cmd_buf commit];
982
+ [cmd_buf waitUntilCompleted];
983
+ }
984
+ }
985
+
572
986
  void ggml_metal_device_get_memory(ggml_metal_device_t dev, size_t * free, size_t * total) {
573
987
  if (@available(macOS 10.12, iOS 16.0, *)) {
574
988
  *total = dev->mtl_device.recommendedMaxWorkingSetSize;
@@ -597,6 +1011,15 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
597
1011
  }
598
1012
 
599
1013
  switch (op->op) {
1014
+ case GGML_OP_SCALE:
1015
+ case GGML_OP_FILL:
1016
+ case GGML_OP_CLAMP:
1017
+ case GGML_OP_SQR:
1018
+ case GGML_OP_SQRT:
1019
+ case GGML_OP_SIN:
1020
+ case GGML_OP_COS:
1021
+ case GGML_OP_LOG:
1022
+ return ggml_is_contiguous_rows(op->src[0]) && (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16);
600
1023
  case GGML_OP_UNARY:
601
1024
  switch (ggml_get_unary_op(op)) {
602
1025
  case GGML_UNARY_OP_TANH:
@@ -614,7 +1037,9 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
614
1037
  case GGML_UNARY_OP_HARDSWISH:
615
1038
  case GGML_UNARY_OP_HARDSIGMOID:
616
1039
  case GGML_UNARY_OP_EXP:
617
- return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
1040
+ case GGML_UNARY_OP_SOFTPLUS:
1041
+ case GGML_UNARY_OP_EXPM1:
1042
+ return ggml_is_contiguous_rows(op->src[0]) && (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16);
618
1043
  default:
619
1044
  return false;
620
1045
  }
@@ -642,27 +1067,32 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
642
1067
  case GGML_OP_MUL:
643
1068
  case GGML_OP_DIV:
644
1069
  case GGML_OP_ADD_ID:
645
- return op->src[0]->type == GGML_TYPE_F32;
646
1070
  case GGML_OP_ACC:
1071
+ return ggml_is_contiguous_rows(op->src[0]) && ggml_is_contiguous_rows(op->src[1]) && op->src[0]->type == GGML_TYPE_F32;
647
1072
  case GGML_OP_REPEAT:
648
- case GGML_OP_SCALE:
649
1073
  case GGML_OP_CONV_TRANSPOSE_1D:
650
1074
  return true;
651
- case GGML_OP_CLAMP:
652
- return op->src[0]->type == GGML_TYPE_F32;
653
- case GGML_OP_SQR:
654
- case GGML_OP_SQRT:
655
- case GGML_OP_SIN:
656
- case GGML_OP_COS:
657
- case GGML_OP_LOG:
658
- return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
1075
+ case GGML_OP_CONV_TRANSPOSE_2D:
1076
+ return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]) &&
1077
+ (op->src[0]->type == GGML_TYPE_F16 || op->src[0]->type == GGML_TYPE_F32) &&
1078
+ op->src[1]->type == GGML_TYPE_F32 &&
1079
+ op->type == GGML_TYPE_F32;
1080
+ case GGML_OP_SUM:
1081
+ return has_simdgroup_reduction && ggml_is_contiguous(op->src[0]);
1082
+ case GGML_OP_TRI:
1083
+ return ggml_is_contiguous_rows(op->src[0]);
659
1084
  case GGML_OP_SUM_ROWS:
1085
+ case GGML_OP_CUMSUM:
660
1086
  case GGML_OP_MEAN:
661
1087
  case GGML_OP_SOFT_MAX:
662
1088
  case GGML_OP_GROUP_NORM:
663
- return has_simdgroup_reduction && ggml_is_contiguous_rows(op->src[0]);
664
1089
  case GGML_OP_L2_NORM:
665
- return has_simdgroup_reduction && (op->ne[0] % 4 == 0 && ggml_is_contiguous_1(op->src[0]));
1090
+ return has_simdgroup_reduction && ggml_is_contiguous_rows(op->src[0]);
1091
+ case GGML_OP_COUNT_EQUAL:
1092
+ return has_simdgroup_reduction &&
1093
+ op->src[0]->type == GGML_TYPE_I32 &&
1094
+ op->src[1]->type == GGML_TYPE_I32 &&
1095
+ op->type == GGML_TYPE_I64;
666
1096
  case GGML_OP_ARGMAX:
667
1097
  return has_simdgroup_reduction;
668
1098
  case GGML_OP_NORM:
@@ -672,13 +1102,23 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
672
1102
  return true;
673
1103
  case GGML_OP_IM2COL:
674
1104
  return ggml_is_contiguous(op->src[1]) && op->src[1]->type == GGML_TYPE_F32 && (op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_F32);
675
- case GGML_OP_POOL_1D:
676
- return false;
1105
+ case GGML_OP_CONV_2D:
1106
+ return ggml_is_contiguous(op->src[0]) &&
1107
+ op->src[1]->type == GGML_TYPE_F32 &&
1108
+ op->type == GGML_TYPE_F32 &&
1109
+ (op->src[0]->type == GGML_TYPE_F16 || op->src[0]->type == GGML_TYPE_F32);
677
1110
  case GGML_OP_UPSCALE:
678
- return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
1111
+ return op->src[0]->type == GGML_TYPE_F32;
1112
+ case GGML_OP_POOL_1D:
1113
+ return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
679
1114
  case GGML_OP_POOL_2D:
680
1115
  return op->src[0]->type == GGML_TYPE_F32;
681
1116
  case GGML_OP_PAD:
1117
+ // TODO: add circular padding support for metal, see https://github.com/ggml-org/llama.cpp/pull/16985
1118
+ if (ggml_get_op_params_i32(op, 8) != 0) {
1119
+ return false;
1120
+ }
1121
+
682
1122
  return (ggml_get_op_params_i32(op, 0) == 0) && (ggml_get_op_params_i32(op, 2) == 0) &&
683
1123
  (ggml_get_op_params_i32(op, 4) == 0) && (ggml_get_op_params_i32(op, 6) == 0);
684
1124
  case GGML_OP_PAD_REFLECT_1D:
@@ -686,25 +1126,24 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
686
1126
  case GGML_OP_LEAKY_RELU:
687
1127
  return op->src[0]->type == GGML_TYPE_F32;
688
1128
  case GGML_OP_ARGSORT:
689
- // TODO: Support arbitrary column width
690
- return op->src[0]->ne[0] <= 1024;
1129
+ case GGML_OP_TOP_K:
691
1130
  case GGML_OP_ARANGE:
692
1131
  return true;
693
1132
  case GGML_OP_FLASH_ATTN_EXT:
694
1133
  // for new head sizes, add checks here
695
- if (op->src[0]->ne[0] != 40 &&
1134
+ if (op->src[0]->ne[0] != 32 &&
1135
+ op->src[0]->ne[0] != 40 &&
1136
+ op->src[0]->ne[0] != 48 &&
696
1137
  op->src[0]->ne[0] != 64 &&
1138
+ op->src[0]->ne[0] != 72 &&
697
1139
  op->src[0]->ne[0] != 80 &&
698
1140
  op->src[0]->ne[0] != 96 &&
699
1141
  op->src[0]->ne[0] != 112 &&
700
1142
  op->src[0]->ne[0] != 128 &&
701
1143
  op->src[0]->ne[0] != 192 &&
702
- op->src[0]->ne[0] != 256) {
703
- return false;
704
- }
705
- if (op->src[0]->ne[0] == 576) {
706
- // DeepSeek sizes
707
- // TODO: disabled for now, until optmized
1144
+ op->src[0]->ne[0] != 256 &&
1145
+ op->src[0]->ne[0] != 320 &&
1146
+ op->src[0]->ne[0] != 576) {
708
1147
  return false;
709
1148
  }
710
1149
  if (op->src[1]->type != op->src[2]->type) {
@@ -717,9 +1156,13 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
717
1156
  case GGML_OP_RWKV_WKV6:
718
1157
  case GGML_OP_RWKV_WKV7:
719
1158
  return true;
1159
+ case GGML_OP_GATED_DELTA_NET:
1160
+ return has_simdgroup_reduction && op->src[2]->ne[0] % 32 == 0;
1161
+ case GGML_OP_SOLVE_TRI:
720
1162
  case GGML_OP_MUL_MAT:
721
1163
  case GGML_OP_MUL_MAT_ID:
722
- return has_simdgroup_reduction;
1164
+ return has_simdgroup_reduction && op->src[0]->type != GGML_TYPE_NVFP4;
1165
+ case GGML_OP_SET:
723
1166
  case GGML_OP_CPY:
724
1167
  case GGML_OP_DUP:
725
1168
  case GGML_OP_CONT:
@@ -770,15 +1213,13 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
770
1213
  return false;
771
1214
  }
772
1215
  case GGML_TYPE_I32:
773
- return op->type == GGML_TYPE_F32;
1216
+ return op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_I32;
774
1217
  default:
775
1218
  return false;
776
1219
  };
777
1220
  }
778
1221
  case GGML_OP_GET_ROWS:
779
- {
780
- return op->ne[3] == 1;
781
- }
1222
+ return op->src[0]->type != GGML_TYPE_NVFP4;
782
1223
  case GGML_OP_SET_ROWS:
783
1224
  {
784
1225
  if (op->src[0]->type != GGML_TYPE_F32) {
@@ -800,6 +1241,11 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
800
1241
  return false;
801
1242
  };
802
1243
  }
1244
+ case GGML_OP_DIAG:
1245
+ return true;
1246
+ case GGML_OP_OPT_STEP_ADAMW:
1247
+ case GGML_OP_OPT_STEP_SGD:
1248
+ return has_simdgroup_reduction;
803
1249
  default:
804
1250
  return false;
805
1251
  }
@@ -824,7 +1270,7 @@ struct ggml_metal_buffer_wrapper {
824
1270
  };
825
1271
 
826
1272
  struct ggml_metal_buffer {
827
- void * all_data; // TODO: https://github.com/ggml-org/llama.cpp/pull/15985
1273
+ void * all_data;
828
1274
  size_t all_size;
829
1275
 
830
1276
  // if false, the Metal buffer data is allocated in private GPU memory and is not shared with the host
@@ -838,12 +1284,11 @@ struct ggml_metal_buffer {
838
1284
  bool use_residency_sets;
839
1285
 
840
1286
  // optional MTLResidencySet
841
- // note: cannot use explicity "id<MTLResidencySet>" here because it is not available on certain OSes
1287
+ // note: cannot use explicitly "id<MTLResidencySet>" here because it is not available on certain OSes
842
1288
  id rset;
843
1289
 
844
- // pointers to global device objects
845
- id<MTLDevice> device;
846
- id<MTLCommandQueue> queue;
1290
+ // pointers to global device
1291
+ ggml_metal_device_t dev;
847
1292
  };
848
1293
 
849
1294
  static void ggml_metal_log_allocated_size(id<MTLDevice> device, size_t size_aligned) {
@@ -886,7 +1331,7 @@ static bool ggml_metal_buffer_rset_init(ggml_metal_buffer_t buf) {
886
1331
  desc.initialCapacity = buf->n_buffers;
887
1332
 
888
1333
  NSError * error;
889
- buf->rset = [buf->device newResidencySetWithDescriptor:desc error:&error];
1334
+ buf->rset = [buf->dev->mtl_device newResidencySetWithDescriptor:desc error:&error];
890
1335
  if (error) {
891
1336
  GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
892
1337
  [desc release];
@@ -947,6 +1392,8 @@ static void * ggml_metal_host_malloc(size_t n) {
947
1392
  ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size, bool shared) {
948
1393
  ggml_metal_buffer_t res = calloc(1, sizeof(struct ggml_metal_buffer));
949
1394
 
1395
+ res->dev = dev;
1396
+
950
1397
  const size_t size_page = sysconf(_SC_PAGESIZE);
951
1398
 
952
1399
  size_t size_aligned = size;
@@ -962,16 +1409,14 @@ ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size,
962
1409
  if (shared) {
963
1410
  res->all_data = ggml_metal_host_malloc(size_aligned);
964
1411
  res->is_shared = true;
965
- res->owned = true;
966
1412
  } else {
967
- // dummy, non-NULL value - we'll populate this after creating the Metal buffer below
968
- res->all_data = (void *) 0x000000400ULL;
1413
+ // use virtual address
1414
+ res->all_data = (void *) atomic_fetch_add_explicit(&dev->addr_virt, size_aligned, memory_order_relaxed);
969
1415
  res->is_shared = false;
970
1416
  }
971
1417
  res->all_size = size_aligned;
972
1418
 
973
- res->device = ggml_metal_device_get_obj(dev);
974
- res->queue = ggml_metal_device_get_queue(dev);
1419
+ res->owned = true;
975
1420
 
976
1421
  res->n_buffers = 1;
977
1422
 
@@ -980,15 +1425,13 @@ ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size,
980
1425
  res->buffers[0].metal = nil;
981
1426
 
982
1427
  if (size_aligned > 0) {
983
- if (props_dev->use_shared_buffers &&shared) {
984
- res->buffers[0].metal = [res->device newBufferWithBytesNoCopy:res->all_data
1428
+ if (props_dev->use_shared_buffers && shared) {
1429
+ res->buffers[0].metal = [res->dev->mtl_device newBufferWithBytesNoCopy:res->all_data
985
1430
  length:size_aligned
986
1431
  options:MTLResourceStorageModeShared
987
1432
  deallocator:nil];
988
1433
  } else {
989
- res->buffers[0].metal = [res->device newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate];
990
-
991
- res->all_data = (void *) (res->buffers[0].metal.gpuAddress);
1434
+ res->buffers[0].metal = [res->dev->mtl_device newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate];
992
1435
  }
993
1436
  }
994
1437
 
@@ -1009,6 +1452,8 @@ ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size,
1009
1452
  return NULL;
1010
1453
  }
1011
1454
 
1455
+ ggml_metal_device_rsets_add(dev, res->rset);
1456
+
1012
1457
  //ggml_metal_log_allocated_size(device, size_aligned);
1013
1458
 
1014
1459
  return res;
@@ -1017,6 +1462,8 @@ ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size,
1017
1462
  ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, size_t size, size_t max_tensor_size) {
1018
1463
  ggml_metal_buffer_t res = calloc(1, sizeof(struct ggml_metal_buffer));
1019
1464
 
1465
+ res->dev = dev;
1466
+
1020
1467
  res->all_data = ptr;
1021
1468
  res->all_size = size;
1022
1469
 
@@ -1039,9 +1486,6 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s
1039
1486
  size_aligned += (size_page - (size_aligned % size_page));
1040
1487
  }
1041
1488
 
1042
- res->device = ggml_metal_device_get_obj(dev);
1043
- res->queue = ggml_metal_device_get_queue(dev);
1044
-
1045
1489
  const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev);
1046
1490
 
1047
1491
  // the buffer fits into the max buffer size allowed by the device
@@ -1051,7 +1495,7 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s
1051
1495
  res->buffers[res->n_buffers].metal = nil;
1052
1496
 
1053
1497
  if (size_aligned > 0) {
1054
- res->buffers[res->n_buffers].metal = [res->device newBufferWithBytesNoCopy:ptr length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
1498
+ res->buffers[res->n_buffers].metal = [res->dev->mtl_device newBufferWithBytesNoCopy:ptr length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
1055
1499
 
1056
1500
  if (res->buffers[res->n_buffers].metal == nil) {
1057
1501
  GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
@@ -1060,7 +1504,7 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s
1060
1504
  }
1061
1505
  }
1062
1506
 
1063
- ggml_metal_log_allocated_size(res->device, size_aligned);
1507
+ ggml_metal_log_allocated_size(res->dev->mtl_device, size_aligned);
1064
1508
 
1065
1509
  ++res->n_buffers;
1066
1510
  } else {
@@ -1078,7 +1522,7 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s
1078
1522
  res->buffers[res->n_buffers].metal = nil;
1079
1523
 
1080
1524
  if (size_step_aligned > 0) {
1081
- res->buffers[res->n_buffers].metal = [res->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) ptr + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
1525
+ res->buffers[res->n_buffers].metal = [res->dev->mtl_device newBufferWithBytesNoCopy:(void *) ((uint8_t *) ptr + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
1082
1526
 
1083
1527
  if (res->buffers[res->n_buffers].metal == nil) {
1084
1528
  GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
@@ -1087,7 +1531,7 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s
1087
1531
  }
1088
1532
  }
1089
1533
 
1090
- ggml_metal_log_allocated_size(res->device, size_step_aligned);
1534
+ ggml_metal_log_allocated_size(res->dev->mtl_device, size_step_aligned);
1091
1535
 
1092
1536
  if (i + size_step < size) {
1093
1537
  GGML_LOG_INFO("\n");
@@ -1105,10 +1549,14 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s
1105
1549
  return NULL;
1106
1550
  }
1107
1551
 
1552
+ ggml_metal_device_rsets_add(dev, res->rset);
1553
+
1108
1554
  return res;
1109
1555
  }
1110
1556
 
1111
1557
  void ggml_metal_buffer_free(ggml_metal_buffer_t buf) {
1558
+ ggml_metal_device_rsets_rm(buf->dev, buf->rset);
1559
+
1112
1560
  for (int i = 0; i < buf->n_buffers; i++) {
1113
1561
  [buf->buffers[i].metal release];
1114
1562
  }
@@ -1136,7 +1584,7 @@ bool ggml_metal_buffer_is_shared(ggml_metal_buffer_t buf) {
1136
1584
 
1137
1585
  void ggml_metal_buffer_memset_tensor(ggml_metal_buffer_t buf, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
1138
1586
  if (buf->is_shared) {
1139
- memset((char *)tensor->data + offset, value, size);
1587
+ memset((char *) tensor->data + offset, value, size);
1140
1588
  return;
1141
1589
  }
1142
1590
 
@@ -1145,8 +1593,7 @@ void ggml_metal_buffer_memset_tensor(ggml_metal_buffer_t buf, struct ggml_tensor
1145
1593
  struct ggml_metal_buffer_id bid_dst = ggml_metal_buffer_get_id(buf, tensor);
1146
1594
  bid_dst.offs += offset;
1147
1595
 
1148
- id<MTLCommandQueue> queue = buf->queue;
1149
- id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
1596
+ id<MTLCommandBuffer> cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences];
1150
1597
 
1151
1598
  {
1152
1599
  id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
@@ -1165,14 +1612,14 @@ void ggml_metal_buffer_memset_tensor(ggml_metal_buffer_t buf, struct ggml_tensor
1165
1612
 
1166
1613
  void ggml_metal_buffer_set_tensor(ggml_metal_buffer_t buf, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
1167
1614
  if (buf->is_shared) {
1168
- memcpy((char *)tensor->data + offset, data, size);
1615
+ memcpy((char *) tensor->data + offset, data, size);
1169
1616
  return;
1170
1617
  }
1171
1618
 
1172
1619
  @autoreleasepool {
1173
1620
  // src
1174
1621
  void * data_ptr = (void *)(uintptr_t) data; // "const cast" the src data
1175
- id<MTLBuffer> buf_src = [buf->device newBufferWithBytesNoCopy:data_ptr
1622
+ id<MTLBuffer> buf_src = [buf->dev->mtl_device newBufferWithBytesNoCopy:data_ptr
1176
1623
  length:size
1177
1624
  options:MTLResourceStorageModeShared
1178
1625
  deallocator:nil];
@@ -1187,8 +1634,7 @@ void ggml_metal_buffer_set_tensor(ggml_metal_buffer_t buf, struct ggml_tensor *
1187
1634
  // this is alternative to waitUntilCompleted, which should be faster, but don't seem to make much difference
1188
1635
  dispatch_semaphore_t completion_semaphore = dispatch_semaphore_create(0);
1189
1636
 
1190
- id<MTLCommandQueue> queue = buf->queue;
1191
- id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
1637
+ id<MTLCommandBuffer> cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences];
1192
1638
 
1193
1639
  {
1194
1640
  id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
@@ -1220,7 +1666,7 @@ void ggml_metal_buffer_set_tensor(ggml_metal_buffer_t buf, struct ggml_tensor *
1220
1666
 
1221
1667
  void ggml_metal_buffer_get_tensor(ggml_metal_buffer_t buf, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
1222
1668
  if (buf->is_shared) {
1223
- memcpy(data, (const char *)tensor->data + offset, size);
1669
+ memcpy(data, (const char *) tensor->data + offset, size);
1224
1670
  return;
1225
1671
  }
1226
1672
 
@@ -1230,15 +1676,14 @@ void ggml_metal_buffer_get_tensor(ggml_metal_buffer_t buf, const struct ggml_ten
1230
1676
  bid_src.offs += offset;
1231
1677
 
1232
1678
  // dst
1233
- id<MTLBuffer> buf_dst = [buf->device newBufferWithBytesNoCopy:data
1679
+ id<MTLBuffer> buf_dst = [buf->dev->mtl_device newBufferWithBytesNoCopy:data
1234
1680
  length:size
1235
1681
  options:MTLResourceStorageModeShared
1236
1682
  deallocator:nil];
1237
1683
 
1238
1684
  GGML_ASSERT(buf_dst);
1239
1685
 
1240
- id<MTLCommandQueue> queue = buf->queue;
1241
- id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
1686
+ id<MTLCommandBuffer> cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences];
1242
1687
 
1243
1688
  {
1244
1689
  id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
@@ -1264,8 +1709,7 @@ void ggml_metal_buffer_clear(ggml_metal_buffer_t buf, uint8_t value) {
1264
1709
  }
1265
1710
 
1266
1711
  @autoreleasepool {
1267
- id<MTLCommandQueue> queue = buf->queue;
1268
- id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
1712
+ id<MTLCommandBuffer> cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences];
1269
1713
 
1270
1714
  {
1271
1715
  id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];