whispercpp 1.3.5 → 1.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (610) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +1 -1
  3. data/README.md +99 -2
  4. data/ext/extconf.rb +1 -0
  5. data/ext/ruby_whisper.c +20 -4
  6. data/ext/ruby_whisper.h +30 -2
  7. data/ext/ruby_whisper_context.c +216 -124
  8. data/ext/ruby_whisper_context_params.c +163 -0
  9. data/ext/ruby_whisper_model.c +0 -1
  10. data/ext/ruby_whisper_params.c +0 -1
  11. data/ext/ruby_whisper_segment.c +0 -1
  12. data/ext/ruby_whisper_token.c +29 -9
  13. data/ext/ruby_whisper_transcribe.cpp +4 -1
  14. data/ext/ruby_whisper_vad_context.c +48 -1
  15. data/ext/ruby_whisper_vad_context_detect.cpp +6 -5
  16. data/ext/ruby_whisper_vad_params.c +0 -1
  17. data/ext/ruby_whisper_vad_segment.c +0 -1
  18. data/ext/ruby_whisper_vad_segments.c +0 -1
  19. data/ext/sources/CMakeLists.txt +1 -1
  20. data/ext/sources/bindings/javascript/package.json +1 -1
  21. data/ext/sources/cmake/whisper-config.cmake.in +5 -40
  22. data/ext/sources/examples/bench/bench.cpp +23 -18
  23. data/ext/sources/examples/cli/cli.cpp +8 -0
  24. data/ext/sources/examples/common-ggml.cpp +2 -0
  25. data/ext/sources/examples/miniaudio.h +4507 -2131
  26. data/ext/sources/examples/server/server.cpp +18 -4
  27. data/ext/sources/examples/talk-llama/CMakeLists.txt +3 -2
  28. data/ext/sources/examples/talk-llama/llama-adapter.cpp +7 -13
  29. data/ext/sources/examples/talk-llama/llama-adapter.h +4 -3
  30. data/ext/sources/examples/talk-llama/llama-arch.cpp +335 -17
  31. data/ext/sources/examples/talk-llama/llama-arch.h +42 -0
  32. data/ext/sources/examples/talk-llama/llama-batch.cpp +3 -1
  33. data/ext/sources/examples/talk-llama/llama-chat.cpp +21 -1
  34. data/ext/sources/examples/talk-llama/llama-chat.h +1 -0
  35. data/ext/sources/examples/talk-llama/llama-context.cpp +508 -520
  36. data/ext/sources/examples/talk-llama/llama-context.h +27 -28
  37. data/ext/sources/examples/talk-llama/llama-cparams.h +5 -0
  38. data/ext/sources/examples/talk-llama/llama-ext.h +12 -0
  39. data/ext/sources/examples/talk-llama/llama-grammar.cpp +8 -8
  40. data/ext/sources/examples/talk-llama/llama-graph.cpp +583 -130
  41. data/ext/sources/examples/talk-llama/llama-graph.h +131 -10
  42. data/ext/sources/examples/talk-llama/llama-hparams.cpp +57 -40
  43. data/ext/sources/examples/talk-llama/llama-hparams.h +79 -10
  44. data/ext/sources/examples/talk-llama/llama-impl.cpp +4 -4
  45. data/ext/sources/examples/talk-llama/llama-impl.h +13 -1
  46. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +3 -1
  47. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +274 -89
  48. data/ext/sources/examples/talk-llama/llama-kv-cache.h +2 -3
  49. data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.cpp +275 -0
  50. data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.h +140 -0
  51. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +11 -13
  52. data/ext/sources/examples/talk-llama/llama-mmap.cpp +28 -11
  53. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +527 -119
  54. data/ext/sources/examples/talk-llama/llama-model-loader.h +35 -5
  55. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +60 -46
  56. data/ext/sources/examples/talk-llama/llama-model-saver.h +5 -2
  57. data/ext/sources/examples/talk-llama/llama-model.cpp +1365 -647
  58. data/ext/sources/examples/talk-llama/llama-model.h +72 -19
  59. data/ext/sources/examples/talk-llama/llama-quant.cpp +578 -346
  60. data/ext/sources/examples/talk-llama/{llama-sampling.cpp → llama-sampler.cpp} +190 -76
  61. data/ext/sources/examples/talk-llama/{llama-sampling.h → llama-sampler.h} +0 -2
  62. data/ext/sources/examples/talk-llama/llama-vocab.cpp +118 -48
  63. data/ext/sources/examples/talk-llama/llama-vocab.h +5 -0
  64. data/ext/sources/examples/talk-llama/llama.cpp +76 -22
  65. data/ext/sources/examples/talk-llama/llama.h +63 -30
  66. data/ext/sources/examples/talk-llama/models/afmoe.cpp +2 -3
  67. data/ext/sources/examples/talk-llama/models/apertus.cpp +3 -3
  68. data/ext/sources/examples/talk-llama/models/arcee.cpp +3 -3
  69. data/ext/sources/examples/talk-llama/models/arctic.cpp +4 -5
  70. data/ext/sources/examples/talk-llama/models/baichuan.cpp +4 -3
  71. data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +1 -2
  72. data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +3 -5
  73. data/ext/sources/examples/talk-llama/models/bert.cpp +13 -7
  74. data/ext/sources/examples/talk-llama/models/bitnet.cpp +9 -24
  75. data/ext/sources/examples/talk-llama/models/bloom.cpp +2 -2
  76. data/ext/sources/examples/talk-llama/models/chameleon.cpp +3 -3
  77. data/ext/sources/examples/talk-llama/models/chatglm.cpp +2 -2
  78. data/ext/sources/examples/talk-llama/models/codeshell.cpp +3 -3
  79. data/ext/sources/examples/talk-llama/models/cogvlm.cpp +3 -3
  80. data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +2 -2
  81. data/ext/sources/examples/talk-llama/models/command-r.cpp +2 -2
  82. data/ext/sources/examples/talk-llama/models/dbrx.cpp +4 -5
  83. data/ext/sources/examples/talk-llama/models/deci.cpp +3 -3
  84. data/ext/sources/examples/talk-llama/models/deepseek.cpp +4 -6
  85. data/ext/sources/examples/talk-llama/models/deepseek2.cpp +24 -21
  86. data/ext/sources/examples/talk-llama/models/delta-net-base.cpp +445 -0
  87. data/ext/sources/examples/talk-llama/models/dots1.cpp +4 -6
  88. data/ext/sources/examples/talk-llama/models/dream.cpp +3 -3
  89. data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +4 -6
  90. data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +3 -3
  91. data/ext/sources/examples/talk-llama/models/eurobert.cpp +97 -0
  92. data/ext/sources/examples/talk-llama/models/exaone-moe.cpp +145 -0
  93. data/ext/sources/examples/talk-llama/models/exaone.cpp +3 -3
  94. data/ext/sources/examples/talk-llama/models/exaone4.cpp +3 -3
  95. data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +2 -4
  96. data/ext/sources/examples/talk-llama/models/falcon.cpp +3 -3
  97. data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +1 -1
  98. data/ext/sources/examples/talk-llama/models/gemma.cpp +1 -1
  99. data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +1 -1
  100. data/ext/sources/examples/talk-llama/models/gemma3.cpp +1 -1
  101. data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +7 -7
  102. data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +3 -3
  103. data/ext/sources/examples/talk-llama/models/glm4.cpp +14 -7
  104. data/ext/sources/examples/talk-llama/models/gpt2.cpp +2 -2
  105. data/ext/sources/examples/talk-llama/models/gptneox.cpp +2 -2
  106. data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +4 -5
  107. data/ext/sources/examples/talk-llama/models/granite.cpp +4 -5
  108. data/ext/sources/examples/talk-llama/models/grok.cpp +4 -4
  109. data/ext/sources/examples/talk-llama/models/grovemoe.cpp +5 -7
  110. data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +3 -3
  111. data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +4 -5
  112. data/ext/sources/examples/talk-llama/models/internlm2.cpp +3 -3
  113. data/ext/sources/examples/talk-llama/models/jais.cpp +2 -2
  114. data/ext/sources/examples/talk-llama/models/jais2.cpp +123 -0
  115. data/ext/sources/examples/talk-llama/models/jamba.cpp +3 -3
  116. data/ext/sources/examples/talk-llama/models/kimi-linear.cpp +381 -0
  117. data/ext/sources/examples/talk-llama/models/lfm2.cpp +145 -124
  118. data/ext/sources/examples/talk-llama/models/llada-moe.cpp +4 -4
  119. data/ext/sources/examples/talk-llama/models/llada.cpp +3 -3
  120. data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +4 -4
  121. data/ext/sources/examples/talk-llama/models/llama.cpp +18 -11
  122. data/ext/sources/examples/talk-llama/models/maincoder.cpp +3 -3
  123. data/ext/sources/examples/talk-llama/models/{graph-context-mamba.cpp → mamba-base.cpp} +9 -3
  124. data/ext/sources/examples/talk-llama/models/mamba.cpp +1 -2
  125. data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +11 -5
  126. data/ext/sources/examples/talk-llama/models/minicpm3.cpp +14 -13
  127. data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +4 -5
  128. data/ext/sources/examples/talk-llama/models/mistral3.cpp +4 -4
  129. data/ext/sources/examples/talk-llama/models/models.h +181 -46
  130. data/ext/sources/examples/talk-llama/models/modern-bert.cpp +2 -9
  131. data/ext/sources/examples/talk-llama/models/mpt.cpp +2 -2
  132. data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +26 -14
  133. data/ext/sources/examples/talk-llama/models/nemotron.cpp +3 -3
  134. data/ext/sources/examples/talk-llama/models/neo-bert.cpp +2 -2
  135. data/ext/sources/examples/talk-llama/models/olmo.cpp +3 -3
  136. data/ext/sources/examples/talk-llama/models/olmo2.cpp +3 -3
  137. data/ext/sources/examples/talk-llama/models/olmoe.cpp +4 -4
  138. data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +1 -1
  139. data/ext/sources/examples/talk-llama/models/openelm.cpp +3 -3
  140. data/ext/sources/examples/talk-llama/models/orion.cpp +3 -3
  141. data/ext/sources/examples/talk-llama/models/paddleocr.cpp +122 -0
  142. data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +3 -3
  143. data/ext/sources/examples/talk-llama/models/phi2.cpp +2 -2
  144. data/ext/sources/examples/talk-llama/models/phi3.cpp +3 -3
  145. data/ext/sources/examples/talk-llama/models/plamo.cpp +3 -3
  146. data/ext/sources/examples/talk-llama/models/plamo2.cpp +9 -5
  147. data/ext/sources/examples/talk-llama/models/plamo3.cpp +2 -2
  148. data/ext/sources/examples/talk-llama/models/plm.cpp +15 -14
  149. data/ext/sources/examples/talk-llama/models/qwen.cpp +2 -2
  150. data/ext/sources/examples/talk-llama/models/qwen2.cpp +3 -3
  151. data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +4 -4
  152. data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +3 -3
  153. data/ext/sources/examples/talk-llama/models/qwen3.cpp +12 -9
  154. data/ext/sources/examples/talk-llama/models/qwen35.cpp +381 -0
  155. data/ext/sources/examples/talk-llama/models/qwen35moe.cpp +422 -0
  156. data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +15 -8
  157. data/ext/sources/examples/talk-llama/models/qwen3next.cpp +84 -432
  158. data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +9 -18
  159. data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +8 -17
  160. data/ext/sources/examples/talk-llama/models/refact.cpp +2 -2
  161. data/ext/sources/examples/talk-llama/models/rnd1.cpp +4 -4
  162. data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +2 -0
  163. data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +2 -0
  164. data/ext/sources/examples/talk-llama/models/seed-oss.cpp +3 -3
  165. data/ext/sources/examples/talk-llama/models/smallthinker.cpp +4 -4
  166. data/ext/sources/examples/talk-llama/models/smollm3.cpp +3 -3
  167. data/ext/sources/examples/talk-llama/models/stablelm.cpp +2 -2
  168. data/ext/sources/examples/talk-llama/models/starcoder.cpp +2 -2
  169. data/ext/sources/examples/talk-llama/models/starcoder2.cpp +3 -3
  170. data/ext/sources/examples/talk-llama/models/step35-iswa.cpp +165 -0
  171. data/ext/sources/examples/talk-llama/models/t5-dec.cpp +2 -2
  172. data/ext/sources/examples/talk-llama/models/t5-enc.cpp +2 -2
  173. data/ext/sources/examples/talk-llama/models/xverse.cpp +3 -3
  174. data/ext/sources/examples/talk-llama/unicode.cpp +21 -65
  175. data/ext/sources/ggml/CMakeLists.txt +9 -3
  176. data/ext/sources/ggml/include/ggml-backend.h +1 -1
  177. data/ext/sources/ggml/include/ggml-cann.h +1 -1
  178. data/ext/sources/ggml/include/ggml-cpu.h +5 -0
  179. data/ext/sources/ggml/include/ggml-openvino.h +37 -0
  180. data/ext/sources/ggml/include/ggml-opt.h +1 -1
  181. data/ext/sources/ggml/include/ggml-rpc.h +6 -1
  182. data/ext/sources/ggml/include/ggml-virtgpu.h +14 -0
  183. data/ext/sources/ggml/include/ggml.h +56 -9
  184. data/ext/sources/ggml/src/CMakeLists.txt +3 -0
  185. data/ext/sources/ggml/src/ggml-alloc.c +4 -9
  186. data/ext/sources/ggml/src/ggml-backend-dl.cpp +48 -0
  187. data/ext/sources/ggml/src/ggml-backend-dl.h +45 -0
  188. data/ext/sources/ggml/src/ggml-backend-reg.cpp +28 -86
  189. data/ext/sources/ggml/src/ggml-backend.cpp +5 -2
  190. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
  191. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +6 -2
  192. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +1 -1
  193. data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +1 -1
  194. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +348 -189
  195. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +40 -85
  196. data/ext/sources/ggml/src/ggml-cann/common.h +3 -4
  197. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +44 -62
  198. data/ext/sources/ggml/src/ggml-common.h +11 -0
  199. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +16 -11
  200. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +42 -19
  201. data/ext/sources/ggml/src/ggml-cpu/amx/common.h +34 -10
  202. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +85 -85
  203. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +85 -1
  204. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2744 -548
  205. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1653 -0
  206. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1391 -0
  207. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +8 -10
  208. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +9 -9
  209. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +118 -18
  210. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +107 -26
  211. data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +2 -6
  212. data/ext/sources/ggml/src/ggml-cpu/common.h +8 -0
  213. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +3 -0
  214. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +59 -12
  215. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +15 -0
  216. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +21 -20
  217. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +965 -252
  218. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +584 -197
  219. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +903 -188
  220. data/ext/sources/ggml/src/ggml-cpu/ops.h +1 -0
  221. data/ext/sources/ggml/src/ggml-cpu/quants.c +40 -0
  222. data/ext/sources/ggml/src/ggml-cpu/quants.h +3 -0
  223. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +2890 -679
  224. data/ext/sources/ggml/src/ggml-cpu/repack.h +119 -8
  225. data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +136 -0
  226. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +111 -3
  227. data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +1 -1
  228. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +17 -0
  229. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +1 -1
  230. data/ext/sources/ggml/src/ggml-cuda/argsort.cu +19 -10
  231. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +32 -30
  232. data/ext/sources/ggml/src/ggml-cuda/common.cuh +134 -18
  233. data/ext/sources/ggml/src/ggml-cuda/convert.cu +41 -27
  234. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +6 -3
  235. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +78 -64
  236. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +384 -143
  237. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +36 -22
  238. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +3 -3
  239. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +26 -5
  240. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +1 -1
  241. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +127 -12
  242. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +263 -0
  243. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
  244. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +595 -200
  245. data/ext/sources/ggml/src/ggml-cuda/mean.cu +9 -8
  246. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +173 -6
  247. data/ext/sources/ggml/src/ggml-cuda/mmf.cu +30 -10
  248. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +158 -85
  249. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +34 -22
  250. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +127 -67
  251. data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +2 -0
  252. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +157 -65
  253. data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +1 -0
  254. data/ext/sources/ggml/src/ggml-cuda/norm.cu +18 -76
  255. data/ext/sources/ggml/src/ggml-cuda/pad.cu +13 -10
  256. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +1 -1
  257. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +2 -16
  258. data/ext/sources/ggml/src/ggml-cuda/rope.cu +233 -133
  259. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +8 -83
  260. data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +1 -1
  261. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +56 -32
  262. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
  263. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +5 -0
  264. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +1 -0
  265. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +5 -0
  266. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +1 -0
  267. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +1 -0
  268. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +1 -0
  269. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +3 -3
  270. data/ext/sources/ggml/src/ggml-cuda/top-k.cu +0 -1
  271. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +199 -135
  272. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +20 -14
  273. data/ext/sources/ggml/src/ggml-cuda/unary.cu +55 -0
  274. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +2 -0
  275. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +31 -17
  276. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +10 -0
  277. data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +82 -45
  278. data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +334 -160
  279. data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +7 -5
  280. data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +328 -197
  281. data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
  282. data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +765 -234
  283. data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +252 -0
  284. data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +412 -265
  285. data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +23 -23
  286. data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.c → hex-dma.c} +1 -1
  287. data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.h → hex-dma.h} +28 -3
  288. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
  289. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
  290. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
  291. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +1 -1
  292. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +27 -37
  293. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +6 -35
  294. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
  295. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +240 -0
  296. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
  297. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +251 -0
  298. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
  299. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
  300. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
  301. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
  302. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
  303. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
  304. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
  305. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
  306. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
  307. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +20 -1347
  308. data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +211 -13
  309. data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +1119 -952
  310. data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +254 -244
  311. data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +36 -36
  312. data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +155 -138
  313. data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +339 -0
  314. data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
  315. data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +209 -114
  316. data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +1 -5
  317. data/ext/sources/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
  318. data/ext/sources/ggml/src/ggml-hexagon/htp-drv.h +121 -0
  319. data/ext/sources/ggml/src/ggml-hexagon/libdl.h +79 -0
  320. data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
  321. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +6 -0
  322. data/ext/sources/ggml/src/ggml-impl.h +62 -0
  323. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -10
  324. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +13 -2
  325. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +8 -0
  326. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +147 -17
  327. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +274 -73
  328. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +22 -4
  329. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +102 -36
  330. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +174 -23
  331. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +580 -280
  332. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +5 -4
  333. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +320 -107
  334. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +1068 -825
  335. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +19 -1
  336. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +3108 -636
  337. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +41 -99
  338. data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +45 -0
  339. data/ext/sources/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
  340. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +204 -0
  341. data/ext/sources/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
  342. data/ext/sources/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
  343. data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +87 -56
  344. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
  345. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
  346. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
  347. data/ext/sources/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
  348. data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +114 -13
  349. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
  350. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
  351. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
  352. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
  353. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
  354. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
  355. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
  356. data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mv_q6_k.cl → mul_mv_q6_k_f32.cl} +4 -0
  357. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
  358. data/ext/sources/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
  359. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +31 -32
  360. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +14 -4
  361. data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -60
  362. data/ext/sources/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
  363. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +114 -13
  364. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +94 -48
  365. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +26 -0
  366. data/ext/sources/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
  367. data/ext/sources/ggml/src/ggml-openvino/.clang-format +154 -0
  368. data/ext/sources/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
  369. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +975 -0
  370. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
  371. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +373 -0
  372. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
  373. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +1110 -0
  374. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +884 -0
  375. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.h +153 -0
  376. data/ext/sources/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
  377. data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
  378. data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
  379. data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
  380. data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
  381. data/ext/sources/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
  382. data/ext/sources/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
  383. data/ext/sources/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
  384. data/ext/sources/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
  385. data/ext/sources/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
  386. data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
  387. data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
  388. data/ext/sources/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
  389. data/ext/sources/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
  390. data/ext/sources/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
  391. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
  392. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +123 -0
  393. data/ext/sources/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
  394. data/ext/sources/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
  395. data/ext/sources/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
  396. data/ext/sources/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
  397. data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
  398. data/ext/sources/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
  399. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +46 -0
  400. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +39 -0
  401. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +123 -0
  402. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +17 -0
  403. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
  404. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
  405. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
  406. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
  407. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
  408. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +293 -0
  409. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
  410. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +226 -0
  411. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +85 -0
  412. data/ext/sources/ggml/src/ggml-openvino/utils.cpp +823 -0
  413. data/ext/sources/ggml/src/ggml-openvino/utils.h +123 -0
  414. data/ext/sources/ggml/src/ggml-quants.c +96 -5
  415. data/ext/sources/ggml/src/ggml-quants.h +3 -0
  416. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +15 -88
  417. data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +5 -1
  418. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +1 -0
  419. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +21 -20
  420. data/ext/sources/ggml/src/ggml-sycl/common.hpp +315 -10
  421. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +69 -1
  422. data/ext/sources/ggml/src/ggml-sycl/convert.hpp +22 -1
  423. data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +1 -1
  424. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +791 -47
  425. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +78 -68
  426. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -0
  427. data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +1179 -0
  428. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +55 -0
  429. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +1338 -0
  430. data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +667 -0
  431. data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +225 -0
  432. data/ext/sources/ggml/src/ggml-sycl/fattn.hpp +22 -0
  433. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +309 -0
  434. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +8 -0
  435. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +316 -51
  436. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +65 -66
  437. data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +3 -3
  438. data/ext/sources/ggml/src/ggml-sycl/presets.hpp +3 -0
  439. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +1 -1
  440. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +450 -287
  441. data/ext/sources/ggml/src/ggml-sycl/rope.hpp +6 -0
  442. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +6 -6
  443. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
  444. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
  445. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
  446. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
  447. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
  448. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
  449. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
  450. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
  451. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
  452. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +7 -0
  453. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +7 -0
  454. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +7 -0
  455. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +7 -0
  456. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +7 -0
  457. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +7 -0
  458. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +7 -0
  459. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +7 -0
  460. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +7 -0
  461. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +7 -0
  462. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +7 -0
  463. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +7 -0
  464. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +7 -0
  465. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +7 -0
  466. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +7 -0
  467. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +7 -0
  468. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +7 -0
  469. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +7 -0
  470. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +7 -0
  471. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +7 -0
  472. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +7 -0
  473. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +7 -0
  474. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +7 -0
  475. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +7 -0
  476. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +7 -0
  477. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +7 -0
  478. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +7 -0
  479. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +7 -0
  480. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +7 -0
  481. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +7 -0
  482. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +7 -0
  483. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +7 -0
  484. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +7 -0
  485. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +7 -0
  486. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +7 -0
  487. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +7 -0
  488. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +13 -0
  489. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +1 -1
  490. data/ext/sources/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
  491. data/ext/sources/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
  492. data/ext/sources/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
  493. data/ext/sources/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
  494. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
  495. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
  496. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
  497. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
  498. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
  499. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
  500. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
  501. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
  502. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
  503. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
  504. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
  505. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
  506. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
  507. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
  508. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
  509. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
  510. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
  511. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
  512. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
  513. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
  514. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
  515. data/ext/sources/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
  516. data/ext/sources/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
  517. data/ext/sources/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
  518. data/ext/sources/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
  519. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
  520. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
  521. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
  522. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
  523. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
  524. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
  525. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
  526. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
  527. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
  528. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
  529. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
  530. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
  531. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
  532. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +1 -1
  533. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1250 -465
  534. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +16 -8
  535. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
  536. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +374 -170
  537. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +66 -22
  538. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +389 -201
  539. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +106 -58
  540. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
  541. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +9 -8
  542. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +128 -0
  543. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +12 -9
  544. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +20 -17
  545. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +11 -3
  546. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +8 -4
  547. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
  548. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +5 -3
  549. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +3 -3
  550. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +2 -3
  551. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +36 -63
  552. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -4
  553. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -4
  554. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -4
  555. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +10 -5
  556. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +7 -4
  557. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
  558. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +16 -10
  559. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +55 -35
  560. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1314 -109
  561. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1660 -1371
  562. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
  563. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
  564. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
  565. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +141 -0
  566. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +65 -72
  567. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +75 -0
  568. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +6 -0
  569. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
  570. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +40 -5
  571. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +105 -60
  572. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{get_rows.tmpl.wgsl → get_rows.wgsl} +53 -259
  573. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat.tmpl.wgsl → mul_mat.wgsl} +68 -257
  574. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +692 -23
  575. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_reg_tile.tmpl.wgsl → mul_mat_reg_tile.wgsl} +28 -128
  576. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_subgroup_matrix.tmpl.wgsl → mul_mat_subgroup_matrix.wgsl} +31 -137
  577. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +480 -0
  578. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
  579. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
  580. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{scale.tmpl.wgsl → scale.wgsl} +9 -36
  581. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +40 -12
  582. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
  583. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
  584. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +6 -1
  585. data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +31 -32
  586. data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +9 -6
  587. data/ext/sources/ggml/src/ggml.c +167 -33
  588. data/ext/sources/ggml/src/gguf.cpp +229 -44
  589. data/ext/sources/src/whisper.cpp +6 -28
  590. data/sig/whisper.rbs +43 -2
  591. data/test/test_context_params.rb +82 -0
  592. data/test/test_token.rb +11 -0
  593. data/test/test_vad_context.rb +58 -8
  594. data/test/test_whisper.rb +20 -0
  595. data/whispercpp.gemspec +1 -1
  596. metadata +240 -28
  597. data/ext/sources/ggml/cmake/BuildTypes.cmake +0 -54
  598. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +0 -333
  599. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +0 -94
  600. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +0 -72
  601. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +0 -49
  602. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +0 -1020
  603. data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +0 -149
  604. data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +0 -454
  605. data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +0 -221
  606. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +0 -188
  607. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +0 -45
  608. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +0 -267
  609. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +0 -112
  610. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +0 -483
@@ -46,9 +46,6 @@ size_t ggml_metal_op_flash_attn_ext_extra_tmp(const struct ggml_tensor * op);
46
46
  int ggml_metal_op_concat (ggml_metal_op_t ctx, int idx);
47
47
  int ggml_metal_op_repeat (ggml_metal_op_t ctx, int idx);
48
48
  int ggml_metal_op_acc (ggml_metal_op_t ctx, int idx);
49
- int ggml_metal_op_scale (ggml_metal_op_t ctx, int idx);
50
- int ggml_metal_op_fill (ggml_metal_op_t ctx, int idx);
51
- int ggml_metal_op_clamp (ggml_metal_op_t ctx, int idx);
52
49
  int ggml_metal_op_unary (ggml_metal_op_t ctx, int idx);
53
50
  int ggml_metal_op_glu (ggml_metal_op_t ctx, int idx);
54
51
  int ggml_metal_op_sum (ggml_metal_op_t ctx, int idx);
@@ -56,11 +53,16 @@ int ggml_metal_op_sum_rows (ggml_metal_op_t ctx, int idx);
56
53
  int ggml_metal_op_cumsum (ggml_metal_op_t ctx, int idx);
57
54
  int ggml_metal_op_get_rows (ggml_metal_op_t ctx, int idx);
58
55
  int ggml_metal_op_set_rows (ggml_metal_op_t ctx, int idx);
56
+ int ggml_metal_op_diag (ggml_metal_op_t ctx, int idx);
59
57
  int ggml_metal_op_soft_max (ggml_metal_op_t ctx, int idx);
60
58
  int ggml_metal_op_ssm_conv (ggml_metal_op_t ctx, int idx);
61
59
  int ggml_metal_op_ssm_scan (ggml_metal_op_t ctx, int idx);
62
60
  int ggml_metal_op_rwkv (ggml_metal_op_t ctx, int idx);
61
+ int ggml_metal_op_gated_delta_net (ggml_metal_op_t ctx, int idx);
62
+ int ggml_metal_op_solve_tri (ggml_metal_op_t ctx, int idx);
63
+ int ggml_metal_op_set (ggml_metal_op_t ctx, int idx);
63
64
  int ggml_metal_op_cpy (ggml_metal_op_t ctx, int idx);
65
+ int ggml_metal_op_pool_1d (ggml_metal_op_t ctx, int idx);
64
66
  int ggml_metal_op_pool_2d (ggml_metal_op_t ctx, int idx);
65
67
  int ggml_metal_op_mul_mat (ggml_metal_op_t ctx, int idx);
66
68
  int ggml_metal_op_mul_mat_id (ggml_metal_op_t ctx, int idx);
@@ -83,7 +85,6 @@ int ggml_metal_op_timestep_embedding(ggml_metal_op_t ctx, int idx);
83
85
  int ggml_metal_op_argmax (ggml_metal_op_t ctx, int idx);
84
86
  int ggml_metal_op_argsort (ggml_metal_op_t ctx, int idx);
85
87
  int ggml_metal_op_top_k (ggml_metal_op_t ctx, int idx);
86
- int ggml_metal_op_leaky_relu (ggml_metal_op_t ctx, int idx);
87
88
  int ggml_metal_op_tri (ggml_metal_op_t ctx, int idx);
88
89
  int ggml_metal_op_opt_step_adamw (ggml_metal_op_t ctx, int idx);
89
90
  int ggml_metal_op_opt_step_sgd (ggml_metal_op_t ctx, int idx);
@@ -7,11 +7,15 @@
7
7
  #include "ggml-metal-context.h"
8
8
  #include "ggml-metal-ops.h"
9
9
 
10
- // globals
10
+ #include <mutex>
11
+ #include <string>
11
12
 
12
- // initialized in ggml_backend_metal_reg
13
- static ggml_backend_reg g_ggml_metal_reg;
14
- static ggml_backend_device g_ggml_metal_device;
13
+ #define GGML_METAL_NAME "MTL"
14
+ #define GGML_METAL_MAX_DEVICES 16
15
+
16
+ // number of Metal devices
17
+ // note: can be overridden with GGML_METAL_DEVICES env to simulate virtual devices
18
+ static int g_devices = 1;
15
19
 
16
20
  ////////////////////////////////////////////////////////////////////////////////
17
21
  // backend interface
@@ -165,10 +169,28 @@ static ggml_backend_buffer_i ggml_backend_metal_buffer_private_i = {
165
169
  /* .reset = */ NULL,
166
170
  };
167
171
 
172
+ static bool ggml_backend_buffer_is_metal(ggml_backend_buffer_t buffer) {
173
+ return buffer->iface.free_buffer == ggml_backend_metal_buffer_shared_free_buffer ||
174
+ buffer->iface.free_buffer == ggml_backend_metal_buffer_private_free_buffer;
175
+ }
176
+
168
177
  //
169
178
  // buffer types
170
179
  //
171
180
 
181
+ struct ggml_backend_metal_buffer_type {
182
+ int device;
183
+ std::string name;
184
+ };
185
+
186
+ struct ggml_backend_metal_buffer_type_deleter {
187
+ void operator()(ggml_backend_metal_buffer_type * ctx) const {
188
+ delete ctx;
189
+ }
190
+ };
191
+
192
+ typedef std::unique_ptr<ggml_backend_metal_buffer_type, ggml_backend_metal_buffer_type_deleter> ggml_backend_metal_buffer_type_ptr;
193
+
172
194
  // common method for allocating shread or private Metal buffers
173
195
  static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size, bool shared) {
174
196
  ggml_metal_device_t ctx_dev = (ggml_metal_device_t)buft->device->context;
@@ -218,9 +240,9 @@ static size_t ggml_backend_metal_buffer_type_get_alloc_size(ggml_backend_buffer_
218
240
  // default (shared) buffer type
219
241
 
220
242
  static const char * ggml_backend_metal_buffer_type_shared_get_name(ggml_backend_buffer_type_t buft) {
221
- return "Metal";
243
+ ggml_backend_metal_buffer_type * ctx = (ggml_backend_metal_buffer_type *)buft->context;
222
244
 
223
- GGML_UNUSED(buft);
245
+ return ctx->name.c_str();
224
246
  }
225
247
 
226
248
  static ggml_backend_buffer_t ggml_backend_metal_buffer_type_shared_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
@@ -249,29 +271,54 @@ static bool ggml_backend_metal_buffer_type_shared_is_host(ggml_backend_buffer_ty
249
271
  GGML_UNUSED(buft);
250
272
  }
251
273
 
252
- static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_shared(void) {
253
- static ggml_backend_buffer_type ggml_backend_buffer_type_metal = {
254
- /* .iface = */ {
255
- /* .get_name = */ ggml_backend_metal_buffer_type_shared_get_name,
256
- /* .alloc_buffer = */ ggml_backend_metal_buffer_type_shared_alloc_buffer,
257
- /* .get_alignment = */ ggml_backend_metal_buffer_type_shared_get_alignment,
258
- /* .get_max_size = */ ggml_backend_metal_buffer_type_shared_get_max_size,
259
- /* .get_alloc_size = */ ggml_backend_metal_buffer_type_shared_get_alloc_size,
260
- /* .is_host = */ ggml_backend_metal_buffer_type_shared_is_host,
261
- },
262
- /* .device = */ &g_ggml_metal_device,
263
- /* .context = */ NULL,
264
- };
274
+ static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_shared(int device) {
275
+ static std::mutex mutex;
276
+ std::lock_guard<std::mutex> lock(mutex);
277
+
278
+ static std::vector<ggml_backend_buffer_type> bufts;
279
+ static std::vector<ggml_backend_metal_buffer_type_ptr> ctxs;
280
+
281
+ static bool initialized = false;
282
+ if (!initialized) {
283
+ bufts.reserve(g_devices);
284
+ ctxs.reserve(g_devices);
285
+
286
+ for (int i = 0; i < g_devices; ++i) {
287
+ ggml_backend_metal_buffer_type * raw_ctx =
288
+ new ggml_backend_metal_buffer_type {
289
+ /* .device = */ i,
290
+ /* .name = */ GGML_METAL_NAME + std::to_string(i),
291
+ };
292
+ ctxs.emplace_back(raw_ctx);
293
+
294
+ ggml_backend_buffer_type buft = {
295
+ /* .iface = */ {
296
+ /* .get_name = */ ggml_backend_metal_buffer_type_shared_get_name,
297
+ /* .alloc_buffer = */ ggml_backend_metal_buffer_type_shared_alloc_buffer,
298
+ /* .get_alignment = */ ggml_backend_metal_buffer_type_shared_get_alignment,
299
+ /* .get_max_size = */ ggml_backend_metal_buffer_type_shared_get_max_size,
300
+ /* .get_alloc_size = */ ggml_backend_metal_buffer_type_shared_get_alloc_size,
301
+ /* .is_host = */ ggml_backend_metal_buffer_type_shared_is_host,
302
+ },
303
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_metal_reg(), i),
304
+ /* .context = */ raw_ctx,
305
+ };
306
+
307
+ bufts.emplace_back(buft);
308
+ }
309
+
310
+ initialized = true;
311
+ }
265
312
 
266
- return &ggml_backend_buffer_type_metal;
313
+ return &bufts[device];
267
314
  }
268
315
 
269
316
  // default (private) buffer type
270
317
 
271
318
  static const char * ggml_backend_metal_buffer_type_private_get_name(ggml_backend_buffer_type_t buft) {
272
- return "Metal_Private";
319
+ ggml_backend_metal_buffer_type * ctx = (ggml_backend_metal_buffer_type *)buft->context;
273
320
 
274
- GGML_UNUSED(buft);
321
+ return ctx->name.c_str();
275
322
  }
276
323
 
277
324
  static ggml_backend_buffer_t ggml_backend_metal_buffer_type_private_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
@@ -300,29 +347,53 @@ static bool ggml_backend_metal_buffer_type_private_is_host(ggml_backend_buffer_t
300
347
  GGML_UNUSED(buft);
301
348
  }
302
349
 
303
- static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_private(void) {
304
- static ggml_backend_buffer_type ggml_backend_buffer_type_metal = {
305
- /* .iface = */ {
306
- /* .get_name = */ ggml_backend_metal_buffer_type_private_get_name,
307
- /* .alloc_buffer = */ ggml_backend_metal_buffer_type_private_alloc_buffer,
308
- /* .get_alignment = */ ggml_backend_metal_buffer_type_private_get_alignment,
309
- /* .get_max_size = */ ggml_backend_metal_buffer_type_private_get_max_size,
310
- /* .get_alloc_size = */ ggml_backend_metal_buffer_type_private_get_alloc_size,
311
- /* .is_host = */ ggml_backend_metal_buffer_type_private_is_host,
312
- },
313
- /* .device = */ &g_ggml_metal_device,
314
- /* .context = */ NULL,
315
- };
350
+ static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_private(int device) {
351
+ static std::mutex mutex;
352
+ std::lock_guard<std::mutex> lock(mutex);
353
+
354
+ static std::vector<ggml_backend_buffer_type> bufts;
355
+ static std::vector<ggml_backend_metal_buffer_type_ptr> ctxs;
356
+
357
+ static bool initialized = false;
358
+ if (!initialized) {
359
+ bufts.reserve(g_devices);
360
+ ctxs.reserve(g_devices);
361
+
362
+ for (int i = 0; i < g_devices; ++i) {
363
+ ggml_backend_metal_buffer_type * raw_ctx = new ggml_backend_metal_buffer_type{
364
+ /* .device = */ i,
365
+ /* .name = */ GGML_METAL_NAME + std::to_string(i) + "_Private"
366
+ };
367
+ ctxs.emplace_back(raw_ctx);
368
+
369
+ ggml_backend_buffer_type buft = {
370
+ /* .iface = */ {
371
+ /* .get_name = */ ggml_backend_metal_buffer_type_private_get_name,
372
+ /* .alloc_buffer = */ ggml_backend_metal_buffer_type_private_alloc_buffer,
373
+ /* .get_alignment = */ ggml_backend_metal_buffer_type_private_get_alignment,
374
+ /* .get_max_size = */ ggml_backend_metal_buffer_type_private_get_max_size,
375
+ /* .get_alloc_size = */ ggml_backend_metal_buffer_type_private_get_alloc_size,
376
+ /* .is_host = */ ggml_backend_metal_buffer_type_private_is_host,
377
+ },
378
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_metal_reg(), i),
379
+ /* .context = */ raw_ctx,
380
+ };
381
+
382
+ bufts.emplace_back(buft);
383
+ }
384
+
385
+ initialized = true;
386
+ }
316
387
 
317
- return &ggml_backend_buffer_type_metal;
388
+ return &bufts[device];
318
389
  }
319
390
 
320
391
  // mapped buffer type
321
392
 
322
393
  static const char * ggml_backend_metal_buffer_type_mapped_get_name(ggml_backend_buffer_type_t buft) {
323
- return "Metal_Mapped";
394
+ ggml_backend_metal_buffer_type * ctx = (ggml_backend_metal_buffer_type *)buft->context;
324
395
 
325
- GGML_UNUSED(buft);
396
+ return ctx->name.c_str();
326
397
  }
327
398
 
328
399
  static ggml_backend_buffer_t ggml_backend_metal_buffer_type_mapped_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
@@ -352,31 +423,55 @@ static bool ggml_backend_metal_buffer_type_mapped_is_host(ggml_backend_buffer_ty
352
423
  GGML_UNUSED(buft);
353
424
  }
354
425
 
355
- static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_mapped(void) {
356
- // note: not obvious, but this buffer type still needs to implement .alloc_buffer:
357
- // https://github.com/ggml-org/llama.cpp/pull/15832#discussion_r2333177099
358
- static ggml_backend_buffer_type ggml_backend_buffer_type_mapped_metal = {
359
- /* .iface = */ {
360
- /* .get_name = */ ggml_backend_metal_buffer_type_mapped_get_name,
361
- /* .alloc_buffer = */ ggml_backend_metal_buffer_type_mapped_alloc_buffer,
362
- /* .get_alignment = */ ggml_backend_metal_buffer_type_mapped_get_alignment,
363
- /* .get_max_size = */ ggml_backend_metal_buffer_type_mapped_get_max_size,
364
- /* .get_alloc_size = */ ggml_backend_metal_buffer_type_mapped_get_alloc_size,
365
- /* .is_host = */ ggml_backend_metal_buffer_type_mapped_is_host,
366
- },
367
- /* .device = */ &g_ggml_metal_device,
368
- /* .context = */ NULL,
369
- };
426
+ static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_mapped(int device) {
427
+ static std::mutex mutex;
428
+ std::lock_guard<std::mutex> lock(mutex);
429
+
430
+ static std::vector<ggml_backend_buffer_type> bufts;
431
+ static std::vector<ggml_backend_metal_buffer_type_ptr> ctxs;
432
+
433
+ static bool initialized = false;
434
+ if (!initialized) {
435
+ bufts.reserve(g_devices);
436
+ ctxs.reserve(g_devices);
437
+
438
+ for (int i = 0; i < g_devices; ++i) {
439
+ ggml_backend_metal_buffer_type * raw_ctx = new ggml_backend_metal_buffer_type{
440
+ /* .device = */ i,
441
+ /* .name = */ GGML_METAL_NAME + std::to_string(i) + "_Mapped"
442
+ };
443
+ ctxs.emplace_back(raw_ctx);
444
+
445
+ // note: not obvious, but this buffer type still needs to implement .alloc_buffer:
446
+ // https://github.com/ggml-org/llama.cpp/pull/15832#discussion_r2333177099
447
+ ggml_backend_buffer_type buft = {
448
+ /* .iface = */ {
449
+ /* .get_name = */ ggml_backend_metal_buffer_type_mapped_get_name,
450
+ /* .alloc_buffer = */ ggml_backend_metal_buffer_type_mapped_alloc_buffer,
451
+ /* .get_alignment = */ ggml_backend_metal_buffer_type_mapped_get_alignment,
452
+ /* .get_max_size = */ ggml_backend_metal_buffer_type_mapped_get_max_size,
453
+ /* .get_alloc_size = */ ggml_backend_metal_buffer_type_mapped_get_alloc_size,
454
+ /* .is_host = */ ggml_backend_metal_buffer_type_mapped_is_host,
455
+ },
456
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_metal_reg(), i),
457
+ /* .context = */ raw_ctx,
458
+ };
459
+
460
+ bufts.emplace_back(buft);
461
+ }
462
+
463
+ initialized = true;
464
+ }
370
465
 
371
- return &ggml_backend_buffer_type_mapped_metal;
466
+ return &bufts[device];
372
467
  }
373
468
 
374
469
  // backend
375
470
 
376
471
  static const char * ggml_backend_metal_name(ggml_backend_t backend) {
377
- return "Metal";
472
+ ggml_metal_t ctx = (ggml_metal_t)backend->context;
378
473
 
379
- GGML_UNUSED(backend);
474
+ return ggml_metal_get_name(ctx);
380
475
  }
381
476
 
382
477
  static void ggml_backend_metal_free(ggml_backend_t backend) {
@@ -409,12 +504,24 @@ static void ggml_backend_metal_get_tensor_async(ggml_backend_t backend, const gg
409
504
  }
410
505
 
411
506
  static bool ggml_backend_metal_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
412
- return false;
507
+ if (!ggml_backend_is_metal(backend_src) || !ggml_backend_is_metal(backend_dst)) {
508
+ return false;
509
+ }
413
510
 
414
- GGML_UNUSED(backend_src);
415
- GGML_UNUSED(backend_dst);
416
- GGML_UNUSED(src);
417
- GGML_UNUSED(dst);
511
+ if (!ggml_backend_buffer_is_metal(src->buffer) || !ggml_backend_buffer_is_metal(dst->buffer)) {
512
+ return false;
513
+ }
514
+
515
+ ggml_metal_t ctx_src = (ggml_metal_t)backend_src->context;
516
+ ggml_metal_t ctx_dst = (ggml_metal_t)backend_dst->context;
517
+
518
+ //ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
519
+ //ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
520
+
521
+ //ggml_metal_buffer_t buf_ctx_src = (ggml_metal_buffer_t)buf_src->context;
522
+ //ggml_metal_buffer_t buf_ctx_dst = (ggml_metal_buffer_t)buf_dst->context;
523
+
524
+ return ggml_metal_cpy_tensor_async(ctx_src, ctx_dst, src, dst);
418
525
  }
419
526
 
420
527
  static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
@@ -423,6 +530,20 @@ static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend,
423
530
  return ggml_metal_graph_compute(ctx, cgraph);
424
531
  }
425
532
 
533
+ static void ggml_backend_metal_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
534
+ ggml_metal_t ctx = (ggml_metal_t)backend->context;
535
+ ggml_metal_event_t ev = (ggml_metal_event_t)event->context;
536
+
537
+ ggml_metal_event_record(ctx, ev);
538
+ }
539
+
540
+ static void ggml_backend_metal_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
541
+ ggml_metal_t ctx = (ggml_metal_t)backend->context;
542
+ ggml_metal_event_t ev = (ggml_metal_event_t)event->context;
543
+
544
+ ggml_metal_event_wait(ctx, ev);
545
+ }
546
+
426
547
  static void ggml_backend_metal_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) {
427
548
  ggml_metal_t ctx = (ggml_metal_t)backend->context;
428
549
 
@@ -435,7 +556,6 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
435
556
  ggml_metal_t ctx = (ggml_metal_t)backend->context;
436
557
 
437
558
  ggml_metal_set_n_cb(ctx, n_cb);
438
-
439
559
  }
440
560
 
441
561
  static ggml_backend_i ggml_backend_metal_i = {
@@ -450,12 +570,8 @@ static ggml_backend_i ggml_backend_metal_i = {
450
570
  /* .graph_plan_update = */ NULL,
451
571
  /* .graph_plan_compute = */ NULL,
452
572
  /* .graph_compute = */ ggml_backend_metal_graph_compute,
453
-
454
- // the events API is needed only for multi-GPU setups, so likely no need to implement it for Metal
455
- // in any case, these docs seem relevant if we ever decide to implement it:
456
- // https://developer.apple.com/documentation/metal/mtlcommandbuffer#Synchronizing-Passes-with-Events
457
- /* .event_record = */ NULL,
458
- /* .event_wait = */ NULL,
573
+ /* .event_record = */ ggml_backend_metal_event_record,
574
+ /* .event_wait = */ ggml_backend_metal_event_wait,
459
575
  /* .graph_optimize = */ ggml_backend_metal_graph_optimize,
460
576
  };
461
577
 
@@ -519,15 +635,17 @@ void ggml_backend_metal_capture_next_compute(ggml_backend_t backend) {
519
635
  // backend device
520
636
 
521
637
  static const char * ggml_backend_metal_device_get_name(ggml_backend_dev_t dev) {
522
- return "Metal";
638
+ ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context;
523
639
 
524
- GGML_UNUSED(dev);
640
+ const ggml_metal_device_props * props_dev = ggml_metal_device_get_props(ctx_dev);
641
+
642
+ return props_dev->name;
525
643
  }
526
644
 
527
645
  static const char * ggml_backend_metal_device_get_description(ggml_backend_dev_t dev) {
528
646
  ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context;
529
647
 
530
- return ggml_metal_device_get_props(ctx_dev)->name;
648
+ return ggml_metal_device_get_props(ctx_dev)->desc;
531
649
  }
532
650
 
533
651
  static void ggml_backend_metal_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
@@ -550,14 +668,14 @@ static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, ggml_bac
550
668
  ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
551
669
 
552
670
  props->caps = {
553
- /* .async = */ true,
554
- /* .host_buffer = */ false,
555
- /* .buffer_from_host_ptr = */ true,
556
- /* .events = */ false,
671
+ /* .async = */ true,
672
+ /* .host_buffer = */ false,
673
+ /* .buffer_from_host_ptr = */ true,
674
+ /* .events = */ true,
557
675
  };
558
676
  }
559
677
 
560
- static ggml_backend_t ggml_backend_metal_device_init(ggml_backend_dev_t dev, const char * params) {
678
+ static ggml_backend_t ggml_backend_metal_device_init_backend(ggml_backend_dev_t dev, const char * params) {
561
679
  ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context;
562
680
 
563
681
  ggml_metal_t ctx = ggml_metal_init(ctx_dev);
@@ -587,7 +705,7 @@ static ggml_backend_buffer_type_t ggml_backend_metal_device_get_buffer_type(ggml
587
705
 
588
706
  const ggml_metal_device_props * props_dev = ggml_metal_device_get_props(ctx_dev);
589
707
 
590
- return props_dev->use_shared_buffers ? ggml_backend_metal_buffer_type_shared() : ggml_backend_metal_buffer_type_private();
708
+ return props_dev->use_shared_buffers ? ggml_backend_metal_buffer_type_shared(props_dev->device) : ggml_backend_metal_buffer_type_private(props_dev->device);
591
709
  }
592
710
 
593
711
  static ggml_backend_buffer_t ggml_backend_metal_device_buffer_mapped(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
@@ -595,7 +713,9 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_mapped(ggml_backen
595
713
 
596
714
  ggml_metal_buffer_t res = ggml_metal_buffer_map(ctx_dev, ptr, size, max_tensor_size);
597
715
 
598
- return ggml_backend_buffer_init(ggml_backend_metal_buffer_type_mapped(), ggml_backend_metal_buffer_shared_i, res, size);
716
+ const ggml_metal_device_props * props_dev = ggml_metal_device_get_props(ctx_dev);
717
+
718
+ return ggml_backend_buffer_init(ggml_backend_metal_buffer_type_mapped(props_dev->device), ggml_backend_metal_buffer_shared_i, res, size);
599
719
  }
600
720
 
601
721
  static bool ggml_backend_metal_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
@@ -606,9 +726,10 @@ static bool ggml_backend_metal_device_supports_op(ggml_backend_dev_t dev, const
606
726
 
607
727
  static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
608
728
  return
729
+ buft->device == dev && (
609
730
  buft->iface.get_name == ggml_backend_metal_buffer_type_shared_get_name ||
610
731
  buft->iface.get_name == ggml_backend_metal_buffer_type_private_get_name ||
611
- buft->iface.get_name == ggml_backend_metal_buffer_type_mapped_get_name;
732
+ buft->iface.get_name == ggml_backend_metal_buffer_type_mapped_get_name);
612
733
 
613
734
  GGML_UNUSED(dev);
614
735
  }
@@ -632,45 +753,97 @@ static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const g
632
753
  get_op_batch_size(op) >= ggml_metal_device_get_props(ctx_dev)->op_offload_min_batch_size;
633
754
  }
634
755
 
756
+ static ggml_backend_event_t ggml_backend_metal_device_event_new(ggml_backend_dev_t dev) {
757
+ ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context;
758
+
759
+ ggml_metal_event_t event = ggml_metal_device_event_init(ctx_dev);
760
+ GGML_ASSERT(event);
761
+
762
+ ggml_backend_event_t ev = new ggml_backend_event {
763
+ /* .device = */ dev,
764
+ /* .context = */ event,
765
+ };
766
+
767
+ return ev;
768
+ }
769
+
770
+ static void ggml_backend_metal_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) {
771
+ ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context;
772
+
773
+ ggml_metal_event_t ev = (ggml_metal_event_t)event->context;
774
+
775
+ ggml_metal_device_event_free(ctx_dev, ev);
776
+
777
+ delete event;
778
+ }
779
+
780
+ static void ggml_backend_metal_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) {
781
+ ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context;
782
+
783
+ ggml_metal_event_t evt = (ggml_metal_event_t)event->context;
784
+
785
+ ggml_metal_device_event_synchronize(ctx_dev, evt);
786
+ }
787
+
635
788
  static ggml_backend_device_i ggml_backend_metal_device_i = {
636
789
  /* .get_name = */ ggml_backend_metal_device_get_name,
637
790
  /* .get_description = */ ggml_backend_metal_device_get_description,
638
791
  /* .get_memory = */ ggml_backend_metal_device_get_memory,
639
792
  /* .get_type = */ ggml_backend_metal_device_get_type,
640
793
  /* .get_props = */ ggml_backend_metal_device_get_props,
641
- /* .init_backend = */ ggml_backend_metal_device_init,
794
+ /* .init_backend = */ ggml_backend_metal_device_init_backend,
642
795
  /* .get_buffer_type = */ ggml_backend_metal_device_get_buffer_type,
643
796
  /* .get_host_buffer_type = */ NULL,
644
797
  /* .buffer_from_host_ptr = */ ggml_backend_metal_device_buffer_mapped,
645
798
  /* .supports_op = */ ggml_backend_metal_device_supports_op,
646
799
  /* .supports_buft = */ ggml_backend_metal_device_supports_buft,
647
800
  /* .offload_op = */ ggml_backend_metal_device_offload_op,
648
- /* .event_new = */ NULL,
649
- /* .event_free = */ NULL,
650
- /* .event_synchronize = */ NULL,
801
+ /* .event_new = */ ggml_backend_metal_device_event_new,
802
+ /* .event_free = */ ggml_backend_metal_device_event_free,
803
+ /* .event_synchronize = */ ggml_backend_metal_device_event_synchronize,
651
804
  };
652
805
 
653
806
  // backend registry
654
807
 
808
+ struct ggml_backend_metal_reg {
809
+ std::vector<ggml_backend_dev_t> devices;
810
+ };
811
+
812
+ typedef struct ggml_backend_metal_reg * ggml_backend_metal_reg_t;
813
+
814
+ static ggml_backend_metal_reg_t ggml_backend_metal_reg_init(void) {
815
+ ggml_backend_metal_reg_t ctx = new struct ggml_backend_metal_reg;
816
+
817
+ return ctx;
818
+ }
819
+
820
+ static void ggml_backend_metal_reg_free(ggml_backend_metal_reg_t ctx) {
821
+ delete ctx;
822
+ }
823
+
824
+ struct ggml_backend_metal_reg_deleter {
825
+ void operator()(ggml_backend_metal_reg_t ctx) {
826
+ ggml_backend_metal_reg_free(ctx);
827
+ }
828
+ };
829
+
830
+ typedef std::unique_ptr<struct ggml_backend_metal_reg, ggml_backend_metal_reg_deleter> ggml_backend_metal_reg_ptr;
831
+
655
832
  static const char * ggml_backend_metal_reg_get_name(ggml_backend_reg_t reg) {
656
- return "Metal";
833
+ return GGML_METAL_NAME;
657
834
 
658
835
  GGML_UNUSED(reg);
659
836
  }
660
837
 
661
838
  static size_t ggml_backend_metal_reg_device_count(ggml_backend_reg_t reg) {
662
- return 1;
663
-
664
- GGML_UNUSED(reg);
839
+ ggml_backend_metal_reg_t ctx = (ggml_backend_metal_reg_t)reg->context;
840
+ return ctx->devices.size();
665
841
  }
666
842
 
667
843
  static ggml_backend_dev_t ggml_backend_metal_reg_device_get(ggml_backend_reg_t reg, size_t index) {
668
- GGML_ASSERT(index == 0);
669
-
670
- return &g_ggml_metal_device;
671
-
672
- GGML_UNUSED(reg);
673
- GGML_UNUSED(index);
844
+ ggml_backend_metal_reg_t ctx = (ggml_backend_metal_reg_t)reg->context;
845
+ GGML_ASSERT(index < ctx->devices.size());
846
+ return ctx->devices[index];
674
847
  }
675
848
 
676
849
  static ggml_backend_feature g_ggml_backend_metal_features[] = {
@@ -698,27 +871,67 @@ static void * ggml_backend_metal_get_proc_address(ggml_backend_reg_t reg, const
698
871
 
699
872
  static ggml_backend_reg_i ggml_backend_metal_reg_i = {
700
873
  /* .get_name = */ ggml_backend_metal_reg_get_name,
701
- /* .device_count = */ ggml_backend_metal_reg_device_count,
702
- /* .device_get = */ ggml_backend_metal_reg_device_get,
874
+ /* .get_device_count = */ ggml_backend_metal_reg_device_count,
875
+ /* .get_device = */ ggml_backend_metal_reg_device_get,
703
876
  /* .get_proc_address = */ ggml_backend_metal_get_proc_address,
704
877
  };
705
878
 
879
+ static ggml_backend_dev_t ggml_backend_metal_device_init(ggml_backend_reg_t reg, int device) {
880
+ return new ggml_backend_device {
881
+ /* .iface = */ ggml_backend_metal_device_i,
882
+ /* .reg = */ reg,
883
+ /* .context = */ ggml_metal_device_get(device),
884
+ };
885
+ }
886
+
887
+ static void ggml_backend_metal_device_free(ggml_backend_dev_t dev) {
888
+ delete dev;
889
+ }
890
+
891
+ struct ggml_backend_device_deleter {
892
+ void operator()(ggml_backend_dev_t ctx) {
893
+ ggml_backend_metal_device_free(ctx);
894
+ }
895
+ };
896
+
897
+ typedef std::unique_ptr<ggml_backend_device, ggml_backend_device_deleter> ggml_backend_device_ptr;
898
+
706
899
  ggml_backend_reg_t ggml_backend_metal_reg(void) {
900
+ static ggml_backend_reg reg;
901
+ static bool initialized = false;
902
+
707
903
  {
708
- g_ggml_metal_reg = {
709
- /* .api_version = */ GGML_BACKEND_API_VERSION,
710
- /* .iface = */ ggml_backend_metal_reg_i,
711
- /* .context = */ NULL,
712
- };
713
-
714
- g_ggml_metal_device = {
715
- /* .iface = */ ggml_backend_metal_device_i,
716
- /* .reg = */ &g_ggml_metal_reg,
717
- /* .context = */ ggml_metal_device_get(),
718
- };
904
+ static std::mutex mutex;
905
+ std::lock_guard<std::mutex> lock(mutex);
906
+
907
+ const char * env = getenv("GGML_METAL_DEVICES");
908
+ if (env) {
909
+ g_devices = atoi(env);
910
+ }
911
+
912
+ static std::vector<ggml_backend_device_ptr> devs;
913
+
914
+ if (!initialized) {
915
+ static ggml_backend_metal_reg_ptr reg_ctx(ggml_backend_metal_reg_init());
916
+
917
+ for (int i = 0; i < g_devices; ++i) {
918
+ auto * dev = ggml_backend_metal_device_init(&reg, i);
919
+ devs.emplace_back(dev);
920
+
921
+ reg_ctx->devices.push_back(dev);
922
+ }
923
+
924
+ reg = {
925
+ /* .api_version = */ GGML_BACKEND_API_VERSION,
926
+ /* .iface = */ ggml_backend_metal_reg_i,
927
+ /* .context = */ reg_ctx.get(),
928
+ };
929
+ }
930
+
931
+ initialized = true;
719
932
  }
720
933
 
721
- return &g_ggml_metal_reg;
934
+ return &reg;
722
935
  }
723
936
 
724
937
  GGML_BACKEND_DL_IMPL(ggml_backend_metal_reg)