whispercpp 1.3.2 → 1.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (664) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +6 -3
  3. data/README.md +71 -14
  4. data/Rakefile +20 -7
  5. data/ext/.gitignore +4 -6
  6. data/ext/dependencies.rb +36 -24
  7. data/ext/extconf.rb +1 -1
  8. data/ext/options.rb +48 -184
  9. data/ext/ruby_whisper.c +18 -0
  10. data/ext/ruby_whisper_context.c +43 -12
  11. data/ext/ruby_whisper_model.c +1 -1
  12. data/ext/ruby_whisper_params.c +59 -27
  13. data/ext/ruby_whisper_segment.c +81 -4
  14. data/ext/ruby_whisper_transcribe.cpp +13 -7
  15. data/ext/ruby_whisper_vad_params.c +1 -1
  16. data/ext/sources/CMakeLists.txt +5 -1
  17. data/ext/sources/bindings/javascript/package.json +1 -1
  18. data/ext/sources/build-xcframework.sh +24 -0
  19. data/ext/sources/examples/CMakeLists.txt +1 -0
  20. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +120 -24
  21. data/ext/sources/examples/addon.node/addon.cpp +154 -35
  22. data/ext/sources/examples/addon.node/index.js +10 -5
  23. data/ext/sources/examples/addon.node/vad-example.js +132 -0
  24. data/ext/sources/examples/bench/bench.cpp +29 -18
  25. data/ext/sources/examples/bench.wasm/index-tmpl.html +10 -9
  26. data/ext/sources/examples/cli/cli.cpp +7 -4
  27. data/ext/sources/examples/command/command.cpp +58 -32
  28. data/ext/sources/examples/command.wasm/index-tmpl.html +5 -4
  29. data/ext/sources/examples/common-ggml.cpp +2 -0
  30. data/ext/sources/examples/common-whisper.cpp +14 -7
  31. data/ext/sources/examples/lsp/lsp.cpp +21 -17
  32. data/ext/sources/examples/quantize/quantize.cpp +3 -0
  33. data/ext/sources/examples/server/CMakeLists.txt +3 -0
  34. data/ext/sources/examples/server/server.cpp +193 -35
  35. data/ext/sources/examples/server.py +6 -1
  36. data/ext/sources/examples/stream/stream.cpp +10 -2
  37. data/ext/sources/examples/stream.wasm/emscripten.cpp +6 -6
  38. data/ext/sources/examples/stream.wasm/index-tmpl.html +82 -5
  39. data/ext/sources/examples/talk-llama/CMakeLists.txt +3 -0
  40. data/ext/sources/examples/talk-llama/llama-adapter.cpp +101 -4
  41. data/ext/sources/examples/talk-llama/llama-adapter.h +6 -0
  42. data/ext/sources/examples/talk-llama/llama-arch.cpp +756 -15
  43. data/ext/sources/examples/talk-llama/llama-arch.h +85 -1
  44. data/ext/sources/examples/talk-llama/llama-batch.cpp +773 -272
  45. data/ext/sources/examples/talk-llama/llama-batch.h +126 -55
  46. data/ext/sources/examples/talk-llama/llama-chat.cpp +150 -13
  47. data/ext/sources/examples/talk-llama/llama-chat.h +8 -0
  48. data/ext/sources/examples/talk-llama/llama-context.cpp +814 -542
  49. data/ext/sources/examples/talk-llama/llama-context.h +68 -32
  50. data/ext/sources/examples/talk-llama/llama-cparams.cpp +1 -1
  51. data/ext/sources/examples/talk-llama/llama-cparams.h +4 -4
  52. data/ext/sources/examples/talk-llama/llama-graph.cpp +787 -440
  53. data/ext/sources/examples/talk-llama/llama-graph.h +333 -153
  54. data/ext/sources/examples/talk-llama/llama-hparams.cpp +128 -6
  55. data/ext/sources/examples/talk-llama/llama-hparams.h +80 -17
  56. data/ext/sources/examples/talk-llama/llama-impl.h +2 -0
  57. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +326 -0
  58. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.h +137 -0
  59. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +1248 -1967
  60. data/ext/sources/examples/talk-llama/llama-kv-cache.h +218 -345
  61. data/ext/sources/examples/talk-llama/llama-kv-cells.h +164 -52
  62. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +266 -0
  63. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +139 -0
  64. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +1154 -0
  65. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +182 -0
  66. data/ext/sources/examples/talk-llama/llama-memory.cpp +58 -0
  67. data/ext/sources/examples/talk-llama/llama-memory.h +94 -4
  68. data/ext/sources/examples/talk-llama/llama-mmap.cpp +1 -1
  69. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +44 -17
  70. data/ext/sources/examples/talk-llama/llama-model-loader.h +3 -2
  71. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +1 -0
  72. data/ext/sources/examples/talk-llama/llama-model.cpp +11377 -5248
  73. data/ext/sources/examples/talk-llama/llama-model.h +87 -9
  74. data/ext/sources/examples/talk-llama/llama-quant.cpp +137 -16
  75. data/ext/sources/examples/talk-llama/llama-sampling.cpp +226 -126
  76. data/ext/sources/examples/talk-llama/llama-vocab.cpp +502 -38
  77. data/ext/sources/examples/talk-llama/llama-vocab.h +46 -0
  78. data/ext/sources/examples/talk-llama/llama.cpp +76 -17
  79. data/ext/sources/examples/talk-llama/llama.h +176 -151
  80. data/ext/sources/examples/talk-llama/talk-llama.cpp +11 -6
  81. data/ext/sources/examples/talk-llama/unicode.cpp +212 -0
  82. data/ext/sources/examples/talk-llama/unicode.h +45 -0
  83. data/ext/sources/examples/vad-speech-segments/speech.cpp +6 -0
  84. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +6 -2
  85. data/ext/sources/examples/whisper.wasm/index-tmpl.html +17 -16
  86. data/ext/sources/ggml/CMakeLists.txt +106 -33
  87. data/ext/sources/ggml/cmake/common.cmake +24 -0
  88. data/ext/sources/ggml/cmake/ggml-config.cmake.in +132 -93
  89. data/ext/sources/ggml/include/ggml-backend.h +18 -2
  90. data/ext/sources/ggml/include/ggml-cpu.h +2 -0
  91. data/ext/sources/ggml/include/ggml-metal.h +1 -6
  92. data/ext/sources/ggml/include/ggml-opt.h +25 -6
  93. data/ext/sources/ggml/include/ggml-webgpu.h +19 -0
  94. data/ext/sources/ggml/include/ggml-zdnn.h +17 -0
  95. data/ext/sources/ggml/include/ggml.h +365 -21
  96. data/ext/sources/ggml/src/CMakeLists.txt +98 -25
  97. data/ext/sources/ggml/src/ggml-alloc.c +265 -141
  98. data/ext/sources/ggml/src/ggml-backend-impl.h +4 -1
  99. data/ext/sources/ggml/src/ggml-backend-reg.cpp +35 -13
  100. data/ext/sources/ggml/src/ggml-backend.cpp +266 -60
  101. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +4 -4
  102. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +5 -4
  103. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +15 -0
  104. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +3 -1
  105. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +903 -717
  106. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +143 -25
  107. data/ext/sources/ggml/src/ggml-cann/common.h +149 -2
  108. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +521 -78
  109. data/ext/sources/ggml/src/ggml-common.h +21 -0
  110. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +165 -50
  111. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +5 -3
  112. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
  113. data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  114. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +3650 -0
  115. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +1891 -0
  116. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +2160 -0
  117. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  118. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +2305 -0
  119. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1897 -0
  120. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +342 -0
  121. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +1468 -0
  122. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +1221 -0
  123. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +3820 -0
  124. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +6307 -0
  125. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +214 -0
  126. data/ext/sources/ggml/src/ggml-cpu/common.h +18 -3
  127. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +23 -7
  128. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +179 -110
  129. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +44 -33
  130. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  131. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +152 -18
  132. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +7 -1
  133. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +228 -98
  134. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +532 -1124
  135. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  136. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +3374 -2081
  137. data/ext/sources/ggml/src/ggml-cpu/ops.h +13 -8
  138. data/ext/sources/ggml/src/ggml-cpu/quants.c +1193 -0
  139. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +34 -0
  140. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1982 -0
  141. data/ext/sources/ggml/src/ggml-cpu/repack.h +120 -0
  142. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +367 -46
  143. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
  144. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  145. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  146. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  147. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +3 -3
  148. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +1 -1
  149. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +272 -35
  150. data/ext/sources/ggml/src/ggml-cpu/vec.h +794 -142
  151. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +20 -16
  152. data/ext/sources/ggml/src/ggml-cuda/add-id.cu +58 -0
  153. data/ext/sources/ggml/src/ggml-cuda/add-id.cuh +3 -0
  154. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +330 -191
  155. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +2 -0
  156. data/ext/sources/ggml/src/ggml-cuda/common.cuh +291 -81
  157. data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
  158. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  159. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  160. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  161. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  162. data/ext/sources/ggml/src/ggml-cuda/conv2d.cu +166 -0
  163. data/ext/sources/ggml/src/ggml-cuda/conv2d.cuh +5 -0
  164. data/ext/sources/ggml/src/ggml-cuda/convert.cu +117 -22
  165. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +20 -0
  166. data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +217 -0
  167. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +64 -307
  168. data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
  169. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +14 -40
  170. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +499 -368
  171. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +142 -93
  172. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +755 -0
  173. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +3 -0
  174. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +593 -0
  175. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +90 -50
  176. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +185 -198
  177. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
  178. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +50 -39
  179. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +636 -222
  180. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +196 -35
  181. data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +1 -0
  182. data/ext/sources/ggml/src/ggml-cuda/mean.cu +73 -0
  183. data/ext/sources/ggml/src/ggml-cuda/mean.cuh +3 -0
  184. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +198 -45
  185. data/ext/sources/ggml/src/ggml-cuda/mmf.cu +123 -0
  186. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +496 -0
  187. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +206 -57
  188. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +1262 -721
  189. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +506 -0
  190. data/ext/sources/ggml/src/ggml-cuda/{mmv.cuh → mmvf.cuh} +4 -5
  191. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +64 -73
  192. data/ext/sources/ggml/src/ggml-cuda/norm.cu +284 -12
  193. data/ext/sources/ggml/src/ggml-cuda/norm.cuh +7 -0
  194. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
  195. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
  196. data/ext/sources/ggml/src/ggml-cuda/pad.cu +46 -23
  197. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cu +91 -0
  198. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
  199. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +12 -10
  200. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
  201. data/ext/sources/ggml/src/ggml-cuda/roll.cu +67 -0
  202. data/ext/sources/ggml/src/ggml-cuda/roll.cuh +5 -0
  203. data/ext/sources/ggml/src/ggml-cuda/rope.cu +21 -27
  204. data/ext/sources/ggml/src/ggml-cuda/scale.cu +14 -11
  205. data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +276 -0
  206. data/ext/sources/ggml/src/ggml-cuda/set-rows.cuh +7 -0
  207. data/ext/sources/ggml/src/ggml-cuda/softcap.cu +34 -0
  208. data/ext/sources/ggml/src/ggml-cuda/softcap.cuh +5 -0
  209. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +126 -59
  210. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
  211. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +322 -98
  212. data/ext/sources/ggml/src/ggml-cuda/sum.cu +6 -10
  213. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +23 -19
  214. data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  215. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
  216. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
  217. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
  218. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
  219. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
  220. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
  221. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
  222. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
  223. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
  224. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
  225. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
  226. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
  227. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
  228. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
  229. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
  230. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
  231. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
  232. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
  233. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
  234. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
  235. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
  236. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
  237. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
  238. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
  239. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
  240. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
  241. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
  242. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
  243. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
  244. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
  245. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
  246. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
  247. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
  248. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
  249. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
  250. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
  251. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +21 -18
  252. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
  253. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
  254. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
  255. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
  256. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
  257. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
  258. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
  259. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
  260. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
  261. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
  262. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
  263. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
  264. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
  265. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
  266. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
  267. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
  268. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
  269. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +259 -0
  270. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +14 -0
  271. data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +3 -3
  272. data/ext/sources/ggml/src/ggml-cuda/unary.cu +179 -0
  273. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +15 -0
  274. data/ext/sources/ggml/src/ggml-cuda/upscale.cu +92 -6
  275. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
  276. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
  277. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +58 -36
  278. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +4 -3
  279. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +14 -2
  280. data/ext/sources/ggml/src/ggml-impl.h +229 -175
  281. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +21 -17
  282. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
  283. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  284. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
  285. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +600 -0
  286. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +1376 -0
  287. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +226 -0
  288. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +1308 -0
  289. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +163 -63
  290. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +3158 -0
  291. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +82 -0
  292. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +718 -0
  293. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +3208 -1575
  294. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +18 -8
  295. data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +2 -2
  296. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +32 -0
  297. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +4430 -792
  298. data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +107 -0
  299. data/ext/sources/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  300. data/ext/sources/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  301. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  302. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  303. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  304. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +84 -0
  305. data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +138 -0
  306. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  307. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +370 -0
  308. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  309. data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
  310. data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +378 -0
  311. data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +121 -0
  312. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
  313. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
  314. data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
  315. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  316. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +132 -0
  317. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +133 -0
  318. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  319. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  320. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  321. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  322. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  323. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  324. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  325. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  326. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  327. data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
  328. data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  329. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  330. data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +79 -0
  331. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
  332. data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +189 -0
  333. data/ext/sources/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  334. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +34 -13
  335. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +34 -13
  336. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +34 -13
  337. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +34 -13
  338. data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +138 -0
  339. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
  340. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  341. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +20 -0
  342. data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  343. data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +120 -0
  344. data/ext/sources/ggml/src/ggml-opt.cpp +97 -41
  345. data/ext/sources/ggml/src/ggml-quants.c +117 -24
  346. data/ext/sources/ggml/src/ggml-quants.h +6 -0
  347. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +85 -62
  348. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
  349. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +2 -0
  350. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +9 -0
  351. data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +6 -0
  352. data/ext/sources/ggml/src/ggml-sycl/common.hpp +20 -48
  353. data/ext/sources/ggml/src/ggml-sycl/concat.cpp +13 -17
  354. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +21 -2
  355. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +116 -211
  356. data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +213 -1
  357. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  358. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +700 -1041
  359. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +20 -9
  360. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +17 -26
  361. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +2 -96
  362. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +393 -250
  363. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +1 -1
  364. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +32 -8
  365. data/ext/sources/ggml/src/ggml-sycl/quantize.hpp +133 -0
  366. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +38 -11
  367. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +125 -21
  368. data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +234 -0
  369. data/ext/sources/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  370. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  371. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  372. data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +4 -3
  373. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +105 -17
  374. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +36 -32
  375. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +4198 -1145
  376. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
  377. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +41 -1
  378. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
  379. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +13 -4
  380. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +39 -29
  381. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +349 -0
  382. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  383. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +2 -2
  384. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +66 -12
  385. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +154 -0
  386. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +21 -0
  387. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +1 -1
  388. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +2 -1
  389. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +6 -5
  390. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +4 -2
  391. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
  392. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +1 -1
  393. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
  394. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +1 -1
  395. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +1 -1
  396. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
  397. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +21 -0
  398. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +69 -24
  399. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +60 -20
  400. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +98 -42
  401. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +64 -27
  402. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +74 -13
  403. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  404. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  405. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  406. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  407. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +4 -17
  408. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +19 -10
  409. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +25 -15
  410. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +19 -0
  411. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
  412. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
  413. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
  414. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +18 -14
  415. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +126 -0
  416. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +65 -1
  417. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +11 -7
  418. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
  419. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +144 -531
  420. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +206 -38
  421. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.comp +556 -0
  422. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +12 -5
  423. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +15 -9
  424. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +111 -0
  425. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
  426. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +24 -3
  427. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +53 -3
  428. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  429. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +64 -11
  430. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
  431. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  432. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +1 -4
  433. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
  434. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
  435. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
  436. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp +5 -0
  437. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
  438. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +29 -7
  439. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +4 -0
  440. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
  441. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +38 -5
  442. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  443. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
  444. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +4 -3
  445. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +101 -9
  446. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
  447. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/utils.comp +25 -0
  448. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +338 -71
  449. data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +54 -0
  450. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1558 -0
  451. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl +44 -0
  452. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl +41 -0
  453. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
  454. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
  455. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +60 -0
  456. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +124 -0
  457. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
  458. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  459. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl +44 -0
  460. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl +41 -0
  461. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +907 -0
  462. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +57 -0
  463. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_in_place.wgsl +48 -0
  464. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +81 -0
  465. data/ext/sources/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  466. data/ext/sources/ggml/src/ggml-zdnn/common.hpp +59 -0
  467. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +628 -0
  468. data/ext/sources/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  469. data/ext/sources/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  470. data/ext/sources/ggml/src/ggml-zdnn/utils.cpp +79 -0
  471. data/ext/sources/ggml/src/ggml-zdnn/utils.hpp +19 -0
  472. data/ext/sources/ggml/src/ggml.c +802 -142
  473. data/ext/sources/ggml/src/ggml.cpp +26 -0
  474. data/ext/sources/ggml/src/gguf.cpp +32 -4
  475. data/ext/sources/include/whisper.h +2 -0
  476. data/ext/sources/src/CMakeLists.txt +2 -0
  477. data/ext/sources/src/coreml/whisper-compat.h +10 -0
  478. data/ext/sources/src/coreml/whisper-compat.m +35 -0
  479. data/ext/sources/src/coreml/whisper-decoder-impl.m +1 -0
  480. data/ext/sources/src/coreml/whisper-encoder-impl.m +1 -0
  481. data/ext/sources/src/whisper.cpp +241 -215
  482. data/ext/sources/tests/CMakeLists.txt +8 -1
  483. data/ext/sources/tests/test-vad-full.cpp +3 -3
  484. data/ext/sources/tests/test-vad.cpp +2 -2
  485. data/extsources.rb +15 -9
  486. data/lib/whisper/context.rb +15 -0
  487. data/lib/whisper/model/uri.rb +57 -2
  488. data/lib/whisper/segment.rb +58 -0
  489. data/sig/whisper.rbs +75 -38
  490. data/{tests → test}/helper.rb +1 -12
  491. data/{tests → test}/test_model.rb +9 -0
  492. data/test/test_package.rb +51 -0
  493. data/{tests → test}/test_params.rb +8 -0
  494. data/test/test_segment.rb +146 -0
  495. data/{tests → test}/test_whisper.rb +70 -0
  496. data/whispercpp.gemspec +2 -3
  497. metadata +246 -191
  498. data/ext/sources/.dockerignore +0 -3
  499. data/ext/sources/.github/workflows/bindings-ruby.yml +0 -21
  500. data/ext/sources/ci/run.sh +0 -336
  501. data/ext/sources/close-issue.yml +0 -28
  502. data/ext/sources/ggml/include/ggml-kompute.h +0 -50
  503. data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +0 -107
  504. data/ext/sources/ggml/src/ggml-amx/common.h +0 -94
  505. data/ext/sources/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
  506. data/ext/sources/ggml/src/ggml-amx/mmq.cpp +0 -2510
  507. data/ext/sources/ggml/src/ggml-amx/mmq.h +0 -17
  508. data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  509. data/ext/sources/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  510. data/ext/sources/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  511. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  512. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  513. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  514. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  515. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  516. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  517. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  518. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -6431
  519. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  520. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13747
  521. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -357
  522. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
  523. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -365
  524. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
  525. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -482
  526. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -472
  527. data/ext/sources/ggml/src/ggml-cuda/mmv.cu +0 -336
  528. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  529. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  530. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  531. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  532. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  533. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  534. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  535. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  536. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  537. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  538. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  539. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  540. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  541. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  542. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  543. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  544. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  545. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  546. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  547. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  548. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  549. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  550. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  551. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  552. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  553. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  554. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  555. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  556. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  557. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  558. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  559. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  560. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  561. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  562. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  563. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  564. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  565. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  566. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  567. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  568. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  569. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  570. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  571. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  572. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  573. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  574. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  575. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  576. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  577. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  578. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  579. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  580. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  581. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  582. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  583. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  584. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  585. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  586. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  587. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  588. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  589. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  590. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  591. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  592. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  593. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  594. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  595. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  596. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  597. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  598. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  599. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  600. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  601. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  602. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  603. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  604. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  605. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  606. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  607. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  608. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  609. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  610. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  611. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  612. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  613. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  614. data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  615. data/ext/sources/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  616. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
  617. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
  618. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
  619. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
  620. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
  621. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
  622. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
  623. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
  624. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
  625. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
  626. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
  627. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
  628. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
  629. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
  630. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
  631. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
  632. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
  633. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
  634. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
  635. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
  636. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
  637. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
  638. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
  639. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
  640. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
  641. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
  642. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
  643. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
  644. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
  645. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
  646. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
  647. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
  648. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
  649. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
  650. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
  651. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
  652. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
  653. data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +0 -5998
  654. data/tests/test_package.rb +0 -46
  655. data/tests/test_segment.rb +0 -74
  656. /data/ext/sources/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  657. /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  658. /data/{tests → test}/jfk_reader/.gitignore +0 -0
  659. /data/{tests → test}/jfk_reader/extconf.rb +0 -0
  660. /data/{tests → test}/jfk_reader/jfk_reader.c +0 -0
  661. /data/{tests → test}/test_callback.rb +0 -0
  662. /data/{tests → test}/test_error.rb +0 -0
  663. /data/{tests → test}/test_vad.rb +0 -0
  664. /data/{tests → test}/test_vad_params.rb +0 -0
@@ -19,9 +19,8 @@
19
19
  #include <stdio.h>
20
20
  #include <stdlib.h>
21
21
  #include <string.h>
22
- #include <string>
23
- #include <vector>
24
22
  #include <algorithm>
23
+ #include <vector>
25
24
 
26
25
  #ifdef __APPLE__
27
26
  #include <sys/types.h>
@@ -32,6 +31,7 @@
32
31
  // backend buffer type
33
32
 
34
33
  const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
34
+ GGML_ASSERT(buft);
35
35
  return buft->iface.get_name(buft);
36
36
  }
37
37
 
@@ -41,14 +41,17 @@ ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t
41
41
  return ggml_backend_buffer_init(buft, {}, NULL, 0);
42
42
  }
43
43
 
44
+ GGML_ASSERT(buft);
44
45
  return buft->iface.alloc_buffer(buft, size);
45
46
  }
46
47
 
47
48
  size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
49
+ GGML_ASSERT(buft);
48
50
  return buft->iface.get_alignment(buft);
49
51
  }
50
52
 
51
53
  size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
54
+ GGML_ASSERT(buft);
52
55
  // get_max_size is optional, defaults to SIZE_MAX
53
56
  if (buft->iface.get_max_size) {
54
57
  return buft->iface.get_max_size(buft);
@@ -57,6 +60,7 @@ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
57
60
  }
58
61
 
59
62
  size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) {
63
+ GGML_ASSERT(buft);
60
64
  // get_alloc_size is optional, defaults to ggml_nbytes
61
65
  if (buft->iface.get_alloc_size) {
62
66
  size_t size = buft->iface.get_alloc_size(buft, tensor);
@@ -67,6 +71,7 @@ size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const s
67
71
  }
68
72
 
69
73
  bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
74
+ GGML_ASSERT(buft);
70
75
  if (buft->iface.is_host) {
71
76
  return buft->iface.is_host(buft);
72
77
  }
@@ -74,6 +79,7 @@ bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
74
79
  }
75
80
 
76
81
  ggml_backend_dev_t ggml_backend_buft_get_device(ggml_backend_buffer_type_t buft) {
82
+ GGML_ASSERT(buft);
77
83
  return buft->device;
78
84
  }
79
85
 
@@ -111,10 +117,12 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
111
117
  }
112
118
 
113
119
  size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
120
+ GGML_ASSERT(buffer);
114
121
  return buffer->size;
115
122
  }
116
123
 
117
124
  void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
125
+ GGML_ASSERT(buffer);
118
126
  // get_base is optional if the buffer is zero-sized
119
127
  if (buffer->size == 0) {
120
128
  return NULL;
@@ -128,6 +136,7 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
128
136
  }
129
137
 
130
138
  enum ggml_status ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
139
+ GGML_ASSERT(buffer);
131
140
  // init_tensor is optional
132
141
  if (buffer->iface.init_tensor) {
133
142
  return buffer->iface.init_tensor(buffer, tensor);
@@ -136,6 +145,7 @@ enum ggml_status ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, s
136
145
  }
137
146
 
138
147
  void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
148
+ GGML_ASSERT(buffer);
139
149
  // clear is optional if the buffer is zero-sized
140
150
  if (buffer->size == 0) {
141
151
  return;
@@ -161,6 +171,7 @@ bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
161
171
  }
162
172
 
163
173
  void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
174
+ GGML_ASSERT(buffer);
164
175
  buffer->usage = usage;
165
176
 
166
177
  // FIXME: add a generic callback to the buffer interface
@@ -170,14 +181,17 @@ void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backe
170
181
  }
171
182
 
172
183
  enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage(ggml_backend_buffer_t buffer) {
184
+ GGML_ASSERT(buffer);
173
185
  return buffer->usage;
174
186
  }
175
187
 
176
188
  ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
189
+ GGML_ASSERT(buffer);
177
190
  return buffer->buft;
178
191
  }
179
192
 
180
193
  void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
194
+ GGML_ASSERT(buffer);
181
195
  if (buffer->iface.reset) {
182
196
  buffer->iface.reset(buffer);
183
197
  }
@@ -216,6 +230,7 @@ void ggml_backend_free(ggml_backend_t backend) {
216
230
  }
217
231
 
218
232
  ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
233
+ GGML_ASSERT(backend);
219
234
  return ggml_backend_dev_buffer_type(backend->device);
220
235
  }
221
236
 
@@ -232,6 +247,8 @@ size_t ggml_backend_get_max_size(ggml_backend_t backend) {
232
247
  }
233
248
 
234
249
  void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
250
+ GGML_ASSERT(backend);
251
+ GGML_ASSERT(tensor);
235
252
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
236
253
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
237
254
 
@@ -243,6 +260,8 @@ void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor *
243
260
  }
244
261
 
245
262
  void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
263
+ GGML_ASSERT(backend);
264
+ GGML_ASSERT(tensor);
246
265
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
247
266
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
248
267
 
@@ -284,6 +303,7 @@ void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, siz
284
303
  }
285
304
 
286
305
  void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
306
+ GGML_ASSERT(tensor);
287
307
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
288
308
 
289
309
  if (size == 0) {
@@ -299,6 +319,7 @@ void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size
299
319
  }
300
320
 
301
321
  void ggml_backend_synchronize(ggml_backend_t backend) {
322
+ GGML_ASSERT(backend);
302
323
  if (backend->iface.synchronize == NULL) {
303
324
  return;
304
325
  }
@@ -307,18 +328,21 @@ void ggml_backend_synchronize(ggml_backend_t backend) {
307
328
  }
308
329
 
309
330
  ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
331
+ GGML_ASSERT(backend);
310
332
  GGML_ASSERT(backend->iface.graph_plan_create != NULL);
311
333
 
312
334
  return backend->iface.graph_plan_create(backend, cgraph);
313
335
  }
314
336
 
315
337
  void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
338
+ GGML_ASSERT(backend);
316
339
  GGML_ASSERT(backend->iface.graph_plan_free != NULL);
317
340
 
318
341
  backend->iface.graph_plan_free(backend, plan);
319
342
  }
320
343
 
321
344
  enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
345
+ GGML_ASSERT(backend);
322
346
  GGML_ASSERT(backend->iface.graph_plan_compute != NULL);
323
347
 
324
348
  return backend->iface.graph_plan_compute(backend, plan);
@@ -331,42 +355,32 @@ enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_
331
355
  }
332
356
 
333
357
  enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
358
+ GGML_ASSERT(backend);
334
359
  return backend->iface.graph_compute(backend, cgraph);
335
360
  }
336
361
 
337
362
  bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
363
+ GGML_ASSERT(backend);
338
364
  return ggml_backend_dev_supports_op(backend->device, op);
339
365
  }
340
366
 
341
367
  bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
368
+ GGML_ASSERT(backend);
342
369
  return ggml_backend_dev_supports_buft(backend->device, buft);
343
370
  }
344
371
 
345
372
  bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
373
+ GGML_ASSERT(backend);
346
374
  return ggml_backend_dev_offload_op(backend->device, op);
347
375
  }
348
376
 
349
377
  ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
378
+ GGML_ASSERT(backend);
350
379
  return backend->device;
351
380
  }
352
381
 
353
382
  // backend copy
354
383
 
355
- static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
356
- if (a->type != b->type) {
357
- return false;
358
- }
359
- for (int i = 0; i < GGML_MAX_DIMS; i++) {
360
- if (a->ne[i] != b->ne[i]) {
361
- return false;
362
- }
363
- if (a->nb[i] != b->nb[i]) {
364
- return false;
365
- }
366
- }
367
- return true;
368
- }
369
-
370
384
  void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
371
385
  GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
372
386
 
@@ -397,6 +411,7 @@ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t b
397
411
  return;
398
412
  }
399
413
 
414
+ GGML_ASSERT(backend_dst);
400
415
  if (backend_dst->iface.cpy_tensor_async != NULL) {
401
416
  if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) {
402
417
  return;
@@ -428,38 +443,52 @@ void ggml_backend_event_free(ggml_backend_event_t event) {
428
443
  }
429
444
 
430
445
  void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend) {
446
+ GGML_ASSERT(backend);
431
447
  GGML_ASSERT(backend->iface.event_record != NULL);
432
448
 
433
449
  backend->iface.event_record(backend, event);
434
450
  }
435
451
 
436
452
  void ggml_backend_event_synchronize(ggml_backend_event_t event) {
453
+ GGML_ASSERT(event);
437
454
  GGML_ASSERT(event->device->iface.event_synchronize);
438
455
 
439
456
  event->device->iface.event_synchronize(event->device, event);
440
457
  }
441
458
 
442
459
  void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
460
+ GGML_ASSERT(backend);
443
461
  GGML_ASSERT(backend->iface.event_wait != NULL);
444
462
 
445
463
  backend->iface.event_wait(backend, event);
446
464
  }
447
465
 
466
+ static void ggml_backend_graph_optimize(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
467
+ GGML_ASSERT(backend);
468
+ if (backend->iface.graph_optimize != NULL) {
469
+ backend->iface.graph_optimize(backend, cgraph);
470
+ }
471
+ }
472
+
448
473
  // Backend device
449
474
 
450
475
  const char * ggml_backend_dev_name(ggml_backend_dev_t device) {
476
+ GGML_ASSERT(device);
451
477
  return device->iface.get_name(device);
452
478
  }
453
479
 
454
480
  const char * ggml_backend_dev_description(ggml_backend_dev_t device) {
481
+ GGML_ASSERT(device);
455
482
  return device->iface.get_description(device);
456
483
  }
457
484
 
458
485
  void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
486
+ GGML_ASSERT(device);
459
487
  device->iface.get_memory(device, free, total);
460
488
  }
461
489
 
462
490
  enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) {
491
+ GGML_ASSERT(device);
463
492
  return device->iface.get_type(device);
464
493
  }
465
494
 
@@ -469,18 +498,22 @@ void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_d
469
498
  }
470
499
 
471
500
  ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device) {
501
+ GGML_ASSERT(device);
472
502
  return device->reg;
473
503
  }
474
504
 
475
505
  ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params) {
506
+ GGML_ASSERT(device);
476
507
  return device->iface.init_backend(device, params);
477
508
  }
478
509
 
479
510
  ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) {
511
+ GGML_ASSERT(device);
480
512
  return device->iface.get_buffer_type(device);
481
513
  }
482
514
 
483
515
  ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device) {
516
+ GGML_ASSERT(device);
484
517
  if (device->iface.get_host_buffer_type == NULL) {
485
518
  return NULL;
486
519
  }
@@ -489,18 +522,22 @@ ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t
489
522
  }
490
523
 
491
524
  ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) {
525
+ GGML_ASSERT(device);
492
526
  return device->iface.buffer_from_host_ptr(device, ptr, size, max_tensor_size);
493
527
  }
494
528
 
495
529
  bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
530
+ GGML_ASSERT(device);
496
531
  return device->iface.supports_op(device, op);
497
532
  }
498
533
 
499
534
  bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft) {
535
+ GGML_ASSERT(device);
500
536
  return device->iface.supports_buft(device, buft);
501
537
  }
502
538
 
503
539
  bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
540
+ GGML_ASSERT(device);
504
541
  if (device->iface.offload_op != NULL) {
505
542
  return device->iface.offload_op(device, op);
506
543
  }
@@ -511,18 +548,22 @@ bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_te
511
548
  // Backend (reg)
512
549
 
513
550
  const char * ggml_backend_reg_name(ggml_backend_reg_t reg) {
551
+ GGML_ASSERT(reg);
514
552
  return reg->iface.get_name(reg);
515
553
  }
516
554
 
517
555
  size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg) {
556
+ GGML_ASSERT(reg);
518
557
  return reg->iface.get_device_count(reg);
519
558
  }
520
559
 
521
560
  ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index) {
561
+ GGML_ASSERT(reg);
522
562
  return reg->iface.get_device(reg, index);
523
563
  }
524
564
 
525
565
  void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
566
+ GGML_ASSERT(reg);
526
567
  if (!reg->iface.get_proc_address) {
527
568
  return NULL;
528
569
  }
@@ -537,6 +578,7 @@ struct ggml_backend_multi_buffer_context {
537
578
  };
538
579
 
539
580
  static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
581
+ GGML_ASSERT(buffer);
540
582
  ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
541
583
  for (size_t i = 0; i < ctx->n_buffers; i++) {
542
584
  ggml_backend_buffer_free(ctx->buffers[i]);
@@ -547,6 +589,7 @@ static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer)
547
589
  }
548
590
 
549
591
  static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
592
+ GGML_ASSERT(buffer);
550
593
  ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
551
594
  for (size_t i = 0; i < ctx->n_buffers; i++) {
552
595
  ggml_backend_buffer_clear(ctx->buffers[i], value);
@@ -582,10 +625,12 @@ ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer
582
625
  }
583
626
 
584
627
  bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
628
+ GGML_ASSERT(buffer);
585
629
  return buffer->iface.free_buffer == ggml_backend_multi_buffer_free_buffer;
586
630
  }
587
631
 
588
632
  void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
633
+ GGML_ASSERT(buffer);
589
634
  GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer));
590
635
  ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
591
636
  for (size_t i = 0; i < ctx->n_buffers; i++) {
@@ -613,7 +658,7 @@ static bool ggml_is_view_op(enum ggml_op op) {
613
658
  #endif
614
659
 
615
660
  #ifndef GGML_SCHED_MAX_SPLIT_INPUTS
616
- #define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC
661
+ #define GGML_SCHED_MAX_SPLIT_INPUTS 30
617
662
  #endif
618
663
 
619
664
  #ifndef GGML_SCHED_MAX_COPIES
@@ -662,6 +707,7 @@ struct ggml_backend_sched {
662
707
  // pipeline parallelism support
663
708
  int n_copies;
664
709
  int cur_copy;
710
+ int next_copy;
665
711
  ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
666
712
  struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
667
713
  int n_graph_inputs;
@@ -817,8 +863,9 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
817
863
  }
818
864
  if (sched->debug > 1) {
819
865
  ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
820
- GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
821
- fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
866
+ GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s] use=%d:", i, ggml_op_name(node->op), node->name,
867
+ fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node),
868
+ graph->use_counts[ggml_hash_find(&graph->visited_hash_set, node)]);
822
869
  for (int j = 0; j < GGML_MAX_SRC; j++) {
823
870
  struct ggml_tensor * src = node->src[j];
824
871
  if (src == NULL) {
@@ -862,7 +909,7 @@ static void ggml_backend_sched_set_if_supported(ggml_backend_sched_t sched, stru
862
909
  }
863
910
 
864
911
  // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
865
- static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
912
+ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
866
913
  // reset splits
867
914
  sched->n_splits = 0;
868
915
  sched->n_graph_inputs = 0;
@@ -1084,6 +1131,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1084
1131
  }
1085
1132
  }
1086
1133
  }
1134
+ // if the node is still unassigned, assign it to the first backend that supports it
1135
+ for (int b = 0; b < sched->n_backends && *cur_backend_id == -1; b++) {
1136
+ ggml_backend_sched_set_if_supported(sched, node, b, cur_backend_id);
1137
+ }
1138
+ GGML_ASSERT(*cur_backend_id != -1);
1087
1139
  }
1088
1140
 
1089
1141
  // pass 5: split graph, find tensors that need to be copied
@@ -1111,7 +1163,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1111
1163
 
1112
1164
  const int node_backend_id = tensor_backend_id(node);
1113
1165
 
1114
- assert(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback
1166
+ GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback
1115
1167
 
1116
1168
  // check if we should start a new split based on the sources of the current node
1117
1169
  bool need_new_split = false;
@@ -1169,7 +1221,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1169
1221
 
1170
1222
  size_t src_id = hash_id(src);
1171
1223
  const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
1172
- assert(src_backend_id != -1); // all inputs should be assigned by now
1224
+ GGML_ASSERT(src_backend_id != -1); // all inputs should be assigned by now
1173
1225
 
1174
1226
  if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
1175
1227
  if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {
@@ -1253,6 +1305,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1253
1305
  struct ggml_backend_sched_split * split = &sched->splits[i];
1254
1306
  split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
1255
1307
 
1308
+ // Optimize this split of the graph. This needs to happen before we make graph_copy,
1309
+ // so they are in sync.
1310
+ ggml_backend_graph_optimize(sched->backends[split->backend_id], &split->graph);
1311
+
1256
1312
  // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
1257
1313
  for (int j = 0; j < split->n_inputs; j++) {
1258
1314
  assert(graph_copy->size > (graph_copy->n_nodes + 1));
@@ -1340,7 +1396,10 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
1340
1396
  // allocate graph
1341
1397
  if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
1342
1398
  // the re-allocation may cause the split inputs to be moved to a different address
1343
- ggml_backend_sched_synchronize(sched);
1399
+ // synchronize without ggml_backend_sched_synchronize to avoid changing cur_copy
1400
+ for (int i = 0; i < sched->n_backends; i++) {
1401
+ ggml_backend_synchronize(sched->backends[i]);
1402
+ }
1344
1403
  #ifndef NDEBUG
1345
1404
  GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
1346
1405
  #endif
@@ -1355,17 +1414,22 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
1355
1414
  }
1356
1415
 
1357
1416
  static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
1417
+ GGML_ASSERT(sched);
1358
1418
  struct ggml_backend_sched_split * splits = sched->splits;
1359
1419
 
1360
- for (int i = 0; i < sched->n_splits; i++) {
1361
- struct ggml_backend_sched_split * split = &splits[i];
1420
+ ggml_tensor * prev_ids_tensor = nullptr;
1421
+ std::vector<int32_t> ids;
1422
+ std::vector<ggml_bitset_t> used_ids;
1423
+
1424
+ for (int split_id = 0; split_id < sched->n_splits; split_id++) {
1425
+ struct ggml_backend_sched_split * split = &splits[split_id];
1362
1426
  int split_backend_id = split->backend_id;
1363
1427
  ggml_backend_t split_backend = sched->backends[split_backend_id];
1364
1428
 
1365
1429
  // copy the input tensors to the split backend
1366
- for (int j = 0; j < split->n_inputs; j++) {
1367
- ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
1368
- struct ggml_tensor * input = split->inputs[j];
1430
+ for (int input_id = 0; input_id < split->n_inputs; input_id++) {
1431
+ ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[input_id]);
1432
+ struct ggml_tensor * input = split->inputs[input_id];
1369
1433
  struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
1370
1434
 
1371
1435
  if (input->flags & GGML_TENSOR_FLAG_INPUT) {
@@ -1383,16 +1447,104 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
1383
1447
  } else {
1384
1448
  ggml_backend_synchronize(split_backend);
1385
1449
  }
1386
- // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
1387
- // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
1388
- if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
1450
+
1451
+ // when offloading MoE weights, we can reduce the amount of data copied by copying only the experts that are used
1452
+ ggml_tensor * node = split->graph.nodes[0];
1453
+ if (split->graph.n_nodes > 0 &&
1454
+ ggml_backend_buffer_get_usage(input->buffer) == GGML_BACKEND_BUFFER_USAGE_WEIGHTS &&
1455
+ ggml_backend_buffer_is_host(input->buffer) && (
1456
+ (node->src[0] == input_cpy && node->op == GGML_OP_MUL_MAT_ID)
1457
+ //|| (node->src[1] == input_cpy && node->op == GGML_OP_ADD_ID) /* GGML_OP_ADD_ID weights are small and not worth splitting */
1458
+ )) {
1459
+
1460
+ const int64_t n_expert = node->op == GGML_OP_MUL_MAT_ID ? input->ne[2] : input->ne[1];
1461
+ const size_t expert_size = node->op == GGML_OP_MUL_MAT_ID ? input->nb[2] : input->nb[1];
1462
+
1389
1463
  ggml_backend_synchronize(input_backend);
1390
- if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1391
- ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
1392
- } else {
1393
- ggml_backend_synchronize(split_backend);
1464
+
1465
+ // get the ids
1466
+ ggml_tensor * ids_tensor = node->src[2];
1467
+ ggml_backend_t ids_backend = split_backend;
1468
+
1469
+ // if the ids tensor is also an input of the split, it may not have been copied yet to the split backend
1470
+ // in that case, we use the original ids tensor
1471
+ for (int i = input_id + 1; i < split->n_inputs; i++) {
1472
+ if (ids_tensor == tensor_copy(split->inputs[i], split_backend_id, sched->cur_copy)) {
1473
+ ids_tensor = split->inputs[i];
1474
+ ids_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[i]);
1475
+ break;
1476
+ }
1477
+ }
1478
+
1479
+ if (ids_tensor != prev_ids_tensor) {
1480
+ ids.resize(ggml_nbytes(ids_tensor) / sizeof(int32_t));
1481
+ ggml_backend_tensor_get_async(ids_backend, ids_tensor, ids.data(), 0, ggml_nbytes(ids_tensor));
1482
+ ggml_backend_synchronize(ids_backend);
1483
+
1484
+ // find the used experts
1485
+ used_ids.clear();
1486
+ used_ids.resize(ggml_bitset_size(n_expert));
1487
+ for (int64_t i1 = 0; i1 < ids_tensor->ne[1]; i1++) {
1488
+ for (int64_t i0 = 0; i0 < ids_tensor->ne[0]; i0++) {
1489
+ int32_t id = ids[i1 * ids_tensor->nb[1]/sizeof(int32_t) + i0 * ids_tensor->nb[0]/sizeof(int32_t)];
1490
+ GGML_ASSERT(id >= 0 && id < n_expert);
1491
+ ggml_bitset_set(used_ids.data(), id);
1492
+ }
1493
+ }
1494
+
1495
+ prev_ids_tensor = ids_tensor;
1496
+ }
1497
+
1498
+ // group consecutive experts and copy them together
1499
+ auto copy_experts = [&](int32_t first_id, int32_t last_id) {
1500
+ const size_t expert_offset = first_id * expert_size;
1501
+ const size_t expert_size_copy = (last_id - first_id + 1) * expert_size;
1502
+ const size_t padding = std::min<size_t>(expert_size, 512);
1503
+ const size_t padding_end = last_id < n_expert - 1 ? padding : 0;
1504
+
1505
+ ggml_backend_tensor_set_async(split_backend,
1506
+ input_cpy,
1507
+ (const uint8_t *)input->data + expert_offset, expert_offset,
1508
+ // copy a bit extra at the to ensure there are no NaNs in the padding of the last expert
1509
+ // this is necessary for MMQ in the CUDA backend
1510
+ expert_size_copy + padding_end);
1511
+ };
1512
+
1513
+ int id = 0;
1514
+ while (!ggml_bitset_get(used_ids.data(), id)) {
1515
+ id++;
1516
+ }
1517
+ int32_t first_id = id;
1518
+ int32_t last_id = first_id;
1519
+
1520
+ for (++id; id < n_expert; ++id) {
1521
+ if (!ggml_bitset_get(used_ids.data(), id)) {
1522
+ continue;
1523
+ }
1524
+
1525
+ if (id == last_id + 1) {
1526
+ last_id = id;
1527
+ continue;
1528
+ }
1529
+
1530
+ copy_experts(first_id, last_id);
1531
+
1532
+ first_id = id;
1533
+ last_id = id;
1534
+ }
1535
+ copy_experts(first_id, last_id);
1536
+ } else {
1537
+ // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
1538
+ // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
1539
+ if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
1540
+ ggml_backend_synchronize(input_backend);
1541
+ if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1542
+ ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
1543
+ } else {
1544
+ ggml_backend_synchronize(split_backend);
1545
+ }
1546
+ ggml_backend_tensor_copy(input, input_cpy);
1394
1547
  }
1395
- ggml_backend_tensor_copy(input, input_cpy);
1396
1548
  }
1397
1549
  }
1398
1550
  }
@@ -1444,8 +1596,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
1444
1596
  }
1445
1597
  }
1446
1598
 
1447
- sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
1448
-
1449
1599
  return GGML_STATUS_SUCCESS;
1450
1600
  }
1451
1601
 
@@ -1533,6 +1683,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
1533
1683
  }
1534
1684
 
1535
1685
  void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
1686
+ GGML_ASSERT(sched);
1536
1687
  // reset state for the next run
1537
1688
  if (!sched->is_reset) {
1538
1689
  ggml_hash_set_reset(&sched->hash_set);
@@ -1544,12 +1695,15 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
1544
1695
  }
1545
1696
 
1546
1697
  bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
1698
+ GGML_ASSERT(sched);
1547
1699
  GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
1548
1700
 
1549
- ggml_backend_sched_split_graph(sched, measure_graph);
1701
+ ggml_backend_sched_reset(sched);
1550
1702
 
1551
1703
  ggml_backend_sched_synchronize(sched);
1552
1704
 
1705
+ ggml_backend_sched_split_graph(sched, measure_graph);
1706
+
1553
1707
  if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
1554
1708
  return false;
1555
1709
  }
@@ -1560,10 +1714,14 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
1560
1714
  }
1561
1715
 
1562
1716
  bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1717
+ GGML_ASSERT(sched);
1563
1718
  GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
1719
+ GGML_ASSERT(!sched->is_alloc);
1564
1720
 
1565
- ggml_backend_sched_split_graph(sched, graph);
1721
+ sched->cur_copy = sched->next_copy;
1722
+ sched->next_copy = (sched->next_copy + 1) % sched->n_copies;
1566
1723
 
1724
+ ggml_backend_sched_split_graph(sched, graph);
1567
1725
 
1568
1726
  if (!ggml_backend_sched_alloc_splits(sched)) {
1569
1727
  return false;
@@ -1581,6 +1739,7 @@ enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, st
1581
1739
  }
1582
1740
 
1583
1741
  enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1742
+ GGML_ASSERT(sched);
1584
1743
  if (!sched->is_reset && !sched->is_alloc) {
1585
1744
  ggml_backend_sched_reset(sched);
1586
1745
  }
@@ -1595,37 +1754,55 @@ enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sch
1595
1754
  }
1596
1755
 
1597
1756
  void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
1757
+ GGML_ASSERT(sched);
1598
1758
  for (int i = 0; i < sched->n_backends; i++) {
1599
1759
  ggml_backend_synchronize(sched->backends[i]);
1600
1760
  }
1601
- // reset the current copy to 0 so that the graphs will be similar during generation
1602
- // necessary for CUDA graphs
1603
- sched->cur_copy = 0;
1761
+ if (!sched->is_alloc) {
1762
+ // if the graph is not already allocated, always use copy 0 after a synchronization
1763
+ // this ensures that during generation the same copy is used every time,
1764
+ // which avoids changes in the graph that could cause CUDA or other graphs to be disabled
1765
+ sched->next_copy = 0;
1766
+ }
1604
1767
  }
1605
1768
 
1606
1769
  void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
1770
+ GGML_ASSERT(sched);
1607
1771
  sched->callback_eval = callback;
1608
1772
  sched->callback_eval_user_data = user_data;
1609
1773
  }
1610
1774
 
1611
1775
  int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
1776
+ GGML_ASSERT(sched);
1612
1777
  return sched->n_splits;
1613
1778
  }
1614
1779
 
1615
1780
  int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) {
1781
+ GGML_ASSERT(sched);
1616
1782
  return sched->n_copies;
1617
1783
  }
1618
1784
 
1619
1785
  int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched) {
1786
+ GGML_ASSERT(sched);
1620
1787
  return sched->n_backends;
1621
1788
  }
1622
1789
 
1623
1790
  ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i) {
1791
+ GGML_ASSERT(sched);
1624
1792
  GGML_ASSERT(i >= 0 && i < sched->n_backends);
1625
1793
  return sched->backends[i];
1626
1794
  }
1627
1795
 
1796
+ ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend) {
1797
+ GGML_ASSERT(sched);
1798
+ int backend_index = ggml_backend_sched_backend_id(sched, backend);
1799
+ GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1800
+
1801
+ return sched->bufts[backend_index];
1802
+ }
1803
+
1628
1804
  size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
1805
+ GGML_ASSERT(sched);
1629
1806
  int backend_index = ggml_backend_sched_backend_id(sched, backend);
1630
1807
  GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1631
1808
 
@@ -1633,6 +1810,7 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
1633
1810
  }
1634
1811
 
1635
1812
  void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
1813
+ GGML_ASSERT(sched);
1636
1814
  int backend_index = ggml_backend_sched_backend_id(sched, backend);
1637
1815
  GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1638
1816
  tensor_backend_id(node) = backend_index;
@@ -1641,6 +1819,7 @@ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct gg
1641
1819
  }
1642
1820
 
1643
1821
  ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
1822
+ GGML_ASSERT(sched);
1644
1823
  int backend_index = tensor_backend_id(node);
1645
1824
  if (backend_index == -1) {
1646
1825
  return NULL;
@@ -1651,6 +1830,7 @@ ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched,
1651
1830
  // utils
1652
1831
 
1653
1832
  enum ggml_status ggml_backend_view_init(struct ggml_tensor * tensor) {
1833
+ GGML_ASSERT(tensor);
1654
1834
  GGML_ASSERT(tensor->buffer == NULL);
1655
1835
  GGML_ASSERT(tensor->view_src != NULL);
1656
1836
  GGML_ASSERT(tensor->view_src->buffer != NULL);
@@ -1662,6 +1842,7 @@ enum ggml_status ggml_backend_view_init(struct ggml_tensor * tensor) {
1662
1842
  }
1663
1843
 
1664
1844
  enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
1845
+ GGML_ASSERT(tensor);
1665
1846
  GGML_ASSERT(tensor->buffer == NULL);
1666
1847
  GGML_ASSERT(tensor->data == NULL);
1667
1848
  GGML_ASSERT(tensor->view_src == NULL);
@@ -1735,6 +1916,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_
1735
1916
  }
1736
1917
 
1737
1918
  struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
1919
+ GGML_ASSERT(graph);
1738
1920
  struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size);
1739
1921
  struct ggml_tensor ** node_copies = (ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
1740
1922
  bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0]));
@@ -1821,7 +2003,7 @@ void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
1821
2003
  ggml_free(copy.ctx_unallocated);
1822
2004
  }
1823
2005
 
1824
- bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
2006
+ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor * test_node) {
1825
2007
  struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
1826
2008
  if (copy.buffer == NULL) {
1827
2009
  return false;
@@ -1832,28 +2014,45 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
1832
2014
 
1833
2015
  assert(g1->n_nodes == g2->n_nodes);
1834
2016
 
1835
- for (int i = 0; i < g1->n_nodes; i++) {
1836
- struct ggml_tensor * t1 = g1->nodes[i];
1837
- struct ggml_tensor * t2 = g2->nodes[i];
2017
+ if (test_node != nullptr) {
2018
+ // Compute the whole graph and only test the output for a specific tensor
2019
+ ggml_backend_graph_compute(backend1, g1);
2020
+ ggml_backend_graph_compute(backend2, g2);
1838
2021
 
1839
- assert(t1->op == t2->op && ggml_are_same_layout(t1, t2));
2022
+ int test_node_idx = -1;
2023
+ for (int i = 0; i < g1->n_nodes; i++) {
2024
+ struct ggml_tensor * t1 = g1->nodes[i];
2025
+ if (t1 == test_node) {
2026
+ test_node_idx = i;
2027
+ break;
2028
+ }
2029
+ }
2030
+ GGML_ASSERT(test_node_idx != -1);
1840
2031
 
1841
- struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1);
1842
- struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1);
2032
+ callback(test_node_idx, g1->nodes[test_node_idx], g2->nodes[test_node_idx], user_data);
2033
+ } else {
2034
+ for (int i = 0; i < g1->n_nodes; i++) {
2035
+ struct ggml_tensor * t1 = g1->nodes[i];
2036
+ struct ggml_tensor * t2 = g2->nodes[i];
1843
2037
 
1844
- ggml_backend_graph_compute(backend1, &g1v);
1845
- ggml_backend_graph_compute(backend2, &g2v);
2038
+ assert(t1->op == t2->op && ggml_are_same_layout(t1, t2));
1846
2039
 
1847
- if (ggml_is_view_op(t1->op)) {
1848
- continue;
1849
- }
2040
+ struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1);
2041
+ struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1);
1850
2042
 
1851
- // compare results, calculate rms etc
1852
- if (!callback(i, t1, t2, user_data)) {
1853
- break;
2043
+ ggml_backend_graph_compute(backend1, &g1v);
2044
+ ggml_backend_graph_compute(backend2, &g2v);
2045
+
2046
+ if (ggml_is_view_op(t1->op)) {
2047
+ continue;
2048
+ }
2049
+
2050
+ // compare results, calculate rms etc
2051
+ if (!callback(i, t1, t2, user_data)) {
2052
+ break;
2053
+ }
1854
2054
  }
1855
2055
  }
1856
-
1857
2056
  ggml_backend_graph_copy_free(copy);
1858
2057
 
1859
2058
  return true;
@@ -1862,6 +2061,7 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
1862
2061
  // CPU backend - buffer
1863
2062
 
1864
2063
  static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
2064
+ GGML_ASSERT(buffer);
1865
2065
  uintptr_t data = (uintptr_t)buffer->context;
1866
2066
 
1867
2067
  // align the buffer
@@ -1873,28 +2073,33 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
1873
2073
  }
1874
2074
 
1875
2075
  static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
2076
+ GGML_ASSERT(buffer);
1876
2077
  ggml_aligned_free(buffer->context, buffer->size);
1877
2078
  }
1878
2079
 
1879
2080
  static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
2081
+ GGML_ASSERT(tensor);
1880
2082
  memset((char *)tensor->data + offset, value, size);
1881
2083
 
1882
2084
  GGML_UNUSED(buffer);
1883
2085
  }
1884
2086
 
1885
2087
  static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
2088
+ GGML_ASSERT(tensor);
1886
2089
  memcpy((char *)tensor->data + offset, data, size);
1887
2090
 
1888
2091
  GGML_UNUSED(buffer);
1889
2092
  }
1890
2093
 
1891
2094
  static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
2095
+ GGML_ASSERT(tensor);
1892
2096
  memcpy(data, (const char *)tensor->data + offset, size);
1893
2097
 
1894
2098
  GGML_UNUSED(buffer);
1895
2099
  }
1896
2100
 
1897
2101
  static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
2102
+ GGML_ASSERT(src);
1898
2103
  if (ggml_backend_buffer_is_host(src->buffer)) {
1899
2104
  memcpy(dst->data, src->data, ggml_nbytes(src));
1900
2105
  return true;
@@ -1905,6 +2110,7 @@ static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con
1905
2110
  }
1906
2111
 
1907
2112
  static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
2113
+ GGML_ASSERT(buffer);
1908
2114
  memset(buffer->context, value, buffer->size);
1909
2115
  }
1910
2116