whispercpp 1.3.6 → 1.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (828) hide show
  1. checksums.yaml +4 -4
  2. data/.document +3 -0
  3. data/.rdoc_options +2 -0
  4. data/README.md +38 -5
  5. data/Rakefile +18 -3
  6. data/ext/dependencies.rb +10 -4
  7. data/ext/dependencies_for_windows.rb +17 -0
  8. data/ext/extconf.rb +20 -8
  9. data/ext/options.rb +54 -14
  10. data/ext/options_for_windows.rb +51 -0
  11. data/ext/ruby_whisper.c +36 -42
  12. data/ext/ruby_whisper.h +135 -0
  13. data/ext/ruby_whisper_context.c +107 -28
  14. data/ext/ruby_whisper_log_queue.c +180 -0
  15. data/ext/ruby_whisper_log_settable.h +47 -0
  16. data/ext/ruby_whisper_parakeet.c +49 -0
  17. data/ext/ruby_whisper_parakeet_context.c +304 -0
  18. data/ext/ruby_whisper_parakeet_context_params.c +117 -0
  19. data/ext/ruby_whisper_parakeet_model.c +84 -0
  20. data/ext/ruby_whisper_parakeet_params.c +548 -0
  21. data/ext/ruby_whisper_parakeet_segment.c +157 -0
  22. data/ext/ruby_whisper_parakeet_token.c +188 -0
  23. data/ext/ruby_whisper_parakeet_transcribe.cpp +58 -0
  24. data/ext/ruby_whisper_params.c +256 -65
  25. data/ext/ruby_whisper_segment.c +6 -6
  26. data/ext/ruby_whisper_transcribe.cpp +42 -15
  27. data/ext/sources/CMakeLists.txt +41 -3
  28. data/ext/sources/CMakePresets.json +95 -0
  29. data/ext/sources/cmake/parakeet-config.cmake.in +30 -0
  30. data/ext/sources/cmake/parakeet.pc.in +10 -0
  31. data/ext/sources/cmake/whisper.pc.in +1 -1
  32. data/ext/sources/examples/CMakeLists.txt +4 -2
  33. data/ext/sources/examples/bench/bench.cpp +1 -1
  34. data/ext/sources/examples/cli/cli.cpp +43 -9
  35. data/ext/sources/examples/common-ggml.cpp +2 -0
  36. data/ext/sources/examples/common-whisper.cpp +139 -67
  37. data/ext/sources/examples/common-whisper.h +11 -0
  38. data/ext/sources/examples/ffmpeg-transcode.cpp +211 -341
  39. data/ext/sources/examples/parakeet-cli/CMakeLists.txt +8 -0
  40. data/ext/sources/examples/parakeet-cli/parakeet-cli.cpp +243 -0
  41. data/ext/sources/examples/parakeet-quantize/CMakeLists.txt +7 -0
  42. data/ext/sources/examples/parakeet-quantize/parakeet-quantize.cpp +230 -0
  43. data/ext/sources/examples/server/server.cpp +199 -163
  44. data/ext/sources/ggml/CMakeLists.txt +21 -13
  45. data/ext/sources/ggml/cmake/FindNCCL.cmake +36 -0
  46. data/ext/sources/ggml/cmake/ggml-config.cmake.in +12 -2
  47. data/ext/sources/ggml/include/ggml-alloc.h +1 -0
  48. data/ext/sources/ggml/include/ggml-backend.h +72 -10
  49. data/ext/sources/ggml/include/ggml-cuda.h +3 -0
  50. data/ext/sources/ggml/include/ggml-rpc.h +3 -3
  51. data/ext/sources/ggml/include/ggml.h +101 -9
  52. data/ext/sources/ggml/include/gguf.h +10 -2
  53. data/ext/sources/ggml/src/CMakeLists.txt +22 -5
  54. data/ext/sources/ggml/src/ggml-alloc.c +5 -1
  55. data/ext/sources/ggml/src/ggml-backend-impl.h +22 -2
  56. data/ext/sources/ggml/src/ggml-backend-meta.cpp +2263 -0
  57. data/ext/sources/ggml/src/ggml-backend-reg.cpp +12 -0
  58. data/ext/sources/ggml/src/ggml-backend.cpp +110 -9
  59. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +4 -0
  60. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +672 -257
  61. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +71 -0
  62. data/ext/sources/ggml/src/ggml-cann/common.h +20 -10
  63. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +211 -30
  64. data/ext/sources/ggml/src/ggml-common.h +11 -0
  65. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +58 -29
  66. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +2 -0
  67. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +16 -16
  68. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +116 -7
  69. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +65 -0
  70. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +151 -1
  71. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +0 -1
  72. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +4279 -1292
  73. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +5 -35
  74. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +0 -1
  75. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +72 -1
  76. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +177 -27
  77. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +1 -1
  78. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +5 -0
  79. data/ext/sources/ggml/src/ggml-cpu/cmake/FindSMTIME.cmake +32 -0
  80. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +10 -0
  81. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +95 -5
  82. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
  83. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +146 -134
  84. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +88 -70
  85. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +372 -73
  86. data/ext/sources/ggml/src/ggml-cpu/ops.h +3 -0
  87. data/ext/sources/ggml/src/ggml-cpu/quants.c +55 -0
  88. data/ext/sources/ggml/src/ggml-cpu/quants.h +3 -0
  89. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +3 -0
  90. data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +90 -0
  91. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +3 -16
  92. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1402 -687
  93. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +8 -0
  94. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +597 -2766
  95. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime2_kernels.cpp +5768 -0
  96. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.cpp +320 -0
  97. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.h +55 -0
  98. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +182 -19
  99. data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.cpp +1795 -0
  100. data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.h +14 -0
  101. data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.cpp +3178 -0
  102. data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.h +95 -0
  103. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_barrier.h +34 -0
  104. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.cpp +760 -0
  105. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.h +32 -0
  106. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_tcm.h +409 -0
  107. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +37 -53
  108. data/ext/sources/ggml/src/ggml-cpu/vec.h +225 -240
  109. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +17 -7
  110. data/ext/sources/ggml/src/ggml-cuda/allreduce.cu +971 -0
  111. data/ext/sources/ggml/src/ggml-cuda/allreduce.cuh +29 -0
  112. data/ext/sources/ggml/src/ggml-cuda/argsort.cu +62 -26
  113. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +44 -18
  114. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +1 -0
  115. data/ext/sources/ggml/src/ggml-cuda/common.cuh +242 -28
  116. data/ext/sources/ggml/src/ggml-cuda/concat.cu +120 -114
  117. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +45 -21
  118. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +1 -0
  119. data/ext/sources/ggml/src/ggml-cuda/convert.cu +53 -0
  120. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +10 -0
  121. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +14 -6
  122. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +22 -0
  123. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +278 -44
  124. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +331 -130
  125. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +12 -0
  126. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +126 -27
  127. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +40 -15
  128. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +18 -9
  129. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +152 -49
  130. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
  131. data/ext/sources/ggml/src/ggml-cuda/fwht.cu +101 -0
  132. data/ext/sources/ggml/src/ggml-cuda/fwht.cuh +4 -0
  133. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +84 -35
  134. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +34 -12
  135. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1069 -609
  136. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +32 -29
  137. data/ext/sources/ggml/src/ggml-cuda/mean.cu +4 -2
  138. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +242 -195
  139. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +3 -3
  140. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +18 -12
  141. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +502 -423
  142. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +19 -12
  143. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +485 -57
  144. data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +6 -1
  145. data/ext/sources/ggml/src/ggml-cuda/norm.cu +36 -10
  146. data/ext/sources/ggml/src/ggml-cuda/out-prod.cu +23 -7
  147. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +133 -26
  148. data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +1 -1
  149. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +5 -1
  150. data/ext/sources/ggml/src/ggml-cuda/rope.cu +11 -4
  151. data/ext/sources/ggml/src/ggml-cuda/scale.cu +4 -1
  152. data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +14 -6
  153. data/ext/sources/ggml/src/ggml-cuda/snake.cu +72 -0
  154. data/ext/sources/ggml/src/ggml-cuda/snake.cuh +8 -0
  155. data/ext/sources/ggml/src/ggml-cuda/softcap.cu +4 -1
  156. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +45 -13
  157. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
  158. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +40 -18
  159. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +8 -4
  160. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +1 -0
  161. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +1 -0
  162. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +2 -0
  163. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +1 -0
  164. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +1 -0
  165. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +1 -0
  166. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +1 -0
  167. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +2 -0
  168. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +1 -0
  169. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +1 -0
  170. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +2 -0
  171. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +1 -0
  172. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +2 -0
  173. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq192-dv128.cu +5 -0
  174. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq320-dv256.cu +5 -0
  175. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq512-dv512.cu +5 -0
  176. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu +7 -0
  177. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-f16.cu +7 -0
  178. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_0.cu +7 -0
  179. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_1.cu +7 -0
  180. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_0.cu +7 -0
  181. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_1.cu +7 -0
  182. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q8_0.cu +7 -0
  183. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-bf16.cu +7 -0
  184. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-bf16.cu +7 -0
  185. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-bf16.cu +7 -0
  186. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-bf16.cu +7 -0
  187. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-bf16.cu +7 -0
  188. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-bf16.cu +7 -0
  189. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-nvfp4.cu +5 -0
  190. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q1_0.cu +5 -0
  191. data/ext/sources/ggml/src/ggml-cuda/top-k.cu +5 -4
  192. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +26 -23
  193. data/ext/sources/ggml/src/ggml-cuda/unary.cu +31 -2
  194. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +2 -0
  195. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +80 -0
  196. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +7 -2
  197. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +22 -4
  198. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +3 -0
  199. data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +2 -1
  200. data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +1428 -743
  201. data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +45 -7
  202. data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +53 -84
  203. data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +25 -12
  204. data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +165 -184
  205. data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +5 -5
  206. data/ext/sources/ggml/src/ggml-hexagon/htp/concat-ops.c +277 -0
  207. data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +170 -127
  208. data/ext/sources/ggml/src/ggml-hexagon/htp/cumsum-ops.c +270 -0
  209. data/ext/sources/ggml/src/ggml-hexagon/htp/diag-ops.c +216 -0
  210. data/ext/sources/ggml/src/ggml-hexagon/htp/fill-ops.c +123 -0
  211. data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +125 -97
  212. data/ext/sources/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c +1148 -0
  213. data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +148 -42
  214. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.c +2 -2
  215. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.h +252 -62
  216. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +9 -0
  217. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +87 -1
  218. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c +1878 -0
  219. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c +2066 -0
  220. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.c +6 -0
  221. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.h +88 -0
  222. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-profile.h +34 -0
  223. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.c +158 -0
  224. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.h +134 -0
  225. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-utils.h +200 -0
  226. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +96 -13
  227. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +182 -57
  228. data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +9 -3
  229. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +71 -3
  230. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +27 -10
  231. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +63 -23
  232. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +9 -8
  233. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-flash-attn.h +47 -0
  234. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-log.h +65 -0
  235. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-pow.h +42 -0
  236. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-repl.h +74 -0
  237. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +1 -0
  238. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sin-cos.h +90 -0
  239. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +5 -8
  240. data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +529 -815
  241. data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +2522 -234
  242. data/ext/sources/ggml/src/ggml-hexagon/htp/pad-ops.c +547 -0
  243. data/ext/sources/ggml/src/ggml-hexagon/htp/repeat-ops.c +148 -0
  244. data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +291 -95
  245. data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +59 -37
  246. data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +121 -133
  247. data/ext/sources/ggml/src/ggml-hexagon/htp/solve-tri-ops.c +267 -0
  248. data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +244 -151
  249. data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +6 -6
  250. data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +719 -45
  251. data/ext/sources/ggml/src/ggml-hexagon/htp/vtcm-utils.h +16 -0
  252. data/ext/sources/ggml/src/ggml-hexagon/htp-opnode.h +272 -0
  253. data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +3 -1
  254. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +22 -9
  255. data/ext/sources/ggml/src/ggml-impl.h +6 -1
  256. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +138 -13
  257. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +32 -1
  258. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +164 -28
  259. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +80 -0
  260. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +190 -19
  261. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +2 -0
  262. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +39 -26
  263. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +823 -322
  264. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +5 -6
  265. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +54 -5
  266. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +12248 -5907
  267. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +67 -0
  268. data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +59 -0
  269. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +1819 -112
  270. data/ext/sources/ggml/src/ggml-opencl/kernels/gated_delta_net.cl +249 -0
  271. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl +306 -0
  272. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl +256 -0
  273. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl +258 -0
  274. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl +283 -0
  275. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl +260 -0
  276. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl +262 -0
  277. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl +288 -0
  278. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl +267 -0
  279. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_iq4_nl_f32.cl +150 -0
  280. data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mat_Ab_Bi_8x4.cl → gemm_noshuffle_q4_0_f32.cl} +1 -1
  281. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_k_f32.cl +172 -0
  282. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_0_f32.cl +131 -0
  283. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_1_f32.cl +134 -0
  284. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_k_f32.cl +176 -0
  285. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q6_k_f32.cl +140 -0
  286. data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mm_q8_0_f32_8x4.cl → gemm_noshuffle_q8_0_f32.cl} +1 -1
  287. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_xmem_f16_f32_os8.cl +233 -0
  288. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl +165 -0
  289. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl +120 -0
  290. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl +123 -0
  291. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl +155 -0
  292. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl +123 -0
  293. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl +125 -0
  294. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl +160 -0
  295. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl +141 -0
  296. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_iq4_nl_f32.cl +302 -0
  297. data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle_general.cl → gemv_noshuffle_q4_0_f32.cl} +5 -5
  298. data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle.cl → gemv_noshuffle_q4_0_f32_spec.cl} +5 -5
  299. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_k_f32.cl +318 -0
  300. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_0_f32.cl +291 -0
  301. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_1_f32.cl +294 -0
  302. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_k_f32.cl +326 -0
  303. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q6_k_f32.cl +293 -0
  304. data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +15 -9
  305. data/ext/sources/ggml/src/ggml-opencl/kernels/moe_reorder_b.cl +30 -0
  306. data/ext/sources/ggml/src/ggml-opencl/kernels/moe_sort_by_expert.cl +82 -0
  307. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_iq4_nl_f32_l4_lm.cl +171 -0
  308. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_k_f32_l4_lm.cl +179 -0
  309. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_0_f32_l4_lm.cl +173 -0
  310. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_1_f32_l4_lm.cl +175 -0
  311. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_k_f32_l4_lm.cl +192 -0
  312. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl +164 -0
  313. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32_flat.cl +202 -0
  314. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl +196 -0
  315. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32.cl +241 -0
  316. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32_flat.cl +243 -0
  317. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32.cl +243 -0
  318. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32_flat.cl +247 -0
  319. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32.cl +187 -0
  320. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32_flat.cl +203 -0
  321. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +48 -64
  322. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +15 -5
  323. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +18 -11
  324. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +35 -13
  325. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +264 -192
  326. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +33 -7
  327. data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp +25 -0
  328. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +1 -0
  329. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +1 -0
  330. data/ext/sources/ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp +41 -0
  331. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +27 -3
  332. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +67 -36
  333. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +1 -0
  334. data/ext/sources/ggml/src/ggml-openvino/utils.cpp +101 -44
  335. data/ext/sources/ggml/src/ggml-openvino/utils.h +23 -3
  336. data/ext/sources/ggml/src/ggml-opt.cpp +1 -0
  337. data/ext/sources/ggml/src/ggml-quants.c +289 -114
  338. data/ext/sources/ggml/src/ggml-quants.h +3 -0
  339. data/ext/sources/ggml/src/ggml-rpc/CMakeLists.txt +24 -0
  340. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +167 -311
  341. data/ext/sources/ggml/src/ggml-rpc/transport.cpp +683 -0
  342. data/ext/sources/ggml/src/ggml-rpc/transport.h +34 -0
  343. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +50 -4
  344. data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +1 -1
  345. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +3 -1
  346. data/ext/sources/ggml/src/ggml-sycl/common.cpp +74 -2
  347. data/ext/sources/ggml/src/ggml-sycl/common.hpp +41 -1
  348. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +115 -13
  349. data/ext/sources/ggml/src/ggml-sycl/convert.hpp +9 -0
  350. data/ext/sources/ggml/src/ggml-sycl/cumsum.cpp +148 -0
  351. data/ext/sources/ggml/src/ggml-sycl/cumsum.hpp +5 -0
  352. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +663 -0
  353. data/ext/sources/ggml/src/ggml-sycl/diag.cpp +67 -0
  354. data/ext/sources/ggml/src/ggml-sycl/diag.hpp +5 -0
  355. data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +586 -6
  356. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +1 -90
  357. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +0 -2
  358. data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.cpp +56 -0
  359. data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.hpp +63 -0
  360. data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +7 -5
  361. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +4 -0
  362. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +76 -168
  363. data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +7 -0
  364. data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +3 -1
  365. data/ext/sources/ggml/src/ggml-sycl/fill.cpp +55 -0
  366. data/ext/sources/ggml/src/ggml-sycl/fill.hpp +5 -0
  367. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +69 -31
  368. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +1 -0
  369. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
  370. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +79 -3
  371. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +823 -190
  372. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +353 -89
  373. data/ext/sources/ggml/src/ggml-sycl/im2col.hpp +5 -3
  374. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +1344 -26
  375. data/ext/sources/ggml/src/ggml-sycl/mmvq.hpp +16 -0
  376. data/ext/sources/ggml/src/ggml-sycl/pad.cpp +27 -27
  377. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +71 -0
  378. data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +7 -1
  379. data/ext/sources/ggml/src/ggml-sycl/solve_tri.cpp +172 -0
  380. data/ext/sources/ggml/src/ggml-sycl/solve_tri.hpp +8 -0
  381. data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +6 -1
  382. data/ext/sources/ggml/src/ggml-sycl/ssm_scan.cpp +156 -0
  383. data/ext/sources/ggml/src/ggml-sycl/ssm_scan.hpp +5 -0
  384. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +62 -10
  385. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +18 -6
  386. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq512-dv512.cpp +6 -0
  387. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +1 -0
  388. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +1 -0
  389. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +1 -0
  390. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +1 -0
  391. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +1 -0
  392. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +1 -0
  393. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +1 -0
  394. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +1 -0
  395. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +1 -0
  396. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +1 -0
  397. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +1 -0
  398. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +1 -0
  399. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +1 -0
  400. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +1 -0
  401. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +1 -0
  402. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +1 -0
  403. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +1 -0
  404. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +1 -0
  405. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +1 -0
  406. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +1 -0
  407. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +1 -0
  408. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +1 -0
  409. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +1 -0
  410. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +1 -0
  411. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +1 -0
  412. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +1 -0
  413. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +1 -0
  414. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +1 -0
  415. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +1 -0
  416. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +1 -0
  417. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +1 -0
  418. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +1 -0
  419. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +1 -0
  420. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +1 -0
  421. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +1 -0
  422. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +1 -0
  423. data/ext/sources/ggml/src/ggml-sycl/type.hpp +112 -0
  424. data/ext/sources/ggml/src/ggml-sycl/upscale.cpp +410 -0
  425. data/ext/sources/ggml/src/ggml-sycl/upscale.hpp +9 -0
  426. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +215 -53
  427. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +4 -0
  428. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +2 -0
  429. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +2 -0
  430. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +1 -0
  431. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +1 -0
  432. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +0 -2
  433. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +11 -0
  434. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +2060 -535
  435. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
  436. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +6 -2
  437. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +146 -13
  438. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +3 -1
  439. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +1 -1
  440. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +25 -1
  441. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +88 -0
  442. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl +643 -1
  443. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_nvfp4.comp +32 -0
  444. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q1_0.comp +29 -0
  445. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +0 -1
  446. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dot_product_funcs.glsl +27 -0
  447. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +0 -1
  448. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2_decode_vector.comp +7 -0
  449. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +197 -48
  450. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +60 -59
  451. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +115 -113
  452. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +122 -31
  453. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl +131 -0
  454. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mmq_funcs.glsl +203 -0
  455. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp +115 -0
  456. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +125 -64
  457. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.glsl +0 -1
  458. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl +10 -1
  459. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl +16 -6
  460. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +76 -54
  461. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +0 -1
  462. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +0 -1
  463. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +122 -27
  464. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +6 -6
  465. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +1 -1
  466. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +1 -1
  467. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +1 -1
  468. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +1 -0
  469. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +88 -55
  470. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +11 -17
  471. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +43 -10
  472. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +159 -125
  473. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +8 -8
  474. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +24 -9
  475. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +0 -1
  476. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +5 -2
  477. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +0 -1
  478. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +3 -2
  479. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/snake.comp +49 -0
  480. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +11 -1
  481. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +0 -1
  482. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +79 -2
  483. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +171 -147
  484. data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +5 -2
  485. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +2202 -283
  486. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +2610 -1403
  487. data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +37 -7
  488. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_id.wgsl +64 -0
  489. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +8 -7
  490. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +76 -95
  491. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +19 -1
  492. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/conv2d.wgsl +165 -0
  493. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{cpy.tmpl.wgsl → cpy.wgsl} +25 -50
  494. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +107 -184
  495. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_quant_staging.tmpl +124 -0
  496. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl +397 -0
  497. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_blk.wgsl +101 -0
  498. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_reduce.wgsl +84 -0
  499. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_split.wgsl +619 -0
  500. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl +149 -0
  501. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl +183 -78
  502. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.wgsl +155 -0
  503. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/im2col.wgsl +101 -0
  504. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +655 -495
  505. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id.wgsl +195 -0
  506. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl +52 -0
  507. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_vec.wgsl +154 -0
  508. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +8 -6
  509. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +5 -1
  510. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +80 -409
  511. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl +1432 -0
  512. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_q_acc.tmpl +303 -0
  513. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quant_inner_loops.tmpl +21 -0
  514. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quantize_q8.wgsl +173 -0
  515. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_mul.wgsl +152 -0
  516. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{rope.tmpl.wgsl → rope.wgsl} +71 -142
  517. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/row_norm.wgsl +153 -0
  518. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +6 -4
  519. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set.wgsl +109 -0
  520. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +2 -3
  521. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows_quant.wgsl +224 -0
  522. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{soft_max.tmpl.wgsl → soft_max.wgsl} +106 -206
  523. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/solve_tri.wgsl +121 -0
  524. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_conv.wgsl +65 -0
  525. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_scan.wgsl +193 -0
  526. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +68 -48
  527. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/upscale.wgsl +240 -0
  528. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +18 -14
  529. data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +1 -1
  530. data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +244 -10
  531. data/ext/sources/ggml/src/ggml.c +110 -28
  532. data/ext/sources/ggml/src/gguf.cpp +173 -28
  533. data/ext/sources/include/parakeet.h +342 -0
  534. data/ext/sources/include/whisper.h +10 -0
  535. data/ext/sources/media/matmul.png +0 -0
  536. data/ext/sources/src/CMakeLists.txt +23 -0
  537. data/ext/sources/src/parakeet-arch.h +188 -0
  538. data/ext/sources/src/parakeet.cpp +3838 -0
  539. data/ext/sources/src/whisper.cpp +56 -12
  540. data/extsources.rb +26 -10
  541. data/lib/whisper/log_settable.rb +36 -0
  542. data/lib/whisper/model/uri.rb +13 -1
  543. data/lib/whisper/output.rb +74 -0
  544. data/sig/whisper.rbs +411 -62
  545. data/test/helper.rb +2 -0
  546. data/test/jfk_reader/jfk_reader.c +50 -7
  547. data/test/test_callback.rb +1 -0
  548. data/test/test_package.rb +6 -5
  549. data/test/test_parakeet.rb +28 -0
  550. data/test/test_parakeet_callback.rb +107 -0
  551. data/test/test_parakeet_context.rb +116 -0
  552. data/test/test_parakeet_context_params.rb +24 -0
  553. data/test/test_parakeet_model.rb +21 -0
  554. data/test/test_parakeet_params.rb +78 -0
  555. data/test/test_parakeet_segment.rb +42 -0
  556. data/test/test_parakeet_token.rb +73 -0
  557. data/test/test_params.rb +2 -0
  558. data/test/test_vad_segment.rb +1 -1
  559. data/test/test_whisper.rb +24 -6
  560. data/whispercpp.gemspec +2 -2
  561. metadata +215 -281
  562. data/ext/sources/bindings/javascript/CMakeLists.txt +0 -41
  563. data/ext/sources/bindings/javascript/emscripten.cpp +0 -93
  564. data/ext/sources/bindings/javascript/libwhisper.worker.js +0 -1
  565. data/ext/sources/bindings/javascript/package.json +0 -26
  566. data/ext/sources/bindings/javascript/whisper.js +0 -19
  567. data/ext/sources/examples/addon.node/CMakeLists.txt +0 -31
  568. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +0 -133
  569. data/ext/sources/examples/addon.node/addon.cpp +0 -557
  570. data/ext/sources/examples/addon.node/index.js +0 -59
  571. data/ext/sources/examples/addon.node/package.json +0 -16
  572. data/ext/sources/examples/addon.node/vad-example.js +0 -132
  573. data/ext/sources/examples/bench.wasm/CMakeLists.txt +0 -49
  574. data/ext/sources/examples/bench.wasm/emscripten.cpp +0 -87
  575. data/ext/sources/examples/bench.wasm/index-tmpl.html +0 -285
  576. data/ext/sources/examples/coi-serviceworker.js +0 -146
  577. data/ext/sources/examples/command/CMakeLists.txt +0 -10
  578. data/ext/sources/examples/command/command.cpp +0 -802
  579. data/ext/sources/examples/command/commands.txt +0 -9
  580. data/ext/sources/examples/command.wasm/CMakeLists.txt +0 -50
  581. data/ext/sources/examples/command.wasm/emscripten.cpp +0 -327
  582. data/ext/sources/examples/command.wasm/index-tmpl.html +0 -415
  583. data/ext/sources/examples/generate-karaoke.sh +0 -57
  584. data/ext/sources/examples/helpers.js +0 -191
  585. data/ext/sources/examples/livestream.sh +0 -112
  586. data/ext/sources/examples/lsp/CMakeLists.txt +0 -10
  587. data/ext/sources/examples/lsp/lsp.cpp +0 -471
  588. data/ext/sources/examples/lsp/whisper.vim +0 -362
  589. data/ext/sources/examples/python/test_whisper_processor.py +0 -7
  590. data/ext/sources/examples/python/whisper_processor.py +0 -54
  591. data/ext/sources/examples/server/bench.js +0 -29
  592. data/ext/sources/examples/server.py +0 -120
  593. data/ext/sources/examples/stream/CMakeLists.txt +0 -10
  594. data/ext/sources/examples/stream/stream.cpp +0 -437
  595. data/ext/sources/examples/stream.wasm/CMakeLists.txt +0 -49
  596. data/ext/sources/examples/stream.wasm/emscripten.cpp +0 -216
  597. data/ext/sources/examples/stream.wasm/index-tmpl.html +0 -491
  598. data/ext/sources/examples/sycl/CMakeLists.txt +0 -9
  599. data/ext/sources/examples/sycl/build.sh +0 -22
  600. data/ext/sources/examples/sycl/ls-sycl-device.cpp +0 -11
  601. data/ext/sources/examples/sycl/run-whisper.sh +0 -17
  602. data/ext/sources/examples/talk-llama/CMakeLists.txt +0 -48
  603. data/ext/sources/examples/talk-llama/eleven-labs.py +0 -80
  604. data/ext/sources/examples/talk-llama/llama-adapter.cpp +0 -488
  605. data/ext/sources/examples/talk-llama/llama-adapter.h +0 -89
  606. data/ext/sources/examples/talk-llama/llama-arch.cpp +0 -2877
  607. data/ext/sources/examples/talk-llama/llama-arch.h +0 -628
  608. data/ext/sources/examples/talk-llama/llama-batch.cpp +0 -919
  609. data/ext/sources/examples/talk-llama/llama-batch.h +0 -173
  610. data/ext/sources/examples/talk-llama/llama-chat.cpp +0 -896
  611. data/ext/sources/examples/talk-llama/llama-chat.h +0 -71
  612. data/ext/sources/examples/talk-llama/llama-context.cpp +0 -3633
  613. data/ext/sources/examples/talk-llama/llama-context.h +0 -359
  614. data/ext/sources/examples/talk-llama/llama-cparams.cpp +0 -5
  615. data/ext/sources/examples/talk-llama/llama-cparams.h +0 -47
  616. data/ext/sources/examples/talk-llama/llama-ext.h +0 -12
  617. data/ext/sources/examples/talk-llama/llama-grammar.cpp +0 -1464
  618. data/ext/sources/examples/talk-llama/llama-grammar.h +0 -194
  619. data/ext/sources/examples/talk-llama/llama-graph.cpp +0 -2735
  620. data/ext/sources/examples/talk-llama/llama-graph.h +0 -1031
  621. data/ext/sources/examples/talk-llama/llama-hparams.cpp +0 -258
  622. data/ext/sources/examples/talk-llama/llama-hparams.h +0 -353
  623. data/ext/sources/examples/talk-llama/llama-impl.cpp +0 -171
  624. data/ext/sources/examples/talk-llama/llama-impl.h +0 -75
  625. data/ext/sources/examples/talk-llama/llama-io.cpp +0 -15
  626. data/ext/sources/examples/talk-llama/llama-io.h +0 -35
  627. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +0 -330
  628. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.h +0 -137
  629. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2285
  630. data/ext/sources/examples/talk-llama/llama-kv-cache.h +0 -389
  631. data/ext/sources/examples/talk-llama/llama-kv-cells.h +0 -533
  632. data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.cpp +0 -275
  633. data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.h +0 -140
  634. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +0 -268
  635. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +0 -139
  636. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +0 -1165
  637. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +0 -182
  638. data/ext/sources/examples/talk-llama/llama-memory.cpp +0 -59
  639. data/ext/sources/examples/talk-llama/llama-memory.h +0 -122
  640. data/ext/sources/examples/talk-llama/llama-mmap.cpp +0 -752
  641. data/ext/sources/examples/talk-llama/llama-mmap.h +0 -73
  642. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +0 -1655
  643. data/ext/sources/examples/talk-llama/llama-model-loader.h +0 -206
  644. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +0 -299
  645. data/ext/sources/examples/talk-llama/llama-model-saver.h +0 -40
  646. data/ext/sources/examples/talk-llama/llama-model.cpp +0 -9056
  647. data/ext/sources/examples/talk-llama/llama-model.h +0 -597
  648. data/ext/sources/examples/talk-llama/llama-quant.cpp +0 -1304
  649. data/ext/sources/examples/talk-llama/llama-quant.h +0 -1
  650. data/ext/sources/examples/talk-llama/llama-sampler.cpp +0 -3885
  651. data/ext/sources/examples/talk-llama/llama-sampler.h +0 -42
  652. data/ext/sources/examples/talk-llama/llama-vocab.cpp +0 -3970
  653. data/ext/sources/examples/talk-llama/llama-vocab.h +0 -187
  654. data/ext/sources/examples/talk-llama/llama.cpp +0 -1194
  655. data/ext/sources/examples/talk-llama/llama.h +0 -1573
  656. data/ext/sources/examples/talk-llama/models/afmoe.cpp +0 -190
  657. data/ext/sources/examples/talk-llama/models/apertus.cpp +0 -125
  658. data/ext/sources/examples/talk-llama/models/arcee.cpp +0 -135
  659. data/ext/sources/examples/talk-llama/models/arctic.cpp +0 -137
  660. data/ext/sources/examples/talk-llama/models/arwkv7.cpp +0 -86
  661. data/ext/sources/examples/talk-llama/models/baichuan.cpp +0 -123
  662. data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +0 -143
  663. data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +0 -133
  664. data/ext/sources/examples/talk-llama/models/bert.cpp +0 -184
  665. data/ext/sources/examples/talk-llama/models/bitnet.cpp +0 -145
  666. data/ext/sources/examples/talk-llama/models/bloom.cpp +0 -101
  667. data/ext/sources/examples/talk-llama/models/chameleon.cpp +0 -178
  668. data/ext/sources/examples/talk-llama/models/chatglm.cpp +0 -132
  669. data/ext/sources/examples/talk-llama/models/codeshell.cpp +0 -111
  670. data/ext/sources/examples/talk-llama/models/cogvlm.cpp +0 -102
  671. data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +0 -134
  672. data/ext/sources/examples/talk-llama/models/command-r.cpp +0 -122
  673. data/ext/sources/examples/talk-llama/models/dbrx.cpp +0 -122
  674. data/ext/sources/examples/talk-llama/models/deci.cpp +0 -135
  675. data/ext/sources/examples/talk-llama/models/deepseek.cpp +0 -142
  676. data/ext/sources/examples/talk-llama/models/deepseek2.cpp +0 -262
  677. data/ext/sources/examples/talk-llama/models/delta-net-base.cpp +0 -445
  678. data/ext/sources/examples/talk-llama/models/dots1.cpp +0 -132
  679. data/ext/sources/examples/talk-llama/models/dream.cpp +0 -105
  680. data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +0 -148
  681. data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +0 -110
  682. data/ext/sources/examples/talk-llama/models/eurobert.cpp +0 -97
  683. data/ext/sources/examples/talk-llama/models/exaone-moe.cpp +0 -145
  684. data/ext/sources/examples/talk-llama/models/exaone.cpp +0 -114
  685. data/ext/sources/examples/talk-llama/models/exaone4.cpp +0 -123
  686. data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +0 -111
  687. data/ext/sources/examples/talk-llama/models/falcon.cpp +0 -120
  688. data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +0 -116
  689. data/ext/sources/examples/talk-llama/models/gemma.cpp +0 -112
  690. data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +0 -128
  691. data/ext/sources/examples/talk-llama/models/gemma3.cpp +0 -155
  692. data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +0 -384
  693. data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +0 -170
  694. data/ext/sources/examples/talk-llama/models/glm4.cpp +0 -157
  695. data/ext/sources/examples/talk-llama/models/gpt2.cpp +0 -105
  696. data/ext/sources/examples/talk-llama/models/gptneox.cpp +0 -144
  697. data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +0 -195
  698. data/ext/sources/examples/talk-llama/models/granite.cpp +0 -210
  699. data/ext/sources/examples/talk-llama/models/grok.cpp +0 -159
  700. data/ext/sources/examples/talk-llama/models/grovemoe.cpp +0 -139
  701. data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +0 -132
  702. data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +0 -153
  703. data/ext/sources/examples/talk-llama/models/internlm2.cpp +0 -120
  704. data/ext/sources/examples/talk-llama/models/jais.cpp +0 -86
  705. data/ext/sources/examples/talk-llama/models/jais2.cpp +0 -123
  706. data/ext/sources/examples/talk-llama/models/jamba.cpp +0 -106
  707. data/ext/sources/examples/talk-llama/models/kimi-linear.cpp +0 -381
  708. data/ext/sources/examples/talk-llama/models/lfm2.cpp +0 -196
  709. data/ext/sources/examples/talk-llama/models/llada-moe.cpp +0 -122
  710. data/ext/sources/examples/talk-llama/models/llada.cpp +0 -99
  711. data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +0 -178
  712. data/ext/sources/examples/talk-llama/models/llama.cpp +0 -175
  713. data/ext/sources/examples/talk-llama/models/maincoder.cpp +0 -117
  714. data/ext/sources/examples/talk-llama/models/mamba-base.cpp +0 -289
  715. data/ext/sources/examples/talk-llama/models/mamba.cpp +0 -54
  716. data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +0 -129
  717. data/ext/sources/examples/talk-llama/models/minicpm3.cpp +0 -200
  718. data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +0 -123
  719. data/ext/sources/examples/talk-llama/models/mistral3.cpp +0 -160
  720. data/ext/sources/examples/talk-llama/models/models.h +0 -704
  721. data/ext/sources/examples/talk-llama/models/modern-bert.cpp +0 -109
  722. data/ext/sources/examples/talk-llama/models/mpt.cpp +0 -126
  723. data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +0 -162
  724. data/ext/sources/examples/talk-llama/models/nemotron.cpp +0 -122
  725. data/ext/sources/examples/talk-llama/models/neo-bert.cpp +0 -104
  726. data/ext/sources/examples/talk-llama/models/olmo.cpp +0 -121
  727. data/ext/sources/examples/talk-llama/models/olmo2.cpp +0 -150
  728. data/ext/sources/examples/talk-llama/models/olmoe.cpp +0 -124
  729. data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +0 -127
  730. data/ext/sources/examples/talk-llama/models/openelm.cpp +0 -124
  731. data/ext/sources/examples/talk-llama/models/orion.cpp +0 -123
  732. data/ext/sources/examples/talk-llama/models/paddleocr.cpp +0 -122
  733. data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +0 -121
  734. data/ext/sources/examples/talk-llama/models/phi2.cpp +0 -121
  735. data/ext/sources/examples/talk-llama/models/phi3.cpp +0 -152
  736. data/ext/sources/examples/talk-llama/models/plamo.cpp +0 -110
  737. data/ext/sources/examples/talk-llama/models/plamo2.cpp +0 -320
  738. data/ext/sources/examples/talk-llama/models/plamo3.cpp +0 -128
  739. data/ext/sources/examples/talk-llama/models/plm.cpp +0 -169
  740. data/ext/sources/examples/talk-llama/models/qwen.cpp +0 -108
  741. data/ext/sources/examples/talk-llama/models/qwen2.cpp +0 -126
  742. data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +0 -151
  743. data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +0 -117
  744. data/ext/sources/examples/talk-llama/models/qwen3.cpp +0 -120
  745. data/ext/sources/examples/talk-llama/models/qwen35.cpp +0 -381
  746. data/ext/sources/examples/talk-llama/models/qwen35moe.cpp +0 -422
  747. data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +0 -131
  748. data/ext/sources/examples/talk-llama/models/qwen3next.cpp +0 -525
  749. data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +0 -140
  750. data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +0 -132
  751. data/ext/sources/examples/talk-llama/models/refact.cpp +0 -94
  752. data/ext/sources/examples/talk-llama/models/rnd1.cpp +0 -126
  753. data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +0 -164
  754. data/ext/sources/examples/talk-llama/models/rwkv6.cpp +0 -94
  755. data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +0 -86
  756. data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +0 -137
  757. data/ext/sources/examples/talk-llama/models/rwkv7.cpp +0 -90
  758. data/ext/sources/examples/talk-llama/models/seed-oss.cpp +0 -124
  759. data/ext/sources/examples/talk-llama/models/smallthinker.cpp +0 -126
  760. data/ext/sources/examples/talk-llama/models/smollm3.cpp +0 -128
  761. data/ext/sources/examples/talk-llama/models/stablelm.cpp +0 -146
  762. data/ext/sources/examples/talk-llama/models/starcoder.cpp +0 -100
  763. data/ext/sources/examples/talk-llama/models/starcoder2.cpp +0 -121
  764. data/ext/sources/examples/talk-llama/models/step35-iswa.cpp +0 -165
  765. data/ext/sources/examples/talk-llama/models/t5-dec.cpp +0 -166
  766. data/ext/sources/examples/talk-llama/models/t5-enc.cpp +0 -96
  767. data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +0 -149
  768. data/ext/sources/examples/talk-llama/models/xverse.cpp +0 -108
  769. data/ext/sources/examples/talk-llama/prompts/talk-alpaca.txt +0 -23
  770. data/ext/sources/examples/talk-llama/speak +0 -40
  771. data/ext/sources/examples/talk-llama/speak.bat +0 -1
  772. data/ext/sources/examples/talk-llama/speak.ps1 +0 -14
  773. data/ext/sources/examples/talk-llama/talk-llama.cpp +0 -813
  774. data/ext/sources/examples/talk-llama/unicode-data.cpp +0 -7034
  775. data/ext/sources/examples/talk-llama/unicode-data.h +0 -20
  776. data/ext/sources/examples/talk-llama/unicode.cpp +0 -1103
  777. data/ext/sources/examples/talk-llama/unicode.h +0 -111
  778. data/ext/sources/examples/wchess/CMakeLists.txt +0 -10
  779. data/ext/sources/examples/wchess/libwchess/CMakeLists.txt +0 -19
  780. data/ext/sources/examples/wchess/libwchess/Chessboard.cpp +0 -803
  781. data/ext/sources/examples/wchess/libwchess/Chessboard.h +0 -33
  782. data/ext/sources/examples/wchess/libwchess/WChess.cpp +0 -193
  783. data/ext/sources/examples/wchess/libwchess/WChess.h +0 -63
  784. data/ext/sources/examples/wchess/libwchess/test-chessboard.cpp +0 -117
  785. data/ext/sources/examples/wchess/wchess.cmd/CMakeLists.txt +0 -8
  786. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +0 -253
  787. data/ext/sources/examples/whisper.wasm/CMakeLists.txt +0 -50
  788. data/ext/sources/examples/whisper.wasm/emscripten.cpp +0 -118
  789. data/ext/sources/examples/whisper.wasm/index-tmpl.html +0 -659
  790. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +0 -99
  791. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +0 -155
  792. data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +0 -153
  793. data/ext/sources/ggml/src/ggml-opencl/kernels/embed_kernel.py +0 -26
  794. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +0 -123
  795. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +0 -17
  796. data/ext/sources/ggml/src/ggml-virtgpu/regenerate_remoting.py +0 -333
  797. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.glsl +0 -5
  798. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +0 -182
  799. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +0 -323
  800. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +0 -718
  801. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +0 -123
  802. data/ext/sources/tests/CMakeLists.txt +0 -112
  803. data/ext/sources/tests/earnings21/eval.mk +0 -58
  804. data/ext/sources/tests/earnings21/eval.py +0 -68
  805. data/ext/sources/tests/earnings21/normalizers/__init__.py +0 -2
  806. data/ext/sources/tests/earnings21/normalizers/basic.py +0 -80
  807. data/ext/sources/tests/earnings21/normalizers/english.json +0 -1741
  808. data/ext/sources/tests/earnings21/normalizers/english.py +0 -550
  809. data/ext/sources/tests/earnings21/requirements.txt +0 -6
  810. data/ext/sources/tests/en-0-ref.txt +0 -1
  811. data/ext/sources/tests/en-1-ref.txt +0 -1
  812. data/ext/sources/tests/en-2-ref.txt +0 -1
  813. data/ext/sources/tests/es-0-ref.txt +0 -1
  814. data/ext/sources/tests/librispeech/eval.mk +0 -39
  815. data/ext/sources/tests/librispeech/eval.py +0 -47
  816. data/ext/sources/tests/librispeech/normalizers/__init__.py +0 -2
  817. data/ext/sources/tests/librispeech/normalizers/basic.py +0 -80
  818. data/ext/sources/tests/librispeech/normalizers/english.json +0 -1741
  819. data/ext/sources/tests/librispeech/normalizers/english.py +0 -550
  820. data/ext/sources/tests/librispeech/requirements.txt +0 -6
  821. data/ext/sources/tests/run-tests.sh +0 -130
  822. data/ext/sources/tests/test-c.c +0 -3
  823. data/ext/sources/tests/test-vad-full.cpp +0 -56
  824. data/ext/sources/tests/test-vad.cpp +0 -83
  825. data/ext/sources/tests/test-whisper.js +0 -58
  826. data/lib/whisper/context.rb +0 -15
  827. data/lib/whisper/segment.rb +0 -58
  828. /data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle_general_q8_0_f32.cl → gemv_noshuffle_q8_0_f32.cl} +0 -0
@@ -0,0 +1,971 @@
1
+ #include "allreduce.cuh"
2
+
3
+ #if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
4
+
5
+ #include "convert.cuh"
6
+ #include "ggml-impl.h"
7
+
8
+ #include <algorithm>
9
+ #include <cstdlib>
10
+ #include <cstring>
11
+ #include <limits>
12
+
13
+ // ---------------------------------------------------------------------------
14
+ // CUDA AllReduce for tensor-parallel inference across two GPUs.
15
+ //
16
+ // Provides an in-place sum reduction over matching tensors on two CUDA
17
+ // devices in the same process. Used by the tensor-split path alongside
18
+ // NCCL; targets setups without NVLink, where data is exchanged between the
19
+ // GPUs by staging it through pinned host memory over PCIe.
20
+ //
21
+ // Two reduction strategies are selected per call by tensor size:
22
+ //
23
+ // * Chunked kernel path (small reductions): a single CUDA kernel both
24
+ // stages data through pinned host memory and performs the local sum.
25
+ // Cross-GPU synchronization happens *inside the kernel* (busy-wait on
26
+ // a host-memory flag), which keeps launch overhead low for the
27
+ // latency-sensitive token-generation case.
28
+ //
29
+ // * Copy-engine path (large reductions): the transfer is split into
30
+ // D2H + H2D cudaMemcpyAsync chunks driven by the GPU's copy engine,
31
+ // followed by a small device-side add kernel. Cross-GPU
32
+ // synchronization happens *outside the kernel*, via CUDA events
33
+ // between streams. This keeps the compute engine free while large
34
+ // transfers are in flight, which matters for prefill-sized tensors.
35
+ // Reductions larger than the per-call inner cap are processed by an
36
+ // outer chunker that issues sequential inner calls.
37
+ // ---------------------------------------------------------------------------
38
+
39
+ // ---------------------------------------------------------------------------
40
+ // Cross-GPU signal mechanism
41
+ //
42
+ // One int per (slot, rank) pair in pinned host memory. Each AR call writes a
43
+ // strictly increasing token (= the AR call number) into its own arrival int.
44
+ // The peer spins until its read of the other's arrival int equals the token
45
+ // it expects for this call -- a mismatch means the peer hasn't arrived yet.
46
+ // Tokens never repeat over realistic call rates (32-bit int wraps in tens of
47
+ // days at thousands of ARs/sec), so arrival ints don't need to be reset
48
+ // between calls; we initialize once at pipeline init and let the values
49
+ // accumulate.
50
+ //
51
+ // There is exactly one writer (the owning GPU) and one reader (the peer), so
52
+ // we don't need atomics. A volatile store paired with __threadfence_system()
53
+ // provides the release ordering that makes the D2H writes visible system-wide
54
+ // before the arrival token is observed.
55
+ //
56
+ // atomicAdd_system() requires hostNativeAtomicSupported, which is unavailable
57
+ // on PCIe-attached consumer GPUs without NVLink, so the volatile path is the
58
+ // portable choice.
59
+ // ---------------------------------------------------------------------------
60
+
61
+ static __device__ __forceinline__ void ggml_cuda_ar_signal_set(int * p, int token) {
62
+ *(volatile int *)p = token;
63
+ }
64
+ static __device__ __forceinline__ int ggml_cuda_ar_signal_get(const int * p) {
65
+ return *(const volatile int *)p;
66
+ }
67
+
68
+ // Byte spacing between adjacent arrival ints. 64 bytes (one cache line)
69
+ // ensures each GPU/block's arrival slot lives on its own line, preventing
70
+ // false-sharing stalls on the polling GPU.
71
+ static constexpr size_t GGML_CUDA_AR_ARRIVAL_STRIDE = 64;
72
+
73
+ // Number of blocks the chunked kernel launches with. Each block stripes a
74
+ // disjoint slice of the data and synchronizes through its own arrival-token
75
+ // slot so multiple SMs can pump PCIe stores in parallel.
76
+ static constexpr int GGML_CUDA_AR_KERNEL_BLOCKS = 8;
77
+
78
+ // ---------------------------------------------------------------------------
79
+ // Chunked kernel AllReduce -- 2 GPUs, supports float, half, and bfloat16.
80
+ //
81
+ // Both GPUs run this kernel simultaneously on independent streams. sendbuf
82
+ // and recvbuf live in T_dst (the caller's tensor type); host_mine / host_other
83
+ // carry data in T_wire (the on-wire type, possibly narrower than T_dst -- e.g.
84
+ // T_dst=F32 with T_wire=BF16 halves the bytes pushed across PCIe). When
85
+ // T_dst == T_wire the casts below are no-ops.
86
+ //
87
+ // Each GPU runs three phases:
88
+ //
89
+ // Phase 1 (all threads): cast sendbuf (T_dst) -> T_wire and store as
90
+ // single-instruction-width vectors into host_mine.
91
+ // __threadfence_system() commits these writes to host
92
+ // memory.
93
+ // Phase 2 (thread 0): write token to arrival_mine; spin until
94
+ // arrival_other == token.
95
+ // Phase 3 (all threads): read T_wire vectors from host_other, cast
96
+ // each element to T_dst, and sum with the local
97
+ // sendbuf value (also rounded through T_wire so that
98
+ // both GPUs truncate identically -- this guarantees
99
+ // bit-equivalent results across the two devices).
100
+ //
101
+ // Multi-block: blocks stripe vectors across (gridDim.x * blockDim.x) global
102
+ // threads to keep multiple SMs issuing PCIe stores in parallel. Each block
103
+ // has its own arrival-token slot (offset by blockIdx.x * ARRIVAL_STRIDE);
104
+ // thread 0 of each block signals/spins on that slot independently of other
105
+ // blocks. Tail elements (the leftover < ELEMS_PER_VEC at the end) are
106
+ // handled only by block 0 to avoid cross-block writes to the same slots.
107
+ // ---------------------------------------------------------------------------
108
+ template <typename T_dst, typename T_wire>
109
+ static __global__ void ggml_cuda_ar_kernel(
110
+ const T_dst * sendbuf,
111
+ T_dst * recvbuf,
112
+ T_wire * __restrict__ host_mine,
113
+ const T_wire * __restrict__ host_other,
114
+ int count,
115
+ int * arrival_mine,
116
+ int * arrival_other,
117
+ int token) {
118
+
119
+ // Vector unit for the wire type, sized to the arch's widest single-instruction
120
+ // copy (16 B on Volta+). Each phase-1 iter writes one vector to host memory;
121
+ // each phase-3 iter reads one and produces ELEMS_PER_VEC sums.
122
+ constexpr int ELEMS_PER_VEC = ggml_cuda_get_max_cpy_bytes() / sizeof(T_wire);
123
+ constexpr int ARRIVAL_INTS = (int)(GGML_CUDA_AR_ARRIVAL_STRIDE / sizeof(int));
124
+
125
+ const int tid = threadIdx.x;
126
+ const int nt = blockDim.x;
127
+ const int bid = blockIdx.x;
128
+ const int gtid = bid * nt + tid;
129
+ const int gnt = gridDim.x * nt;
130
+ const int count_vec = count / ELEMS_PER_VEC;
131
+ const int tail = count_vec * ELEMS_PER_VEC;
132
+
133
+ // Phase 1: cast sendbuf (T_dst) -> host_mine (T_wire) and store as vectors.
134
+ {
135
+ for (int i = gtid; i < count_vec; i += gnt) {
136
+ const int off = i * ELEMS_PER_VEC;
137
+ T_wire wire[ELEMS_PER_VEC];
138
+ #pragma unroll
139
+ for (int k = 0; k < ELEMS_PER_VEC; ++k) {
140
+ wire[k] = ggml_cuda_cast<T_wire>(sendbuf[off + k]);
141
+ }
142
+ ggml_cuda_memcpy_1<sizeof(wire)>(&host_mine[off], wire);
143
+ }
144
+ if (bid == 0 && tid < count - tail) {
145
+ host_mine[tail + tid] = ggml_cuda_cast<T_wire>(sendbuf[tail + tid]);
146
+ }
147
+ }
148
+
149
+ // Commit this block's host writes before signalling.
150
+ __threadfence_system();
151
+ __syncthreads();
152
+
153
+ // Phase 2: thread 0 of each block signals on its own arrival slot, then
154
+ // spins for the matching slot from peer. Per-block tokens mean blocks
155
+ // proceed independently -- no inter-block barrier needed.
156
+ if (tid == 0) {
157
+ int * my_slot = arrival_mine + bid * ARRIVAL_INTS;
158
+ const int * other_slot = arrival_other + bid * ARRIVAL_INTS;
159
+
160
+ ggml_cuda_ar_signal_set(my_slot, token);
161
+ __threadfence_system(); // make our signal visible system-wide
162
+
163
+ while (ggml_cuda_ar_signal_get(other_slot) != token) {
164
+ #if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
165
+ __nanosleep(100);
166
+ #else
167
+ NO_DEVICE_CODE;
168
+ #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
169
+ }
170
+ }
171
+
172
+ __syncthreads();
173
+
174
+ // Acquire peer's host_other writes (this block's stripe of them).
175
+ __threadfence_system();
176
+
177
+ // Phase 3: read peer's T_wire vector, cast both sides through T_wire for
178
+ // bit-equivalence, sum in T_dst precision, and write back to recvbuf.
179
+ {
180
+ for (int i = gtid; i < count_vec; i += gnt) {
181
+ const int off = i * ELEMS_PER_VEC;
182
+ T_wire wire[ELEMS_PER_VEC];
183
+ ggml_cuda_memcpy_1<sizeof(wire)>(wire, &host_other[off]);
184
+ #pragma unroll
185
+ for (int k = 0; k < ELEMS_PER_VEC; ++k) {
186
+ const T_wire d_low = ggml_cuda_cast<T_wire>(sendbuf[off + k]);
187
+ recvbuf[off + k] = ggml_cuda_cast<T_dst>(
188
+ ggml_cuda_cast<float>(d_low) + ggml_cuda_cast<float>(wire[k]));
189
+ }
190
+ }
191
+ if (bid == 0 && tid < count - tail) {
192
+ const T_wire d_low = ggml_cuda_cast<T_wire>(sendbuf[tail + tid]);
193
+ recvbuf[tail + tid] = ggml_cuda_cast<T_dst>(
194
+ ggml_cuda_cast<float>(d_low) +
195
+ ggml_cuda_cast<float>(host_other[tail + tid]));
196
+ }
197
+ }
198
+ }
199
+
200
+ // Combined load-convert-add kernel. The peer's contribution arrives as T_src
201
+ // (which may be a lower-precision type than T_dst when the BF16 round-trip is
202
+ // active). For bit-equivalence between the two GPUs, dst is first rounded
203
+ // through T_src's precision via ggml_cuda_cast -- peer already truncated its
204
+ // own value the same way before sending -- so both sides perform identical
205
+ // arithmetic. When T_dst == T_src the round-trip cast is a no-op.
206
+ template <typename T_dst, typename T_src>
207
+ static __global__ void ggml_cuda_ar_add_kernel(
208
+ T_dst * __restrict__ dst,
209
+ const T_src * __restrict__ src,
210
+ int count) {
211
+ const int tid = blockIdx.x * blockDim.x + threadIdx.x;
212
+ const int nt = gridDim.x * blockDim.x;
213
+ for (int i = tid; i < count; i += nt) {
214
+ const T_src d_low = ggml_cuda_cast<T_src>(dst[i]);
215
+ dst[i] = ggml_cuda_cast<T_dst>(
216
+ ggml_cuda_cast<float>(d_low) + ggml_cuda_cast<float>(src[i]));
217
+ }
218
+ }
219
+
220
+ // ---------------------------------------------------------------------------
221
+ // Pipeline structure
222
+ // ---------------------------------------------------------------------------
223
+
224
+ // Number of slots in the event / arrival ring. Two slots is sufficient:
225
+ // lockstep guarantees the two GPUs are at most one AR (or chunk) apart, so
226
+ // slot[N%2] is always safe to reuse -- peer has already consumed slot[N%2]
227
+ // from AR N-2 by the time we get to AR N. acquire_slot's
228
+ // cudaEventSynchronize on ev.ker for both devices makes that consumption
229
+ // explicit before we overwrite host_buf[slot] for the new AR.
230
+ static constexpr int GGML_CUDA_AR_POOL_SIZE = 2;
231
+
232
+ // Maximum chunk size (bytes per GPU) handled by one chunked kernel launch.
233
+ // Larger tensors are reduced by issuing multiple chunked launches.
234
+ static constexpr size_t GGML_CUDA_AR_MAX_BYTES = 1024 * 1024; // 1 MB
235
+
236
+ // Copy-engine path: largest tensor accepted on this path; sets host_large /
237
+ // dev_tmp allocation size.
238
+ static constexpr size_t GGML_CUDA_AR_COPY_MAX_BYTES = 32 * 1024 * 1024; // 32 MB
239
+
240
+ // AR wire size at which the copy-engine path takes over from the chunked-
241
+ // kernel path. Override via GGML_CUDA_AR_COPY_THRESHOLD.
242
+ static constexpr size_t GGML_CUDA_AR_COPY_THRESHOLD_DEFAULT = 1024 * 1024; // 1 MB
243
+ // Per-call CE chunk-size heuristic: chunk_bytes = clamp(nbytes / 4, MIN, MAX).
244
+ // The /4 keeps ~4 chunks in flight at any moment (good D2H/H2D overlap with
245
+ // the peer); the clamps cover the cases where nbytes/4 is too small (per-
246
+ // memcpy fixed cost dominates) or too large (chunk-level pipelining stalls).
247
+ // Env var GGML_CUDA_AR_COPY_CHUNK_BYTES can override with a fixed value.
248
+ static constexpr size_t GGML_CUDA_AR_COPY_CHUNK_BYTES_HEURISTIC_MIN = 512 * 1024; // 512 KB
249
+ static constexpr size_t GGML_CUDA_AR_COPY_CHUNK_BYTES_HEURISTIC_MAX = 2 * 1024 * 1024; // 2 MB
250
+ // Absolute floor that an env-var override is allowed to set; this caps the
251
+ // per-slot copy-event array. 256 KB -> up to 128 chunks per 32 MB tensor.
252
+ static constexpr size_t GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN = 256 * 1024;
253
+ static constexpr int GGML_CUDA_AR_COPY_MAX_CHUNKS =
254
+ static_cast<int>((GGML_CUDA_AR_COPY_MAX_BYTES + GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN - 1) /
255
+ GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN);
256
+
257
+ struct ggml_cuda_ar_event_slot {
258
+ cudaEvent_t app = nullptr; // upstream computation complete
259
+ cudaEvent_t cpy[GGML_CUDA_AR_COPY_MAX_CHUNKS] = {}; // copy-engine D2H chunks complete
260
+ cudaEvent_t h2d = nullptr; // copy-engine H2Ds complete (handoff AR stream -> compute stream)
261
+ cudaEvent_t ker = nullptr; // AllReduce kernel complete
262
+ };
263
+
264
+ // Mapped pinned host allocation: cudaHostAlloc + cudaHostGetDevicePointer
265
+ // in one place, with the host handle preserved for cudaFreeHost. Used where
266
+ // the CPU never touches the buffer -- only the device reads/writes via the
267
+ // mapped device pointer. Required on systems where cudaDevAttrCanUseHost-
268
+ // PointerForRegisteredMem is 0 and the host pointer can't be used as a
269
+ // device pointer.
270
+ struct ggml_cuda_ar_host_mapping {
271
+ uint8_t * host = nullptr; // cudaFreeHost handle; also the H-side ptr for cudaMemcpyAsync
272
+ uint8_t * dev = nullptr; // device-side pointer for kernels / cudaMemset
273
+
274
+ cudaError_t alloc(size_t bytes) {
275
+ cudaError_t rc = cudaHostAlloc(reinterpret_cast<void **>(&host), bytes,
276
+ cudaHostAllocPortable | cudaHostAllocMapped);
277
+ if (rc != cudaSuccess) {
278
+ host = nullptr;
279
+ return rc;
280
+ }
281
+ rc = cudaHostGetDevicePointer(reinterpret_cast<void **>(&dev), host, 0);
282
+ if (rc != cudaSuccess) {
283
+ cudaFreeHost(host);
284
+ host = nullptr;
285
+ dev = nullptr;
286
+ }
287
+ return rc;
288
+ }
289
+
290
+ void free() {
291
+ if (host) {
292
+ cudaFreeHost(host);
293
+ host = nullptr;
294
+ dev = nullptr;
295
+ }
296
+ }
297
+ };
298
+
299
+ struct ggml_cuda_ar_pipeline {
300
+ int n_devices;
301
+ int devices[GGML_CUDA_MAX_DEVICES];
302
+ size_t buf_bytes; // bytes per device in host_buf[]
303
+ size_t copy_bytes; // bytes per device in host_large[] / dev_tmp[]
304
+ size_t copy_threshold;
305
+ size_t copy_chunk_bytes;
306
+ size_t bf16_threshold; // tensors >= this size (bytes) are reduced via FP32->BF16 round-trip; 0 disables
307
+ uint64_t call_count;
308
+
309
+ // Per-device resources.
310
+ ggml_cuda_ar_host_mapping host_buf[GGML_CUDA_MAX_DEVICES]; // pinned staging (chunked kernel)
311
+ ggml_cuda_ar_host_mapping host_large[GGML_CUDA_MAX_DEVICES]; // pinned staging (copy-engine)
312
+ char * dev_tmp[GGML_CUDA_MAX_DEVICES]; // device scratch for copy-engine path
313
+ cudaStream_t streams[GGML_CUDA_MAX_DEVICES]; // non-blocking
314
+ ggml_cuda_ar_event_slot ev_pool[GGML_CUDA_MAX_DEVICES][GGML_CUDA_AR_POOL_SIZE];
315
+
316
+ // Copy-engine: per-device "I finished reading my peer's host_large"
317
+ // event. Indexed by RECORDER device. Recorded same-device on streams[i]
318
+ // after stage 2's last H2D from host_large[peer]. Waited cross-device
319
+ // by peer's stage-1 stream before the next AR overwrites host_large[peer].
320
+ cudaEvent_t host_large_read_done[GGML_CUDA_MAX_DEVICES];
321
+ bool host_large_read_done_valid;
322
+
323
+ // Copy-engine: per-device "my add_kernel is done with dev_tmp" event.
324
+ // Recorded on the compute stream after each add_kernel; the AR stream
325
+ // waits on it before the next copy_impl's H2D overwrites dev_tmp. Lets us
326
+ // single-buffer dev_tmp despite add_kernel running on a separate stream.
327
+ cudaEvent_t dev_tmp_kernel_done[GGML_CUDA_MAX_DEVICES];
328
+ bool dev_tmp_kernel_done_valid;
329
+
330
+ // Arrival ring: ARRIVAL_STRIDE bytes between adjacent ints. Mapped pinned
331
+ // memory; CPU never reads/writes -- only the kernel and cudaMemset.
332
+ // Use ggml_cuda_ar_arrival_ptr() to index.
333
+ ggml_cuda_ar_host_mapping arrival;
334
+ };
335
+
336
+ // Base pointer for the (slot, rank) per-block token block. The kernel adds
337
+ // blockIdx.x * (ARRIVAL_STRIDE/sizeof(int)) internally to land on its own slot.
338
+ static int * ggml_cuda_ar_arrival_ptr(const ggml_cuda_ar_pipeline * p, int slot, int rank) {
339
+ const size_t offset = ((size_t)slot * p->n_devices + rank) *
340
+ GGML_CUDA_AR_KERNEL_BLOCKS * GGML_CUDA_AR_ARRIVAL_STRIDE;
341
+ return reinterpret_cast<int *>(p->arrival.dev + offset);
342
+ }
343
+
344
+ static uint64_t ggml_cuda_ar_env_u64(const char * name, uint64_t default_value) {
345
+ const char * value = getenv(name);
346
+ if (value == nullptr || value[0] == '\0') {
347
+ return default_value;
348
+ }
349
+
350
+ char * end = nullptr;
351
+ const unsigned long long parsed = strtoull(value, &end, 10);
352
+ return end != value ? (uint64_t) parsed : default_value;
353
+ }
354
+
355
+ struct ggml_cuda_ar_slot_info {
356
+ int slot;
357
+ int token;
358
+ };
359
+
360
+ static ggml_cuda_ar_slot_info ggml_cuda_ar_acquire_slot(ggml_cuda_ar_pipeline * p) {
361
+ const int slot = static_cast<int>(p->call_count % GGML_CUDA_AR_POOL_SIZE);
362
+ const bool pool_lapped = p->call_count >= GGML_CUDA_AR_POOL_SIZE;
363
+ p->call_count++;
364
+
365
+ if (pool_lapped) {
366
+ for (int i = 0; i < p->n_devices; ++i) {
367
+ ggml_cuda_set_device(p->devices[i]);
368
+ CUDA_CHECK(cudaEventSynchronize(p->ev_pool[i][slot].ker));
369
+ }
370
+ }
371
+
372
+ return { slot, (int) p->call_count };
373
+ }
374
+
375
+ // Per-AR copy-engine chunk size: env-var override if set, else heuristic
376
+ // (clamp(nbytes/4, HEURISTIC_MIN, HEURISTIC_MAX)).
377
+ static size_t ggml_cuda_ar_chunk_bytes(const ggml_cuda_ar_pipeline * p, size_t nbytes) {
378
+ if (p->copy_chunk_bytes > 0) {
379
+ return p->copy_chunk_bytes;
380
+ }
381
+ return std::min(GGML_CUDA_AR_COPY_CHUNK_BYTES_HEURISTIC_MAX,
382
+ std::max(GGML_CUDA_AR_COPY_CHUNK_BYTES_HEURISTIC_MIN, nbytes / 4));
383
+ }
384
+
385
+ static void ggml_cuda_ar_wait_for_compute(
386
+ ggml_cuda_ar_pipeline * p, ggml_backend_cuda_context * cuda_ctx, int rank, int slot) {
387
+ ggml_cuda_ar_event_slot & ev = p->ev_pool[rank][slot];
388
+ CUDA_CHECK(cudaEventRecord(ev.app, cuda_ctx->stream()));
389
+ CUDA_CHECK(cudaStreamWaitEvent(p->streams[rank], ev.app));
390
+ }
391
+
392
+ // ---------------------------------------------------------------------------
393
+ // Init / free
394
+ // ---------------------------------------------------------------------------
395
+
396
+ ggml_cuda_ar_pipeline * ggml_cuda_ar_pipeline_init(const int * devices, size_t n_devices) {
397
+
398
+ if (n_devices != 2) {
399
+ GGML_LOG_DEBUG("%s: internal AllReduce only supports n_devices=2 (got %zu); "
400
+ "falling back\n", __func__, n_devices);
401
+ return nullptr;
402
+ }
403
+
404
+ // The chunked kernel uses __nanosleep, which is sm70+ (Volta+).
405
+ for (size_t i = 0; i < n_devices; ++i) {
406
+ const int cc = ggml_cuda_info().devices[devices[i]].cc;
407
+ if (cc < GGML_CUDA_CC_VOLTA) {
408
+ GGML_LOG_DEBUG("%s: internal AllReduce requires compute capability >= %d "
409
+ "(device %d has cc=%d); falling back\n",
410
+ __func__, GGML_CUDA_CC_VOLTA, devices[i], cc);
411
+ return nullptr;
412
+ }
413
+ }
414
+
415
+ auto * p = new ggml_cuda_ar_pipeline{};
416
+ p->n_devices = n_devices;
417
+ p->copy_bytes = GGML_CUDA_AR_COPY_MAX_BYTES;
418
+ p->copy_threshold = ggml_cuda_ar_env_u64("GGML_CUDA_AR_COPY_THRESHOLD", GGML_CUDA_AR_COPY_THRESHOLD_DEFAULT);
419
+ // 0 = use the per-call heuristic (default). Non-zero env value forces a
420
+ // fixed chunk size for diagnostics, with a floor at COPY_CHUNK_BYTES_MIN.
421
+ p->copy_chunk_bytes = ggml_cuda_ar_env_u64("GGML_CUDA_AR_COPY_CHUNK_BYTES", 0);
422
+ if (p->copy_chunk_bytes > 0 && p->copy_chunk_bytes < GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN) {
423
+ GGML_LOG_WARN("%s: GGML_CUDA_AR_COPY_CHUNK_BYTES=%zu below minimum %zu; clamping\n",
424
+ __func__, p->copy_chunk_bytes, GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN);
425
+ p->copy_chunk_bytes = GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN;
426
+ }
427
+ // Default 1: BF16 round-trip is always on for F32 inputs (any non-zero
428
+ // ne). Set GGML_CUDA_AR_BF16_THRESHOLD=0 to disable, or to a larger
429
+ // byte threshold to opt out for small tensors.
430
+ p->bf16_threshold = ggml_cuda_ar_env_u64("GGML_CUDA_AR_BF16_THRESHOLD", 1);
431
+ for (size_t i = 0; i < n_devices; ++i) {
432
+ p->devices[i] = devices[i];
433
+ }
434
+
435
+ // Per-device streams and event pools.
436
+ for (size_t i = 0; i < n_devices; ++i) {
437
+ ggml_cuda_set_device(p->devices[i]);
438
+
439
+ cudaStream_t stream = nullptr;
440
+ if (cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking) != cudaSuccess) {
441
+ GGML_LOG_ERROR("%s: cudaStreamCreateWithFlags failed for device %d\n",
442
+ __func__, p->devices[i]);
443
+ ggml_cuda_ar_pipeline_free(p);
444
+ return nullptr;
445
+ }
446
+ p->streams[i] = stream;
447
+
448
+ for (int s = 0; s < GGML_CUDA_AR_POOL_SIZE; ++s) {
449
+ bool ok =
450
+ cudaEventCreateWithFlags(&p->ev_pool[i][s].app, cudaEventDisableTiming) == cudaSuccess &&
451
+ cudaEventCreateWithFlags(&p->ev_pool[i][s].h2d, cudaEventDisableTiming) == cudaSuccess &&
452
+ cudaEventCreateWithFlags(&p->ev_pool[i][s].ker, cudaEventDisableTiming) == cudaSuccess;
453
+ for (int c = 0; ok && c < GGML_CUDA_AR_COPY_MAX_CHUNKS; ++c) {
454
+ ok = cudaEventCreateWithFlags(&p->ev_pool[i][s].cpy[c], cudaEventDisableTiming) == cudaSuccess;
455
+ }
456
+ if (!ok) {
457
+ GGML_LOG_ERROR("%s: cudaEventCreate failed for device %d slot %d\n",
458
+ __func__, p->devices[i], s);
459
+ ggml_cuda_ar_pipeline_free(p);
460
+ return nullptr;
461
+ }
462
+ }
463
+
464
+ if (cudaEventCreateWithFlags(&p->host_large_read_done[i], cudaEventDisableTiming) != cudaSuccess) {
465
+ GGML_LOG_ERROR("%s: cudaEventCreate for host_large_read_done failed for device %d\n",
466
+ __func__, p->devices[i]);
467
+ ggml_cuda_ar_pipeline_free(p);
468
+ return nullptr;
469
+ }
470
+ if (cudaEventCreateWithFlags(&p->dev_tmp_kernel_done[i], cudaEventDisableTiming) != cudaSuccess) {
471
+ GGML_LOG_ERROR("%s: cudaEventCreate for dev_tmp_kernel_done failed for device %d\n",
472
+ __func__, p->devices[i]);
473
+ ggml_cuda_ar_pipeline_free(p);
474
+ return nullptr;
475
+ }
476
+ }
477
+
478
+ // Arrival ring: cache-line padded so each GPU's int is on its own line.
479
+ const size_t arrival_bytes =
480
+ (size_t)GGML_CUDA_AR_POOL_SIZE * n_devices *
481
+ GGML_CUDA_AR_KERNEL_BLOCKS * GGML_CUDA_AR_ARRIVAL_STRIDE;
482
+ if (p->arrival.alloc(arrival_bytes) != cudaSuccess) {
483
+ GGML_LOG_ERROR("%s: alloc for arrival ring failed (%zu bytes)\n",
484
+ __func__, arrival_bytes);
485
+ ggml_cuda_ar_pipeline_free(p);
486
+ return nullptr;
487
+ }
488
+ ggml_cuda_set_device(p->devices[0]);
489
+ if (cudaMemset(p->arrival.dev, 0, arrival_bytes) != cudaSuccess) {
490
+ GGML_LOG_ERROR("%s: cudaMemset for arrival ring failed (%zu bytes)\n",
491
+ __func__, arrival_bytes);
492
+ ggml_cuda_ar_pipeline_free(p);
493
+ return nullptr;
494
+ }
495
+
496
+ // Per-device pinned staging buffers -- POOL_SIZE-deep ring so the chunked-
497
+ // kernel can write the next slot's data while the peer is still reading
498
+ // the previous slot's. Indexed by (slot * buf_bytes) at the call site.
499
+ p->buf_bytes = GGML_CUDA_AR_MAX_BYTES;
500
+ const size_t host_buf_total = (size_t) GGML_CUDA_AR_POOL_SIZE * p->buf_bytes;
501
+ for (size_t i = 0; i < n_devices; ++i) {
502
+ if (p->host_buf[i].alloc(host_buf_total) != cudaSuccess) {
503
+ GGML_LOG_ERROR("%s: alloc for staging failed (%zu bytes)\n",
504
+ __func__, host_buf_total);
505
+ ggml_cuda_ar_pipeline_free(p);
506
+ return nullptr;
507
+ }
508
+ }
509
+
510
+ // Copy-engine path: pinned host staging + device scratch, sized for the
511
+ // largest tensor we accept on this path (GGML_CUDA_AR_COPY_MAX_BYTES).
512
+ // dev_tmp is single-buffered; cross-AR safety is enforced by an explicit
513
+ // cross-stream wait in copy_impl on the prior AR's add_kernel-done event.
514
+ for (size_t i = 0; i < n_devices; ++i) {
515
+ ggml_cuda_set_device(p->devices[i]);
516
+ if (p->host_large[i].alloc(p->copy_bytes) != cudaSuccess) {
517
+ GGML_LOG_ERROR("%s: alloc for large staging failed (%zu bytes)\n",
518
+ __func__, p->copy_bytes);
519
+ ggml_cuda_ar_pipeline_free(p);
520
+ return nullptr;
521
+ }
522
+ if (cudaMalloc(reinterpret_cast<void **>(&p->dev_tmp[i]), p->copy_bytes) != cudaSuccess) {
523
+ GGML_LOG_ERROR("%s: cudaMalloc for copy scratch failed (%zu bytes) on device %d\n",
524
+ __func__, p->copy_bytes, p->devices[i]);
525
+ ggml_cuda_ar_pipeline_free(p);
526
+ return nullptr;
527
+ }
528
+ }
529
+
530
+ GGML_LOG_INFO("%s: initialized AllReduce pipeline: %zu GPUs, "
531
+ "%zu KB chunked kernel staging + %zu MB copy-engine staging per GPU\n",
532
+ __func__, n_devices, p->buf_bytes >> 10, p->copy_bytes >> 20);
533
+
534
+ return p;
535
+ }
536
+
537
+ void ggml_cuda_ar_pipeline_free(ggml_cuda_ar_pipeline * p) {
538
+ if (!p) {
539
+ return;
540
+ }
541
+
542
+ // Drain all in-flight kernels before tearing down resources.
543
+ for (int i = 0; i < p->n_devices; ++i) {
544
+ if (p->streams[i]) {
545
+ ggml_cuda_set_device(p->devices[i]);
546
+ cudaStreamSynchronize(p->streams[i]);
547
+ }
548
+ }
549
+
550
+ for (int i = 0; i < p->n_devices; ++i) {
551
+ p->host_buf[i].free();
552
+ p->host_large[i].free();
553
+ if (p->dev_tmp[i]) {
554
+ ggml_cuda_set_device(p->devices[i]);
555
+ cudaFree(p->dev_tmp[i]);
556
+ }
557
+ ggml_cuda_set_device(p->devices[i]);
558
+ for (int s = 0; s < GGML_CUDA_AR_POOL_SIZE; ++s) {
559
+ if (p->ev_pool[i][s].app) { cudaEventDestroy(p->ev_pool[i][s].app); }
560
+ for (int c = 0; c < GGML_CUDA_AR_COPY_MAX_CHUNKS; ++c) {
561
+ if (p->ev_pool[i][s].cpy[c]) { cudaEventDestroy(p->ev_pool[i][s].cpy[c]); }
562
+ }
563
+ if (p->ev_pool[i][s].h2d) { cudaEventDestroy(p->ev_pool[i][s].h2d); }
564
+ if (p->ev_pool[i][s].ker) { cudaEventDestroy(p->ev_pool[i][s].ker); }
565
+ }
566
+ if (p->host_large_read_done[i]) {
567
+ ggml_cuda_set_device(p->devices[i]);
568
+ cudaEventDestroy(p->host_large_read_done[i]);
569
+ }
570
+ if (p->dev_tmp_kernel_done[i]) {
571
+ ggml_cuda_set_device(p->devices[i]);
572
+ cudaEventDestroy(p->dev_tmp_kernel_done[i]);
573
+ }
574
+ if (p->streams[i]) {
575
+ ggml_cuda_set_device(p->devices[i]);
576
+ cudaStreamDestroy(p->streams[i]);
577
+ }
578
+ }
579
+ p->arrival.free();
580
+ delete p;
581
+ }
582
+
583
+ // ---------------------------------------------------------------------------
584
+ // Dispatch
585
+ // ---------------------------------------------------------------------------
586
+
587
+ // Asymmetric copy_impl: data sent over PCIe in T_src precision (one element of
588
+ // nbytes per ne element); accumulated locally into a T_dst buffer. When
589
+ // T_src == T_dst this is the original homogeneous reduction. When they differ
590
+ // (e.g. BF16 wire / F32 accumulator) the add kernel rounds dst through T_src
591
+ // for bit-equivalence between GPUs and we skip the otherwise-needed
592
+ // post-conversion entirely.
593
+ template <typename T_src, typename T_dst>
594
+ static bool ggml_cuda_ar_allreduce_copy_impl(
595
+ ggml_cuda_ar_pipeline * p,
596
+ ggml_backend_t * backends,
597
+ T_src * const src_buf[GGML_CUDA_MAX_DEVICES],
598
+ T_dst * const dst_buf[GGML_CUDA_MAX_DEVICES],
599
+ const bool compute[GGML_CUDA_MAX_DEVICES],
600
+ int64_t ne,
601
+ size_t nbytes) {
602
+ GGML_ASSERT(p->n_devices == 2);
603
+ GGML_ASSERT(nbytes <= p->copy_bytes);
604
+ GGML_ASSERT(ne <= std::numeric_limits<int>::max());
605
+
606
+ const size_t chunk_bytes = ggml_cuda_ar_chunk_bytes(p, nbytes);
607
+ GGML_ASSERT(chunk_bytes > 0);
608
+
609
+ const int slot = ggml_cuda_ar_acquire_slot(p).slot;
610
+ const size_t copy_chunks = (nbytes + chunk_bytes - 1) / chunk_bytes;
611
+ GGML_ASSERT(copy_chunks <= GGML_CUDA_AR_COPY_MAX_CHUNKS);
612
+
613
+ ggml_backend_cuda_context * cuda_ctx[2] = {};
614
+
615
+ // Stage 1: both GPUs copy their local contribution to pinned host memory.
616
+ for (int i = 0; i < 2; ++i) {
617
+ ggml_cuda_set_device(p->devices[i]);
618
+ cuda_ctx[i] = static_cast<ggml_backend_cuda_context *>(backends[i]->context);
619
+ GGML_ASSERT(cuda_ctx[i]->device == p->devices[i]);
620
+
621
+ ggml_cuda_ar_wait_for_compute(p, cuda_ctx[i], i, slot);
622
+
623
+ // Wait for peer's H2D from our host_large[i] (recorded in the
624
+ // previous AR's stage 2) to complete before we overwrite host_large[i].
625
+ // host_large_read_done[peer] = peer finished reading host_large[i].
626
+ // No-op on the first AR -- no prior record exists.
627
+ if (p->host_large_read_done_valid) {
628
+ const int peer = 1 - i;
629
+ CUDA_CHECK(cudaStreamWaitEvent(p->streams[i], p->host_large_read_done[peer]));
630
+ }
631
+
632
+ if (!compute[i]) {
633
+ CUDA_CHECK(cudaMemsetAsync(src_buf[i], 0, nbytes, p->streams[i]));
634
+ }
635
+
636
+ for (size_t c = 0; c < copy_chunks; ++c) {
637
+ const size_t offset = c * chunk_bytes;
638
+ const size_t this_bytes = (nbytes - offset) < chunk_bytes ?
639
+ (nbytes - offset) : chunk_bytes;
640
+
641
+ CUDA_CHECK(cudaMemcpyAsync(
642
+ p->host_large[i].host + offset, reinterpret_cast<char *>(src_buf[i]) + offset, this_bytes,
643
+ cudaMemcpyDeviceToHost, p->streams[i]));
644
+ CUDA_CHECK(cudaEventRecord(p->ev_pool[i][slot].cpy[c], p->streams[i]));
645
+ }
646
+ }
647
+
648
+ // Stage 2: each GPU waits for each peer D2H chunk, pulls that chunk back to
649
+ // local device scratch (dev_tmp), then performs one device-local add over
650
+ // the assembled peer tensor. The H2Ds run on the AR stream (copy engine)
651
+ // and the add_kernel runs on the caller's compute stream, so the AR stream
652
+ // stays pure-copy and avoids an in-stream copy->compute engine switch every
653
+ // AR. dev_tmp is single-buffered: the AR stream waits cross-stream on the
654
+ // prior AR's add_kernel-done event before overwriting it.
655
+ for (int i = 0; i < 2; ++i) {
656
+ const int peer = 1 - i;
657
+ ggml_cuda_set_device(p->devices[i]);
658
+
659
+ // Wait for the previous AR's add_kernel (on the compute stream) to
660
+ // finish reading dev_tmp before our H2D overwrites it. No-op on the
661
+ // first copy_impl call.
662
+ if (p->dev_tmp_kernel_done_valid) {
663
+ CUDA_CHECK(cudaStreamWaitEvent(p->streams[i], p->dev_tmp_kernel_done[i]));
664
+ }
665
+
666
+ for (size_t c = 0; c < copy_chunks; ++c) {
667
+ const size_t offset = c * chunk_bytes;
668
+ const size_t this_bytes = (nbytes - offset) < chunk_bytes ?
669
+ (nbytes - offset) : chunk_bytes;
670
+
671
+ CUDA_CHECK(cudaStreamWaitEvent(p->streams[i], p->ev_pool[peer][slot].cpy[c]));
672
+ CUDA_CHECK(cudaMemcpyAsync(
673
+ p->dev_tmp[i] + offset, p->host_large[peer].host + offset, this_bytes,
674
+ cudaMemcpyHostToDevice, p->streams[i]));
675
+ }
676
+
677
+ // Mark our reads of host_large[peer] complete so peer's next AR can
678
+ // safely overwrite it.
679
+ CUDA_CHECK(cudaEventRecord(p->host_large_read_done[i], p->streams[i]));
680
+
681
+ // Hand off from AR stream (copy engine) to compute stream: compute
682
+ // stream waits for all H2Ds to finish, then runs the add_kernel.
683
+ CUDA_CHECK(cudaEventRecord(p->ev_pool[i][slot].h2d, p->streams[i]));
684
+ CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx[i]->stream(), p->ev_pool[i][slot].h2d));
685
+
686
+ const int block_size = 256;
687
+ int n_blocks = (int) ((ne + block_size - 1) / block_size);
688
+ if (n_blocks > 1024) {
689
+ n_blocks = 1024;
690
+ }
691
+ ggml_cuda_ar_add_kernel<T_dst, T_src><<<n_blocks, block_size, 0, cuda_ctx[i]->stream()>>>(
692
+ dst_buf[i],
693
+ reinterpret_cast<const T_src *>(p->dev_tmp[i]),
694
+ (int) ne);
695
+ CUDA_CHECK(cudaGetLastError());
696
+
697
+ // Record dev_tmp-released on the compute stream so the next copy_impl
698
+ // can wait for the kernel to finish before overwriting dev_tmp. Also
699
+ // record AR-done as ev.ker for acquire_slot's pool-wraparound sync.
700
+ CUDA_CHECK(cudaEventRecord(p->dev_tmp_kernel_done[i], cuda_ctx[i]->stream()));
701
+ CUDA_CHECK(cudaEventRecord(p->ev_pool[i][slot].ker, cuda_ctx[i]->stream()));
702
+ }
703
+ p->host_large_read_done_valid = true;
704
+ p->dev_tmp_kernel_done_valid = true;
705
+
706
+ return true;
707
+ }
708
+
709
+ // Outer-level chunker: copy_impl handles up to copy_bytes per call (limited by
710
+ // the host_large / dev_tmp allocation size). When the full AR exceeds that,
711
+ // slice the tensor into copy_bytes-sized pieces and call copy_impl repeatedly.
712
+ // Each slice goes through its own stage 1 -> stage 2 cycle and acquires its own
713
+ // slot, so cross-AR fences and pool wraparound work the same way as for any
714
+ // other sequence of small ARs.
715
+ template <typename T_src, typename T_dst>
716
+ static bool ggml_cuda_ar_allreduce_copy_outer(
717
+ ggml_cuda_ar_pipeline * p,
718
+ ggml_backend_t * backends,
719
+ T_src * const src_buf[GGML_CUDA_MAX_DEVICES],
720
+ T_dst * const dst_buf[GGML_CUDA_MAX_DEVICES],
721
+ const bool compute[GGML_CUDA_MAX_DEVICES],
722
+ int64_t ne) {
723
+ const int64_t outer_max_elems = (int64_t) (p->copy_bytes / sizeof(T_src));
724
+ GGML_ASSERT(outer_max_elems > 0);
725
+
726
+ bool ok = true;
727
+ for (int64_t outer_start = 0; outer_start < ne && ok; outer_start += outer_max_elems) {
728
+ const int64_t outer_ne = std::min(outer_max_elems, ne - outer_start);
729
+ const size_t outer_nbytes = (size_t) outer_ne * sizeof(T_src);
730
+
731
+ T_src * src[GGML_CUDA_MAX_DEVICES] = {};
732
+ T_dst * dst[GGML_CUDA_MAX_DEVICES] = {};
733
+ for (int i = 0; i < p->n_devices; ++i) {
734
+ src[i] = src_buf[i] + outer_start;
735
+ dst[i] = dst_buf[i] + outer_start;
736
+ }
737
+ ok = ggml_cuda_ar_allreduce_copy_impl<T_src, T_dst>(
738
+ p, backends, src, dst, compute, outer_ne, outer_nbytes);
739
+ }
740
+ return ok;
741
+ }
742
+
743
+ bool ggml_cuda_ar_allreduce(
744
+ ggml_cuda_ar_pipeline * p,
745
+ ggml_backend_t * backends,
746
+ ggml_tensor ** tensors) {
747
+ GGML_ASSERT(p != nullptr);
748
+
749
+ const int n = p->n_devices;
750
+ GGML_ASSERT(n == 2);
751
+
752
+ const ggml_type input_type = tensors[0]->type;
753
+ GGML_ASSERT(input_type == GGML_TYPE_F32 || input_type == GGML_TYPE_F16 || input_type == GGML_TYPE_BF16);
754
+
755
+ const int64_t ne = ggml_nelements(tensors[0]);
756
+ GGML_ASSERT(ne > 0);
757
+
758
+ const size_t input_nbytes = ggml_nbytes(tensors[0]);
759
+
760
+ // BF16 round-trip: F32 inputs >= bf16_threshold are converted to BF16 for
761
+ // the reduction (chunked or copy-engine), halving on-wire bytes. Matches
762
+ // NCCL's behaviour. The pre-conversion zeroes inactive shards so the
763
+ // inner paths see them as already-prepared compute tensors.
764
+ const bool use_bf16 =
765
+ input_type == GGML_TYPE_F32 &&
766
+ p->bf16_threshold > 0 &&
767
+ input_nbytes >= p->bf16_threshold;
768
+
769
+ const ggml_type kernel_type = use_bf16 ? GGML_TYPE_BF16 : input_type;
770
+ const size_t type_size = ggml_type_size(kernel_type);
771
+ GGML_ASSERT(p->buf_bytes >= type_size);
772
+ const size_t nbytes = (size_t) ne * type_size;
773
+
774
+ bool compute_flag[GGML_CUDA_MAX_DEVICES] = {};
775
+ for (int i = 0; i < n; ++i) {
776
+ compute_flag[i] = (tensors[i]->flags & GGML_TENSOR_FLAG_COMPUTE) != 0;
777
+ }
778
+
779
+ // Decide between copy-engine and chunked kernel paths based on the working
780
+ // type's actual byte count. No upper bound: copy_outer slices reductions
781
+ // larger than copy_bytes into copy_bytes-sized pieces.
782
+ const bool use_copy_engine =
783
+ p->copy_threshold > 0 &&
784
+ nbytes >= p->copy_threshold;
785
+
786
+ // BF16 inactive-shard zeroing: when use_bf16 is on, the combined kernel
787
+ // (chunked kernel path) and the combined add kernel (copy_engine path)
788
+ // both accumulate into the F32 tensor data directly, so an inactive
789
+ // shard's accumulator must start at zero.
790
+ if (use_bf16) {
791
+ for (int i = 0; i < n; ++i) {
792
+ if (!compute_flag[i]) {
793
+ auto * cuda_ctx = static_cast<ggml_backend_cuda_context *>(backends[i]->context);
794
+ GGML_ASSERT(cuda_ctx->device == p->devices[i]);
795
+ ggml_cuda_set_device(p->devices[i]);
796
+ CUDA_CHECK(cudaMemsetAsync(tensors[i]->data, 0, (size_t) ne * sizeof(float), cuda_ctx->stream()));
797
+ }
798
+ }
799
+ }
800
+
801
+ // Pre-convert F32 -> BF16 into bf16_tmp ONLY for the copy_engine + use_bf16
802
+ // path; the chunked kernel path's combined kernel does the conversion
803
+ // inline as it writes to host_buf.
804
+ ggml_cuda_pool_alloc<nv_bfloat16> bf16_tmp[GGML_CUDA_MAX_DEVICES];
805
+ void * copy_src_ptr[GGML_CUDA_MAX_DEVICES] = {};
806
+
807
+ if (use_copy_engine && use_bf16) {
808
+ to_bf16_cuda_t to_bf16 = ggml_get_to_bf16_cuda(GGML_TYPE_F32);
809
+ for (int i = 0; i < n; ++i) {
810
+ auto * cuda_ctx = static_cast<ggml_backend_cuda_context *>(backends[i]->context);
811
+ GGML_ASSERT(cuda_ctx->device == p->devices[i]);
812
+ bf16_tmp[i].pool = &cuda_ctx->pool();
813
+ bf16_tmp[i].alloc(ne);
814
+ ggml_cuda_set_device(p->devices[i]);
815
+ if (compute_flag[i]) {
816
+ to_bf16(tensors[i]->data, bf16_tmp[i].get(), ne, cuda_ctx->stream());
817
+ CUDA_CHECK(cudaGetLastError());
818
+ } else {
819
+ CUDA_CHECK(cudaMemsetAsync(bf16_tmp[i].get(), 0, nbytes, cuda_ctx->stream()));
820
+ }
821
+ copy_src_ptr[i] = bf16_tmp[i].get();
822
+ }
823
+ }
824
+
825
+ bool ok = true;
826
+ if (use_copy_engine) {
827
+ // After up-front BF16 conversion, the tmp buffers already hold the
828
+ // (possibly zeroed-for-inactive) data, so the inner path can treat
829
+ // every shard as compute.
830
+ bool inner_compute[GGML_CUDA_MAX_DEVICES];
831
+ for (int i = 0; i < n; ++i) {
832
+ inner_compute[i] = use_bf16 ? true : compute_flag[i];
833
+ }
834
+
835
+ // Dispatch into copy_impl with explicit src/dst types. When use_bf16
836
+ // is on, the wire type is BF16 (src = bf16_tmp) and the accumulator
837
+ // is F32 (dst = tensors[i]->data); the combined add kernel rounds dst
838
+ // through BF16 for bit-equivalence and writes F32 directly, so no
839
+ // post-conversion is needed. Otherwise src == dst (same native type).
840
+ if (use_bf16) {
841
+ GGML_ASSERT(kernel_type == GGML_TYPE_BF16);
842
+ nv_bfloat16 * src[GGML_CUDA_MAX_DEVICES] = {};
843
+ float * dst[GGML_CUDA_MAX_DEVICES] = {};
844
+ for (int i = 0; i < n; ++i) {
845
+ src[i] = static_cast<nv_bfloat16 *>(copy_src_ptr[i]);
846
+ dst[i] = static_cast<float *>(tensors[i]->data);
847
+ }
848
+ ok = ggml_cuda_ar_allreduce_copy_outer<nv_bfloat16, float>(
849
+ p, backends, src, dst, inner_compute, ne);
850
+ } else {
851
+ switch (kernel_type) {
852
+ case GGML_TYPE_F32: {
853
+ float * buf[GGML_CUDA_MAX_DEVICES] = {};
854
+ for (int i = 0; i < n; ++i) {
855
+ buf[i] = static_cast<float *>(tensors[i]->data);
856
+ }
857
+ ok = ggml_cuda_ar_allreduce_copy_outer<float, float>(
858
+ p, backends, buf, buf, inner_compute, ne);
859
+ break;
860
+ }
861
+ case GGML_TYPE_BF16: {
862
+ nv_bfloat16 * buf[GGML_CUDA_MAX_DEVICES] = {};
863
+ for (int i = 0; i < n; ++i) {
864
+ buf[i] = static_cast<nv_bfloat16 *>(tensors[i]->data);
865
+ }
866
+ ok = ggml_cuda_ar_allreduce_copy_outer<nv_bfloat16, nv_bfloat16>(
867
+ p, backends, buf, buf, inner_compute, ne);
868
+ break;
869
+ }
870
+ case GGML_TYPE_F16: {
871
+ half * buf[GGML_CUDA_MAX_DEVICES] = {};
872
+ for (int i = 0; i < n; ++i) {
873
+ buf[i] = static_cast<half *>(tensors[i]->data);
874
+ }
875
+ ok = ggml_cuda_ar_allreduce_copy_outer<half, half>(
876
+ p, backends, buf, buf, inner_compute, ne);
877
+ break;
878
+ }
879
+ default:
880
+ GGML_ASSERT(false);
881
+ }
882
+ }
883
+ } else {
884
+ // host_buf carries T_wire-typed data; max_chunk_elems is the count that
885
+ // fits in one host_buf at the wire size.
886
+ const size_t max_chunk_elems = p->buf_bytes / type_size;
887
+ const size_t input_type_size = ggml_type_size(input_type);
888
+
889
+ // Chunked kernel path runs entirely on the caller's compute stream:
890
+ // since AR is a barrier here, same-stream ordering subsumes any
891
+ // cross-stream event handshake that the copy-engine path needs, and
892
+ // skips the cross-stream scheduling overhead that was hurting the
893
+ // small-tensor (tg) latency on the AR-stream variant. Only ev.ker is
894
+ // still recorded at end-of-AR for acquire_slot's pool-wraparound check.
895
+ for (int64_t chunk_start = 0; chunk_start < ne; chunk_start += (int64_t) max_chunk_elems) {
896
+ const size_t remaining_elems = (size_t) (ne - chunk_start);
897
+ const size_t chunk_elems = remaining_elems < max_chunk_elems ? remaining_elems : max_chunk_elems;
898
+ const size_t chunk_dst_bytes = chunk_elems * input_type_size;
899
+
900
+ const auto [slot, token] = ggml_cuda_ar_acquire_slot(p);
901
+ const bool last_chunk = chunk_start + (int64_t) chunk_elems == ne;
902
+
903
+ for (int i = 0; i < n; ++i) {
904
+ const int peer = 1 - i; // valid for n == 2 only
905
+ ggml_cuda_set_device(p->devices[i]);
906
+ auto * cuda_ctx = static_cast<ggml_backend_cuda_context *>(backends[i]->context);
907
+ GGML_ASSERT(cuda_ctx->device == p->devices[i]);
908
+ cudaStream_t stream = cuda_ctx->stream();
909
+
910
+ char * data = static_cast<char *>(tensors[i]->data) + chunk_start * (int64_t) input_type_size;
911
+
912
+ // Match NCCL/meta-backend semantics: inactive shards contribute
913
+ // zeros. On the BF16 path the F32 tensor data was already
914
+ // zeroed up-front (above), so per-chunk zeroing isn't needed.
915
+ if (!compute_flag[i] && !use_bf16) {
916
+ CUDA_CHECK(cudaMemsetAsync(data, 0, chunk_dst_bytes, stream));
917
+ }
918
+
919
+ #define LAUNCH_AR_KERNEL(T_dst, T_wire) \
920
+ ggml_cuda_ar_kernel<T_dst, T_wire><<<dim3(GGML_CUDA_AR_KERNEL_BLOCKS), dim3(256), 0, stream>>>( \
921
+ reinterpret_cast<const T_dst *>(data), \
922
+ reinterpret_cast<T_dst *>(data), \
923
+ reinterpret_cast<T_wire *>(p->host_buf[i].dev + (size_t) slot * p->buf_bytes), \
924
+ reinterpret_cast<const T_wire *>(p->host_buf[peer].dev + (size_t) slot * p->buf_bytes), \
925
+ static_cast<int>(chunk_elems), \
926
+ ggml_cuda_ar_arrival_ptr(p, slot, i), \
927
+ ggml_cuda_ar_arrival_ptr(p, slot, peer), \
928
+ token)
929
+
930
+ if (use_bf16) {
931
+ GGML_ASSERT(input_type == GGML_TYPE_F32);
932
+ LAUNCH_AR_KERNEL(float, nv_bfloat16);
933
+ } else {
934
+ switch (input_type) {
935
+ case GGML_TYPE_F32: LAUNCH_AR_KERNEL(float, float); break;
936
+ case GGML_TYPE_F16: LAUNCH_AR_KERNEL(half, half); break;
937
+ case GGML_TYPE_BF16: LAUNCH_AR_KERNEL(nv_bfloat16, nv_bfloat16); break;
938
+ default: GGML_ASSERT(false);
939
+ }
940
+ }
941
+
942
+ #undef LAUNCH_AR_KERNEL
943
+ CUDA_CHECK(cudaGetLastError());
944
+
945
+ if (last_chunk) {
946
+ CUDA_CHECK(cudaEventRecord(p->ev_pool[i][slot].ker, stream));
947
+ }
948
+ }
949
+ }
950
+ }
951
+
952
+ return ok;
953
+ }
954
+
955
+ #else // defined(GGML_USE_HIP) || defined(GGML_USE_MUSA)
956
+
957
+ // HIP and MUSA lack the host-mapped pinned-memory APIs (cudaHostAllocPortable
958
+ // / cudaHostAllocMapped / cudaHostGetDevicePointer) and __nanosleep that this
959
+ // implementation relies on, so the internal AllReduce is a CUDA-only feature.
960
+ // The dispatcher in ggml-cuda.cu treats a nullptr pipeline as "init failed"
961
+ // and silently falls back to the meta backend's generic AllReduce.
962
+ ggml_cuda_ar_pipeline * ggml_cuda_ar_pipeline_init(const int *, size_t) {
963
+ return nullptr;
964
+ }
965
+ void ggml_cuda_ar_pipeline_free(ggml_cuda_ar_pipeline *) {
966
+ }
967
+ bool ggml_cuda_ar_allreduce(ggml_cuda_ar_pipeline *, ggml_backend_t *, ggml_tensor **) {
968
+ return false;
969
+ }
970
+
971
+ #endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)