whispercpp 1.3.6 → 1.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (828) hide show
  1. checksums.yaml +4 -4
  2. data/.document +3 -0
  3. data/.rdoc_options +2 -0
  4. data/README.md +38 -5
  5. data/Rakefile +18 -3
  6. data/ext/dependencies.rb +10 -4
  7. data/ext/dependencies_for_windows.rb +17 -0
  8. data/ext/extconf.rb +20 -8
  9. data/ext/options.rb +54 -14
  10. data/ext/options_for_windows.rb +51 -0
  11. data/ext/ruby_whisper.c +36 -42
  12. data/ext/ruby_whisper.h +135 -0
  13. data/ext/ruby_whisper_context.c +107 -28
  14. data/ext/ruby_whisper_log_queue.c +180 -0
  15. data/ext/ruby_whisper_log_settable.h +47 -0
  16. data/ext/ruby_whisper_parakeet.c +49 -0
  17. data/ext/ruby_whisper_parakeet_context.c +304 -0
  18. data/ext/ruby_whisper_parakeet_context_params.c +117 -0
  19. data/ext/ruby_whisper_parakeet_model.c +84 -0
  20. data/ext/ruby_whisper_parakeet_params.c +548 -0
  21. data/ext/ruby_whisper_parakeet_segment.c +157 -0
  22. data/ext/ruby_whisper_parakeet_token.c +188 -0
  23. data/ext/ruby_whisper_parakeet_transcribe.cpp +58 -0
  24. data/ext/ruby_whisper_params.c +256 -65
  25. data/ext/ruby_whisper_segment.c +6 -6
  26. data/ext/ruby_whisper_transcribe.cpp +42 -15
  27. data/ext/sources/CMakeLists.txt +41 -3
  28. data/ext/sources/CMakePresets.json +95 -0
  29. data/ext/sources/cmake/parakeet-config.cmake.in +30 -0
  30. data/ext/sources/cmake/parakeet.pc.in +10 -0
  31. data/ext/sources/cmake/whisper.pc.in +1 -1
  32. data/ext/sources/examples/CMakeLists.txt +4 -2
  33. data/ext/sources/examples/bench/bench.cpp +1 -1
  34. data/ext/sources/examples/cli/cli.cpp +43 -9
  35. data/ext/sources/examples/common-ggml.cpp +2 -0
  36. data/ext/sources/examples/common-whisper.cpp +139 -67
  37. data/ext/sources/examples/common-whisper.h +11 -0
  38. data/ext/sources/examples/ffmpeg-transcode.cpp +211 -341
  39. data/ext/sources/examples/parakeet-cli/CMakeLists.txt +8 -0
  40. data/ext/sources/examples/parakeet-cli/parakeet-cli.cpp +243 -0
  41. data/ext/sources/examples/parakeet-quantize/CMakeLists.txt +7 -0
  42. data/ext/sources/examples/parakeet-quantize/parakeet-quantize.cpp +230 -0
  43. data/ext/sources/examples/server/server.cpp +199 -163
  44. data/ext/sources/ggml/CMakeLists.txt +21 -13
  45. data/ext/sources/ggml/cmake/FindNCCL.cmake +36 -0
  46. data/ext/sources/ggml/cmake/ggml-config.cmake.in +12 -2
  47. data/ext/sources/ggml/include/ggml-alloc.h +1 -0
  48. data/ext/sources/ggml/include/ggml-backend.h +72 -10
  49. data/ext/sources/ggml/include/ggml-cuda.h +3 -0
  50. data/ext/sources/ggml/include/ggml-rpc.h +3 -3
  51. data/ext/sources/ggml/include/ggml.h +101 -9
  52. data/ext/sources/ggml/include/gguf.h +10 -2
  53. data/ext/sources/ggml/src/CMakeLists.txt +22 -5
  54. data/ext/sources/ggml/src/ggml-alloc.c +5 -1
  55. data/ext/sources/ggml/src/ggml-backend-impl.h +22 -2
  56. data/ext/sources/ggml/src/ggml-backend-meta.cpp +2263 -0
  57. data/ext/sources/ggml/src/ggml-backend-reg.cpp +12 -0
  58. data/ext/sources/ggml/src/ggml-backend.cpp +110 -9
  59. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +4 -0
  60. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +672 -257
  61. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +71 -0
  62. data/ext/sources/ggml/src/ggml-cann/common.h +20 -10
  63. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +211 -30
  64. data/ext/sources/ggml/src/ggml-common.h +11 -0
  65. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +58 -29
  66. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +2 -0
  67. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +16 -16
  68. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +116 -7
  69. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +65 -0
  70. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +151 -1
  71. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +0 -1
  72. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +4279 -1292
  73. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +5 -35
  74. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +0 -1
  75. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +72 -1
  76. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +177 -27
  77. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +1 -1
  78. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +5 -0
  79. data/ext/sources/ggml/src/ggml-cpu/cmake/FindSMTIME.cmake +32 -0
  80. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +10 -0
  81. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +95 -5
  82. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
  83. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +146 -134
  84. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +88 -70
  85. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +372 -73
  86. data/ext/sources/ggml/src/ggml-cpu/ops.h +3 -0
  87. data/ext/sources/ggml/src/ggml-cpu/quants.c +55 -0
  88. data/ext/sources/ggml/src/ggml-cpu/quants.h +3 -0
  89. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +3 -0
  90. data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +90 -0
  91. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +3 -16
  92. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1402 -687
  93. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +8 -0
  94. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +597 -2766
  95. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime2_kernels.cpp +5768 -0
  96. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.cpp +320 -0
  97. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.h +55 -0
  98. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +182 -19
  99. data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.cpp +1795 -0
  100. data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.h +14 -0
  101. data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.cpp +3178 -0
  102. data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.h +95 -0
  103. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_barrier.h +34 -0
  104. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.cpp +760 -0
  105. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.h +32 -0
  106. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_tcm.h +409 -0
  107. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +37 -53
  108. data/ext/sources/ggml/src/ggml-cpu/vec.h +225 -240
  109. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +17 -7
  110. data/ext/sources/ggml/src/ggml-cuda/allreduce.cu +971 -0
  111. data/ext/sources/ggml/src/ggml-cuda/allreduce.cuh +29 -0
  112. data/ext/sources/ggml/src/ggml-cuda/argsort.cu +62 -26
  113. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +44 -18
  114. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +1 -0
  115. data/ext/sources/ggml/src/ggml-cuda/common.cuh +242 -28
  116. data/ext/sources/ggml/src/ggml-cuda/concat.cu +120 -114
  117. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +45 -21
  118. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +1 -0
  119. data/ext/sources/ggml/src/ggml-cuda/convert.cu +53 -0
  120. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +10 -0
  121. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +14 -6
  122. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +22 -0
  123. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +278 -44
  124. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +331 -130
  125. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +12 -0
  126. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +126 -27
  127. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +40 -15
  128. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +18 -9
  129. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +152 -49
  130. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
  131. data/ext/sources/ggml/src/ggml-cuda/fwht.cu +101 -0
  132. data/ext/sources/ggml/src/ggml-cuda/fwht.cuh +4 -0
  133. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +84 -35
  134. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +34 -12
  135. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1069 -609
  136. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +32 -29
  137. data/ext/sources/ggml/src/ggml-cuda/mean.cu +4 -2
  138. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +242 -195
  139. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +3 -3
  140. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +18 -12
  141. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +502 -423
  142. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +19 -12
  143. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +485 -57
  144. data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +6 -1
  145. data/ext/sources/ggml/src/ggml-cuda/norm.cu +36 -10
  146. data/ext/sources/ggml/src/ggml-cuda/out-prod.cu +23 -7
  147. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +133 -26
  148. data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +1 -1
  149. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +5 -1
  150. data/ext/sources/ggml/src/ggml-cuda/rope.cu +11 -4
  151. data/ext/sources/ggml/src/ggml-cuda/scale.cu +4 -1
  152. data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +14 -6
  153. data/ext/sources/ggml/src/ggml-cuda/snake.cu +72 -0
  154. data/ext/sources/ggml/src/ggml-cuda/snake.cuh +8 -0
  155. data/ext/sources/ggml/src/ggml-cuda/softcap.cu +4 -1
  156. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +45 -13
  157. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
  158. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +40 -18
  159. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +8 -4
  160. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +1 -0
  161. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +1 -0
  162. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +2 -0
  163. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +1 -0
  164. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +1 -0
  165. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +1 -0
  166. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +1 -0
  167. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +2 -0
  168. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +1 -0
  169. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +1 -0
  170. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +2 -0
  171. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +1 -0
  172. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +2 -0
  173. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq192-dv128.cu +5 -0
  174. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq320-dv256.cu +5 -0
  175. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq512-dv512.cu +5 -0
  176. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu +7 -0
  177. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-f16.cu +7 -0
  178. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_0.cu +7 -0
  179. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_1.cu +7 -0
  180. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_0.cu +7 -0
  181. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_1.cu +7 -0
  182. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q8_0.cu +7 -0
  183. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-bf16.cu +7 -0
  184. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-bf16.cu +7 -0
  185. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-bf16.cu +7 -0
  186. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-bf16.cu +7 -0
  187. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-bf16.cu +7 -0
  188. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-bf16.cu +7 -0
  189. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-nvfp4.cu +5 -0
  190. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q1_0.cu +5 -0
  191. data/ext/sources/ggml/src/ggml-cuda/top-k.cu +5 -4
  192. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +26 -23
  193. data/ext/sources/ggml/src/ggml-cuda/unary.cu +31 -2
  194. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +2 -0
  195. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +80 -0
  196. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +7 -2
  197. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +22 -4
  198. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +3 -0
  199. data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +2 -1
  200. data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +1428 -743
  201. data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +45 -7
  202. data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +53 -84
  203. data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +25 -12
  204. data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +165 -184
  205. data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +5 -5
  206. data/ext/sources/ggml/src/ggml-hexagon/htp/concat-ops.c +277 -0
  207. data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +170 -127
  208. data/ext/sources/ggml/src/ggml-hexagon/htp/cumsum-ops.c +270 -0
  209. data/ext/sources/ggml/src/ggml-hexagon/htp/diag-ops.c +216 -0
  210. data/ext/sources/ggml/src/ggml-hexagon/htp/fill-ops.c +123 -0
  211. data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +125 -97
  212. data/ext/sources/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c +1148 -0
  213. data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +148 -42
  214. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.c +2 -2
  215. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.h +252 -62
  216. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +9 -0
  217. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +87 -1
  218. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c +1878 -0
  219. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c +2066 -0
  220. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.c +6 -0
  221. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.h +88 -0
  222. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-profile.h +34 -0
  223. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.c +158 -0
  224. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.h +134 -0
  225. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-utils.h +200 -0
  226. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +96 -13
  227. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +182 -57
  228. data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +9 -3
  229. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +71 -3
  230. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +27 -10
  231. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +63 -23
  232. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +9 -8
  233. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-flash-attn.h +47 -0
  234. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-log.h +65 -0
  235. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-pow.h +42 -0
  236. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-repl.h +74 -0
  237. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +1 -0
  238. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sin-cos.h +90 -0
  239. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +5 -8
  240. data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +529 -815
  241. data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +2522 -234
  242. data/ext/sources/ggml/src/ggml-hexagon/htp/pad-ops.c +547 -0
  243. data/ext/sources/ggml/src/ggml-hexagon/htp/repeat-ops.c +148 -0
  244. data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +291 -95
  245. data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +59 -37
  246. data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +121 -133
  247. data/ext/sources/ggml/src/ggml-hexagon/htp/solve-tri-ops.c +267 -0
  248. data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +244 -151
  249. data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +6 -6
  250. data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +719 -45
  251. data/ext/sources/ggml/src/ggml-hexagon/htp/vtcm-utils.h +16 -0
  252. data/ext/sources/ggml/src/ggml-hexagon/htp-opnode.h +272 -0
  253. data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +3 -1
  254. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +22 -9
  255. data/ext/sources/ggml/src/ggml-impl.h +6 -1
  256. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +138 -13
  257. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +32 -1
  258. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +164 -28
  259. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +80 -0
  260. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +190 -19
  261. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +2 -0
  262. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +39 -26
  263. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +823 -322
  264. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +5 -6
  265. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +54 -5
  266. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +12248 -5907
  267. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +67 -0
  268. data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +59 -0
  269. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +1819 -112
  270. data/ext/sources/ggml/src/ggml-opencl/kernels/gated_delta_net.cl +249 -0
  271. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl +306 -0
  272. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl +256 -0
  273. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl +258 -0
  274. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl +283 -0
  275. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl +260 -0
  276. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl +262 -0
  277. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl +288 -0
  278. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl +267 -0
  279. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_iq4_nl_f32.cl +150 -0
  280. data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mat_Ab_Bi_8x4.cl → gemm_noshuffle_q4_0_f32.cl} +1 -1
  281. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_k_f32.cl +172 -0
  282. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_0_f32.cl +131 -0
  283. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_1_f32.cl +134 -0
  284. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_k_f32.cl +176 -0
  285. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q6_k_f32.cl +140 -0
  286. data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mm_q8_0_f32_8x4.cl → gemm_noshuffle_q8_0_f32.cl} +1 -1
  287. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_xmem_f16_f32_os8.cl +233 -0
  288. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl +165 -0
  289. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl +120 -0
  290. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl +123 -0
  291. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl +155 -0
  292. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl +123 -0
  293. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl +125 -0
  294. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl +160 -0
  295. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl +141 -0
  296. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_iq4_nl_f32.cl +302 -0
  297. data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle_general.cl → gemv_noshuffle_q4_0_f32.cl} +5 -5
  298. data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle.cl → gemv_noshuffle_q4_0_f32_spec.cl} +5 -5
  299. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_k_f32.cl +318 -0
  300. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_0_f32.cl +291 -0
  301. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_1_f32.cl +294 -0
  302. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_k_f32.cl +326 -0
  303. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q6_k_f32.cl +293 -0
  304. data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +15 -9
  305. data/ext/sources/ggml/src/ggml-opencl/kernels/moe_reorder_b.cl +30 -0
  306. data/ext/sources/ggml/src/ggml-opencl/kernels/moe_sort_by_expert.cl +82 -0
  307. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_iq4_nl_f32_l4_lm.cl +171 -0
  308. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_k_f32_l4_lm.cl +179 -0
  309. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_0_f32_l4_lm.cl +173 -0
  310. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_1_f32_l4_lm.cl +175 -0
  311. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_k_f32_l4_lm.cl +192 -0
  312. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl +164 -0
  313. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32_flat.cl +202 -0
  314. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl +196 -0
  315. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32.cl +241 -0
  316. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32_flat.cl +243 -0
  317. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32.cl +243 -0
  318. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32_flat.cl +247 -0
  319. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32.cl +187 -0
  320. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32_flat.cl +203 -0
  321. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +48 -64
  322. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +15 -5
  323. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +18 -11
  324. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +35 -13
  325. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +264 -192
  326. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +33 -7
  327. data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp +25 -0
  328. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +1 -0
  329. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +1 -0
  330. data/ext/sources/ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp +41 -0
  331. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +27 -3
  332. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +67 -36
  333. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +1 -0
  334. data/ext/sources/ggml/src/ggml-openvino/utils.cpp +101 -44
  335. data/ext/sources/ggml/src/ggml-openvino/utils.h +23 -3
  336. data/ext/sources/ggml/src/ggml-opt.cpp +1 -0
  337. data/ext/sources/ggml/src/ggml-quants.c +289 -114
  338. data/ext/sources/ggml/src/ggml-quants.h +3 -0
  339. data/ext/sources/ggml/src/ggml-rpc/CMakeLists.txt +24 -0
  340. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +167 -311
  341. data/ext/sources/ggml/src/ggml-rpc/transport.cpp +683 -0
  342. data/ext/sources/ggml/src/ggml-rpc/transport.h +34 -0
  343. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +50 -4
  344. data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +1 -1
  345. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +3 -1
  346. data/ext/sources/ggml/src/ggml-sycl/common.cpp +74 -2
  347. data/ext/sources/ggml/src/ggml-sycl/common.hpp +41 -1
  348. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +115 -13
  349. data/ext/sources/ggml/src/ggml-sycl/convert.hpp +9 -0
  350. data/ext/sources/ggml/src/ggml-sycl/cumsum.cpp +148 -0
  351. data/ext/sources/ggml/src/ggml-sycl/cumsum.hpp +5 -0
  352. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +663 -0
  353. data/ext/sources/ggml/src/ggml-sycl/diag.cpp +67 -0
  354. data/ext/sources/ggml/src/ggml-sycl/diag.hpp +5 -0
  355. data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +586 -6
  356. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +1 -90
  357. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +0 -2
  358. data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.cpp +56 -0
  359. data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.hpp +63 -0
  360. data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +7 -5
  361. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +4 -0
  362. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +76 -168
  363. data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +7 -0
  364. data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +3 -1
  365. data/ext/sources/ggml/src/ggml-sycl/fill.cpp +55 -0
  366. data/ext/sources/ggml/src/ggml-sycl/fill.hpp +5 -0
  367. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +69 -31
  368. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +1 -0
  369. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
  370. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +79 -3
  371. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +823 -190
  372. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +353 -89
  373. data/ext/sources/ggml/src/ggml-sycl/im2col.hpp +5 -3
  374. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +1344 -26
  375. data/ext/sources/ggml/src/ggml-sycl/mmvq.hpp +16 -0
  376. data/ext/sources/ggml/src/ggml-sycl/pad.cpp +27 -27
  377. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +71 -0
  378. data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +7 -1
  379. data/ext/sources/ggml/src/ggml-sycl/solve_tri.cpp +172 -0
  380. data/ext/sources/ggml/src/ggml-sycl/solve_tri.hpp +8 -0
  381. data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +6 -1
  382. data/ext/sources/ggml/src/ggml-sycl/ssm_scan.cpp +156 -0
  383. data/ext/sources/ggml/src/ggml-sycl/ssm_scan.hpp +5 -0
  384. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +62 -10
  385. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +18 -6
  386. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq512-dv512.cpp +6 -0
  387. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +1 -0
  388. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +1 -0
  389. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +1 -0
  390. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +1 -0
  391. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +1 -0
  392. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +1 -0
  393. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +1 -0
  394. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +1 -0
  395. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +1 -0
  396. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +1 -0
  397. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +1 -0
  398. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +1 -0
  399. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +1 -0
  400. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +1 -0
  401. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +1 -0
  402. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +1 -0
  403. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +1 -0
  404. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +1 -0
  405. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +1 -0
  406. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +1 -0
  407. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +1 -0
  408. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +1 -0
  409. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +1 -0
  410. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +1 -0
  411. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +1 -0
  412. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +1 -0
  413. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +1 -0
  414. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +1 -0
  415. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +1 -0
  416. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +1 -0
  417. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +1 -0
  418. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +1 -0
  419. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +1 -0
  420. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +1 -0
  421. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +1 -0
  422. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +1 -0
  423. data/ext/sources/ggml/src/ggml-sycl/type.hpp +112 -0
  424. data/ext/sources/ggml/src/ggml-sycl/upscale.cpp +410 -0
  425. data/ext/sources/ggml/src/ggml-sycl/upscale.hpp +9 -0
  426. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +215 -53
  427. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +4 -0
  428. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +2 -0
  429. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +2 -0
  430. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +1 -0
  431. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +1 -0
  432. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +0 -2
  433. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +11 -0
  434. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +2060 -535
  435. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
  436. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +6 -2
  437. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +146 -13
  438. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +3 -1
  439. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +1 -1
  440. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +25 -1
  441. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +88 -0
  442. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl +643 -1
  443. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_nvfp4.comp +32 -0
  444. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q1_0.comp +29 -0
  445. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +0 -1
  446. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dot_product_funcs.glsl +27 -0
  447. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +0 -1
  448. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2_decode_vector.comp +7 -0
  449. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +197 -48
  450. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +60 -59
  451. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +115 -113
  452. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +122 -31
  453. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl +131 -0
  454. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mmq_funcs.glsl +203 -0
  455. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp +115 -0
  456. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +125 -64
  457. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.glsl +0 -1
  458. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl +10 -1
  459. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl +16 -6
  460. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +76 -54
  461. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +0 -1
  462. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +0 -1
  463. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +122 -27
  464. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +6 -6
  465. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +1 -1
  466. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +1 -1
  467. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +1 -1
  468. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +1 -0
  469. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +88 -55
  470. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +11 -17
  471. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +43 -10
  472. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +159 -125
  473. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +8 -8
  474. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +24 -9
  475. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +0 -1
  476. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +5 -2
  477. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +0 -1
  478. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +3 -2
  479. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/snake.comp +49 -0
  480. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +11 -1
  481. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +0 -1
  482. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +79 -2
  483. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +171 -147
  484. data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +5 -2
  485. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +2202 -283
  486. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +2610 -1403
  487. data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +37 -7
  488. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_id.wgsl +64 -0
  489. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +8 -7
  490. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +76 -95
  491. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +19 -1
  492. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/conv2d.wgsl +165 -0
  493. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{cpy.tmpl.wgsl → cpy.wgsl} +25 -50
  494. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +107 -184
  495. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_quant_staging.tmpl +124 -0
  496. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl +397 -0
  497. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_blk.wgsl +101 -0
  498. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_reduce.wgsl +84 -0
  499. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_split.wgsl +619 -0
  500. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl +149 -0
  501. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl +183 -78
  502. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.wgsl +155 -0
  503. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/im2col.wgsl +101 -0
  504. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +655 -495
  505. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id.wgsl +195 -0
  506. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl +52 -0
  507. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_vec.wgsl +154 -0
  508. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +8 -6
  509. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +5 -1
  510. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +80 -409
  511. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl +1432 -0
  512. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_q_acc.tmpl +303 -0
  513. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quant_inner_loops.tmpl +21 -0
  514. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quantize_q8.wgsl +173 -0
  515. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_mul.wgsl +152 -0
  516. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{rope.tmpl.wgsl → rope.wgsl} +71 -142
  517. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/row_norm.wgsl +153 -0
  518. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +6 -4
  519. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set.wgsl +109 -0
  520. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +2 -3
  521. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows_quant.wgsl +224 -0
  522. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{soft_max.tmpl.wgsl → soft_max.wgsl} +106 -206
  523. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/solve_tri.wgsl +121 -0
  524. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_conv.wgsl +65 -0
  525. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_scan.wgsl +193 -0
  526. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +68 -48
  527. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/upscale.wgsl +240 -0
  528. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +18 -14
  529. data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +1 -1
  530. data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +244 -10
  531. data/ext/sources/ggml/src/ggml.c +110 -28
  532. data/ext/sources/ggml/src/gguf.cpp +173 -28
  533. data/ext/sources/include/parakeet.h +342 -0
  534. data/ext/sources/include/whisper.h +10 -0
  535. data/ext/sources/media/matmul.png +0 -0
  536. data/ext/sources/src/CMakeLists.txt +23 -0
  537. data/ext/sources/src/parakeet-arch.h +188 -0
  538. data/ext/sources/src/parakeet.cpp +3838 -0
  539. data/ext/sources/src/whisper.cpp +56 -12
  540. data/extsources.rb +26 -10
  541. data/lib/whisper/log_settable.rb +36 -0
  542. data/lib/whisper/model/uri.rb +13 -1
  543. data/lib/whisper/output.rb +74 -0
  544. data/sig/whisper.rbs +411 -62
  545. data/test/helper.rb +2 -0
  546. data/test/jfk_reader/jfk_reader.c +50 -7
  547. data/test/test_callback.rb +1 -0
  548. data/test/test_package.rb +6 -5
  549. data/test/test_parakeet.rb +28 -0
  550. data/test/test_parakeet_callback.rb +107 -0
  551. data/test/test_parakeet_context.rb +116 -0
  552. data/test/test_parakeet_context_params.rb +24 -0
  553. data/test/test_parakeet_model.rb +21 -0
  554. data/test/test_parakeet_params.rb +78 -0
  555. data/test/test_parakeet_segment.rb +42 -0
  556. data/test/test_parakeet_token.rb +73 -0
  557. data/test/test_params.rb +2 -0
  558. data/test/test_vad_segment.rb +1 -1
  559. data/test/test_whisper.rb +24 -6
  560. data/whispercpp.gemspec +2 -2
  561. metadata +215 -281
  562. data/ext/sources/bindings/javascript/CMakeLists.txt +0 -41
  563. data/ext/sources/bindings/javascript/emscripten.cpp +0 -93
  564. data/ext/sources/bindings/javascript/libwhisper.worker.js +0 -1
  565. data/ext/sources/bindings/javascript/package.json +0 -26
  566. data/ext/sources/bindings/javascript/whisper.js +0 -19
  567. data/ext/sources/examples/addon.node/CMakeLists.txt +0 -31
  568. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +0 -133
  569. data/ext/sources/examples/addon.node/addon.cpp +0 -557
  570. data/ext/sources/examples/addon.node/index.js +0 -59
  571. data/ext/sources/examples/addon.node/package.json +0 -16
  572. data/ext/sources/examples/addon.node/vad-example.js +0 -132
  573. data/ext/sources/examples/bench.wasm/CMakeLists.txt +0 -49
  574. data/ext/sources/examples/bench.wasm/emscripten.cpp +0 -87
  575. data/ext/sources/examples/bench.wasm/index-tmpl.html +0 -285
  576. data/ext/sources/examples/coi-serviceworker.js +0 -146
  577. data/ext/sources/examples/command/CMakeLists.txt +0 -10
  578. data/ext/sources/examples/command/command.cpp +0 -802
  579. data/ext/sources/examples/command/commands.txt +0 -9
  580. data/ext/sources/examples/command.wasm/CMakeLists.txt +0 -50
  581. data/ext/sources/examples/command.wasm/emscripten.cpp +0 -327
  582. data/ext/sources/examples/command.wasm/index-tmpl.html +0 -415
  583. data/ext/sources/examples/generate-karaoke.sh +0 -57
  584. data/ext/sources/examples/helpers.js +0 -191
  585. data/ext/sources/examples/livestream.sh +0 -112
  586. data/ext/sources/examples/lsp/CMakeLists.txt +0 -10
  587. data/ext/sources/examples/lsp/lsp.cpp +0 -471
  588. data/ext/sources/examples/lsp/whisper.vim +0 -362
  589. data/ext/sources/examples/python/test_whisper_processor.py +0 -7
  590. data/ext/sources/examples/python/whisper_processor.py +0 -54
  591. data/ext/sources/examples/server/bench.js +0 -29
  592. data/ext/sources/examples/server.py +0 -120
  593. data/ext/sources/examples/stream/CMakeLists.txt +0 -10
  594. data/ext/sources/examples/stream/stream.cpp +0 -437
  595. data/ext/sources/examples/stream.wasm/CMakeLists.txt +0 -49
  596. data/ext/sources/examples/stream.wasm/emscripten.cpp +0 -216
  597. data/ext/sources/examples/stream.wasm/index-tmpl.html +0 -491
  598. data/ext/sources/examples/sycl/CMakeLists.txt +0 -9
  599. data/ext/sources/examples/sycl/build.sh +0 -22
  600. data/ext/sources/examples/sycl/ls-sycl-device.cpp +0 -11
  601. data/ext/sources/examples/sycl/run-whisper.sh +0 -17
  602. data/ext/sources/examples/talk-llama/CMakeLists.txt +0 -48
  603. data/ext/sources/examples/talk-llama/eleven-labs.py +0 -80
  604. data/ext/sources/examples/talk-llama/llama-adapter.cpp +0 -488
  605. data/ext/sources/examples/talk-llama/llama-adapter.h +0 -89
  606. data/ext/sources/examples/talk-llama/llama-arch.cpp +0 -2877
  607. data/ext/sources/examples/talk-llama/llama-arch.h +0 -628
  608. data/ext/sources/examples/talk-llama/llama-batch.cpp +0 -919
  609. data/ext/sources/examples/talk-llama/llama-batch.h +0 -173
  610. data/ext/sources/examples/talk-llama/llama-chat.cpp +0 -896
  611. data/ext/sources/examples/talk-llama/llama-chat.h +0 -71
  612. data/ext/sources/examples/talk-llama/llama-context.cpp +0 -3633
  613. data/ext/sources/examples/talk-llama/llama-context.h +0 -359
  614. data/ext/sources/examples/talk-llama/llama-cparams.cpp +0 -5
  615. data/ext/sources/examples/talk-llama/llama-cparams.h +0 -47
  616. data/ext/sources/examples/talk-llama/llama-ext.h +0 -12
  617. data/ext/sources/examples/talk-llama/llama-grammar.cpp +0 -1464
  618. data/ext/sources/examples/talk-llama/llama-grammar.h +0 -194
  619. data/ext/sources/examples/talk-llama/llama-graph.cpp +0 -2735
  620. data/ext/sources/examples/talk-llama/llama-graph.h +0 -1031
  621. data/ext/sources/examples/talk-llama/llama-hparams.cpp +0 -258
  622. data/ext/sources/examples/talk-llama/llama-hparams.h +0 -353
  623. data/ext/sources/examples/talk-llama/llama-impl.cpp +0 -171
  624. data/ext/sources/examples/talk-llama/llama-impl.h +0 -75
  625. data/ext/sources/examples/talk-llama/llama-io.cpp +0 -15
  626. data/ext/sources/examples/talk-llama/llama-io.h +0 -35
  627. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +0 -330
  628. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.h +0 -137
  629. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2285
  630. data/ext/sources/examples/talk-llama/llama-kv-cache.h +0 -389
  631. data/ext/sources/examples/talk-llama/llama-kv-cells.h +0 -533
  632. data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.cpp +0 -275
  633. data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.h +0 -140
  634. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +0 -268
  635. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +0 -139
  636. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +0 -1165
  637. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +0 -182
  638. data/ext/sources/examples/talk-llama/llama-memory.cpp +0 -59
  639. data/ext/sources/examples/talk-llama/llama-memory.h +0 -122
  640. data/ext/sources/examples/talk-llama/llama-mmap.cpp +0 -752
  641. data/ext/sources/examples/talk-llama/llama-mmap.h +0 -73
  642. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +0 -1655
  643. data/ext/sources/examples/talk-llama/llama-model-loader.h +0 -206
  644. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +0 -299
  645. data/ext/sources/examples/talk-llama/llama-model-saver.h +0 -40
  646. data/ext/sources/examples/talk-llama/llama-model.cpp +0 -9056
  647. data/ext/sources/examples/talk-llama/llama-model.h +0 -597
  648. data/ext/sources/examples/talk-llama/llama-quant.cpp +0 -1304
  649. data/ext/sources/examples/talk-llama/llama-quant.h +0 -1
  650. data/ext/sources/examples/talk-llama/llama-sampler.cpp +0 -3885
  651. data/ext/sources/examples/talk-llama/llama-sampler.h +0 -42
  652. data/ext/sources/examples/talk-llama/llama-vocab.cpp +0 -3970
  653. data/ext/sources/examples/talk-llama/llama-vocab.h +0 -187
  654. data/ext/sources/examples/talk-llama/llama.cpp +0 -1194
  655. data/ext/sources/examples/talk-llama/llama.h +0 -1573
  656. data/ext/sources/examples/talk-llama/models/afmoe.cpp +0 -190
  657. data/ext/sources/examples/talk-llama/models/apertus.cpp +0 -125
  658. data/ext/sources/examples/talk-llama/models/arcee.cpp +0 -135
  659. data/ext/sources/examples/talk-llama/models/arctic.cpp +0 -137
  660. data/ext/sources/examples/talk-llama/models/arwkv7.cpp +0 -86
  661. data/ext/sources/examples/talk-llama/models/baichuan.cpp +0 -123
  662. data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +0 -143
  663. data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +0 -133
  664. data/ext/sources/examples/talk-llama/models/bert.cpp +0 -184
  665. data/ext/sources/examples/talk-llama/models/bitnet.cpp +0 -145
  666. data/ext/sources/examples/talk-llama/models/bloom.cpp +0 -101
  667. data/ext/sources/examples/talk-llama/models/chameleon.cpp +0 -178
  668. data/ext/sources/examples/talk-llama/models/chatglm.cpp +0 -132
  669. data/ext/sources/examples/talk-llama/models/codeshell.cpp +0 -111
  670. data/ext/sources/examples/talk-llama/models/cogvlm.cpp +0 -102
  671. data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +0 -134
  672. data/ext/sources/examples/talk-llama/models/command-r.cpp +0 -122
  673. data/ext/sources/examples/talk-llama/models/dbrx.cpp +0 -122
  674. data/ext/sources/examples/talk-llama/models/deci.cpp +0 -135
  675. data/ext/sources/examples/talk-llama/models/deepseek.cpp +0 -142
  676. data/ext/sources/examples/talk-llama/models/deepseek2.cpp +0 -262
  677. data/ext/sources/examples/talk-llama/models/delta-net-base.cpp +0 -445
  678. data/ext/sources/examples/talk-llama/models/dots1.cpp +0 -132
  679. data/ext/sources/examples/talk-llama/models/dream.cpp +0 -105
  680. data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +0 -148
  681. data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +0 -110
  682. data/ext/sources/examples/talk-llama/models/eurobert.cpp +0 -97
  683. data/ext/sources/examples/talk-llama/models/exaone-moe.cpp +0 -145
  684. data/ext/sources/examples/talk-llama/models/exaone.cpp +0 -114
  685. data/ext/sources/examples/talk-llama/models/exaone4.cpp +0 -123
  686. data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +0 -111
  687. data/ext/sources/examples/talk-llama/models/falcon.cpp +0 -120
  688. data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +0 -116
  689. data/ext/sources/examples/talk-llama/models/gemma.cpp +0 -112
  690. data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +0 -128
  691. data/ext/sources/examples/talk-llama/models/gemma3.cpp +0 -155
  692. data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +0 -384
  693. data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +0 -170
  694. data/ext/sources/examples/talk-llama/models/glm4.cpp +0 -157
  695. data/ext/sources/examples/talk-llama/models/gpt2.cpp +0 -105
  696. data/ext/sources/examples/talk-llama/models/gptneox.cpp +0 -144
  697. data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +0 -195
  698. data/ext/sources/examples/talk-llama/models/granite.cpp +0 -210
  699. data/ext/sources/examples/talk-llama/models/grok.cpp +0 -159
  700. data/ext/sources/examples/talk-llama/models/grovemoe.cpp +0 -139
  701. data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +0 -132
  702. data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +0 -153
  703. data/ext/sources/examples/talk-llama/models/internlm2.cpp +0 -120
  704. data/ext/sources/examples/talk-llama/models/jais.cpp +0 -86
  705. data/ext/sources/examples/talk-llama/models/jais2.cpp +0 -123
  706. data/ext/sources/examples/talk-llama/models/jamba.cpp +0 -106
  707. data/ext/sources/examples/talk-llama/models/kimi-linear.cpp +0 -381
  708. data/ext/sources/examples/talk-llama/models/lfm2.cpp +0 -196
  709. data/ext/sources/examples/talk-llama/models/llada-moe.cpp +0 -122
  710. data/ext/sources/examples/talk-llama/models/llada.cpp +0 -99
  711. data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +0 -178
  712. data/ext/sources/examples/talk-llama/models/llama.cpp +0 -175
  713. data/ext/sources/examples/talk-llama/models/maincoder.cpp +0 -117
  714. data/ext/sources/examples/talk-llama/models/mamba-base.cpp +0 -289
  715. data/ext/sources/examples/talk-llama/models/mamba.cpp +0 -54
  716. data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +0 -129
  717. data/ext/sources/examples/talk-llama/models/minicpm3.cpp +0 -200
  718. data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +0 -123
  719. data/ext/sources/examples/talk-llama/models/mistral3.cpp +0 -160
  720. data/ext/sources/examples/talk-llama/models/models.h +0 -704
  721. data/ext/sources/examples/talk-llama/models/modern-bert.cpp +0 -109
  722. data/ext/sources/examples/talk-llama/models/mpt.cpp +0 -126
  723. data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +0 -162
  724. data/ext/sources/examples/talk-llama/models/nemotron.cpp +0 -122
  725. data/ext/sources/examples/talk-llama/models/neo-bert.cpp +0 -104
  726. data/ext/sources/examples/talk-llama/models/olmo.cpp +0 -121
  727. data/ext/sources/examples/talk-llama/models/olmo2.cpp +0 -150
  728. data/ext/sources/examples/talk-llama/models/olmoe.cpp +0 -124
  729. data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +0 -127
  730. data/ext/sources/examples/talk-llama/models/openelm.cpp +0 -124
  731. data/ext/sources/examples/talk-llama/models/orion.cpp +0 -123
  732. data/ext/sources/examples/talk-llama/models/paddleocr.cpp +0 -122
  733. data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +0 -121
  734. data/ext/sources/examples/talk-llama/models/phi2.cpp +0 -121
  735. data/ext/sources/examples/talk-llama/models/phi3.cpp +0 -152
  736. data/ext/sources/examples/talk-llama/models/plamo.cpp +0 -110
  737. data/ext/sources/examples/talk-llama/models/plamo2.cpp +0 -320
  738. data/ext/sources/examples/talk-llama/models/plamo3.cpp +0 -128
  739. data/ext/sources/examples/talk-llama/models/plm.cpp +0 -169
  740. data/ext/sources/examples/talk-llama/models/qwen.cpp +0 -108
  741. data/ext/sources/examples/talk-llama/models/qwen2.cpp +0 -126
  742. data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +0 -151
  743. data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +0 -117
  744. data/ext/sources/examples/talk-llama/models/qwen3.cpp +0 -120
  745. data/ext/sources/examples/talk-llama/models/qwen35.cpp +0 -381
  746. data/ext/sources/examples/talk-llama/models/qwen35moe.cpp +0 -422
  747. data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +0 -131
  748. data/ext/sources/examples/talk-llama/models/qwen3next.cpp +0 -525
  749. data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +0 -140
  750. data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +0 -132
  751. data/ext/sources/examples/talk-llama/models/refact.cpp +0 -94
  752. data/ext/sources/examples/talk-llama/models/rnd1.cpp +0 -126
  753. data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +0 -164
  754. data/ext/sources/examples/talk-llama/models/rwkv6.cpp +0 -94
  755. data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +0 -86
  756. data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +0 -137
  757. data/ext/sources/examples/talk-llama/models/rwkv7.cpp +0 -90
  758. data/ext/sources/examples/talk-llama/models/seed-oss.cpp +0 -124
  759. data/ext/sources/examples/talk-llama/models/smallthinker.cpp +0 -126
  760. data/ext/sources/examples/talk-llama/models/smollm3.cpp +0 -128
  761. data/ext/sources/examples/talk-llama/models/stablelm.cpp +0 -146
  762. data/ext/sources/examples/talk-llama/models/starcoder.cpp +0 -100
  763. data/ext/sources/examples/talk-llama/models/starcoder2.cpp +0 -121
  764. data/ext/sources/examples/talk-llama/models/step35-iswa.cpp +0 -165
  765. data/ext/sources/examples/talk-llama/models/t5-dec.cpp +0 -166
  766. data/ext/sources/examples/talk-llama/models/t5-enc.cpp +0 -96
  767. data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +0 -149
  768. data/ext/sources/examples/talk-llama/models/xverse.cpp +0 -108
  769. data/ext/sources/examples/talk-llama/prompts/talk-alpaca.txt +0 -23
  770. data/ext/sources/examples/talk-llama/speak +0 -40
  771. data/ext/sources/examples/talk-llama/speak.bat +0 -1
  772. data/ext/sources/examples/talk-llama/speak.ps1 +0 -14
  773. data/ext/sources/examples/talk-llama/talk-llama.cpp +0 -813
  774. data/ext/sources/examples/talk-llama/unicode-data.cpp +0 -7034
  775. data/ext/sources/examples/talk-llama/unicode-data.h +0 -20
  776. data/ext/sources/examples/talk-llama/unicode.cpp +0 -1103
  777. data/ext/sources/examples/talk-llama/unicode.h +0 -111
  778. data/ext/sources/examples/wchess/CMakeLists.txt +0 -10
  779. data/ext/sources/examples/wchess/libwchess/CMakeLists.txt +0 -19
  780. data/ext/sources/examples/wchess/libwchess/Chessboard.cpp +0 -803
  781. data/ext/sources/examples/wchess/libwchess/Chessboard.h +0 -33
  782. data/ext/sources/examples/wchess/libwchess/WChess.cpp +0 -193
  783. data/ext/sources/examples/wchess/libwchess/WChess.h +0 -63
  784. data/ext/sources/examples/wchess/libwchess/test-chessboard.cpp +0 -117
  785. data/ext/sources/examples/wchess/wchess.cmd/CMakeLists.txt +0 -8
  786. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +0 -253
  787. data/ext/sources/examples/whisper.wasm/CMakeLists.txt +0 -50
  788. data/ext/sources/examples/whisper.wasm/emscripten.cpp +0 -118
  789. data/ext/sources/examples/whisper.wasm/index-tmpl.html +0 -659
  790. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +0 -99
  791. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +0 -155
  792. data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +0 -153
  793. data/ext/sources/ggml/src/ggml-opencl/kernels/embed_kernel.py +0 -26
  794. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +0 -123
  795. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +0 -17
  796. data/ext/sources/ggml/src/ggml-virtgpu/regenerate_remoting.py +0 -333
  797. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.glsl +0 -5
  798. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +0 -182
  799. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +0 -323
  800. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +0 -718
  801. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +0 -123
  802. data/ext/sources/tests/CMakeLists.txt +0 -112
  803. data/ext/sources/tests/earnings21/eval.mk +0 -58
  804. data/ext/sources/tests/earnings21/eval.py +0 -68
  805. data/ext/sources/tests/earnings21/normalizers/__init__.py +0 -2
  806. data/ext/sources/tests/earnings21/normalizers/basic.py +0 -80
  807. data/ext/sources/tests/earnings21/normalizers/english.json +0 -1741
  808. data/ext/sources/tests/earnings21/normalizers/english.py +0 -550
  809. data/ext/sources/tests/earnings21/requirements.txt +0 -6
  810. data/ext/sources/tests/en-0-ref.txt +0 -1
  811. data/ext/sources/tests/en-1-ref.txt +0 -1
  812. data/ext/sources/tests/en-2-ref.txt +0 -1
  813. data/ext/sources/tests/es-0-ref.txt +0 -1
  814. data/ext/sources/tests/librispeech/eval.mk +0 -39
  815. data/ext/sources/tests/librispeech/eval.py +0 -47
  816. data/ext/sources/tests/librispeech/normalizers/__init__.py +0 -2
  817. data/ext/sources/tests/librispeech/normalizers/basic.py +0 -80
  818. data/ext/sources/tests/librispeech/normalizers/english.json +0 -1741
  819. data/ext/sources/tests/librispeech/normalizers/english.py +0 -550
  820. data/ext/sources/tests/librispeech/requirements.txt +0 -6
  821. data/ext/sources/tests/run-tests.sh +0 -130
  822. data/ext/sources/tests/test-c.c +0 -3
  823. data/ext/sources/tests/test-vad-full.cpp +0 -56
  824. data/ext/sources/tests/test-vad.cpp +0 -83
  825. data/ext/sources/tests/test-whisper.js +0 -58
  826. data/lib/whisper/context.rb +0 -15
  827. data/lib/whisper/segment.rb +0 -58
  828. /data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle_general_q8_0_f32.cl → gemv_noshuffle_q8_0_f32.cl} +0 -0
@@ -2,30 +2,17 @@
2
2
  #include "ggml-impl.h"
3
3
  #include "ggml-backend-impl.h"
4
4
  #include "ggml-cpp.h"
5
+ #include "transport.h"
5
6
 
7
+ #include <array>
6
8
  #include <cinttypes>
9
+ #include <optional>
7
10
  #include <string>
8
11
  #include <vector>
9
12
  #include <memory>
10
13
  #include <mutex>
11
14
  #include <unordered_map>
12
15
  #include <unordered_set>
13
- #ifdef _WIN32
14
- # define WIN32_LEAN_AND_MEAN
15
- # ifndef NOMINMAX
16
- # define NOMINMAX
17
- # endif
18
- # include <windows.h>
19
- # include <winsock2.h>
20
- #else
21
- # include <arpa/inet.h>
22
- # include <sys/socket.h>
23
- # include <sys/types.h>
24
- # include <netinet/in.h>
25
- # include <netinet/tcp.h>
26
- # include <netdb.h>
27
- # include <unistd.h>
28
- #endif
29
16
  #include <cstring>
30
17
  #include <fstream>
31
18
  #include <filesystem>
@@ -39,29 +26,6 @@ static const char * RPC_DEBUG = std::getenv("GGML_RPC_DEBUG");
39
26
 
40
27
  namespace fs = std::filesystem;
41
28
 
42
- static constexpr size_t MAX_CHUNK_SIZE = 1024ull * 1024ull * 1024ull; // 1 GiB
43
-
44
- #ifdef _WIN32
45
- typedef SOCKET sockfd_t;
46
- using ssize_t = __int64;
47
- #else
48
- typedef int sockfd_t;
49
- #endif
50
-
51
- // cross-platform socket
52
- struct socket_t {
53
- sockfd_t fd;
54
- socket_t(sockfd_t fd) : fd(fd) {}
55
- ~socket_t() {
56
- LOG_DBG("[%s] closing socket %d\n", __func__, this->fd);
57
- #ifdef _WIN32
58
- closesocket(this->fd);
59
- #else
60
- close(this->fd);
61
- #endif
62
- }
63
- };
64
-
65
29
  // macro for nicer error messages on server crash
66
30
  #define RPC_STATUS_ASSERT(x) if (!(x)) GGML_ABORT("Remote RPC server crashed or returned malformed response")
67
31
 
@@ -115,10 +79,16 @@ static_assert(RPC_CMD_HELLO == 14, "RPC_CMD_HELLO must be always 14");
115
79
  // Try RPC_CMD_SET_TENSOR_HASH first when data size is larger than this threshold
116
80
  const size_t HASH_THRESHOLD = 10 * 1024 * 1024;
117
81
 
82
+ struct rpc_msg_hello_req {
83
+ uint8_t conn_caps[RPC_CONN_CAPS_SIZE];
84
+ };
85
+
118
86
  struct rpc_msg_hello_rsp {
119
87
  uint8_t major;
120
88
  uint8_t minor;
121
89
  uint8_t patch;
90
+ uint8_t padding;
91
+ uint8_t conn_caps[RPC_CONN_CAPS_SIZE];
122
92
  };
123
93
 
124
94
  struct rpc_msg_device_count_rsp {
@@ -229,6 +199,14 @@ static ggml_guid_t ggml_backend_rpc_guid() {
229
199
  return &guid;
230
200
  }
231
201
 
202
+ struct ggml_backend_rpc_device_context {
203
+ std::string endpoint;
204
+ uint32_t device;
205
+ std::string name;
206
+ std::string description;
207
+ uint64_t last_graph_uid;
208
+ };
209
+
232
210
  struct ggml_backend_rpc_buffer_type_context {
233
211
  std::string endpoint;
234
212
  uint32_t device;
@@ -237,35 +215,10 @@ struct ggml_backend_rpc_buffer_type_context {
237
215
  size_t max_size;
238
216
  };
239
217
 
240
- struct graph_cache {
241
-
242
- bool is_cached(const ggml_cgraph * cgraph) {
243
- if ((int)last_graph.size() != cgraph->n_nodes) {
244
- return false;
245
- }
246
- for (int i = 0; i < cgraph->n_nodes; i++) {
247
- if (memcmp(&last_graph[i], cgraph->nodes[i], sizeof(ggml_tensor)) != 0) {
248
- return false;
249
- }
250
- }
251
- return true;
252
- }
253
-
254
- void add(const ggml_cgraph * cgraph) {
255
- last_graph.resize(cgraph->n_nodes);
256
- for (int i = 0; i < cgraph->n_nodes; i++) {
257
- memcpy(&last_graph[i], cgraph->nodes[i], sizeof(ggml_tensor));
258
- }
259
- }
260
-
261
- std::vector<ggml_tensor> last_graph;
262
- };
263
-
264
218
  struct ggml_backend_rpc_context {
265
219
  std::string endpoint;
266
220
  uint32_t device;
267
221
  std::string name;
268
- graph_cache gc;
269
222
  };
270
223
 
271
224
  struct ggml_backend_rpc_buffer_context {
@@ -288,153 +241,27 @@ static uint64_t fnv_hash(const uint8_t * data, size_t len) {
288
241
  return hash;
289
242
  }
290
243
 
291
- static std::shared_ptr<socket_t> make_socket(sockfd_t fd) {
292
- #ifdef _WIN32
293
- if (fd == INVALID_SOCKET) {
294
- return nullptr;
295
- }
296
- #else
297
- if (fd < 0) {
298
- return nullptr;
299
- }
300
- #endif
301
- return std::make_shared<socket_t>(fd);
302
- }
303
-
304
- static bool set_no_delay(sockfd_t sockfd) {
305
- int flag = 1;
306
- // set TCP_NODELAY to disable Nagle's algorithm
307
- int ret = setsockopt(sockfd, IPPROTO_TCP, TCP_NODELAY, (char *)&flag, sizeof(int));
308
- return ret == 0;
309
- }
310
-
311
- static bool set_reuse_addr(sockfd_t sockfd) {
312
- int flag = 1;
313
- int ret = setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, (char *)&flag, sizeof(int));
314
- return ret == 0;
315
- }
316
-
317
- static std::shared_ptr<socket_t> socket_connect(const char * host, int port) {
318
- struct sockaddr_in addr;
319
- auto sockfd = socket(AF_INET, SOCK_STREAM, 0);
320
- auto sock_ptr = make_socket(sockfd);
321
- if (sock_ptr == nullptr) {
322
- return nullptr;
323
- }
324
- if (!set_no_delay(sockfd)) {
325
- GGML_LOG_ERROR("Failed to set TCP_NODELAY\n");
326
- return nullptr;
327
- }
328
- addr.sin_family = AF_INET;
329
- addr.sin_port = htons(port);
330
- struct hostent * server = gethostbyname(host);
331
- if (server == NULL) {
332
- GGML_LOG_ERROR("Cannot resolve host '%s'\n", host);
333
- return nullptr;
334
- }
335
- memcpy(&addr.sin_addr.s_addr, server->h_addr, server->h_length);
336
- if (connect(sock_ptr->fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
337
- return nullptr;
338
- }
339
- return sock_ptr;
340
- }
341
-
342
- static std::shared_ptr<socket_t> socket_accept(sockfd_t srv_sockfd) {
343
- auto client_socket_fd = accept(srv_sockfd, NULL, NULL);
344
- auto client_socket = make_socket(client_socket_fd);
345
- if (client_socket == nullptr) {
346
- return nullptr;
347
- }
348
- if (!set_no_delay(client_socket_fd)) {
349
- GGML_LOG_ERROR("Failed to set TCP_NODELAY\n");
350
- return nullptr;
351
- }
352
- return client_socket;
353
- }
354
-
355
- static std::shared_ptr<socket_t> create_server_socket(const char * host, int port) {
356
- auto sockfd = socket(AF_INET, SOCK_STREAM, 0);
357
- auto sock = make_socket(sockfd);
358
- if (sock == nullptr) {
359
- return nullptr;
360
- }
361
- if (!set_reuse_addr(sockfd)) {
362
- GGML_LOG_ERROR("Failed to set SO_REUSEADDR\n");
363
- return nullptr;
364
- }
365
- if (inet_addr(host) == INADDR_NONE) {
366
- GGML_LOG_ERROR("Invalid host address: %s\n", host);
367
- return nullptr;
368
- }
369
- struct sockaddr_in serv_addr;
370
- serv_addr.sin_family = AF_INET;
371
- serv_addr.sin_addr.s_addr = inet_addr(host);
372
- serv_addr.sin_port = htons(port);
373
-
374
- if (bind(sockfd, (struct sockaddr *) &serv_addr, sizeof(serv_addr)) < 0) {
375
- return nullptr;
376
- }
377
- if (listen(sockfd, 1) < 0) {
378
- return nullptr;
379
- }
380
- return sock;
381
- }
382
-
383
- static bool send_data(sockfd_t sockfd, const void * data, size_t size) {
384
- size_t bytes_sent = 0;
385
- while (bytes_sent < size) {
386
- size_t size_to_send = std::min(size - bytes_sent, MAX_CHUNK_SIZE);
387
- ssize_t n = send(sockfd, (const char *)data + bytes_sent, size_to_send, 0);
388
- if (n < 0) {
389
- GGML_LOG_ERROR("send failed (bytes_sent=%zu, size_to_send=%zu)\n",
390
- bytes_sent, size_to_send);
391
- return false;
392
- }
393
- bytes_sent += (size_t)n;
394
- }
395
- return true;
396
- }
397
-
398
- static bool recv_data(sockfd_t sockfd, void * data, size_t size) {
399
- size_t bytes_recv = 0;
400
- while (bytes_recv < size) {
401
- size_t size_to_recv = std::min(size - bytes_recv, MAX_CHUNK_SIZE);
402
- ssize_t n = recv(sockfd, (char *)data + bytes_recv, size_to_recv, 0);
403
- if (n < 0) {
404
- GGML_LOG_ERROR("recv failed (bytes_recv=%zu, size_to_recv=%zu)\n",
405
- bytes_recv, size_to_recv);
406
- return false;
407
- }
408
- if (n == 0) {
409
- LOG_DBG("recv returned 0 (peer closed?)\n");
410
- return false;
411
- }
412
- bytes_recv += (size_t)n;
413
- }
414
- return true;
415
- }
416
-
417
- static bool send_msg(sockfd_t sockfd, const void * msg, size_t msg_size) {
418
- if (!send_data(sockfd, &msg_size, sizeof(msg_size))) {
244
+ static bool send_msg(socket_ptr sock, const void * msg, size_t msg_size) {
245
+ if (!sock->send_data(&msg_size, sizeof(msg_size))) {
419
246
  return false;
420
247
  }
421
- return send_data(sockfd, msg, msg_size);
248
+ return sock->send_data(msg, msg_size);
422
249
  }
423
250
 
424
- static bool recv_msg(sockfd_t sockfd, void * msg, size_t msg_size) {
251
+ static bool recv_msg(socket_ptr sock, void * msg, size_t msg_size) {
425
252
  uint64_t size;
426
- if (!recv_data(sockfd, &size, sizeof(size))) {
253
+ if (!sock->recv_data(&size, sizeof(size))) {
427
254
  return false;
428
255
  }
429
256
  if (size != msg_size) {
430
257
  return false;
431
258
  }
432
- return recv_data(sockfd, msg, msg_size);
259
+ return sock->recv_data(msg, msg_size);
433
260
  }
434
261
 
435
- static bool recv_msg(sockfd_t sockfd, std::vector<uint8_t> & input) {
262
+ static bool recv_msg(socket_ptr sock, std::vector<uint8_t> & input) {
436
263
  uint64_t size;
437
- if (!recv_data(sockfd, &size, sizeof(size))) {
264
+ if (!sock->recv_data(&size, sizeof(size))) {
438
265
  return false;
439
266
  }
440
267
  try {
@@ -443,7 +270,7 @@ static bool recv_msg(sockfd_t sockfd, std::vector<uint8_t> & input) {
443
270
  GGML_LOG_ERROR("Failed to allocate input buffer of size %" PRIu64 "\n", size);
444
271
  return false;
445
272
  }
446
- return recv_data(sockfd, input.data(), size);
273
+ return sock->recv_data(input.data(), size);
447
274
  }
448
275
 
449
276
  static bool parse_endpoint(const std::string & endpoint, std::string & host, int & port) {
@@ -452,21 +279,25 @@ static bool parse_endpoint(const std::string & endpoint, std::string & host, int
452
279
  return false;
453
280
  }
454
281
  host = endpoint.substr(0, pos);
455
- port = std::stoi(endpoint.substr(pos + 1));
282
+ try {
283
+ port = std::stoi(endpoint.substr(pos + 1));
284
+ } catch (...) {
285
+ return false;
286
+ }
456
287
  return true;
457
288
  }
458
289
 
459
290
  // RPC request : | rpc_cmd (1 byte) | request_size (8 bytes) | request_data (request_size bytes) |
460
291
  // No response
461
- static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cmd, const void * input, size_t input_size) {
292
+ static bool send_rpc_cmd(socket_ptr sock, enum rpc_cmd cmd, const void * input, size_t input_size) {
462
293
  uint8_t cmd_byte = cmd;
463
- if (!send_data(sock->fd, &cmd_byte, sizeof(cmd_byte))) {
294
+ if (!sock->send_data(&cmd_byte, sizeof(cmd_byte))) {
464
295
  return false;
465
296
  }
466
- if (!send_data(sock->fd, &input_size, sizeof(input_size))) {
297
+ if (!sock->send_data(&input_size, sizeof(input_size))) {
467
298
  return false;
468
299
  }
469
- if (!send_data(sock->fd, input, input_size)) {
300
+ if (!sock->send_data(input, input_size)) {
470
301
  return false;
471
302
  }
472
303
  return true;
@@ -474,20 +305,18 @@ static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cm
474
305
 
475
306
  // RPC request : | rpc_cmd (1 byte) | request_size (8 bytes) | request_data (request_size bytes) |
476
307
  // RPC response: | response_size (8 bytes) | response_data (response_size bytes) |
477
- static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cmd, const void * input, size_t input_size, void * output, size_t output_size) {
308
+ static bool send_rpc_cmd(socket_ptr sock, enum rpc_cmd cmd, const void * input, size_t input_size, void * output, size_t output_size) {
478
309
  if (!send_rpc_cmd(sock, cmd, input, input_size)) {
479
310
  return false;
480
311
  }
481
- // TODO: currently the output_size is always known, do we need support for commands with variable output size?
482
- // even if we do, we can skip sending output_size from the server for commands with known output size
483
312
  uint64_t out_size;
484
- if (!recv_data(sock->fd, &out_size, sizeof(out_size))) {
313
+ if (!sock->recv_data(&out_size, sizeof(out_size))) {
485
314
  return false;
486
315
  }
487
316
  if (out_size != output_size) {
488
317
  return false;
489
318
  }
490
- if (!recv_data(sock->fd, output, output_size)) {
319
+ if (!sock->recv_data(output, output_size)) {
491
320
  return false;
492
321
  }
493
322
  return true;
@@ -495,17 +324,25 @@ static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cm
495
324
 
496
325
  // RPC client-side implementation
497
326
 
498
- static bool check_server_version(const std::shared_ptr<socket_t> & sock) {
499
- rpc_msg_hello_rsp response;
500
- bool status = send_rpc_cmd(sock, RPC_CMD_HELLO, nullptr, 0, &response, sizeof(response));
327
+ // Performs HELLO handshake with transport auto-negotiation.
328
+ // Advertises local capabilities via conn_caps; if the server responds with
329
+ // matching capabilities, the socket is upgraded transparently.
330
+ static bool negotiate_hello(const std::shared_ptr<socket_t> & sock) {
331
+ rpc_msg_hello_req request = {};
332
+ rpc_msg_hello_rsp response = {};
333
+
334
+ sock->get_caps(request.conn_caps);
335
+
336
+ bool status = send_rpc_cmd(sock, RPC_CMD_HELLO, &request, sizeof(request), &response, sizeof(response));
501
337
  RPC_STATUS_ASSERT(status);
338
+
502
339
  if (response.major != RPC_PROTO_MAJOR_VERSION || response.minor > RPC_PROTO_MINOR_VERSION) {
503
- GGML_LOG_ERROR("RPC server version mismatch: %d.%d.%d\n", response.major, response.minor, response.patch);
340
+ GGML_LOG_ERROR("RPC server version mismatch: %d.%d.%d\n",
341
+ response.major, response.minor, response.patch);
504
342
  return false;
505
343
  }
506
- if (response.minor != RPC_PROTO_MINOR_VERSION || response.patch != RPC_PROTO_PATCH_VERSION) {
507
- GGML_LOG_INFO("WARNING: RPC server version mismatch: %d.%d.%d\n", response.major, response.minor, response.patch);
508
- }
344
+
345
+ sock->update_caps(response.conn_caps);
509
346
  return true;
510
347
  }
511
348
 
@@ -513,7 +350,6 @@ static std::shared_ptr<socket_t> get_socket(const std::string & endpoint) {
513
350
  static std::mutex mutex;
514
351
  std::lock_guard<std::mutex> lock(mutex);
515
352
  static std::unordered_map<std::string, std::weak_ptr<socket_t>> sockets;
516
- static bool initialized = false;
517
353
 
518
354
  auto it = sockets.find(endpoint);
519
355
  if (it != sockets.end()) {
@@ -527,26 +363,18 @@ static std::shared_ptr<socket_t> get_socket(const std::string & endpoint) {
527
363
  GGML_LOG_ERROR("Failed to parse endpoint: %s\n", endpoint.c_str());
528
364
  return nullptr;
529
365
  }
530
- #ifdef _WIN32
531
- if (!initialized) {
532
- WSADATA wsaData;
533
- int res = WSAStartup(MAKEWORD(2, 2), &wsaData);
534
- if (res != 0) {
535
- return nullptr;
536
- }
537
- initialized = true;
366
+
367
+ if (!rpc_transport_init()) {
368
+ return nullptr;
538
369
  }
539
- #else
540
- GGML_UNUSED(initialized);
541
- #endif
542
- auto sock = socket_connect(host.c_str(), port);
370
+ auto sock = socket_t::connect(host.c_str(), port);
543
371
  if (sock == nullptr) {
544
372
  return nullptr;
545
373
  }
546
- if (!check_server_version(sock)) {
374
+ if (!negotiate_hello(sock)) {
547
375
  return nullptr;
548
376
  }
549
- LOG_DBG("[%s] connected to %s, sockfd=%d\n", __func__, endpoint.c_str(), sock->fd);
377
+ LOG_DBG("[%s] connected to %s\n", __func__, endpoint.c_str());
550
378
  sockets[endpoint] = sock;
551
379
  return sock;
552
380
  }
@@ -589,8 +417,10 @@ static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
589
417
  ggml_backend_buffer_t buffer = tensor->buffer;
590
418
  ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
591
419
  result.buffer = ctx != nullptr ? ctx->remote_ptr : 0;
420
+ result.data = reinterpret_cast<uint64_t>(tensor->data);
592
421
  } else {
593
422
  result.buffer = 0;
423
+ result.data = 0;
594
424
  }
595
425
  for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) {
596
426
  result.ne[i] = tensor->ne[i];
@@ -606,7 +436,6 @@ static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
606
436
  }
607
437
  result.view_src = reinterpret_cast<uint64_t>(tensor->view_src);
608
438
  result.view_offs = tensor->view_offs;
609
- result.data = reinterpret_cast<uint64_t>(tensor->data);
610
439
 
611
440
  // Avoid sending uninitialized data over the wire
612
441
  memset(result.name, 0, sizeof(result.name));
@@ -705,6 +534,8 @@ static ggml_backend_buffer_i ggml_backend_rpc_buffer_interface = {
705
534
  /* .memset_tensor = */ NULL,
706
535
  /* .set_tensor = */ ggml_backend_rpc_buffer_set_tensor,
707
536
  /* .get_tensor = */ ggml_backend_rpc_buffer_get_tensor,
537
+ /* .set_tensor_2d = */ NULL,
538
+ /* .get_tensor_2d = */ NULL,
708
539
  /* .cpy_tensor = */ ggml_backend_rpc_buffer_cpy_tensor,
709
540
  /* .clear = */ ggml_backend_rpc_buffer_clear,
710
541
  /* .reset = */ NULL,
@@ -867,9 +698,11 @@ static void serialize_graph(uint32_t device, const ggml_cgraph * cgraph, std::ve
867
698
 
868
699
  static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
869
700
  ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
701
+ ggml_backend_dev_t rpc_dev = ggml_backend_get_device(backend);
702
+ ggml_backend_rpc_device_context * rpc_dev_ctx = (ggml_backend_rpc_device_context *)rpc_dev->context;
870
703
 
871
704
  GGML_ASSERT(cgraph->n_nodes > 0);
872
- bool reuse = rpc_ctx->gc.is_cached(cgraph);
705
+ bool reuse = cgraph->uid != 0 && rpc_dev_ctx->last_graph_uid == cgraph->uid;
873
706
  if (reuse) {
874
707
  rpc_msg_graph_recompute_req request;
875
708
  request.device = rpc_ctx->device;
@@ -877,7 +710,7 @@ static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, g
877
710
  bool status = send_rpc_cmd(sock, RPC_CMD_GRAPH_RECOMPUTE, &request, sizeof(request));
878
711
  RPC_STATUS_ASSERT(status);
879
712
  } else {
880
- rpc_ctx->gc.add(cgraph);
713
+ rpc_dev_ctx->last_graph_uid = cgraph->uid;
881
714
  std::vector<uint8_t> input;
882
715
  serialize_graph(rpc_ctx->device, cgraph, input);
883
716
  auto sock = get_socket(rpc_ctx->endpoint);
@@ -892,6 +725,8 @@ static ggml_backend_i ggml_backend_rpc_interface = {
892
725
  /* .free = */ ggml_backend_rpc_free,
893
726
  /* .set_tensor_async = */ NULL,
894
727
  /* .get_tensor_async = */ NULL,
728
+ /* .set_tensor_2d_async = */ NULL,
729
+ /* .get_tensor_2d_async = */ NULL,
895
730
  /* .cpy_tensor_async = */ NULL,
896
731
  /* .synchronize = */ ggml_backend_rpc_synchronize,
897
732
  /* .graph_plan_create = */ NULL,
@@ -941,10 +776,9 @@ ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint, u
941
776
  ggml_backend_t ggml_backend_rpc_init(const char * endpoint, uint32_t device) {
942
777
  std::string dev_name = "RPC" + std::to_string(device) + "[" + std::string(endpoint) + "]";
943
778
  ggml_backend_rpc_context * ctx = new ggml_backend_rpc_context {
944
- /* .endpoint = */ endpoint,
945
- /* .device = */ device,
946
- /* .name = */ dev_name,
947
- /* .gc = */ {},
779
+ /* .endpoint = */ endpoint,
780
+ /* .device = */ device,
781
+ /* .name = */ dev_name,
948
782
  };
949
783
  auto reg = ggml_backend_rpc_add_server(endpoint);
950
784
  ggml_backend_t backend = new ggml_backend {
@@ -1008,8 +842,8 @@ public:
1008
842
  bool get_device_memory(const rpc_msg_get_device_memory_req & request, rpc_msg_get_device_memory_rsp & response);
1009
843
 
1010
844
  struct stored_graph {
1011
- ggml_context_ptr ctx_ptr;
1012
- ggml_cgraph * graph;
845
+ std::vector<uint8_t> buffer;
846
+ ggml_cgraph * graph;
1013
847
  };
1014
848
 
1015
849
  private:
@@ -1162,12 +996,18 @@ ggml_tensor * rpc_server::deserialize_tensor(struct ggml_context * ctx, const rp
1162
996
  return nullptr;
1163
997
  }
1164
998
 
999
+ // Fix: Prevent division by zero if blck_size is 0 (e.g., deprecated types)
1000
+ if (ggml_blck_size((enum ggml_type)tensor->type) == 0) {
1001
+ GGML_LOG_ERROR("[%s] invalid tensor type received (blck_size is 0): %u\n", __func__, tensor->type);
1002
+ return nullptr;
1003
+ }
1004
+
1165
1005
  ggml_tensor * result = ggml_new_tensor_4d(ctx, (ggml_type) tensor->type,
1166
1006
  tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
1167
1007
 
1168
1008
  // ggml_new_tensor_4d might fail if dimensions are invalid, although less likely to crash than invalid type
1169
1009
  if (result == nullptr) {
1170
- GGML_LOG_ERROR("[%s] ggml_new_tensor_4d failed for type %u\\n", __func__, tensor->type);
1010
+ GGML_LOG_ERROR("[%s] ggml_new_tensor_4d failed for type %u\n", __func__, tensor->type);
1171
1011
  return nullptr;
1172
1012
  }
1173
1013
 
@@ -1245,7 +1085,7 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
1245
1085
  fs::path cache_file = fs::path(cache_dir) / hash_str;
1246
1086
  std::ofstream ofs(cache_file, std::ios::binary);
1247
1087
  ofs.write((const char *)data, size);
1248
- GGML_LOG_INFO("[%s] saved to '%s'\n", __func__, cache_file.c_str());
1088
+ GGML_LOG_INFO("[%s] saved to '%s'\n", __func__, cache_file.string().c_str());
1249
1089
  }
1250
1090
  ggml_backend_tensor_set(tensor, data, offset, size);
1251
1091
  return true;
@@ -1333,7 +1173,9 @@ bool rpc_server::init_tensor(const rpc_msg_init_tensor_req & request) {
1333
1173
  if (buffer && buffer->iface.init_tensor) {
1334
1174
  buffer->iface.init_tensor(buffer, tensor);
1335
1175
  } else {
1336
- GGML_LOG_ERROR("Null buffer for tensor passed to init_tensor function\n");
1176
+ if (!buffer) {
1177
+ GGML_LOG_ERROR("Tensor with null buffer passed to init_tensor function\n");
1178
+ }
1337
1179
  }
1338
1180
 
1339
1181
  if (tensor->extra != nullptr) {
@@ -1440,6 +1282,10 @@ ggml_tensor * rpc_server::create_node(uint64_t id,
1440
1282
  if (result == nullptr) {
1441
1283
  return nullptr;
1442
1284
  }
1285
+ if (result->buffer == nullptr && result->data != nullptr) {
1286
+ GGML_LOG_ERROR("[%s] invalid data ptr", __func__);
1287
+ return nullptr;
1288
+ }
1443
1289
  tensor_map[id] = result;
1444
1290
  for (int i = 0; i < GGML_MAX_SRC; i++) {
1445
1291
  // Check if the source ID is 0 before calling create_node recursively
@@ -1505,10 +1351,12 @@ bool rpc_server::graph_compute(const std::vector<uint8_t> & input) {
1505
1351
  LOG_DBG("[%s] device: %u, n_nodes: %u, n_tensors: %u\n", __func__, device, n_nodes, n_tensors);
1506
1352
 
1507
1353
  size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false);
1508
-
1354
+ if (stored_graphs[device].buffer.size() < buf_size) {
1355
+ stored_graphs[device].buffer.resize(buf_size);
1356
+ }
1509
1357
  struct ggml_init_params params = {
1510
1358
  /*.mem_size =*/ buf_size,
1511
- /*.mem_buffer =*/ NULL,
1359
+ /*.mem_buffer =*/ stored_graphs[device].buffer.data(),
1512
1360
  /*.no_alloc =*/ true,
1513
1361
  };
1514
1362
  ggml_context_ptr ctx_ptr { ggml_init(params) };
@@ -1538,7 +1386,6 @@ bool rpc_server::graph_compute(const std::vector<uint8_t> & input) {
1538
1386
  }
1539
1387
  ggml_status status = ggml_backend_graph_compute(backends[device], graph);
1540
1388
  GGML_ASSERT(status == GGML_STATUS_SUCCESS && "Unsuccessful graph computations are not supported with RPC");
1541
- stored_graphs[device].ctx_ptr.swap(ctx_ptr);
1542
1389
  stored_graphs[device].graph = graph;
1543
1390
  return true;
1544
1391
  }
@@ -1579,27 +1426,46 @@ rpc_server::~rpc_server() {
1579
1426
  }
1580
1427
 
1581
1428
  static void rpc_serve_client(const std::vector<ggml_backend_t> & backends, const char * cache_dir,
1582
- sockfd_t sockfd) {
1429
+ socket_ptr sock) {
1583
1430
  rpc_server server(backends, cache_dir);
1584
1431
  uint8_t cmd;
1585
- if (!recv_data(sockfd, &cmd, 1)) {
1432
+ if (!sock->recv_data(&cmd, 1)) {
1586
1433
  return;
1587
1434
  }
1588
- // the first command sent by the client must be HELLO
1589
1435
  if (cmd != RPC_CMD_HELLO) {
1590
1436
  GGML_LOG_ERROR("Expected HELLO command, update client\n");
1591
1437
  return;
1592
1438
  }
1593
- if (!recv_msg(sockfd, nullptr, 0)) {
1439
+
1440
+ // Read input_size and validate protocol version
1441
+ uint64_t hello_input_size;
1442
+ if (!sock->recv_data(&hello_input_size, sizeof(hello_input_size))) {
1443
+ return;
1444
+ }
1445
+
1446
+ if (hello_input_size != sizeof(rpc_msg_hello_req)) {
1447
+ GGML_LOG_ERROR("HELLO request size mismatch (%zu vs %zu) — client needs upgrade to protocol v%d.x\n",
1448
+ (size_t)hello_input_size, sizeof(rpc_msg_hello_req), RPC_PROTO_MAJOR_VERSION);
1449
+ return;
1450
+ }
1451
+
1452
+ rpc_msg_hello_req req = {};
1453
+ if (!sock->recv_data(&req, sizeof(req))) {
1594
1454
  return;
1595
1455
  }
1596
- rpc_msg_hello_rsp response;
1597
- server.hello(response);
1598
- if (!send_msg(sockfd, &response, sizeof(response))) {
1456
+
1457
+ rpc_msg_hello_rsp rsp = {};
1458
+ server.hello(rsp);
1459
+ // Advertise server transport capabilities based on client's caps
1460
+ sock->get_caps(rsp.conn_caps);
1461
+ if (!send_msg(sock, &rsp, sizeof(rsp))) {
1599
1462
  return;
1600
1463
  }
1464
+
1465
+ // Activate transport upgrade using client's caps
1466
+ sock->update_caps(req.conn_caps);
1601
1467
  while (true) {
1602
- if (!recv_data(sockfd, &cmd, 1)) {
1468
+ if (!sock->recv_data(&cmd, 1)) {
1603
1469
  break;
1604
1470
  }
1605
1471
  if (cmd >= RPC_CMD_COUNT) {
@@ -1613,115 +1479,115 @@ static void rpc_serve_client(const std::vector<ggml_backend_t> & backends, const
1613
1479
  return;
1614
1480
  }
1615
1481
  case RPC_CMD_DEVICE_COUNT: {
1616
- if (!recv_msg(sockfd, nullptr, 0)) {
1482
+ if (!recv_msg(sock, nullptr, 0)) {
1617
1483
  return;
1618
1484
  }
1619
1485
  rpc_msg_device_count_rsp response;
1620
1486
  response.device_count = backends.size();
1621
- if (!send_msg(sockfd, &response, sizeof(response))) {
1487
+ if (!send_msg(sock, &response, sizeof(response))) {
1622
1488
  return;
1623
1489
  }
1624
1490
  break;
1625
1491
  }
1626
1492
  case RPC_CMD_ALLOC_BUFFER: {
1627
1493
  rpc_msg_alloc_buffer_req request;
1628
- if (!recv_msg(sockfd, &request, sizeof(request))) {
1494
+ if (!recv_msg(sock, &request, sizeof(request))) {
1629
1495
  return;
1630
1496
  }
1631
1497
  rpc_msg_alloc_buffer_rsp response;
1632
1498
  if (!server.alloc_buffer(request, response)) {
1633
1499
  return;
1634
1500
  }
1635
- if (!send_msg(sockfd, &response, sizeof(response))) {
1501
+ if (!send_msg(sock, &response, sizeof(response))) {
1636
1502
  return;
1637
1503
  }
1638
1504
  break;
1639
1505
  }
1640
1506
  case RPC_CMD_GET_ALLOC_SIZE: {
1641
1507
  rpc_msg_get_alloc_size_req request;
1642
- if (!recv_msg(sockfd, &request, sizeof(request))) {
1508
+ if (!recv_msg(sock, &request, sizeof(request))) {
1643
1509
  return;
1644
1510
  }
1645
1511
  rpc_msg_get_alloc_size_rsp response;
1646
1512
  if (!server.get_alloc_size(request, response)) {
1647
1513
  return;
1648
1514
  }
1649
- if (!send_msg(sockfd, &response, sizeof(response))) {
1515
+ if (!send_msg(sock, &response, sizeof(response))) {
1650
1516
  return;
1651
1517
  }
1652
1518
  break;
1653
1519
  }
1654
1520
  case RPC_CMD_GET_ALIGNMENT: {
1655
1521
  rpc_msg_get_alignment_req request;
1656
- if (!recv_msg(sockfd, &request, sizeof(request))) {
1522
+ if (!recv_msg(sock, &request, sizeof(request))) {
1657
1523
  return;
1658
1524
  }
1659
1525
  rpc_msg_get_alignment_rsp response;
1660
1526
  if (!server.get_alignment(request, response)) {
1661
1527
  return;
1662
1528
  }
1663
- if (!send_msg(sockfd, &response, sizeof(response))) {
1529
+ if (!send_msg(sock, &response, sizeof(response))) {
1664
1530
  return;
1665
1531
  }
1666
1532
  break;
1667
1533
  }
1668
1534
  case RPC_CMD_GET_MAX_SIZE: {
1669
1535
  rpc_msg_get_max_size_req request;
1670
- if (!recv_msg(sockfd, &request, sizeof(request))) {
1536
+ if (!recv_msg(sock, &request, sizeof(request))) {
1671
1537
  return;
1672
1538
  }
1673
1539
  rpc_msg_get_max_size_rsp response;
1674
1540
  if (!server.get_max_size(request, response)) {
1675
1541
  return;
1676
1542
  }
1677
- if (!send_msg(sockfd, &response, sizeof(response))) {
1543
+ if (!send_msg(sock, &response, sizeof(response))) {
1678
1544
  return;
1679
1545
  }
1680
1546
  break;
1681
1547
  }
1682
1548
  case RPC_CMD_BUFFER_GET_BASE: {
1683
1549
  rpc_msg_buffer_get_base_req request;
1684
- if (!recv_msg(sockfd, &request, sizeof(request))) {
1550
+ if (!recv_msg(sock, &request, sizeof(request))) {
1685
1551
  return;
1686
1552
  }
1687
1553
  rpc_msg_buffer_get_base_rsp response;
1688
1554
  if (!server.buffer_get_base(request, response)) {
1689
1555
  return;
1690
1556
  }
1691
- if (!send_msg(sockfd, &response, sizeof(response))) {
1557
+ if (!send_msg(sock, &response, sizeof(response))) {
1692
1558
  return;
1693
1559
  }
1694
1560
  break;
1695
1561
  }
1696
1562
  case RPC_CMD_FREE_BUFFER: {
1697
1563
  rpc_msg_free_buffer_req request;
1698
- if (!recv_msg(sockfd, &request, sizeof(request))) {
1564
+ if (!recv_msg(sock, &request, sizeof(request))) {
1699
1565
  return;
1700
1566
  }
1701
1567
  if (!server.free_buffer(request)) {
1702
1568
  return;
1703
1569
  }
1704
- if (!send_msg(sockfd, nullptr, 0)) {
1570
+ if (!send_msg(sock, nullptr, 0)) {
1705
1571
  return;
1706
1572
  }
1707
1573
  break;
1708
1574
  }
1709
1575
  case RPC_CMD_BUFFER_CLEAR: {
1710
1576
  rpc_msg_buffer_clear_req request;
1711
- if (!recv_msg(sockfd, &request, sizeof(request))) {
1577
+ if (!recv_msg(sock, &request, sizeof(request))) {
1712
1578
  return;
1713
1579
  }
1714
1580
  if (!server.buffer_clear(request)) {
1715
1581
  return;
1716
1582
  }
1717
- if (!send_msg(sockfd, nullptr, 0)) {
1583
+ if (!send_msg(sock, nullptr, 0)) {
1718
1584
  return;
1719
1585
  }
1720
1586
  break;
1721
1587
  }
1722
1588
  case RPC_CMD_SET_TENSOR: {
1723
1589
  std::vector<uint8_t> input;
1724
- if (!recv_msg(sockfd, input)) {
1590
+ if (!recv_msg(sock, input)) {
1725
1591
  return;
1726
1592
  }
1727
1593
  if (!server.set_tensor(input)) {
@@ -1731,62 +1597,62 @@ static void rpc_serve_client(const std::vector<ggml_backend_t> & backends, const
1731
1597
  }
1732
1598
  case RPC_CMD_SET_TENSOR_HASH: {
1733
1599
  rpc_msg_set_tensor_hash_req request;
1734
- if (!recv_msg(sockfd, &request, sizeof(request))) {
1600
+ if (!recv_msg(sock, &request, sizeof(request))) {
1735
1601
  return;
1736
1602
  }
1737
1603
  rpc_msg_set_tensor_hash_rsp response;
1738
1604
  if (!server.set_tensor_hash(request, response)) {
1739
1605
  return;
1740
1606
  }
1741
- if (!send_msg(sockfd, &response, sizeof(response))) {
1607
+ if (!send_msg(sock, &response, sizeof(response))) {
1742
1608
  return;
1743
1609
  }
1744
1610
  break;
1745
1611
  }
1746
1612
  case RPC_CMD_INIT_TENSOR: {
1747
1613
  rpc_msg_init_tensor_req request;
1748
- if (!recv_msg(sockfd, &request,sizeof(request))) {
1614
+ if (!recv_msg(sock, &request,sizeof(request))) {
1749
1615
  return;
1750
1616
  }
1751
1617
  if (!server.init_tensor(request)) {
1752
1618
  return;
1753
1619
  }
1754
- if (!send_msg(sockfd, nullptr, 0)) {
1620
+ if (!send_msg(sock, nullptr, 0)) {
1755
1621
  return;
1756
1622
  }
1757
1623
  break;
1758
1624
  }
1759
1625
  case RPC_CMD_GET_TENSOR: {
1760
1626
  rpc_msg_get_tensor_req request;
1761
- if (!recv_msg(sockfd, &request, sizeof(request))) {
1627
+ if (!recv_msg(sock, &request, sizeof(request))) {
1762
1628
  return;
1763
1629
  }
1764
1630
  std::vector<uint8_t> response;
1765
1631
  if (!server.get_tensor(request, response)) {
1766
1632
  return;
1767
1633
  }
1768
- if (!send_msg(sockfd, response.data(), response.size())) {
1634
+ if (!send_msg(sock, response.data(), response.size())) {
1769
1635
  return;
1770
1636
  }
1771
1637
  break;
1772
1638
  }
1773
1639
  case RPC_CMD_COPY_TENSOR: {
1774
1640
  rpc_msg_copy_tensor_req request;
1775
- if (!recv_msg(sockfd, &request, sizeof(request))) {
1641
+ if (!recv_msg(sock, &request, sizeof(request))) {
1776
1642
  return;
1777
1643
  }
1778
1644
  rpc_msg_copy_tensor_rsp response;
1779
1645
  if (!server.copy_tensor(request, response)) {
1780
1646
  return;
1781
1647
  }
1782
- if (!send_msg(sockfd, &response, sizeof(response))) {
1648
+ if (!send_msg(sock, &response, sizeof(response))) {
1783
1649
  return;
1784
1650
  }
1785
1651
  break;
1786
1652
  }
1787
1653
  case RPC_CMD_GRAPH_COMPUTE: {
1788
1654
  std::vector<uint8_t> input;
1789
- if (!recv_msg(sockfd, input)) {
1655
+ if (!recv_msg(sock, input)) {
1790
1656
  return;
1791
1657
  }
1792
1658
  if (!server.graph_compute(input)) {
@@ -1796,7 +1662,7 @@ static void rpc_serve_client(const std::vector<ggml_backend_t> & backends, const
1796
1662
  }
1797
1663
  case RPC_CMD_GRAPH_RECOMPUTE: {
1798
1664
  rpc_msg_graph_recompute_req request;
1799
- if (!recv_msg(sockfd, &request, sizeof(request))) {
1665
+ if (!recv_msg(sock, &request, sizeof(request))) {
1800
1666
  return;
1801
1667
  }
1802
1668
  if (!server.graph_recompute(request)) {
@@ -1806,14 +1672,14 @@ static void rpc_serve_client(const std::vector<ggml_backend_t> & backends, const
1806
1672
  }
1807
1673
  case RPC_CMD_GET_DEVICE_MEMORY: {
1808
1674
  rpc_msg_get_device_memory_req request;
1809
- if (!recv_msg(sockfd, &request, sizeof(request))) {
1675
+ if (!recv_msg(sock, &request, sizeof(request))) {
1810
1676
  return;
1811
1677
  }
1812
1678
  rpc_msg_get_device_memory_rsp response;
1813
1679
  if (!server.get_device_memory(request, response)) {
1814
1680
  return;
1815
1681
  }
1816
- if (!send_msg(sockfd, &response, sizeof(response))) {
1682
+ if (!send_msg(sock, &response, sizeof(response))) {
1817
1683
  return;
1818
1684
  }
1819
1685
  break;
@@ -1866,50 +1732,39 @@ void ggml_backend_rpc_start_server(const char * endpoint, const char * cache_dir
1866
1732
  if (!parse_endpoint(endpoint, host, port)) {
1867
1733
  return;
1868
1734
  }
1869
- #ifdef _WIN32
1870
- {
1871
- WSADATA wsaData;
1872
- int res = WSAStartup(MAKEWORD(2, 2), &wsaData);
1873
- if (res != 0) {
1874
- fprintf(stderr, "WSAStartup failed: %d\n", res);
1875
- return;
1876
- }
1735
+
1736
+ #ifdef GGML_RPC_RDMA
1737
+ printf(" transport : TCP (RDMA auto-negotiate enabled)\n");
1738
+ #else
1739
+ printf(" transport : TCP\n");
1740
+ #endif // GGML_RPC_RDMA
1741
+ if (!rpc_transport_init()) {
1742
+ fprintf(stderr, "Failed to initialize RPC transport\n");
1743
+ return;
1877
1744
  }
1878
- #endif
1879
- auto server_socket = create_server_socket(host.c_str(), port);
1745
+ auto server_socket = socket_t::create_server(host.c_str(), port);
1880
1746
  if (server_socket == nullptr) {
1881
1747
  fprintf(stderr, "Failed to create server socket\n");
1882
1748
  return;
1883
1749
  }
1884
1750
  while (true) {
1885
- auto client_socket = socket_accept(server_socket->fd);
1751
+ auto client_socket = server_socket->accept();
1886
1752
  if (client_socket == nullptr) {
1887
1753
  fprintf(stderr, "Failed to accept client connection\n");
1888
1754
  return;
1889
1755
  }
1890
1756
  printf("Accepted client connection\n");
1891
1757
  fflush(stdout);
1892
- rpc_serve_client(backends, cache_dir, client_socket->fd);
1758
+ rpc_serve_client(backends, cache_dir, client_socket);
1893
1759
  printf("Client connection closed\n");
1894
1760
  fflush(stdout);
1895
1761
  }
1896
- #ifdef _WIN32
1897
- WSACleanup();
1898
- #endif
1762
+ rpc_transport_shutdown();
1899
1763
  for (auto backend : backends) {
1900
1764
  ggml_backend_free(backend);
1901
1765
  }
1902
1766
  }
1903
1767
 
1904
- // device interface
1905
-
1906
- struct ggml_backend_rpc_device_context {
1907
- std::string endpoint;
1908
- uint32_t device;
1909
- std::string name;
1910
- std::string description;
1911
- };
1912
-
1913
1768
  static const char * ggml_backend_rpc_device_get_name(ggml_backend_dev_t dev) {
1914
1769
  ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;
1915
1770
 
@@ -2091,10 +1946,11 @@ ggml_backend_reg_t ggml_backend_rpc_add_server(const char * endpoint) {
2091
1946
  std::string dev_name = "RPC" + std::to_string(dev_id);
2092
1947
  std::string dev_desc = std::string(endpoint);
2093
1948
  ggml_backend_rpc_device_context * dev_ctx = new ggml_backend_rpc_device_context {
2094
- /* .endpoint = */ endpoint,
2095
- /* .device = */ ind,
2096
- /* .name = */ dev_name,
2097
- /* .description = */ dev_desc
1949
+ /* .endpoint = */ endpoint,
1950
+ /* .device = */ ind,
1951
+ /* .name = */ dev_name,
1952
+ /* .description = */ dev_desc,
1953
+ /* .last_graph_uid = */ 0,
2098
1954
  };
2099
1955
 
2100
1956
  ggml_backend_dev_t dev = new ggml_backend_device {