whispercpp 1.3.2 → 1.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (664) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +6 -3
  3. data/README.md +71 -14
  4. data/Rakefile +20 -7
  5. data/ext/.gitignore +4 -6
  6. data/ext/dependencies.rb +36 -24
  7. data/ext/extconf.rb +1 -1
  8. data/ext/options.rb +48 -184
  9. data/ext/ruby_whisper.c +18 -0
  10. data/ext/ruby_whisper_context.c +43 -12
  11. data/ext/ruby_whisper_model.c +1 -1
  12. data/ext/ruby_whisper_params.c +59 -27
  13. data/ext/ruby_whisper_segment.c +81 -4
  14. data/ext/ruby_whisper_transcribe.cpp +13 -7
  15. data/ext/ruby_whisper_vad_params.c +1 -1
  16. data/ext/sources/CMakeLists.txt +5 -1
  17. data/ext/sources/bindings/javascript/package.json +1 -1
  18. data/ext/sources/build-xcframework.sh +24 -0
  19. data/ext/sources/examples/CMakeLists.txt +1 -0
  20. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +120 -24
  21. data/ext/sources/examples/addon.node/addon.cpp +154 -35
  22. data/ext/sources/examples/addon.node/index.js +10 -5
  23. data/ext/sources/examples/addon.node/vad-example.js +132 -0
  24. data/ext/sources/examples/bench/bench.cpp +29 -18
  25. data/ext/sources/examples/bench.wasm/index-tmpl.html +10 -9
  26. data/ext/sources/examples/cli/cli.cpp +7 -4
  27. data/ext/sources/examples/command/command.cpp +58 -32
  28. data/ext/sources/examples/command.wasm/index-tmpl.html +5 -4
  29. data/ext/sources/examples/common-ggml.cpp +2 -0
  30. data/ext/sources/examples/common-whisper.cpp +14 -7
  31. data/ext/sources/examples/lsp/lsp.cpp +21 -17
  32. data/ext/sources/examples/quantize/quantize.cpp +3 -0
  33. data/ext/sources/examples/server/CMakeLists.txt +3 -0
  34. data/ext/sources/examples/server/server.cpp +193 -35
  35. data/ext/sources/examples/server.py +6 -1
  36. data/ext/sources/examples/stream/stream.cpp +10 -2
  37. data/ext/sources/examples/stream.wasm/emscripten.cpp +6 -6
  38. data/ext/sources/examples/stream.wasm/index-tmpl.html +82 -5
  39. data/ext/sources/examples/talk-llama/CMakeLists.txt +3 -0
  40. data/ext/sources/examples/talk-llama/llama-adapter.cpp +101 -4
  41. data/ext/sources/examples/talk-llama/llama-adapter.h +6 -0
  42. data/ext/sources/examples/talk-llama/llama-arch.cpp +756 -15
  43. data/ext/sources/examples/talk-llama/llama-arch.h +85 -1
  44. data/ext/sources/examples/talk-llama/llama-batch.cpp +773 -272
  45. data/ext/sources/examples/talk-llama/llama-batch.h +126 -55
  46. data/ext/sources/examples/talk-llama/llama-chat.cpp +150 -13
  47. data/ext/sources/examples/talk-llama/llama-chat.h +8 -0
  48. data/ext/sources/examples/talk-llama/llama-context.cpp +814 -542
  49. data/ext/sources/examples/talk-llama/llama-context.h +68 -32
  50. data/ext/sources/examples/talk-llama/llama-cparams.cpp +1 -1
  51. data/ext/sources/examples/talk-llama/llama-cparams.h +4 -4
  52. data/ext/sources/examples/talk-llama/llama-graph.cpp +787 -440
  53. data/ext/sources/examples/talk-llama/llama-graph.h +333 -153
  54. data/ext/sources/examples/talk-llama/llama-hparams.cpp +128 -6
  55. data/ext/sources/examples/talk-llama/llama-hparams.h +80 -17
  56. data/ext/sources/examples/talk-llama/llama-impl.h +2 -0
  57. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +326 -0
  58. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.h +137 -0
  59. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +1248 -1967
  60. data/ext/sources/examples/talk-llama/llama-kv-cache.h +218 -345
  61. data/ext/sources/examples/talk-llama/llama-kv-cells.h +164 -52
  62. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +266 -0
  63. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +139 -0
  64. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +1154 -0
  65. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +182 -0
  66. data/ext/sources/examples/talk-llama/llama-memory.cpp +58 -0
  67. data/ext/sources/examples/talk-llama/llama-memory.h +94 -4
  68. data/ext/sources/examples/talk-llama/llama-mmap.cpp +1 -1
  69. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +44 -17
  70. data/ext/sources/examples/talk-llama/llama-model-loader.h +3 -2
  71. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +1 -0
  72. data/ext/sources/examples/talk-llama/llama-model.cpp +11377 -5248
  73. data/ext/sources/examples/talk-llama/llama-model.h +87 -9
  74. data/ext/sources/examples/talk-llama/llama-quant.cpp +137 -16
  75. data/ext/sources/examples/talk-llama/llama-sampling.cpp +226 -126
  76. data/ext/sources/examples/talk-llama/llama-vocab.cpp +502 -38
  77. data/ext/sources/examples/talk-llama/llama-vocab.h +46 -0
  78. data/ext/sources/examples/talk-llama/llama.cpp +76 -17
  79. data/ext/sources/examples/talk-llama/llama.h +176 -151
  80. data/ext/sources/examples/talk-llama/talk-llama.cpp +11 -6
  81. data/ext/sources/examples/talk-llama/unicode.cpp +212 -0
  82. data/ext/sources/examples/talk-llama/unicode.h +45 -0
  83. data/ext/sources/examples/vad-speech-segments/speech.cpp +6 -0
  84. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +6 -2
  85. data/ext/sources/examples/whisper.wasm/index-tmpl.html +17 -16
  86. data/ext/sources/ggml/CMakeLists.txt +106 -33
  87. data/ext/sources/ggml/cmake/common.cmake +24 -0
  88. data/ext/sources/ggml/cmake/ggml-config.cmake.in +132 -93
  89. data/ext/sources/ggml/include/ggml-backend.h +18 -2
  90. data/ext/sources/ggml/include/ggml-cpu.h +2 -0
  91. data/ext/sources/ggml/include/ggml-metal.h +1 -6
  92. data/ext/sources/ggml/include/ggml-opt.h +25 -6
  93. data/ext/sources/ggml/include/ggml-webgpu.h +19 -0
  94. data/ext/sources/ggml/include/ggml-zdnn.h +17 -0
  95. data/ext/sources/ggml/include/ggml.h +365 -21
  96. data/ext/sources/ggml/src/CMakeLists.txt +98 -25
  97. data/ext/sources/ggml/src/ggml-alloc.c +265 -141
  98. data/ext/sources/ggml/src/ggml-backend-impl.h +4 -1
  99. data/ext/sources/ggml/src/ggml-backend-reg.cpp +35 -13
  100. data/ext/sources/ggml/src/ggml-backend.cpp +266 -60
  101. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +4 -4
  102. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +5 -4
  103. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +15 -0
  104. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +3 -1
  105. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +903 -717
  106. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +143 -25
  107. data/ext/sources/ggml/src/ggml-cann/common.h +149 -2
  108. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +521 -78
  109. data/ext/sources/ggml/src/ggml-common.h +21 -0
  110. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +165 -50
  111. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +5 -3
  112. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
  113. data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  114. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +3650 -0
  115. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +1891 -0
  116. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +2160 -0
  117. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  118. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +2305 -0
  119. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1897 -0
  120. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +342 -0
  121. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +1468 -0
  122. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +1221 -0
  123. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +3820 -0
  124. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +6307 -0
  125. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +214 -0
  126. data/ext/sources/ggml/src/ggml-cpu/common.h +18 -3
  127. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +23 -7
  128. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +179 -110
  129. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +44 -33
  130. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  131. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +152 -18
  132. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +7 -1
  133. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +228 -98
  134. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +532 -1124
  135. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  136. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +3374 -2081
  137. data/ext/sources/ggml/src/ggml-cpu/ops.h +13 -8
  138. data/ext/sources/ggml/src/ggml-cpu/quants.c +1193 -0
  139. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +34 -0
  140. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1982 -0
  141. data/ext/sources/ggml/src/ggml-cpu/repack.h +120 -0
  142. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +367 -46
  143. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
  144. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  145. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  146. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  147. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +3 -3
  148. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +1 -1
  149. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +272 -35
  150. data/ext/sources/ggml/src/ggml-cpu/vec.h +794 -142
  151. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +20 -16
  152. data/ext/sources/ggml/src/ggml-cuda/add-id.cu +58 -0
  153. data/ext/sources/ggml/src/ggml-cuda/add-id.cuh +3 -0
  154. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +330 -191
  155. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +2 -0
  156. data/ext/sources/ggml/src/ggml-cuda/common.cuh +291 -81
  157. data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
  158. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  159. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  160. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  161. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  162. data/ext/sources/ggml/src/ggml-cuda/conv2d.cu +166 -0
  163. data/ext/sources/ggml/src/ggml-cuda/conv2d.cuh +5 -0
  164. data/ext/sources/ggml/src/ggml-cuda/convert.cu +117 -22
  165. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +20 -0
  166. data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +217 -0
  167. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +64 -307
  168. data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
  169. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +14 -40
  170. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +499 -368
  171. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +142 -93
  172. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +755 -0
  173. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +3 -0
  174. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +593 -0
  175. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +90 -50
  176. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +185 -198
  177. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
  178. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +50 -39
  179. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +636 -222
  180. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +196 -35
  181. data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +1 -0
  182. data/ext/sources/ggml/src/ggml-cuda/mean.cu +73 -0
  183. data/ext/sources/ggml/src/ggml-cuda/mean.cuh +3 -0
  184. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +198 -45
  185. data/ext/sources/ggml/src/ggml-cuda/mmf.cu +123 -0
  186. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +496 -0
  187. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +206 -57
  188. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +1262 -721
  189. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +506 -0
  190. data/ext/sources/ggml/src/ggml-cuda/{mmv.cuh → mmvf.cuh} +4 -5
  191. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +64 -73
  192. data/ext/sources/ggml/src/ggml-cuda/norm.cu +284 -12
  193. data/ext/sources/ggml/src/ggml-cuda/norm.cuh +7 -0
  194. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
  195. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
  196. data/ext/sources/ggml/src/ggml-cuda/pad.cu +46 -23
  197. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cu +91 -0
  198. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
  199. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +12 -10
  200. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
  201. data/ext/sources/ggml/src/ggml-cuda/roll.cu +67 -0
  202. data/ext/sources/ggml/src/ggml-cuda/roll.cuh +5 -0
  203. data/ext/sources/ggml/src/ggml-cuda/rope.cu +21 -27
  204. data/ext/sources/ggml/src/ggml-cuda/scale.cu +14 -11
  205. data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +276 -0
  206. data/ext/sources/ggml/src/ggml-cuda/set-rows.cuh +7 -0
  207. data/ext/sources/ggml/src/ggml-cuda/softcap.cu +34 -0
  208. data/ext/sources/ggml/src/ggml-cuda/softcap.cuh +5 -0
  209. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +126 -59
  210. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
  211. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +322 -98
  212. data/ext/sources/ggml/src/ggml-cuda/sum.cu +6 -10
  213. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +23 -19
  214. data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  215. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
  216. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
  217. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
  218. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
  219. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
  220. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
  221. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
  222. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
  223. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
  224. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
  225. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
  226. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
  227. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
  228. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
  229. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
  230. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
  231. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
  232. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
  233. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
  234. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
  235. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
  236. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
  237. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
  238. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
  239. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
  240. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
  241. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
  242. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
  243. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
  244. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
  245. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
  246. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
  247. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
  248. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
  249. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
  250. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
  251. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +21 -18
  252. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
  253. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
  254. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
  255. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
  256. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
  257. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
  258. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
  259. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
  260. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
  261. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
  262. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
  263. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
  264. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
  265. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
  266. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
  267. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
  268. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
  269. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +259 -0
  270. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +14 -0
  271. data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +3 -3
  272. data/ext/sources/ggml/src/ggml-cuda/unary.cu +179 -0
  273. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +15 -0
  274. data/ext/sources/ggml/src/ggml-cuda/upscale.cu +92 -6
  275. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
  276. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
  277. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +58 -36
  278. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +4 -3
  279. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +14 -2
  280. data/ext/sources/ggml/src/ggml-impl.h +229 -175
  281. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +21 -17
  282. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
  283. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  284. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
  285. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +600 -0
  286. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +1376 -0
  287. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +226 -0
  288. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +1308 -0
  289. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +163 -63
  290. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +3158 -0
  291. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +82 -0
  292. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +718 -0
  293. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +3208 -1575
  294. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +18 -8
  295. data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +2 -2
  296. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +32 -0
  297. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +4430 -792
  298. data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +107 -0
  299. data/ext/sources/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  300. data/ext/sources/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  301. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  302. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  303. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  304. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +84 -0
  305. data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +138 -0
  306. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  307. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +370 -0
  308. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  309. data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
  310. data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +378 -0
  311. data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +121 -0
  312. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
  313. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
  314. data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
  315. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  316. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +132 -0
  317. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +133 -0
  318. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  319. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  320. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  321. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  322. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  323. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  324. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  325. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  326. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  327. data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
  328. data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  329. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  330. data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +79 -0
  331. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
  332. data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +189 -0
  333. data/ext/sources/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  334. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +34 -13
  335. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +34 -13
  336. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +34 -13
  337. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +34 -13
  338. data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +138 -0
  339. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
  340. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  341. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +20 -0
  342. data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  343. data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +120 -0
  344. data/ext/sources/ggml/src/ggml-opt.cpp +97 -41
  345. data/ext/sources/ggml/src/ggml-quants.c +117 -24
  346. data/ext/sources/ggml/src/ggml-quants.h +6 -0
  347. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +85 -62
  348. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
  349. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +2 -0
  350. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +9 -0
  351. data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +6 -0
  352. data/ext/sources/ggml/src/ggml-sycl/common.hpp +20 -48
  353. data/ext/sources/ggml/src/ggml-sycl/concat.cpp +13 -17
  354. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +21 -2
  355. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +116 -211
  356. data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +213 -1
  357. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  358. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +700 -1041
  359. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +20 -9
  360. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +17 -26
  361. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +2 -96
  362. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +393 -250
  363. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +1 -1
  364. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +32 -8
  365. data/ext/sources/ggml/src/ggml-sycl/quantize.hpp +133 -0
  366. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +38 -11
  367. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +125 -21
  368. data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +234 -0
  369. data/ext/sources/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  370. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  371. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  372. data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +4 -3
  373. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +105 -17
  374. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +36 -32
  375. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +4198 -1145
  376. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
  377. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +41 -1
  378. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
  379. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +13 -4
  380. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +39 -29
  381. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +349 -0
  382. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  383. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +2 -2
  384. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +66 -12
  385. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +154 -0
  386. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +21 -0
  387. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +1 -1
  388. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +2 -1
  389. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +6 -5
  390. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +4 -2
  391. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
  392. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +1 -1
  393. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
  394. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +1 -1
  395. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +1 -1
  396. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
  397. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +21 -0
  398. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +69 -24
  399. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +60 -20
  400. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +98 -42
  401. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +64 -27
  402. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +74 -13
  403. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  404. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  405. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  406. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  407. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +4 -17
  408. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +19 -10
  409. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +25 -15
  410. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +19 -0
  411. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
  412. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
  413. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
  414. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +18 -14
  415. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +126 -0
  416. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +65 -1
  417. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +11 -7
  418. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
  419. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +144 -531
  420. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +206 -38
  421. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.comp +556 -0
  422. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +12 -5
  423. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +15 -9
  424. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +111 -0
  425. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
  426. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +24 -3
  427. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +53 -3
  428. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  429. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +64 -11
  430. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
  431. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  432. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +1 -4
  433. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
  434. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
  435. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
  436. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp +5 -0
  437. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
  438. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +29 -7
  439. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +4 -0
  440. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
  441. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +38 -5
  442. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  443. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
  444. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +4 -3
  445. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +101 -9
  446. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
  447. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/utils.comp +25 -0
  448. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +338 -71
  449. data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +54 -0
  450. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1558 -0
  451. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl +44 -0
  452. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl +41 -0
  453. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
  454. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
  455. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +60 -0
  456. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +124 -0
  457. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
  458. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  459. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl +44 -0
  460. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl +41 -0
  461. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +907 -0
  462. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +57 -0
  463. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_in_place.wgsl +48 -0
  464. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +81 -0
  465. data/ext/sources/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  466. data/ext/sources/ggml/src/ggml-zdnn/common.hpp +59 -0
  467. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +628 -0
  468. data/ext/sources/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  469. data/ext/sources/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  470. data/ext/sources/ggml/src/ggml-zdnn/utils.cpp +79 -0
  471. data/ext/sources/ggml/src/ggml-zdnn/utils.hpp +19 -0
  472. data/ext/sources/ggml/src/ggml.c +802 -142
  473. data/ext/sources/ggml/src/ggml.cpp +26 -0
  474. data/ext/sources/ggml/src/gguf.cpp +32 -4
  475. data/ext/sources/include/whisper.h +2 -0
  476. data/ext/sources/src/CMakeLists.txt +2 -0
  477. data/ext/sources/src/coreml/whisper-compat.h +10 -0
  478. data/ext/sources/src/coreml/whisper-compat.m +35 -0
  479. data/ext/sources/src/coreml/whisper-decoder-impl.m +1 -0
  480. data/ext/sources/src/coreml/whisper-encoder-impl.m +1 -0
  481. data/ext/sources/src/whisper.cpp +241 -215
  482. data/ext/sources/tests/CMakeLists.txt +8 -1
  483. data/ext/sources/tests/test-vad-full.cpp +3 -3
  484. data/ext/sources/tests/test-vad.cpp +2 -2
  485. data/extsources.rb +15 -9
  486. data/lib/whisper/context.rb +15 -0
  487. data/lib/whisper/model/uri.rb +57 -2
  488. data/lib/whisper/segment.rb +58 -0
  489. data/sig/whisper.rbs +75 -38
  490. data/{tests → test}/helper.rb +1 -12
  491. data/{tests → test}/test_model.rb +9 -0
  492. data/test/test_package.rb +51 -0
  493. data/{tests → test}/test_params.rb +8 -0
  494. data/test/test_segment.rb +146 -0
  495. data/{tests → test}/test_whisper.rb +70 -0
  496. data/whispercpp.gemspec +2 -3
  497. metadata +246 -191
  498. data/ext/sources/.dockerignore +0 -3
  499. data/ext/sources/.github/workflows/bindings-ruby.yml +0 -21
  500. data/ext/sources/ci/run.sh +0 -336
  501. data/ext/sources/close-issue.yml +0 -28
  502. data/ext/sources/ggml/include/ggml-kompute.h +0 -50
  503. data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +0 -107
  504. data/ext/sources/ggml/src/ggml-amx/common.h +0 -94
  505. data/ext/sources/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
  506. data/ext/sources/ggml/src/ggml-amx/mmq.cpp +0 -2510
  507. data/ext/sources/ggml/src/ggml-amx/mmq.h +0 -17
  508. data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  509. data/ext/sources/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  510. data/ext/sources/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  511. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  512. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  513. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  514. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  515. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  516. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  517. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  518. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -6431
  519. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  520. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13747
  521. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -357
  522. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
  523. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -365
  524. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
  525. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -482
  526. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -472
  527. data/ext/sources/ggml/src/ggml-cuda/mmv.cu +0 -336
  528. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  529. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  530. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  531. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  532. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  533. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  534. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  535. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  536. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  537. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  538. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  539. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  540. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  541. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  542. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  543. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  544. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  545. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  546. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  547. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  548. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  549. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  550. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  551. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  552. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  553. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  554. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  555. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  556. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  557. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  558. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  559. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  560. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  561. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  562. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  563. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  564. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  565. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  566. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  567. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  568. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  569. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  570. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  571. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  572. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  573. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  574. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  575. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  576. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  577. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  578. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  579. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  580. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  581. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  582. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  583. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  584. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  585. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  586. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  587. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  588. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  589. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  590. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  591. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  592. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  593. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  594. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  595. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  596. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  597. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  598. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  599. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  600. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  601. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  602. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  603. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  604. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  605. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  606. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  607. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  608. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  609. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  610. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  611. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  612. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  613. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  614. data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  615. data/ext/sources/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  616. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
  617. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
  618. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
  619. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
  620. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
  621. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
  622. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
  623. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
  624. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
  625. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
  626. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
  627. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
  628. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
  629. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
  630. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
  631. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
  632. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
  633. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
  634. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
  635. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
  636. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
  637. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
  638. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
  639. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
  640. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
  641. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
  642. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
  643. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
  644. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
  645. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
  646. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
  647. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
  648. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
  649. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
  650. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
  651. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
  652. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
  653. data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +0 -5998
  654. data/tests/test_package.rb +0 -46
  655. data/tests/test_segment.rb +0 -74
  656. /data/ext/sources/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  657. /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  658. /data/{tests → test}/jfk_reader/.gitignore +0 -0
  659. /data/{tests → test}/jfk_reader/extconf.rb +0 -0
  660. /data/{tests → test}/jfk_reader/jfk_reader.c +0 -0
  661. /data/{tests → test}/test_callback.rb +0 -0
  662. /data/{tests → test}/test_error.rb +0 -0
  663. /data/{tests → test}/test_vad.rb +0 -0
  664. /data/{tests → test}/test_vad_params.rb +0 -0
@@ -206,15 +206,6 @@ static bool ggml_graph_compute_helper(
206
206
  return t;
207
207
  }
208
208
 
209
- static void whisper_load_backends() {
210
- #ifdef GGML_BACKEND_DL
211
- static std::once_flag flag;
212
- std::call_once(flag, []() {
213
- ggml_backend_load_all();
214
- });
215
- #endif
216
- }
217
-
218
209
  // TODO: move these functions to ggml-base with support for ggml-backend?
219
210
 
220
211
  static ggml_tensor * whisper_set_f32(struct ggml_tensor * t, float v) {
@@ -261,45 +252,6 @@ static void whisper_set_i32_nd(struct ggml_tensor * t, int64_t i0, int64_t i1, i
261
252
  *(int32_t *) data = v;
262
253
  }
263
254
 
264
- // faster matrix multiplications for tensors that do not have dimension 0 divisible by "pad"
265
- // the idea is to represent the original matrix multiplication:
266
- //
267
- // Z = X @ Y
268
- //
269
- // with the sum of two matrix multiplications:
270
- //
271
- // Z = (X_0 @ Y_0) + (X_1 @ Y_1)
272
- //
273
- // here X_0 and Y_0 are views of X and Y that have dimension 0 divisible by "pad"
274
- // and X_1 and Y_1 are the remaining views. X_1 and Y_1 end up being small matrices that can be processed with more
275
- // general-purpose kernels
276
- //
277
- static struct ggml_tensor * ggml_mul_mat_pad(struct ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * y, int pad = 32) {
278
- // use padding only if dimension 0 is at least 8 times larger than the padding
279
- // else we won't get much benefit from the optimization
280
- const int n_pad_req = 8;
281
-
282
- if (x->ne[0] % pad == 0 || x->ne[0] / pad < n_pad_req) {
283
- return ggml_mul_mat(ctx, x, y);
284
- }
285
-
286
- struct ggml_tensor * x_0 = ggml_view_3d(ctx, x, (x->ne[0]/pad)*pad, x->ne[1], x->ne[2], x->nb[1], x->nb[2], 0);
287
- struct ggml_tensor * x_1 = ggml_view_3d(ctx, x, x->ne[0]%pad, x->ne[1], x->ne[2], x->nb[1], x->nb[2], x_0->ne[0]*x_0->nb[0]);
288
-
289
- struct ggml_tensor * y_0 = ggml_view_3d(ctx, y, (y->ne[0]/pad)*pad, y->ne[1], y->ne[2], y->nb[1], y->nb[2], 0);
290
- struct ggml_tensor * y_1 = ggml_view_3d(ctx, y, y->ne[0]%pad, y->ne[1], y->ne[2], y->nb[1], y->nb[2], y_0->ne[0]*y_0->nb[0]);
291
-
292
- return ggml_add(ctx,
293
- ggml_mul_mat(ctx, x_0, y_0),
294
- ggml_mul_mat(ctx, x_1, y_1));
295
- }
296
-
297
- // TODO: check if other platforms can benefit from this optimization
298
- // TODO: CUDA is currently broken - seems ggml_mul_mat does not handle views correctly
299
- #if defined(GGML_USE_METAL)
300
- #define ggml_mul_mat ggml_mul_mat_pad
301
- #endif
302
-
303
255
  // available whisper models
304
256
  enum e_model {
305
257
  MODEL_UNKNOWN,
@@ -868,6 +820,11 @@ struct whisper_aheads_masks {
868
820
  ggml_backend_buffer_t buffer = nullptr;
869
821
  };
870
822
 
823
+ struct vad_time_mapping {
824
+ int64_t processed_time; // Time in processed (VAD) audio
825
+ int64_t original_time; // Corresponding time in original audio
826
+ };
827
+
871
828
  struct whisper_state {
872
829
  int64_t t_sample_us = 0;
873
830
  int64_t t_encode_us = 0;
@@ -957,13 +914,15 @@ struct whisper_state {
957
914
  whisper_vad_context * vad_context = nullptr;
958
915
 
959
916
  struct vad_segment_info {
960
- float orig_start;
961
- float orig_end;
962
- float vad_start;
963
- float vad_end;
917
+ int64_t orig_start;
918
+ int64_t orig_end;
919
+ int64_t vad_start;
920
+ int64_t vad_end;
964
921
  };
965
922
  std::vector<vad_segment_info> vad_segments;
966
923
  bool has_vad_segments = false;
924
+
925
+ std::vector<vad_time_mapping> vad_mapping_table;
967
926
  };
968
927
 
969
928
  struct whisper_context {
@@ -1322,8 +1281,6 @@ static size_t aheads_masks_nbytes(struct whisper_aheads_masks & aheads_masks) {
1322
1281
  static ggml_backend_t whisper_backend_init_gpu(const whisper_context_params & params) {
1323
1282
  ggml_log_set(g_state.log_callback, g_state.log_callback_user_data);
1324
1283
 
1325
- whisper_load_backends();
1326
-
1327
1284
  ggml_backend_dev_t dev = nullptr;
1328
1285
 
1329
1286
  int cnt = 0;
@@ -1331,7 +1288,7 @@ static ggml_backend_t whisper_backend_init_gpu(const whisper_context_params & pa
1331
1288
  for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
1332
1289
  ggml_backend_dev_t dev_cur = ggml_backend_dev_get(i);
1333
1290
  if (ggml_backend_dev_type(dev_cur) == GGML_BACKEND_DEVICE_TYPE_GPU) {
1334
- if (cnt == 0 || cnt == params.gpu_device) {
1291
+ if (cnt == params.gpu_device) {
1335
1292
  dev = dev_cur;
1336
1293
  }
1337
1294
 
@@ -1400,7 +1357,7 @@ static buft_list_t make_buft_list(whisper_context_params & params) {
1400
1357
  for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
1401
1358
  ggml_backend_dev_t dev = ggml_backend_dev_get(i);
1402
1359
  if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
1403
- if (cnt == 0 || cnt == params.gpu_device) {
1360
+ if (cnt == params.gpu_device) {
1404
1361
  auto * buft = ggml_backend_dev_buffer_type(dev);
1405
1362
  if (buft) {
1406
1363
  buft_list.emplace_back(dev, buft);
@@ -1442,7 +1399,8 @@ static bool weight_buft_supported(const whisper_hparams & hparams, ggml_tensor *
1442
1399
  op_supported = true;
1443
1400
  } else {
1444
1401
  switch (op) {
1445
- // The current extra_buffer_type implementations only support GGML_OP_MUL_MAT
1402
+ // The current extra_buffer_type implementations only support GGML_OP_MUL_MAT and GGML_OP_GET_ROWS
1403
+ case GGML_OP_GET_ROWS:
1446
1404
  case GGML_OP_MUL_MAT: {
1447
1405
  ggml_init_params params = {
1448
1406
  /*.mem_size =*/ 2 * ggml_tensor_overhead(),
@@ -1458,9 +1416,15 @@ static bool weight_buft_supported(const whisper_hparams & hparams, ggml_tensor *
1458
1416
 
1459
1417
  ggml_tensor * op_tensor = nullptr;
1460
1418
 
1461
- int64_t n_ctx = hparams.n_audio_ctx;
1462
- ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], n_ctx, w->ne[2], w->ne[3]);
1463
- op_tensor = ggml_mul_mat(ctx, w, b);
1419
+ if (op == GGML_OP_MUL_MAT) {
1420
+ int64_t n_ctx = hparams.n_audio_ctx;
1421
+ ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], n_ctx, w->ne[2], w->ne[3]);
1422
+ op_tensor = ggml_mul_mat(ctx, w, b);
1423
+ } else if (op == GGML_OP_GET_ROWS) {
1424
+ int64_t num_indices = 8;
1425
+ ggml_tensor * indices = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, num_indices);
1426
+ op_tensor = ggml_get_rows(ctx, w, indices);
1427
+ }
1464
1428
 
1465
1429
  // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
1466
1430
  GGML_ASSERT(w->buffer == nullptr);
@@ -2429,6 +2393,8 @@ static bool whisper_encode_internal(
2429
2393
  return false;
2430
2394
  }
2431
2395
  } else {
2396
+ ggml_backend_sched_reset(sched);
2397
+
2432
2398
  #if defined(WHISPER_USE_COREML)
2433
2399
  whisper_coreml_encode(wstate.ctx_coreml, mel->ne[0], mel->ne[1], (float *) mel->data, (float *) wstate.embd_enc->data);
2434
2400
  #elif defined(WHISPER_USE_OPENVINO)
@@ -3626,7 +3592,7 @@ int whisper_ctx_init_openvino_encoder(
3626
3592
  struct whisper_context_params whisper_context_default_params() {
3627
3593
  struct whisper_context_params result = {
3628
3594
  /*.use_gpu =*/ true,
3629
- /*.flash_attn =*/ false,
3595
+ /*.flash_attn =*/ true,
3630
3596
  /*.gpu_device =*/ 0,
3631
3597
 
3632
3598
  /*.dtw_token_timestamps =*/ false,
@@ -4335,8 +4301,6 @@ static int whisper_has_openvino(void) {
4335
4301
  const char * whisper_print_system_info(void) {
4336
4302
  static std::string s;
4337
4303
 
4338
- whisper_load_backends();
4339
-
4340
4304
  s = "";
4341
4305
  s += "WHISPER : ";
4342
4306
  s += "COREML = " + std::to_string(whisper_has_coreml()) + " | ";
@@ -4420,8 +4384,8 @@ struct whisper_vad_model {
4420
4384
  };
4421
4385
 
4422
4386
  struct whisper_vad_segment {
4423
- float start; // Start time in seconds
4424
- float end; // End time in seconds
4387
+ int64_t start;
4388
+ int64_t end;
4425
4389
  };
4426
4390
 
4427
4391
  struct whisper_vad_segments {
@@ -4469,6 +4433,15 @@ struct whisper_vad_params whisper_vad_default_params(void) {
4469
4433
  return result;
4470
4434
  }
4471
4435
 
4436
+ // Time conversion utility functions for whisper VAD
4437
+ static int cs_to_samples(int64_t cs) {
4438
+ return (int)((cs / 100.0) * WHISPER_SAMPLE_RATE + 0.5);
4439
+ }
4440
+
4441
+ static int64_t samples_to_cs(int samples) {
4442
+ return (int64_t)((samples / (double)WHISPER_SAMPLE_RATE) * 100.0 + 0.5);
4443
+ }
4444
+
4472
4445
  static bool weight_buft_supported(const whisper_vad_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
4473
4446
  bool op_supported = true;
4474
4447
 
@@ -4703,6 +4676,7 @@ static bool whisper_vad_init_context(whisper_vad_context * vctx) {
4703
4676
  ggml_set_name(vctx->c_state, "c_state");
4704
4677
 
4705
4678
  vctx->buffer = ggml_backend_alloc_ctx_tensors(ctx, vctx->backends[0]);
4679
+ ggml_free(ctx);
4706
4680
  if (!vctx->buffer) {
4707
4681
  WHISPER_LOG_ERROR("%s: failed to allocate memory for the VAD state\n", __func__);
4708
4682
  return false;
@@ -5413,12 +5387,12 @@ struct whisper_vad_segments * whisper_vad_segments_from_probs(
5413
5387
  (speeches[i].end + speech_pad_samples) : audio_length_samples;
5414
5388
  }
5415
5389
 
5416
- // Convert from samples to seconds and copy to final segments
5417
- segments[i].start = (float)speeches[i].start / sample_rate;
5418
- segments[i].end = (float)speeches[i].end / sample_rate;
5390
+ // Convert from samples to centiseconds
5391
+ segments[i].start = samples_to_cs(speeches[i].start);
5392
+ segments[i].end = samples_to_cs(speeches[i].end);
5419
5393
 
5420
5394
  WHISPER_LOG_INFO("%s: VAD segment %d: start = %.2f, end = %.2f (duration: %.2f)\n",
5421
- __func__, i, segments[i].start, segments[i].end, segments[i].end - segments[i].start);
5395
+ __func__, i, segments[i].start/100.0, segments[i].end/100.0, (segments[i].end - segments[i].start)/100.0);
5422
5396
  }
5423
5397
 
5424
5398
  whisper_vad_segments * vad_segments = new whisper_vad_segments;
@@ -5447,6 +5421,9 @@ struct whisper_vad_segments * whisper_vad_segments_from_samples(
5447
5421
 
5448
5422
  void whisper_vad_free(whisper_vad_context * ctx) {
5449
5423
  if (ctx) {
5424
+ if (ctx->buffer) {
5425
+ ggml_backend_buffer_free(ctx->buffer);
5426
+ }
5450
5427
  for (ggml_context * context : ctx->model.ctxs) {
5451
5428
  ggml_free(context);
5452
5429
  }
@@ -5461,6 +5438,9 @@ void whisper_vad_free(whisper_vad_context * ctx) {
5461
5438
  ggml_backend_free(backend);
5462
5439
  }
5463
5440
 
5441
+ delete[] ctx->model.hparams.encoder_in_channels;
5442
+ delete[] ctx->model.hparams.encoder_out_channels;
5443
+ delete[] ctx->model.hparams.kernel_sizes;
5464
5444
 
5465
5445
  delete ctx;
5466
5446
  }
@@ -6615,10 +6595,13 @@ static bool whisper_vad(
6615
6595
  struct whisper_full_params params,
6616
6596
  const float * samples,
6617
6597
  int n_samples,
6618
- std::vector<float> & filtered_samples,
6619
- int & filtered_n_samples) {
6620
- WHISPER_LOG_INFO("%s: VAD is enabled, processing speach segments only\n", __func__);
6621
- filtered_n_samples = 0;
6598
+ std::vector<float> & filtered_samples) {
6599
+ WHISPER_LOG_INFO("%s: VAD is enabled, processing speech segments only\n", __func__);
6600
+ int filtered_n_samples = 0;
6601
+
6602
+ // Clear any existing mapping table
6603
+ state->vad_mapping_table.clear();
6604
+ state->has_vad_segments = false;
6622
6605
 
6623
6606
  if (state->vad_context == nullptr) {
6624
6607
  struct whisper_vad_context_params vad_ctx_params = whisper_vad_default_context_params();
@@ -6640,13 +6623,17 @@ static bool whisper_vad(
6640
6623
  ctx->state->vad_segments.clear();
6641
6624
  ctx->state->vad_segments.reserve(vad_segments->data.size());
6642
6625
 
6626
+ // Initialize the time mapping table
6627
+ state->vad_mapping_table.clear();
6628
+ state->vad_mapping_table.reserve(vad_segments->data.size() * 4);
6629
+
6643
6630
  WHISPER_LOG_INFO("%s: detected %d speech segments\n", __func__, (int)vad_segments->data.size());
6644
6631
  float overlap_seconds = vad_params.samples_overlap;
6645
6632
  int overlap_samples = overlap_seconds * WHISPER_SAMPLE_RATE;
6646
6633
 
6647
6634
  for (int i = 0; i < (int)vad_segments->data.size(); i++) {
6648
- int segment_start_samples = vad_segments->data[i].start * WHISPER_SAMPLE_RATE;
6649
- int segment_end_samples = vad_segments->data[i].end * WHISPER_SAMPLE_RATE;
6635
+ int segment_start_samples = cs_to_samples(vad_segments->data[i].start);
6636
+ int segment_end_samples = cs_to_samples(vad_segments->data[i].end);
6650
6637
 
6651
6638
  if (i < (int)vad_segments->data.size() - 1) {
6652
6639
  segment_end_samples += overlap_samples;
@@ -6655,9 +6642,9 @@ static bool whisper_vad(
6655
6642
  filtered_n_samples += (segment_end_samples - segment_start_samples);
6656
6643
 
6657
6644
  WHISPER_LOG_INFO("%s: Including segment %d: %.2f - %.2f (duration: %.2f)\n",
6658
- __func__, i, vad_segments->data[i].start,
6659
- vad_segments->data[i].end + (i < (int)vad_segments->data.size() - 1 ? overlap_seconds : 0),
6660
- (vad_segments->data[i].end - vad_segments->data[i].start) +
6645
+ __func__, i, vad_segments->data[i].start/100.0,
6646
+ (vad_segments->data[i].end/100.0 + (i < (int)vad_segments->data.size() - 1 ? overlap_seconds : 0)),
6647
+ (vad_segments->data[i].end - vad_segments->data[i].start)/100.0 +
6661
6648
  (i < (int)vad_segments->data.size() - 1 ? overlap_seconds : 0));
6662
6649
  }
6663
6650
 
@@ -6679,8 +6666,8 @@ static bool whisper_vad(
6679
6666
 
6680
6667
  int offset = 0;
6681
6668
  for (int i = 0; i < (int)vad_segments->data.size(); i++) {
6682
- int segment_start_samples = vad_segments->data[i].start * WHISPER_SAMPLE_RATE;
6683
- int segment_end_samples = vad_segments->data[i].end * WHISPER_SAMPLE_RATE;
6669
+ int segment_start_samples = cs_to_samples(vad_segments->data[i].start);
6670
+ int segment_end_samples = cs_to_samples(vad_segments->data[i].end);
6684
6671
 
6685
6672
  if (i < (int)vad_segments->data.size() - 1) {
6686
6673
  segment_end_samples += overlap_samples;
@@ -6689,18 +6676,47 @@ static bool whisper_vad(
6689
6676
  segment_start_samples = std::min(segment_start_samples, n_samples - 1);
6690
6677
  segment_end_samples = std::min(segment_end_samples, n_samples);
6691
6678
  int segment_length = segment_end_samples - segment_start_samples;
6692
-
6693
6679
  if (segment_length > 0) {
6694
6680
  whisper_state::vad_segment_info segment;
6695
6681
 
6696
6682
  segment.orig_start = vad_segments->data[i].start;
6697
6683
  segment.orig_end = vad_segments->data[i].end;
6698
6684
 
6699
- segment.vad_start = offset / (float)WHISPER_SAMPLE_RATE;
6700
- segment.vad_end = (offset + segment_length) / (float)WHISPER_SAMPLE_RATE;
6685
+ segment.vad_start = samples_to_cs(offset);
6686
+ segment.vad_end = samples_to_cs(offset + segment_length);
6687
+
6688
+ // Add segment boundaries to mapping table
6689
+ vad_time_mapping start_mapping = {segment.vad_start, segment.orig_start};
6690
+ vad_time_mapping end_mapping = {segment.vad_end, segment.orig_end};
6691
+
6692
+ state->vad_mapping_table.push_back(start_mapping);
6693
+ state->vad_mapping_table.push_back(end_mapping);
6694
+
6695
+ // Add intermediate points for longer segments to improve interpolation accuracy
6696
+ const int64_t min_segment_length = 100; // 1 second
6697
+ const int64_t point_interval = 20; // Add a point every 200ms
6698
+
6699
+ if (segment.vad_end - segment.vad_start > min_segment_length) {
6700
+ int64_t segment_duration = segment.vad_end - segment.vad_start;
6701
+ int num_points = (int)(segment_duration / point_interval) - 1;
6702
+
6703
+ for (int j = 1; j <= num_points; j++) {
6704
+ int64_t vad_time = segment.vad_start + j * point_interval;
6705
+
6706
+ if (vad_time >= segment.vad_end) continue;
6707
+
6708
+ int64_t vad_elapsed = vad_time - segment.vad_start;
6709
+ int64_t vad_total = segment.vad_end - segment.vad_start;
6710
+ int64_t orig_total = segment.orig_end - segment.orig_start;
6711
+ int64_t orig_time = segment.orig_start + (vad_elapsed * orig_total) / vad_total;
6712
+
6713
+ vad_time_mapping intermediate_mapping = {vad_time, orig_time};
6714
+ state->vad_mapping_table.push_back(intermediate_mapping);
6715
+ }
6716
+ }
6701
6717
 
6702
6718
  WHISPER_LOG_INFO("%s: vad_segment_info: orig_start: %.2f, orig_end: %.2f, vad_start: %.2f, vad_end: %.2f\n",
6703
- __func__, segment.orig_start, segment.orig_end, segment.vad_start, segment.vad_end);
6719
+ __func__, segment.orig_start/100.0, segment.orig_end/100.0, segment.vad_start/100.0, segment.vad_end/100.0);
6704
6720
  ctx->state->vad_segments.push_back(segment);
6705
6721
 
6706
6722
  // Copy this speech segment
@@ -6709,6 +6725,17 @@ static bool whisper_vad(
6709
6725
 
6710
6726
  // Add silence after this segment (except after the last segment)
6711
6727
  if (i < (int)vad_segments->data.size() - 1) {
6728
+ // Calculate the start and end time of the silence gap in processed audio
6729
+ int64_t silence_start_vad = samples_to_cs(offset);
6730
+ int64_t silence_end_vad = samples_to_cs(offset + silence_samples);
6731
+ // Calculate the corresponding original times
6732
+ int64_t orig_silence_start = segment.orig_end;
6733
+ int64_t orig_silence_end = vad_segments->data[i+1].start;
6734
+
6735
+ // Add mapping points for silence boundaries
6736
+ state->vad_mapping_table.push_back({silence_start_vad, orig_silence_start});
6737
+ state->vad_mapping_table.push_back({silence_end_vad, orig_silence_end});
6738
+
6712
6739
  // Fill with zeros (silence)
6713
6740
  memset(filtered_samples.data() + offset, 0, silence_samples * sizeof(float));
6714
6741
  offset += silence_samples;
@@ -6716,6 +6743,24 @@ static bool whisper_vad(
6716
6743
  }
6717
6744
  }
6718
6745
 
6746
+ // Sort the mapping table by processed time
6747
+ std::sort(state->vad_mapping_table.begin(), state->vad_mapping_table.end(),
6748
+ [](const vad_time_mapping& a, const vad_time_mapping& b) {
6749
+ return a.processed_time < b.processed_time;
6750
+ });
6751
+
6752
+ // Remove any duplicate processed times to ensure monotonicity which is
6753
+ // needed for binary search and interpolation later.
6754
+ if (!state->vad_mapping_table.empty()) {
6755
+ auto last = std::unique(state->vad_mapping_table.begin(), state->vad_mapping_table.end(),
6756
+ [](const vad_time_mapping& a, const vad_time_mapping& b) {
6757
+ return a.processed_time == b.processed_time;
6758
+ });
6759
+ state->vad_mapping_table.erase(last, state->vad_mapping_table.end());
6760
+ }
6761
+
6762
+ WHISPER_LOG_INFO("%s: Created time mapping table with %d points\n", __func__, (int)state->vad_mapping_table.size());
6763
+
6719
6764
  filtered_n_samples = offset;
6720
6765
  WHISPER_LOG_INFO("%s: Reduced audio from %d to %d samples (%.1f%% reduction)\n",
6721
6766
  __func__, n_samples, filtered_n_samples, 100.0f * (1.0f - (float)filtered_n_samples / n_samples));
@@ -6735,27 +6780,9 @@ int whisper_full_with_state(
6735
6780
 
6736
6781
  result_all.clear();
6737
6782
 
6738
- const float * process_samples = samples;
6739
- int n_process_samples = n_samples;
6740
- std::vector<float> vad_samples;
6741
-
6742
- if (params.vad) {
6743
- WHISPER_LOG_INFO("%s: VAD is enabled, processing speech segments only\n", __func__);
6744
- int vad_n_samples;
6745
- if (!whisper_vad(ctx, state, params, samples, n_samples, vad_samples, vad_n_samples)) {
6746
- WHISPER_LOG_ERROR("%s: failed to compute VAD\n", __func__);
6747
- return -1;
6748
- }
6749
- if (vad_n_samples == 0) {
6750
- return 0;
6751
- }
6752
- process_samples = vad_samples.data();
6753
- n_process_samples = vad_n_samples;
6754
- }
6755
-
6756
- if (n_process_samples > 0) {
6783
+ if (n_samples > 0) {
6757
6784
  // compute log mel spectrogram
6758
- if (whisper_pcm_to_mel_with_state(ctx, state, process_samples, n_process_samples, params.n_threads) != 0) {
6785
+ if (whisper_pcm_to_mel_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) {
6759
6786
  WHISPER_LOG_ERROR("%s: failed to compute log mel spectrogram\n", __func__);
6760
6787
  return -2;
6761
6788
  }
@@ -7665,6 +7692,21 @@ int whisper_full(
7665
7692
  struct whisper_full_params params,
7666
7693
  const float * samples,
7667
7694
  int n_samples) {
7695
+
7696
+ std::vector<float> vad_samples;
7697
+ if (params.vad) {
7698
+ WHISPER_LOG_INFO("%s: VAD is enabled, processing speech segments only\n", __func__);
7699
+ if (!whisper_vad(ctx, ctx->state, params, samples, n_samples, vad_samples)) {
7700
+ WHISPER_LOG_ERROR("%s: failed to compute VAD\n", __func__);
7701
+ return -1;
7702
+ }
7703
+ if (vad_samples.empty()) {
7704
+ ctx->state->result_all.clear();
7705
+ return 0;
7706
+ }
7707
+ samples = vad_samples.data();
7708
+ n_samples = vad_samples.size();
7709
+ }
7668
7710
  return whisper_full_with_state(ctx, ctx->state, params, samples, n_samples);
7669
7711
  }
7670
7712
 
@@ -7674,9 +7716,24 @@ int whisper_full_parallel(
7674
7716
  const float * samples,
7675
7717
  int n_samples,
7676
7718
  int n_processors) {
7719
+
7677
7720
  if (n_processors == 1) {
7678
7721
  return whisper_full(ctx, params, samples, n_samples);
7679
7722
  }
7723
+
7724
+ std::vector<float> vad_samples;
7725
+ if (params.vad) {
7726
+ WHISPER_LOG_INFO("%s: VAD is enabled, processing speech segments only\n", __func__);
7727
+ if (!whisper_vad(ctx, ctx->state, params, samples, n_samples, vad_samples)) {
7728
+ WHISPER_LOG_ERROR("%s: failed to compute VAD\n", __func__);
7729
+ return -1;
7730
+ }
7731
+ if (vad_samples.empty()) {
7732
+ return 0;
7733
+ }
7734
+ samples = vad_samples.data();
7735
+ n_samples = vad_samples.size();
7736
+ }
7680
7737
  int ret = 0;
7681
7738
 
7682
7739
  // prepare separate states for each thread
@@ -7799,130 +7856,89 @@ int whisper_full_lang_id(struct whisper_context * ctx) {
7799
7856
  return ctx->state->lang_id;
7800
7857
  }
7801
7858
 
7802
- int64_t whisper_full_get_segment_t0_from_state(struct whisper_state * state, int i_segment) {
7803
- // If VAD wasn't used, return the original timestamp
7804
- if (!state->has_vad_segments || state->vad_segments.empty()) {
7805
- return state->result_all[i_segment].t0;
7859
+ static int64_t map_processed_to_original_time(int64_t processed_time, const std::vector<vad_time_mapping> & mapping_table) {
7860
+ if (mapping_table.empty()) {
7861
+ return processed_time;
7806
7862
  }
7807
7863
 
7808
- // Get the start timestamp produced by whisper_full. whisper_full processes
7809
- // only the speech segments in this case so we need to map these timestamps
7810
- // back to the original audio.
7811
- float t0 = state->result_all[i_segment].t0 / 100.0f;
7864
+ if (processed_time <= mapping_table.front().processed_time) {
7865
+ return mapping_table.front().original_time; // Before first mapping point
7866
+ }
7812
7867
 
7813
- // Find which VAD segment this timestamp belongs.
7814
- // TODO(danbev) This could be optimized by using a binary search if the number
7815
- // of segments exceed a certain limit. Also we might be able to assume that
7816
- // the access pattern is sequential and optimized for that too.
7817
- for (size_t i = 0; i < state->vad_segments.size(); i++) {
7818
- const auto & segment = state->vad_segments[i];
7868
+ if (processed_time >= mapping_table.back().processed_time) {
7869
+ return mapping_table.back().original_time; // After last mapping point
7870
+ }
7819
7871
 
7820
- // Check if the timestamp falls within this segment.
7821
- if (t0 >= segment.vad_start && t0 <= segment.vad_end) {
7822
- float proportion = 0.0f;
7823
- if (segment.vad_end > segment.vad_start) {
7824
- proportion = (t0 - segment.vad_start) / (segment.vad_end - segment.vad_start);
7825
- }
7826
- float orig_t0 = segment.orig_start + proportion * (segment.orig_end - segment.orig_start);
7827
- return (int64_t)(orig_t0 * 100);
7872
+ // Binary search over the time map that finds the first entry that has a
7873
+ // processed time greater than or equal to the current processed time.
7874
+ auto upper = std::lower_bound(mapping_table.begin(), mapping_table.end(), processed_time,
7875
+ [](const vad_time_mapping & entry, int64_t time) {
7876
+ return entry.processed_time < time;
7828
7877
  }
7878
+ );
7879
+
7880
+ // If exact match found
7881
+ if (upper->processed_time == processed_time) {
7882
+ return upper->original_time;
7829
7883
  }
7830
7884
 
7831
- // Check if the timestamp falls between two segments.
7832
- for (size_t i = 0; i < state->vad_segments.size() - 1; i++) {
7833
- const auto & curr = state->vad_segments[i];
7834
- const auto & next = state->vad_segments[i + 1];
7885
+ // Need to interpolate between two points
7886
+ auto lower = upper - 1;
7835
7887
 
7836
- if (t0 > curr.vad_end && t0 < next.vad_start) {
7837
- // Calculate how far we are through the gap as a proportion
7838
- float gap_proportion = 0.0f;
7839
- if (next.vad_start > curr.vad_end) {
7840
- gap_proportion = (t0 - curr.vad_end) / (next.vad_start - curr.vad_end);
7841
- }
7842
- float orig_t0 = curr.orig_end + gap_proportion * (next.orig_start - curr.orig_end);
7843
- return (int64_t)(orig_t0 * 100);
7844
- }
7845
- }
7888
+ int64_t processed_diff = upper->processed_time - lower->processed_time;
7889
+ int64_t original_diff = upper->original_time - lower->original_time;
7890
+ int64_t offset = processed_time - lower->processed_time;
7846
7891
 
7847
- // Handle the case where the timestamp is after the last segment.
7848
- if (t0 > state->vad_segments.back().vad_end) {
7849
- // For timestamps after the last segment, add the extra time to the end of the last segment
7850
- const auto& last = state->vad_segments.back();
7851
- // Calculate how far beyond the last segment
7852
- float extra_time = t0 - last.vad_end;
7853
- // Add this extra time to the original end time
7854
- float orig_t0 = last.orig_end + extra_time;
7855
- return (int64_t)(orig_t0 * 100);
7892
+ if (processed_diff == 0) {
7893
+ return lower->original_time;
7856
7894
  }
7857
7895
 
7858
- WHISPER_LOG_WARN("%s: Could not map t0 = %f to a VAD segment\n", __func__, t0);
7859
- return t0;
7896
+ // Perform linear interpolation
7897
+ return lower->original_time + (offset * original_diff) / processed_diff;
7860
7898
  }
7861
7899
 
7862
- int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment) {
7863
- return whisper_full_get_segment_t0_from_state(ctx->state, i_segment);
7900
+ // Function to get the starting timestamp of a segment
7901
+ int64_t whisper_full_get_segment_t0_from_state(struct whisper_state * state, int i_segment) {
7902
+ // If VAD wasn't used, return the original timestamp
7903
+ if (!state->has_vad_segments || state->vad_mapping_table.empty()) {
7904
+ return state->result_all[i_segment].t0;
7905
+ }
7906
+
7907
+ // Get the processed timestamp
7908
+ int64_t t0 = state->result_all[i_segment].t0;
7909
+
7910
+ // Map to original time using the mapping table
7911
+ return map_processed_to_original_time(t0, state->vad_mapping_table);
7864
7912
  }
7865
7913
 
7914
+ // Function to get the ending timestamp of a segment
7866
7915
  int64_t whisper_full_get_segment_t1_from_state(struct whisper_state * state, int i_segment) {
7867
7916
  // If VAD wasn't used, return the original timestamp
7868
- if (!state->has_vad_segments || state->vad_segments.empty()) {
7917
+ if (!state->has_vad_segments || state->vad_mapping_table.empty()) {
7869
7918
  return state->result_all[i_segment].t1;
7870
7919
  }
7871
7920
 
7872
- // Get the end timestamp produced by whisper_full. whisper_full processes
7873
- // only the speech segments in this case so we need to map these timestamps
7874
- // back to the original audio.
7875
- float t1 = state->result_all[i_segment].t1 / 100.0f;
7876
-
7877
- // Find which VAD segment this timestamp belongs.
7878
- // TODO(danbev) This could be optimized by using a binary search if the number
7879
- // of segments exceed a certain limit. Also we might be able to assume that
7880
- // the access pattern is sequential and optimized for that too.
7881
- for (size_t i = 0; i < state->vad_segments.size(); i++) {
7882
- const auto& segment = state->vad_segments[i];
7883
-
7884
- // Check if the timestamp falls within this segment.
7885
- if (t1 >= segment.vad_start && t1 <= segment.vad_end) {
7886
- // Calculate the proportion through the filtered segment.
7887
- float proportion = 0.0f;
7888
- if (segment.vad_end > segment.vad_start) {
7889
- proportion = (t1 - segment.vad_start) / (segment.vad_end - segment.vad_start);
7890
- }
7891
- float orig_t1 = segment.orig_start + proportion * (segment.orig_end - segment.orig_start);
7892
- return (int64_t)(orig_t1 * 100);
7893
- }
7894
- }
7921
+ // Get the processed timestamp
7922
+ int64_t t1 = state->result_all[i_segment].t1;
7895
7923
 
7896
- // Check if the timestamp falls between two segments.
7897
- for (size_t i = 0; i < state->vad_segments.size() - 1; i++) {
7898
- const auto & curr = state->vad_segments[i];
7899
- const auto & next = state->vad_segments[i + 1];
7924
+ // Map to original time using the mapping table
7925
+ int64_t orig_t1 = map_processed_to_original_time(t1, state->vad_mapping_table);
7900
7926
 
7901
- if (t1 > curr.vad_end && t1 < next.vad_start) {
7902
- // Calculate how far we are through the gap as a proportion
7903
- float gap_proportion = 0.0f;
7904
- if (next.vad_start > curr.vad_end) {
7905
- gap_proportion = (t1 - curr.vad_end) / (next.vad_start - curr.vad_end);
7906
- }
7907
- // Map to the corresponding position in the original gap
7908
- float orig_t1 = curr.orig_end + gap_proportion * (next.orig_start - curr.orig_end);
7909
- return (int64_t)(orig_t1 * 100);
7910
- }
7911
- }
7927
+ // Get the corresponding t0 for this segment
7928
+ int64_t orig_t0 = whisper_full_get_segment_t0_from_state(state, i_segment);
7912
7929
 
7913
- // Handle the case where the timestamp is after the last segment
7914
- if (t1 > state->vad_segments.back().vad_end) {
7915
- // For the last segment, use the end of the last VAD segment
7916
- const auto& last = state->vad_segments.back();
7917
- // Calculate how far beyond the last segment
7918
- float extra_time = t1 - last.vad_end;
7919
- // Add this extra time to the original end time
7920
- float orig_t1 = last.orig_end + extra_time;
7921
- return (int64_t)(orig_t1 * 100);
7930
+ // Ensure minimum duration to prevent zero-length segments
7931
+ const int64_t min_duration = 10; // 10ms minimum
7932
+ if (orig_t1 - orig_t0 < min_duration) {
7933
+ orig_t1 = orig_t0 + min_duration;
7922
7934
  }
7923
7935
 
7924
- WHISPER_LOG_WARN("%s: Could not map t1 = %f to a VAD segment\n", __func__, t1);
7925
- return t1;
7936
+ return orig_t1;
7937
+ }
7938
+
7939
+
7940
+ int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment) {
7941
+ return whisper_full_get_segment_t0_from_state(ctx->state, i_segment);
7926
7942
  }
7927
7943
 
7928
7944
  int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment) {
@@ -8154,8 +8170,6 @@ WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads) {
8154
8170
  }
8155
8171
 
8156
8172
  WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
8157
- whisper_load_backends();
8158
-
8159
8173
  static std::string s;
8160
8174
  s = "";
8161
8175
  char strbuf[256];
@@ -8289,10 +8303,6 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
8289
8303
  // token-level timestamps
8290
8304
  //
8291
8305
 
8292
- static int timestamp_to_sample(int64_t t, int n_samples) {
8293
- return std::max(0, std::min((int) n_samples - 1, (int) ((t*WHISPER_SAMPLE_RATE)/100)));
8294
- }
8295
-
8296
8306
  static int64_t sample_to_timestamp(int i_sample) {
8297
8307
  return (100ll*i_sample)/WHISPER_SAMPLE_RATE;
8298
8308
  }
@@ -8342,6 +8352,18 @@ static std::vector<float> get_signal_energy(const float * signal, int n_samples,
8342
8352
  return result;
8343
8353
  }
8344
8354
 
8355
+ static int timestamp_to_sample(int64_t t, int64_t segment_t0, int n_samples) {
8356
+ // Convert absolute timestamp to segment-relative timestamp
8357
+ int64_t relative_t = t - segment_t0;
8358
+ int sample = (int)((relative_t * WHISPER_SAMPLE_RATE) / 100);
8359
+ return std::max(0, std::min(n_samples - 1, sample));
8360
+ }
8361
+
8362
+ static int64_t sample_to_timestamp(int i_sample, int64_t segment_t0) {
8363
+ int64_t relative_timestamp = (100ll * i_sample) / WHISPER_SAMPLE_RATE;
8364
+ return relative_timestamp + segment_t0;
8365
+ }
8366
+
8345
8367
  static void whisper_exp_compute_token_level_timestamps(
8346
8368
  struct whisper_context & ctx,
8347
8369
  struct whisper_state & state,
@@ -8482,8 +8504,8 @@ static void whisper_exp_compute_token_level_timestamps(
8482
8504
  continue;
8483
8505
  }
8484
8506
 
8485
- int s0 = timestamp_to_sample(tokens[j].t0, n_samples);
8486
- int s1 = timestamp_to_sample(tokens[j].t1, n_samples);
8507
+ int s0 = timestamp_to_sample(tokens[j].t0, segment.t0, n_samples);
8508
+ int s1 = timestamp_to_sample(tokens[j].t1, segment.t0, n_samples);
8487
8509
 
8488
8510
  const int ss0 = std::max(s0 - hw, 0);
8489
8511
  const int ss1 = std::min(s1 + hw, n_samples);
@@ -8504,7 +8526,7 @@ static void whisper_exp_compute_token_level_timestamps(
8504
8526
  while (k > 0 && state.energy[k] > thold) {
8505
8527
  k--;
8506
8528
  }
8507
- tokens[j].t0 = sample_to_timestamp(k);
8529
+ tokens[j].t0 = sample_to_timestamp(k, segment.t0);
8508
8530
  if (tokens[j].t0 < tokens[j - 1].t1) {
8509
8531
  tokens[j].t0 = tokens[j - 1].t1;
8510
8532
  } else {
@@ -8515,7 +8537,7 @@ static void whisper_exp_compute_token_level_timestamps(
8515
8537
  k++;
8516
8538
  }
8517
8539
  s0 = k;
8518
- tokens[j].t0 = sample_to_timestamp(k);
8540
+ tokens[j].t0 = sample_to_timestamp(k, segment.t0);
8519
8541
  }
8520
8542
  }
8521
8543
 
@@ -8525,7 +8547,7 @@ static void whisper_exp_compute_token_level_timestamps(
8525
8547
  while (k < n_samples - 1 && state.energy[k] > thold) {
8526
8548
  k++;
8527
8549
  }
8528
- tokens[j].t1 = sample_to_timestamp(k);
8550
+ tokens[j].t1 = sample_to_timestamp(k, segment.t0);
8529
8551
  if (j < n - 1 && tokens[j].t1 > tokens[j + 1].t0) {
8530
8552
  tokens[j].t1 = tokens[j + 1].t0;
8531
8553
  } else {
@@ -8536,7 +8558,7 @@ static void whisper_exp_compute_token_level_timestamps(
8536
8558
  k--;
8537
8559
  }
8538
8560
  s1 = k;
8539
- tokens[j].t1 = sample_to_timestamp(k);
8561
+ tokens[j].t1 = sample_to_timestamp(k, segment.t0);
8540
8562
  }
8541
8563
  }
8542
8564
  }
@@ -8893,6 +8915,10 @@ void whisper_log_set(ggml_log_callback log_callback, void * user_data) {
8893
8915
  ggml_log_set(g_state.log_callback, g_state.log_callback_user_data);
8894
8916
  }
8895
8917
 
8918
+ const char * whisper_version(void) {
8919
+ return WHISPER_VERSION;
8920
+ }
8921
+
8896
8922
  GGML_ATTRIBUTE_FORMAT(2, 3)
8897
8923
  static void whisper_log_internal(ggml_log_level level, const char * format, ...) {
8898
8924
  va_list args;