whispercpp 1.3.2 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (244) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +6 -3
  3. data/README.md +71 -14
  4. data/Rakefile +20 -7
  5. data/ext/.gitignore +4 -6
  6. data/ext/dependencies.rb +36 -24
  7. data/ext/extconf.rb +1 -1
  8. data/ext/options.rb +48 -184
  9. data/ext/ruby_whisper.c +18 -0
  10. data/ext/ruby_whisper_context.c +43 -12
  11. data/ext/ruby_whisper_model.c +1 -1
  12. data/ext/ruby_whisper_params.c +4 -2
  13. data/ext/ruby_whisper_segment.c +81 -4
  14. data/ext/ruby_whisper_transcribe.cpp +13 -7
  15. data/ext/ruby_whisper_vad_params.c +1 -1
  16. data/ext/sources/CMakeLists.txt +5 -1
  17. data/ext/sources/bindings/javascript/package.json +1 -1
  18. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +120 -24
  19. data/ext/sources/examples/addon.node/addon.cpp +150 -31
  20. data/ext/sources/examples/addon.node/index.js +3 -0
  21. data/ext/sources/examples/addon.node/vad-example.js +132 -0
  22. data/ext/sources/examples/bench/bench.cpp +3 -2
  23. data/ext/sources/examples/cli/cli.cpp +3 -2
  24. data/ext/sources/examples/command/command.cpp +32 -8
  25. data/ext/sources/examples/common-whisper.cpp +14 -7
  26. data/ext/sources/examples/lsp/lsp.cpp +2 -0
  27. data/ext/sources/examples/quantize/quantize.cpp +3 -0
  28. data/ext/sources/examples/server/CMakeLists.txt +3 -0
  29. data/ext/sources/examples/server/server.cpp +169 -22
  30. data/ext/sources/examples/stream/stream.cpp +6 -0
  31. data/ext/sources/examples/talk-llama/CMakeLists.txt +4 -1
  32. data/ext/sources/examples/talk-llama/llama-arch.cpp +171 -3
  33. data/ext/sources/examples/talk-llama/llama-arch.h +28 -1
  34. data/ext/sources/examples/talk-llama/llama-batch.cpp +741 -272
  35. data/ext/sources/examples/talk-llama/llama-batch.h +112 -54
  36. data/ext/sources/examples/talk-llama/llama-chat.cpp +30 -8
  37. data/ext/sources/examples/talk-llama/llama-chat.h +1 -0
  38. data/ext/sources/examples/talk-llama/llama-context.cpp +520 -351
  39. data/ext/sources/examples/talk-llama/llama-context.h +38 -17
  40. data/ext/sources/examples/talk-llama/llama-cparams.cpp +1 -1
  41. data/ext/sources/examples/talk-llama/llama-cparams.h +1 -1
  42. data/ext/sources/examples/talk-llama/llama-graph.cpp +447 -372
  43. data/ext/sources/examples/talk-llama/llama-graph.h +128 -58
  44. data/ext/sources/examples/talk-llama/llama-hparams.cpp +10 -2
  45. data/ext/sources/examples/talk-llama/llama-hparams.h +19 -2
  46. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +279 -0
  47. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.h +128 -0
  48. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.cpp +1841 -0
  49. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.h +303 -0
  50. data/ext/sources/examples/talk-llama/llama-kv-cache.h +14 -472
  51. data/ext/sources/examples/talk-llama/llama-kv-cells.h +86 -26
  52. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +246 -0
  53. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +138 -0
  54. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +1125 -0
  55. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +183 -0
  56. data/ext/sources/examples/talk-llama/llama-memory.cpp +58 -0
  57. data/ext/sources/examples/talk-llama/llama-memory.h +88 -4
  58. data/ext/sources/examples/talk-llama/llama-mmap.cpp +1 -1
  59. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +42 -17
  60. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +1 -0
  61. data/ext/sources/examples/talk-llama/llama-model.cpp +1863 -563
  62. data/ext/sources/examples/talk-llama/llama-model.h +27 -0
  63. data/ext/sources/examples/talk-llama/llama-quant.cpp +89 -6
  64. data/ext/sources/examples/talk-llama/llama-vocab.cpp +65 -28
  65. data/ext/sources/examples/talk-llama/llama-vocab.h +1 -0
  66. data/ext/sources/examples/talk-llama/llama.cpp +11 -7
  67. data/ext/sources/examples/talk-llama/llama.h +147 -40
  68. data/ext/sources/examples/talk-llama/talk-llama.cpp +2 -0
  69. data/ext/sources/examples/talk-llama/unicode.cpp +5 -0
  70. data/ext/sources/examples/vad-speech-segments/speech.cpp +6 -0
  71. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +2 -0
  72. data/ext/sources/ggml/CMakeLists.txt +48 -3
  73. data/ext/sources/ggml/cmake/common.cmake +24 -0
  74. data/ext/sources/ggml/include/ggml-backend.h +1 -1
  75. data/ext/sources/ggml/include/ggml-cpu.h +2 -0
  76. data/ext/sources/ggml/include/ggml.h +144 -5
  77. data/ext/sources/ggml/src/CMakeLists.txt +82 -24
  78. data/ext/sources/ggml/src/ggml-backend-reg.cpp +5 -0
  79. data/ext/sources/ggml/src/ggml-backend.cpp +46 -23
  80. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +3 -3
  81. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +1 -0
  82. data/ext/sources/ggml/src/ggml-cann/common.h +6 -1
  83. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
  84. data/ext/sources/ggml/src/ggml-common.h +4 -0
  85. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +133 -40
  86. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
  87. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
  88. data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  89. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
  90. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
  91. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
  92. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  93. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
  94. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
  95. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
  96. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
  97. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
  98. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
  99. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
  100. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  101. data/ext/sources/ggml/src/ggml-cpu/common.h +4 -3
  102. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
  103. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +146 -105
  104. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
  105. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  106. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
  107. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
  108. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  109. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1057 -174
  110. data/ext/sources/ggml/src/ggml-cpu/ops.h +8 -0
  111. data/ext/sources/ggml/src/ggml-cpu/quants.c +1158 -0
  112. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  113. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1571 -0
  114. data/ext/sources/ggml/src/ggml-cpu/repack.h +98 -0
  115. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +330 -38
  116. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  117. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +111 -18
  118. data/ext/sources/ggml/src/ggml-cpu/vec.h +303 -94
  119. data/ext/sources/ggml/src/ggml-cuda/common.cuh +60 -37
  120. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  121. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  122. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  123. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  124. data/ext/sources/ggml/src/ggml-cuda/convert.cu +22 -0
  125. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +5 -0
  126. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +2 -2
  127. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +5 -2
  128. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
  129. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +265 -123
  130. data/ext/sources/ggml/src/ggml-cuda/mean.cu +19 -0
  131. data/ext/sources/ggml/src/ggml-cuda/mean.cuh +3 -0
  132. data/ext/sources/ggml/src/ggml-cuda/mmv.cu +257 -87
  133. data/ext/sources/ggml/src/ggml-cuda/mmv.cuh +2 -3
  134. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
  135. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +5 -18
  136. data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  137. data/ext/sources/ggml/src/ggml-cuda/unary.cu +89 -0
  138. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +7 -0
  139. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +4 -0
  140. data/ext/sources/ggml/src/ggml-impl.h +127 -183
  141. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +11 -10
  142. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +27 -0
  143. data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +331 -49
  144. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +564 -282
  145. data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +2 -2
  146. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +14 -0
  147. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +1859 -489
  148. data/ext/sources/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  149. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  150. data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +72 -0
  151. data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +201 -0
  152. data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
  153. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  154. data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  155. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  156. data/ext/sources/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  157. data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
  158. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
  159. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  160. data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  161. data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
  162. data/ext/sources/ggml/src/ggml-quants.c +6 -8
  163. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
  164. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
  165. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +5 -6
  166. data/ext/sources/ggml/src/ggml-sycl/common.hpp +20 -48
  167. data/ext/sources/ggml/src/ggml-sycl/concat.cpp +28 -41
  168. data/ext/sources/ggml/src/ggml-sycl/conv.cpp +4 -10
  169. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +117 -165
  170. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +192 -53
  171. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  172. data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +49 -67
  173. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
  174. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +648 -1039
  175. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +18 -9
  176. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
  177. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +8 -105
  178. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -100
  179. data/ext/sources/ggml/src/ggml-sycl/gla.cpp +2 -2
  180. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +1 -1
  181. data/ext/sources/ggml/src/ggml-sycl/mmq.cpp +60 -80
  182. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +158 -203
  183. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +55 -74
  184. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +38 -10
  185. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +138 -27
  186. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +3 -3
  187. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  188. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  189. data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +3 -8
  190. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
  191. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +12 -16
  192. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +36 -32
  193. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +726 -282
  194. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
  195. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  196. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  197. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
  198. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
  199. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  200. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +12 -3
  201. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  202. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +10 -1
  203. data/ext/sources/ggml/src/ggml.c +328 -48
  204. data/ext/sources/ggml/src/ggml.cpp +26 -0
  205. data/ext/sources/ggml/src/gguf.cpp +24 -3
  206. data/ext/sources/include/whisper.h +2 -0
  207. data/ext/sources/src/CMakeLists.txt +2 -0
  208. data/ext/sources/src/coreml/whisper-compat.h +10 -0
  209. data/ext/sources/src/coreml/whisper-compat.m +35 -0
  210. data/ext/sources/src/coreml/whisper-decoder-impl.m +1 -0
  211. data/ext/sources/src/coreml/whisper-encoder-impl.m +1 -0
  212. data/ext/sources/src/whisper.cpp +218 -169
  213. data/extsources.rb +15 -9
  214. data/lib/whisper/context.rb +15 -0
  215. data/lib/whisper/model/uri.rb +56 -1
  216. data/lib/whisper/segment.rb +58 -0
  217. data/sig/whisper.rbs +68 -38
  218. data/{tests → test}/helper.rb +1 -12
  219. data/{tests → test}/test_model.rb +9 -0
  220. data/test/test_package.rb +51 -0
  221. data/test/test_segment.rb +146 -0
  222. data/{tests → test}/test_whisper.rb +70 -0
  223. data/whispercpp.gemspec +2 -3
  224. metadata +91 -43
  225. data/ext/sources/.dockerignore +0 -3
  226. data/ext/sources/.github/workflows/bindings-ruby.yml +0 -21
  227. data/ext/sources/ci/run.sh +0 -336
  228. data/ext/sources/close-issue.yml +0 -28
  229. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2739
  230. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  231. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13747
  232. data/tests/test_package.rb +0 -46
  233. data/tests/test_segment.rb +0 -74
  234. /data/ext/sources/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  235. /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  236. /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
  237. /data/{tests → test}/jfk_reader/.gitignore +0 -0
  238. /data/{tests → test}/jfk_reader/extconf.rb +0 -0
  239. /data/{tests → test}/jfk_reader/jfk_reader.c +0 -0
  240. /data/{tests → test}/test_callback.rb +0 -0
  241. /data/{tests → test}/test_error.rb +0 -0
  242. /data/{tests → test}/test_params.rb +0 -0
  243. /data/{tests → test}/test_vad.rb +0 -0
  244. /data/{tests → test}/test_vad_params.rb +0 -0
@@ -0,0 +1,15 @@
1
+ module Whisper
2
+ class Context
3
+ def to_srt
4
+ each_segment.with_index.reduce("") {|srt, (segment, index)|
5
+ srt << "#{index + 1}\n#{segment.to_srt_cue}\n"
6
+ }
7
+ end
8
+
9
+ def to_webvtt
10
+ each_segment.with_index.reduce("WEBVTT\n\n") {|webvtt, (segment, index)|
11
+ webvtt << "#{index + 1}\n#{segment.to_webvtt_cue}\n"
12
+ }
13
+ end
14
+ end
15
+ end
@@ -130,6 +130,44 @@ module Whisper
130
130
  end
131
131
  end
132
132
 
133
+ class ZipURI < URI
134
+ def cache
135
+ zip_path = super
136
+ dest = unzipped_path
137
+ return if dest.exist? && dest.mtime >= zip_path.mtime
138
+ escaping dest do
139
+ system "unzip", "-q", "-d", zip_path.dirname.to_path, zip_path.to_path, exception: true
140
+ end
141
+ zip_path
142
+ end
143
+
144
+ def clear_cache
145
+ super
146
+ unzipped_path.rmtree if unzipped_path.exist?
147
+ end
148
+
149
+ private
150
+
151
+ def unzipped_path
152
+ cache_path.sub_ext("")
153
+ end
154
+
155
+ def escaping(path)
156
+ escaped = Pathname("#{path}.removing")
157
+ if path.exist?
158
+ escaped.rmtree if escaped.exist?
159
+ path.rename escaped
160
+ end
161
+ yield
162
+ ensure
163
+ if path.exist?
164
+ escaped.rmtree if escaped.exist?
165
+ else
166
+ escaped.rename path if escaped.exist?
167
+ end
168
+ end
169
+ end
170
+
133
171
  @pre_converted_models = %w[
134
172
  tiny
135
173
  tiny.en
@@ -171,8 +209,25 @@ module Whisper
171
209
  @pre_converted_models[name] = URI.new("https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-#{name}.bin")
172
210
  end
173
211
 
212
+ @coreml_compiled_models = %w[
213
+ tiny
214
+ tiny.en
215
+ base
216
+ base.en
217
+ small
218
+ small.en
219
+ medium
220
+ medium.en
221
+ large-v1
222
+ large-v2
223
+ large-v3
224
+ large-v3-turbo
225
+ ].each_with_object({}) do |name, models|
226
+ models[@pre_converted_models[name]] = ZipURI.new("https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-#{name}-encoder.mlmodelc.zip")
227
+ end
228
+
174
229
  class << self
175
- attr_reader :pre_converted_models
230
+ attr_reader :pre_converted_models, :coreml_compiled_models
176
231
  end
177
232
  end
178
233
  end
@@ -0,0 +1,58 @@
1
+ module Whisper
2
+ class Segment
3
+ SRT_ESCAPES = {
4
+ "&" => "&amp;",
5
+ "<" => "&lt;",
6
+ ">" => "&gt;",
7
+ }
8
+ SRT_ESCAPES_RE = Regexp.union(SRT_ESCAPES.keys)
9
+ private_constant :SRT_ESCAPES, :SRT_ESCAPES_RE
10
+
11
+ def to_srt_cue
12
+ "#{srt_start_time} --> #{srt_end_time}\n#{srt_text}\n"
13
+ end
14
+
15
+ def to_webvtt_cue
16
+ "#{webvtt_start_time} --> #{webvtt_end_time}\n#{webvtt_text}\n"
17
+ end
18
+
19
+ private
20
+
21
+ def time_to_a(time)
22
+ sec, decimal_part = time.divmod(1000)
23
+ min, sec = sec.divmod(60)
24
+ hour, min = min.divmod(60)
25
+ [hour, min, sec, decimal_part]
26
+ end
27
+
28
+ def srt_time(time)
29
+ "%02d:%02d:%02d,%03d" % time_to_a(time)
30
+ end
31
+
32
+ def srt_start_time
33
+ srt_time(start_time)
34
+ end
35
+
36
+ def srt_end_time
37
+ srt_time(end_time)
38
+ end
39
+
40
+ def srt_text
41
+ text.gsub(SRT_ESCAPES_RE, SRT_ESCAPES)
42
+ end
43
+
44
+ def webvtt_time(time)
45
+ "%02d:%02d:%02d.%03d" % time_to_a(time)
46
+ end
47
+
48
+ def webvtt_start_time
49
+ webvtt_time(start_time)
50
+ end
51
+
52
+ def webvtt_end_time
53
+ webvtt_time(end_time)
54
+ end
55
+
56
+ alias webvtt_text srt_text
57
+ end
58
+ end
data/sig/whisper.rbs CHANGED
@@ -10,6 +10,7 @@ module Whisper
10
10
  type encoder_begin_callback = ^(Whisper::Context, void, Object user_data) -> void
11
11
  type abort_callback = ^(Whisper::Context, void, Object user_data) -> boolish
12
12
 
13
+ VERSION: String
13
14
  LOG_LEVEL_NONE: Integer
14
15
  LOG_LEVEL_INFO: Integer
15
16
  LOG_LEVEL_WARN: Integer
@@ -22,21 +23,22 @@ module Whisper
22
23
  def self.lang_str: (Integer id) -> String
23
24
  def self.lang_str_full: (Integer id) -> String
24
25
  def self.log_set: (log_callback, Object? user_data) -> log_callback
26
+ def self.system_info_str: () -> String
25
27
 
26
28
  class Context
27
- def self.new: (path | ::URI::HTTP) -> instance
29
+ def self.new: (String | path | ::URI::HTTP) -> instance
28
30
 
29
31
  # transcribe a single file
30
32
  # can emit to a block results
31
33
  #
32
- # params = Whisper::Params.new
33
- # params.duration = 60_000
34
- # whisper.transcribe "path/to/audio.wav", params do |text|
35
- # puts text
36
- # end
34
+ # params = Whisper::Params.new
35
+ # params.duration = 60_000
36
+ # whisper.transcribe "path/to/audio.wav", params do |text|
37
+ # puts text
38
+ # end
37
39
  #
38
- def transcribe: (string, Params) -> self
39
- | (string, Params) { (String) -> void } -> self
40
+ def transcribe: (string, Params, ?n_processors: Integer) -> self
41
+ | (string, Params, ?n_processors: Integer) { (String) -> void } -> self
40
42
 
41
43
  def model_n_vocab: () -> Integer
42
44
  def model_n_audio_ctx: () -> Integer
@@ -49,16 +51,16 @@ module Whisper
49
51
 
50
52
  # Yields each Whisper::Segment:
51
53
  #
52
- # whisper.transcribe("path/to/audio.wav", params)
53
- # whisper.each_segment do |segment|
54
- # puts segment.text
55
- # end
54
+ # whisper.transcribe("path/to/audio.wav", params)
55
+ # whisper.each_segment do |segment|
56
+ # puts segment.text
57
+ # end
56
58
  #
57
59
  # Returns an Enumerator if no block given:
58
60
  #
59
- # whisper.transcribe("path/to/audio.wav", params)
60
- # enum = whisper.each_segment
61
- # enum.to_a # => [#<Whisper::Segment>, ...]
61
+ # whisper.transcribe("path/to/audio.wav", params)
62
+ # enum = whisper.each_segment
63
+ # enum.to_a # => [#<Whisper::Segment>, ...]
62
64
  #
63
65
  def each_segment: { (Segment) -> void } -> void
64
66
  | () -> Enumerator[Segment]
@@ -73,25 +75,25 @@ module Whisper
73
75
 
74
76
  # Start time of a segment indexed by +segment_index+ in centiseconds (10 times milliseconds).
75
77
  #
76
- # full_get_segment_t0(3) # => 1668 (16680 ms)
78
+ # full_get_segment_t0(3) # => 1668 (16680 ms)
77
79
  #
78
80
  def full_get_segment_t0: (Integer) -> Integer
79
81
 
80
82
  # End time of a segment indexed by +segment_index+ in centiseconds (10 times milliseconds).
81
83
  #
82
- # full_get_segment_t1(3) # => 1668 (16680 ms)
84
+ # full_get_segment_t1(3) # => 1668 (16680 ms)
83
85
  #
84
86
  def full_get_segment_t1: (Integer) -> Integer
85
87
 
86
88
  # Whether the next segment indexed by +segment_index+ is predicated as a speaker turn.
87
89
  #
88
- # full_get_segment_speacker_turn_next(3) # => true
90
+ # full_get_segment_speacker_turn_next(3) # => true
89
91
  #
90
92
  def full_get_segment_speaker_turn_next: (Integer) -> (true | false)
91
93
 
92
94
  # Text of a segment indexed by +segment_index+.
93
95
  #
94
- # full_get_segment_text(3) # => "ask not what your country can do for you, ..."
96
+ # full_get_segment_text(3) # => "ask not what your country can do for you, ..."
95
97
  #
96
98
  def full_get_segment_text: (Integer) -> String
97
99
 
@@ -115,6 +117,9 @@ module Whisper
115
117
  def full_parallel: (Params, Array[Float], ?Integer n_samples) -> self
116
118
  | (Params, _Samples, ?Integer n_samples) -> self
117
119
  | (Params, _Samples, ?Integer? n_samples, Integer n_processors) -> self
120
+
121
+ def to_srt: () -> String
122
+ def to_webvtt: () -> String
118
123
  end
119
124
 
120
125
  class Params
@@ -281,9 +286,9 @@ module Whisper
281
286
 
282
287
  # Sets new segment callback, called for every newly generated text segment.
283
288
  #
284
- # params.new_segment_callback = ->(context, _, n_new, user_data) {
285
- # # ...
286
- # }
289
+ # params.new_segment_callback = ->(context, _, n_new, user_data) {
290
+ # # ...
291
+ # }
287
292
  #
288
293
  def new_segment_callback=: (new_segment_callback) -> new_segment_callback
289
294
  def new_segment_callback: () -> (new_segment_callback | nil)
@@ -296,9 +301,9 @@ module Whisper
296
301
 
297
302
  # Sets progress callback, called on each progress update.
298
303
  #
299
- # params.new_segment_callback = ->(context, _, progress, user_data) {
300
- # # ...
301
- # }
304
+ # params.new_segment_callback = ->(context, _, progress, user_data) {
305
+ # # ...
306
+ # }
302
307
  #
303
308
  # +progress+ is an Integer between 0 and 100.
304
309
  #
@@ -326,9 +331,9 @@ module Whisper
326
331
 
327
332
  # Sets abort callback, called to check if the process should be aborted.
328
333
  #
329
- # params.abort_callback = ->(user_data) {
330
- # # ...
331
- # }
334
+ # params.abort_callback = ->(user_data) {
335
+ # # ...
336
+ # }
332
337
  #
333
338
  #
334
339
  def abort_callback=: (abort_callback) -> abort_callback
@@ -357,9 +362,9 @@ module Whisper
357
362
 
358
363
  # Hook called on new segment. Yields each Whisper::Segment.
359
364
  #
360
- # whisper.on_new_segment do |segment|
361
- # # ...
362
- # end
365
+ # whisper.on_new_segment do |segment|
366
+ # # ...
367
+ # end
363
368
  #
364
369
  def on_new_segment: { (Segment) -> void } -> void
365
370
 
@@ -373,19 +378,20 @@ module Whisper
373
378
 
374
379
  # Call block to determine whether abort or not. Return +true+ when you want to abort.
375
380
  #
376
- # params.abort_on do
377
- # if some_condition
378
- # true # abort
379
- # else
380
- # false # continue
381
+ # params.abort_on do
382
+ # if some_condition
383
+ # true # abort
384
+ # else
385
+ # false # continue
386
+ # end
381
387
  # end
382
- # end
383
388
  #
384
389
  def abort_on: { (Object user_data) -> boolish } -> void
385
390
  end
386
391
 
387
392
  class Model
388
393
  def self.pre_converted_models: () -> Hash[String, Model::URI]
394
+ def self.coreml_compiled_models: () -> Hash[Model::URI, Model::ZipURI]
389
395
  def self.new: () -> instance
390
396
  def n_vocab: () -> Integer
391
397
  def n_audio_ctx: () -> Integer
@@ -405,9 +411,22 @@ module Whisper
405
411
  def to_path: -> String
406
412
  def clear_cache: -> void
407
413
  end
414
+
415
+ class ZipURI < URI
416
+ def cache: () -> Pathname
417
+ def clear_cache: () -> void
418
+ end
408
419
  end
409
420
 
410
421
  class Segment
422
+ type deconstructed_keys = {
423
+ start_time: (Integer | nil),
424
+ end_time: (Integer | nil),
425
+ text: (String | nil),
426
+ no_speech_prob: (Float | nil),
427
+ speaker_turn_next: (true | false | nil)
428
+ }
429
+
411
430
  # Start time in milliseconds.
412
431
  #
413
432
  def start_time: () -> Integer
@@ -417,10 +436,21 @@ module Whisper
417
436
  def end_time: () -> Integer
418
437
 
419
438
  # Whether the next segment is predicted as a speaker turn.
420
- def speaker_next_turn?: () -> (true | false)
439
+ def speaker_turn_next?: () -> (true | false)
421
440
 
422
441
  def text: () -> String
423
442
  def no_speech_prob: () -> Float
443
+ def to_srt_cue: () -> String
444
+ def to_webvtt_cue: () -> String
445
+
446
+ # Possible keys: :start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next
447
+ #
448
+ # whisper.each_segment do |segment|
449
+ # segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:}
450
+ #
451
+ # puts "[#{start_time} --> #{end_time}] #{text} (no speech prob: #{no_speech_prob}#{speaker_turn_next ? ', speaker turns next' : ''})"
452
+ # end
453
+ def deconstruct_keys: (Array[:start_time | :end_time | :text | :no_speech_prob | :speaker_turn_next] | nil) -> deconstructed_keys
424
454
  end
425
455
 
426
456
  module VAD
@@ -3,7 +3,7 @@ require "whisper"
3
3
  require_relative "jfk_reader/jfk_reader"
4
4
 
5
5
  class TestBase < Test::Unit::TestCase
6
- AUDIO = File.join(__dir__, "..", "..", "..", "samples", "jfk.wav")
6
+ AUDIO = File.join(__dir__, "fixtures", "jfk.wav")
7
7
 
8
8
  class << self
9
9
  def whisper
@@ -21,15 +21,4 @@ class TestBase < Test::Unit::TestCase
21
21
  def whisper
22
22
  self.class.whisper
23
23
  end
24
-
25
- module BuildOptions
26
- load "ext/options.rb", self
27
- Options.include self
28
-
29
- def enable_config(name)
30
- end
31
-
32
- def arg_config(name)
33
- end
34
- end
35
24
  end
@@ -106,4 +106,13 @@ class TestModel < TestBase
106
106
  assert_equal 1, model.ftype
107
107
  assert_equal "base", model.type
108
108
  end
109
+
110
+ def test_coreml_model_auto_download
111
+ uri = Whisper::Model.coreml_compiled_models[Whisper::Model.pre_converted_models["tiny"]]
112
+ model_path = Pathname(uri.to_path).sub_ext("")
113
+ model_path.rmtree if model_path.exist?
114
+
115
+ uri.cache
116
+ assert_path_exist model_path
117
+ end
109
118
  end
@@ -0,0 +1,51 @@
1
+ require_relative "helper"
2
+ require 'tempfile'
3
+ require 'tmpdir'
4
+ require 'shellwords'
5
+
6
+ class TestPackage < TestBase
7
+ def test_build
8
+ Tempfile.create do |file|
9
+ assert system("gem", "build", "whispercpp.gemspec", "--output", file.to_path.shellescape, exception: true)
10
+ assert file.size > 0
11
+ assert_path_exist file.to_path
12
+ end
13
+ end
14
+
15
+ sub_test_case "Building binary on installation" do
16
+ def setup
17
+ system "rake", "build", exception: true
18
+ end
19
+
20
+ def test_install
21
+ gemspec = Gem::Specification.load("whispercpp.gemspec")
22
+ Dir.mktmpdir do |dir|
23
+ system "gem", "install", "--install-dir", dir.shellescape, "--no-document", "pkg/#{gemspec.file_name.shellescape}", exception: true
24
+ assert_installed dir, gemspec.version
25
+ end
26
+ end
27
+
28
+ def test_install_with_coreml
29
+ omit_unless RUBY_PLATFORM.match?(/darwin/) do
30
+ gemspec = Gem::Specification.load("whispercpp.gemspec")
31
+ Dir.mktmpdir do |dir|
32
+ system "gem", "install", "--install-dir", dir.shellescape, "--no-document", "pkg/#{gemspec.file_name.shellescape}", "--", "--enable-whisper-coreml", exception: true
33
+ assert_installed dir, gemspec.version
34
+ libdir = File.join(dir, "gems", "#{gemspec.name}-#{gemspec.version}", "lib")
35
+ assert_nothing_raised do
36
+ system "ruby", "-I", libdir, "-r", "whisper", "-e", "Whisper::Context.new('tiny')", exception: true
37
+ end
38
+ assert_match(/COREML = 1/, `ruby -I #{libdir.shellescape} -r whisper -e 'puts Whisper.system_info_str'`)
39
+ end
40
+ end
41
+ end
42
+
43
+ private
44
+
45
+ def assert_installed(dir, version)
46
+ assert_path_exist File.join(dir, "gems/whispercpp-#{version}/lib", "whisper.#{RbConfig::CONFIG["DLEXT"]}")
47
+ assert_path_exist File.join(dir, "gems/whispercpp-#{version}/LICENSE")
48
+ assert_path_not_exist File.join(dir, "gems/whispercpp-#{version}/ext/build")
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,146 @@
1
+ require_relative "helper"
2
+
3
+ class TestSegment < TestBase
4
+ def test_iteration
5
+ whisper.each_segment do |segment|
6
+ assert_instance_of Whisper::Segment, segment
7
+ end
8
+ end
9
+
10
+ def test_enumerator
11
+ enum = whisper.each_segment
12
+ assert_instance_of Enumerator, enum
13
+ enum.to_a.each_with_index do |segment, index|
14
+ assert_instance_of Whisper::Segment, segment
15
+ assert_kind_of Integer, index
16
+ end
17
+ end
18
+
19
+ def test_start_time
20
+ i = 0
21
+ whisper.each_segment do |segment|
22
+ assert_equal 0, segment.start_time if i == 0
23
+ i += 1
24
+ end
25
+ end
26
+
27
+ def test_end_time
28
+ i = 0
29
+ whisper.each_segment do |segment|
30
+ assert_equal whisper.full_get_segment_t1(i) * 10, segment.end_time
31
+ i += 1
32
+ end
33
+ end
34
+
35
+ def test_no_speech_prob
36
+ no_speech_prob = nil
37
+ whisper.each_segment do |segment|
38
+ no_speech_prob = segment.no_speech_prob
39
+ end
40
+ assert no_speech_prob > 0.0
41
+ end
42
+
43
+ def test_on_new_segment
44
+ params = Whisper::Params.new
45
+ seg = nil
46
+ index = 0
47
+ params.on_new_segment do |segment|
48
+ assert_instance_of Whisper::Segment, segment
49
+ if index == 0
50
+ seg = segment
51
+ assert_equal 0, segment.start_time
52
+ assert_match(/ask not what your country can do for you, ask what you can do for your country/, segment.text)
53
+ end
54
+ index += 1
55
+ end
56
+ whisper.transcribe(AUDIO, params)
57
+ assert_equal 0, seg.start_time
58
+ assert_match(/ask not what your country can do for you, ask what you can do for your country/, seg.text)
59
+ end
60
+
61
+ def test_on_new_segment_twice
62
+ params = Whisper::Params.new
63
+ seg = nil
64
+ params.on_new_segment do |segment|
65
+ seg = segment
66
+ return
67
+ end
68
+ params.on_new_segment do |segment|
69
+ assert_same seg, segment
70
+ return
71
+ end
72
+ whisper.transcribe(AUDIO, params)
73
+ end
74
+
75
+ def test_transcription_after_segment_retrieved
76
+ params = Whisper::Params.new
77
+ segment = whisper.each_segment.first
78
+ assert_match(/ask not what your country can do for you, ask what you can do for your country/, segment.text)
79
+
80
+ whisper.transcribe(AUDIO, Whisper::Params.new(offset: 5000))
81
+ assert_not_match(/ask not what your country can do for you, ask what you can do for your country/, segment.text)
82
+ assert_match(/what you can do for your country/i, segment.text)
83
+ end
84
+
85
+ def test_pattern_matching
86
+ segment = whisper.each_segment.first
87
+ segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:}
88
+
89
+ assert_equal segment.start_time, start_time
90
+ assert_equal segment.end_time, end_time
91
+ assert_equal segment.text, text
92
+ assert_equal segment.no_speech_prob, no_speech_prob
93
+ assert_equal segment.speaker_turn_next?, speaker_turn_next
94
+ end
95
+
96
+ def test_pattern_matching_partial
97
+ segment = whisper.each_segment.first
98
+ segment => {start_time:, end_time:, text:}
99
+
100
+ assert_equal segment.start_time, start_time
101
+ assert_equal segment.end_time, end_time
102
+ assert_equal segment.text, text
103
+ end
104
+
105
+ def test_deconstruct_keys
106
+ segment = whisper.each_segment.first
107
+ expected = {
108
+ start_time: segment.start_time,
109
+ end_time: segment.end_time,
110
+ text: segment.text,
111
+ no_speech_prob: segment.no_speech_prob,
112
+ speaker_turn_next: segment.speaker_turn_next?
113
+ }
114
+ assert_equal expected, segment.deconstruct_keys([:start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next])
115
+ end
116
+
117
+ def test_deconstruct_keys_non_existent
118
+ omit "Undefined behavior"
119
+
120
+ segment = whisper.each_segment.first
121
+
122
+ assert_equal({}, segment.deconstruct_keys([:non_existent]))
123
+ end
124
+
125
+ def test_deconstruct_keys_too_many_keys
126
+ omit "Undefined behavior"
127
+
128
+ segment = whisper.each_segment.first
129
+
130
+ assert_equal({}, segment.deconstruct_keys([:start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next, :extra_key]))
131
+ end
132
+
133
+ def test_deconstruct_keys_includes_non_existent_keys_not_too_many
134
+ omit "Undefined behavior"
135
+
136
+ segment = whisper.each_segment.first
137
+
138
+ expected = {
139
+ start_time: segment.start_time,
140
+ end_time: segment.end_time,
141
+ text: segment.text,
142
+ no_speech_prob: segment.no_speech_prob
143
+ }
144
+ assert_equal(expected, segment.deconstruct_keys([:start_time, :end_time, :text, :no_speech_prob, :non_existent]))
145
+ end
146
+ end
@@ -20,6 +20,24 @@ class TestWhisper < TestBase
20
20
  }
21
21
  end
22
22
 
23
+ def test_transcribe_non_parallel
24
+ @whisper = Whisper::Context.new("base.en")
25
+ params = Whisper::Params.new
26
+
27
+ @whisper.transcribe(AUDIO, params, n_processors: 1) {|text|
28
+ assert_match(/ask not what your country can do for you, ask what you can do for your country/, text)
29
+ }
30
+ end
31
+
32
+ def test_transcribe_n_processors
33
+ @whisper = Whisper::Context.new("base.en")
34
+ params = Whisper::Params.new
35
+
36
+ @whisper.transcribe(AUDIO, params, n_processors: 4) {|text|
37
+ assert_match(/ask not what your country can do for you[,.] ask what you can do for your country/i, text)
38
+ }
39
+ end
40
+
23
41
  sub_test_case "After transcription" do
24
42
  def test_full_n_segments
25
43
  assert_equal 1, whisper.full_n_segments
@@ -94,6 +112,14 @@ class TestWhisper < TestBase
94
112
  end
95
113
  end
96
114
 
115
+ def test_system_info_str
116
+ assert_match(/\AWHISPER : COREML = \d | OPENVINO = \d |/, Whisper.system_info_str)
117
+ end
118
+
119
+ def test_version
120
+ assert_kind_of String, Whisper::VERSION
121
+ end
122
+
97
123
  def test_log_set
98
124
  user_data = Object.new
99
125
  logs = []
@@ -223,4 +249,48 @@ class TestWhisper < TestBase
223
249
  assert_match(/for your country/i, text)
224
250
  end
225
251
  end
252
+
253
+ def test_to_srt
254
+ whisper = Whisper::Context.new("base.en")
255
+ whisper.transcribe AUDIO, @params
256
+
257
+ lines = whisper.to_srt.lines
258
+ assert_match(/\A\d+\n/, lines[0])
259
+ assert_match(/\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}\n/, lines[1])
260
+ assert_match(/ask not what your country can do for you, ask what you can do for your country/, lines[2])
261
+ end
262
+
263
+ def test_to_webvtt
264
+ whisper = Whisper::Context.new("base.en")
265
+ whisper.transcribe AUDIO, @params
266
+
267
+ lines = whisper.to_webvtt.lines
268
+ assert_equal "WEBVTT\n", lines[0]
269
+ assert_equal "\n", lines[1]
270
+ assert_match(/\A\d+\n/, lines[2])
271
+ assert_match(/\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}\n/, lines[3])
272
+ assert_match(/ask not what your country can do for you, ask what you can do for your country/, lines[4])
273
+ end
274
+
275
+ sub_test_case "Format needs escape" do
276
+ def setup
277
+ @whisper = Whisper::Context.new("base.en")
278
+ @whisper.transcribe AUDIO, Whisper::Params.new
279
+ segment = @whisper.each_segment.first
280
+ segment.define_singleton_method :text do
281
+ "& so my fellow Americans --> ask not what your country can do for you <-- ask what you can do for your country."
282
+ end
283
+ @whisper.define_singleton_method :each_segment do
284
+ Enumerator.new(3) {|yielder| 3.times {yielder << segment}}
285
+ end
286
+ end
287
+
288
+ def test_to_srt_escape
289
+ assert_equal "&amp; so my fellow Americans --&gt; ask not what your country can do for you &lt;-- ask what you can do for your country.\n", @whisper.to_srt.lines[2]
290
+ end
291
+
292
+ def test_to_webvtt_escape
293
+ assert_equal "&amp; so my fellow Americans --&gt; ask not what your country can do for you &lt;-- ask what you can do for your country.\n", @whisper.to_webvtt.lines[4]
294
+ end
295
+ end
226
296
  end