whispercpp 1.3.2 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (244) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +6 -3
  3. data/README.md +71 -14
  4. data/Rakefile +20 -7
  5. data/ext/.gitignore +4 -6
  6. data/ext/dependencies.rb +36 -24
  7. data/ext/extconf.rb +1 -1
  8. data/ext/options.rb +48 -184
  9. data/ext/ruby_whisper.c +18 -0
  10. data/ext/ruby_whisper_context.c +43 -12
  11. data/ext/ruby_whisper_model.c +1 -1
  12. data/ext/ruby_whisper_params.c +4 -2
  13. data/ext/ruby_whisper_segment.c +81 -4
  14. data/ext/ruby_whisper_transcribe.cpp +13 -7
  15. data/ext/ruby_whisper_vad_params.c +1 -1
  16. data/ext/sources/CMakeLists.txt +5 -1
  17. data/ext/sources/bindings/javascript/package.json +1 -1
  18. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +120 -24
  19. data/ext/sources/examples/addon.node/addon.cpp +150 -31
  20. data/ext/sources/examples/addon.node/index.js +3 -0
  21. data/ext/sources/examples/addon.node/vad-example.js +132 -0
  22. data/ext/sources/examples/bench/bench.cpp +3 -2
  23. data/ext/sources/examples/cli/cli.cpp +3 -2
  24. data/ext/sources/examples/command/command.cpp +32 -8
  25. data/ext/sources/examples/common-whisper.cpp +14 -7
  26. data/ext/sources/examples/lsp/lsp.cpp +2 -0
  27. data/ext/sources/examples/quantize/quantize.cpp +3 -0
  28. data/ext/sources/examples/server/CMakeLists.txt +3 -0
  29. data/ext/sources/examples/server/server.cpp +169 -22
  30. data/ext/sources/examples/stream/stream.cpp +6 -0
  31. data/ext/sources/examples/talk-llama/CMakeLists.txt +4 -1
  32. data/ext/sources/examples/talk-llama/llama-arch.cpp +171 -3
  33. data/ext/sources/examples/talk-llama/llama-arch.h +28 -1
  34. data/ext/sources/examples/talk-llama/llama-batch.cpp +741 -272
  35. data/ext/sources/examples/talk-llama/llama-batch.h +112 -54
  36. data/ext/sources/examples/talk-llama/llama-chat.cpp +30 -8
  37. data/ext/sources/examples/talk-llama/llama-chat.h +1 -0
  38. data/ext/sources/examples/talk-llama/llama-context.cpp +520 -351
  39. data/ext/sources/examples/talk-llama/llama-context.h +38 -17
  40. data/ext/sources/examples/talk-llama/llama-cparams.cpp +1 -1
  41. data/ext/sources/examples/talk-llama/llama-cparams.h +1 -1
  42. data/ext/sources/examples/talk-llama/llama-graph.cpp +447 -372
  43. data/ext/sources/examples/talk-llama/llama-graph.h +128 -58
  44. data/ext/sources/examples/talk-llama/llama-hparams.cpp +10 -2
  45. data/ext/sources/examples/talk-llama/llama-hparams.h +19 -2
  46. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +279 -0
  47. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.h +128 -0
  48. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.cpp +1841 -0
  49. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.h +303 -0
  50. data/ext/sources/examples/talk-llama/llama-kv-cache.h +14 -472
  51. data/ext/sources/examples/talk-llama/llama-kv-cells.h +86 -26
  52. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +246 -0
  53. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +138 -0
  54. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +1125 -0
  55. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +183 -0
  56. data/ext/sources/examples/talk-llama/llama-memory.cpp +58 -0
  57. data/ext/sources/examples/talk-llama/llama-memory.h +88 -4
  58. data/ext/sources/examples/talk-llama/llama-mmap.cpp +1 -1
  59. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +42 -17
  60. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +1 -0
  61. data/ext/sources/examples/talk-llama/llama-model.cpp +1863 -563
  62. data/ext/sources/examples/talk-llama/llama-model.h +27 -0
  63. data/ext/sources/examples/talk-llama/llama-quant.cpp +89 -6
  64. data/ext/sources/examples/talk-llama/llama-vocab.cpp +65 -28
  65. data/ext/sources/examples/talk-llama/llama-vocab.h +1 -0
  66. data/ext/sources/examples/talk-llama/llama.cpp +11 -7
  67. data/ext/sources/examples/talk-llama/llama.h +147 -40
  68. data/ext/sources/examples/talk-llama/talk-llama.cpp +2 -0
  69. data/ext/sources/examples/talk-llama/unicode.cpp +5 -0
  70. data/ext/sources/examples/vad-speech-segments/speech.cpp +6 -0
  71. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +2 -0
  72. data/ext/sources/ggml/CMakeLists.txt +48 -3
  73. data/ext/sources/ggml/cmake/common.cmake +24 -0
  74. data/ext/sources/ggml/include/ggml-backend.h +1 -1
  75. data/ext/sources/ggml/include/ggml-cpu.h +2 -0
  76. data/ext/sources/ggml/include/ggml.h +144 -5
  77. data/ext/sources/ggml/src/CMakeLists.txt +82 -24
  78. data/ext/sources/ggml/src/ggml-backend-reg.cpp +5 -0
  79. data/ext/sources/ggml/src/ggml-backend.cpp +46 -23
  80. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +3 -3
  81. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +1 -0
  82. data/ext/sources/ggml/src/ggml-cann/common.h +6 -1
  83. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
  84. data/ext/sources/ggml/src/ggml-common.h +4 -0
  85. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +133 -40
  86. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
  87. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
  88. data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  89. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
  90. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
  91. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
  92. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  93. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
  94. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
  95. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
  96. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
  97. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
  98. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
  99. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
  100. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  101. data/ext/sources/ggml/src/ggml-cpu/common.h +4 -3
  102. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
  103. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +146 -105
  104. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
  105. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  106. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
  107. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
  108. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  109. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1057 -174
  110. data/ext/sources/ggml/src/ggml-cpu/ops.h +8 -0
  111. data/ext/sources/ggml/src/ggml-cpu/quants.c +1158 -0
  112. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  113. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1571 -0
  114. data/ext/sources/ggml/src/ggml-cpu/repack.h +98 -0
  115. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +330 -38
  116. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  117. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +111 -18
  118. data/ext/sources/ggml/src/ggml-cpu/vec.h +303 -94
  119. data/ext/sources/ggml/src/ggml-cuda/common.cuh +60 -37
  120. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  121. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  122. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  123. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  124. data/ext/sources/ggml/src/ggml-cuda/convert.cu +22 -0
  125. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +5 -0
  126. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +2 -2
  127. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +5 -2
  128. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
  129. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +265 -123
  130. data/ext/sources/ggml/src/ggml-cuda/mean.cu +19 -0
  131. data/ext/sources/ggml/src/ggml-cuda/mean.cuh +3 -0
  132. data/ext/sources/ggml/src/ggml-cuda/mmv.cu +257 -87
  133. data/ext/sources/ggml/src/ggml-cuda/mmv.cuh +2 -3
  134. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
  135. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +5 -18
  136. data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  137. data/ext/sources/ggml/src/ggml-cuda/unary.cu +89 -0
  138. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +7 -0
  139. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +4 -0
  140. data/ext/sources/ggml/src/ggml-impl.h +127 -183
  141. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +11 -10
  142. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +27 -0
  143. data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +331 -49
  144. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +564 -282
  145. data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +2 -2
  146. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +14 -0
  147. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +1859 -489
  148. data/ext/sources/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  149. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  150. data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +72 -0
  151. data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +201 -0
  152. data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
  153. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  154. data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  155. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  156. data/ext/sources/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  157. data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
  158. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
  159. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  160. data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  161. data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
  162. data/ext/sources/ggml/src/ggml-quants.c +6 -8
  163. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
  164. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
  165. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +5 -6
  166. data/ext/sources/ggml/src/ggml-sycl/common.hpp +20 -48
  167. data/ext/sources/ggml/src/ggml-sycl/concat.cpp +28 -41
  168. data/ext/sources/ggml/src/ggml-sycl/conv.cpp +4 -10
  169. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +117 -165
  170. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +192 -53
  171. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  172. data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +49 -67
  173. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
  174. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +648 -1039
  175. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +18 -9
  176. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
  177. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +8 -105
  178. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -100
  179. data/ext/sources/ggml/src/ggml-sycl/gla.cpp +2 -2
  180. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +1 -1
  181. data/ext/sources/ggml/src/ggml-sycl/mmq.cpp +60 -80
  182. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +158 -203
  183. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +55 -74
  184. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +38 -10
  185. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +138 -27
  186. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +3 -3
  187. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  188. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  189. data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +3 -8
  190. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
  191. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +12 -16
  192. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +36 -32
  193. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +726 -282
  194. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
  195. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  196. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  197. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
  198. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
  199. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  200. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +12 -3
  201. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  202. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +10 -1
  203. data/ext/sources/ggml/src/ggml.c +328 -48
  204. data/ext/sources/ggml/src/ggml.cpp +26 -0
  205. data/ext/sources/ggml/src/gguf.cpp +24 -3
  206. data/ext/sources/include/whisper.h +2 -0
  207. data/ext/sources/src/CMakeLists.txt +2 -0
  208. data/ext/sources/src/coreml/whisper-compat.h +10 -0
  209. data/ext/sources/src/coreml/whisper-compat.m +35 -0
  210. data/ext/sources/src/coreml/whisper-decoder-impl.m +1 -0
  211. data/ext/sources/src/coreml/whisper-encoder-impl.m +1 -0
  212. data/ext/sources/src/whisper.cpp +218 -169
  213. data/extsources.rb +15 -9
  214. data/lib/whisper/context.rb +15 -0
  215. data/lib/whisper/model/uri.rb +56 -1
  216. data/lib/whisper/segment.rb +58 -0
  217. data/sig/whisper.rbs +68 -38
  218. data/{tests → test}/helper.rb +1 -12
  219. data/{tests → test}/test_model.rb +9 -0
  220. data/test/test_package.rb +51 -0
  221. data/test/test_segment.rb +146 -0
  222. data/{tests → test}/test_whisper.rb +70 -0
  223. data/whispercpp.gemspec +2 -3
  224. metadata +91 -43
  225. data/ext/sources/.dockerignore +0 -3
  226. data/ext/sources/.github/workflows/bindings-ruby.yml +0 -21
  227. data/ext/sources/ci/run.sh +0 -336
  228. data/ext/sources/close-issue.yml +0 -28
  229. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2739
  230. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  231. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13747
  232. data/tests/test_package.rb +0 -46
  233. data/tests/test_segment.rb +0 -74
  234. /data/ext/sources/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  235. /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  236. /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
  237. /data/{tests → test}/jfk_reader/.gitignore +0 -0
  238. /data/{tests → test}/jfk_reader/extconf.rb +0 -0
  239. /data/{tests → test}/jfk_reader/jfk_reader.c +0 -0
  240. /data/{tests → test}/test_callback.rb +0 -0
  241. /data/{tests → test}/test_error.rb +0 -0
  242. /data/{tests → test}/test_params.rb +0 -0
  243. /data/{tests → test}/test_vad.rb +0 -0
  244. /data/{tests → test}/test_vad_params.rb +0 -0
@@ -11,6 +11,9 @@ extern ID id_new;
11
11
  extern ID id_to_path;
12
12
  extern ID id_URI;
13
13
  extern ID id_pre_converted_models;
14
+ extern ID id_coreml_compiled_models;
15
+ extern ID id_cache;
16
+ extern ID id_n_processors;
14
17
 
15
18
  extern VALUE cContext;
16
19
  extern VALUE eError;
@@ -18,10 +21,12 @@ extern VALUE cModel;
18
21
 
19
22
  extern const rb_data_type_t ruby_whisper_params_type;
20
23
  extern VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self);
21
- extern VALUE rb_whisper_model_initialize(VALUE context);
22
- extern VALUE rb_whisper_segment_initialize(VALUE context, int index);
24
+ extern VALUE rb_whisper_model_s_new(VALUE context);
25
+ extern VALUE rb_whisper_segment_s_new(VALUE context, int index);
23
26
  extern void prepare_transcription(ruby_whisper_params *rwp, VALUE *context);
24
27
 
28
+ ID transcribe_option_names[1];
29
+
25
30
  static void
26
31
  ruby_whisper_free(ruby_whisper *rw)
27
32
  {
@@ -53,6 +58,9 @@ ruby_whisper_memsize(const void *p)
53
58
  if (!rw) {
54
59
  return 0;
55
60
  }
61
+ if (rw->context) {
62
+ size += sizeof(rw->context);
63
+ }
56
64
  return size;
57
65
  }
58
66
 
@@ -79,6 +87,13 @@ ruby_whisper_normalize_model_path(VALUE model_path)
79
87
  VALUE pre_converted_model = rb_hash_aref(pre_converted_models, model_path);
80
88
  if (!NIL_P(pre_converted_model)) {
81
89
  model_path = pre_converted_model;
90
+ #ifdef RUBY_WHISPER_USE_COREML
91
+ VALUE coreml_converted_models = rb_funcall(cModel, id_coreml_compiled_models, 0);
92
+ VALUE coreml_converted_model = rb_hash_aref(coreml_converted_models, pre_converted_model);
93
+ if (!NIL_P(coreml_converted_model)) {
94
+ rb_funcall(coreml_converted_model, id_cache, 0);
95
+ }
96
+ #endif
82
97
  }
83
98
  else if (TYPE(model_path) == T_STRING) {
84
99
  const char * model_path_str = StringValueCStr(model_path);
@@ -293,13 +308,20 @@ VALUE ruby_whisper_full(int argc, VALUE *argv, VALUE self)
293
308
  // Should check when samples.respond_to?(:length)?
294
309
  } else {
295
310
  if (TYPE(samples) == T_ARRAY) {
296
- n_samples = RARRAY_LEN(samples);
311
+ if (RARRAY_LEN(samples) > INT_MAX) {
312
+ rb_raise(rb_eArgError, "samples are too long");
313
+ }
314
+ n_samples = (int)RARRAY_LEN(samples);
297
315
  } else if (memory_view_available_p) {
298
316
  if (!rb_memory_view_get(samples, &view, RUBY_MEMORY_VIEW_SIMPLE)) {
299
317
  view.obj = Qnil;
300
318
  rb_raise(rb_eArgError, "unable to get a memory view");
301
319
  }
302
- n_samples = view.byte_size / view.item_size;
320
+ ssize_t n_samples_size = view.byte_size / view.item_size;
321
+ if (n_samples_size > INT_MAX) {
322
+ rb_raise(rb_eArgError, "samples are too long");
323
+ }
324
+ n_samples = (int)n_samples_size;
303
325
  } else if (rb_respond_to(samples, id_length)) {
304
326
  n_samples = NUM2INT(rb_funcall(samples, id_length, 0));
305
327
  } else {
@@ -387,10 +409,17 @@ ruby_whisper_full_parallel(int argc, VALUE *argv,VALUE self)
387
409
  view.obj = Qnil;
388
410
  rb_raise(rb_eArgError, "unable to get a memory view");
389
411
  }
390
- n_samples = view.byte_size / view.item_size;
412
+ ssize_t n_samples_size = view.byte_size / view.item_size;
413
+ if (n_samples_size > INT_MAX) {
414
+ rb_raise(rb_eArgError, "samples are too long");
415
+ }
416
+ n_samples = (int)n_samples_size;
391
417
  } else {
392
418
  if (TYPE(samples) == T_ARRAY) {
393
- n_samples = RARRAY_LEN(samples);
419
+ if (RARRAY_LEN(samples) > INT_MAX) {
420
+ rb_raise(rb_eArgError, "samples are too long");
421
+ }
422
+ n_samples = (int)RARRAY_LEN(samples);
394
423
  } else if (rb_respond_to(samples, id_length)) {
395
424
  n_samples = NUM2INT(rb_funcall(samples, id_length, 0));
396
425
  } else {
@@ -476,7 +505,7 @@ ruby_whisper_full_get_segment_t0(VALUE self, VALUE i_segment)
476
505
  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
477
506
  const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
478
507
  const int64_t t0 = whisper_full_get_segment_t0(rw->context, c_i_segment);
479
- return INT2NUM(t0);
508
+ return LONG2NUM(t0);
480
509
  }
481
510
 
482
511
  /*
@@ -494,7 +523,7 @@ ruby_whisper_full_get_segment_t1(VALUE self, VALUE i_segment)
494
523
  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
495
524
  const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
496
525
  const int64_t t1 = whisper_full_get_segment_t1(rw->context, c_i_segment);
497
- return INT2NUM(t1);
526
+ return LONG2NUM(t1);
498
527
  }
499
528
 
500
529
  /*
@@ -552,7 +581,7 @@ ruby_whisper_full_get_segment_no_speech_prob(VALUE self, VALUE i_segment)
552
581
  static VALUE
553
582
  ruby_whisper_full_get_segment(VALUE self, VALUE i_segment)
554
583
  {
555
- return rb_whisper_segment_initialize(self, NUM2INT(i_segment));
584
+ return rb_whisper_segment_s_new(self, NUM2INT(i_segment));
556
585
  }
557
586
 
558
587
  /*
@@ -586,7 +615,7 @@ ruby_whisper_each_segment(VALUE self)
586
615
 
587
616
  const int n_segments = whisper_full_n_segments(rw->context);
588
617
  for (int i = 0; i < n_segments; ++i) {
589
- rb_yield(rb_whisper_segment_initialize(self, i));
618
+ rb_yield(rb_whisper_segment_s_new(self, i));
590
619
  }
591
620
 
592
621
  return self;
@@ -599,7 +628,7 @@ ruby_whisper_each_segment(VALUE self)
599
628
  static VALUE
600
629
  ruby_whisper_get_model(VALUE self)
601
630
  {
602
- return rb_whisper_model_initialize(self);
631
+ return rb_whisper_model_s_new(self);
603
632
  }
604
633
 
605
634
  void
@@ -607,6 +636,8 @@ init_ruby_whisper_context(VALUE *mWhisper)
607
636
  {
608
637
  cContext = rb_define_class_under(*mWhisper, "Context", rb_cObject);
609
638
 
639
+ transcribe_option_names[0] = id_n_processors;
640
+
610
641
  rb_define_alloc_func(cContext, ruby_whisper_allocate);
611
642
  rb_define_method(cContext, "initialize", ruby_whisper_initialize, -1);
612
643
 
@@ -633,7 +664,7 @@ init_ruby_whisper_context(VALUE *mWhisper)
633
664
  rb_define_method(cContext, "full", ruby_whisper_full, -1);
634
665
  rb_define_method(cContext, "full_parallel", ruby_whisper_full_parallel, -1);
635
666
 
636
- // High leve
667
+ // High level
637
668
  rb_define_method(cContext, "full_get_segment", ruby_whisper_full_get_segment, 1);
638
669
  rb_define_method(cContext, "each_segment", ruby_whisper_each_segment, 0);
639
670
 
@@ -35,7 +35,7 @@ static VALUE ruby_whisper_model_allocate(VALUE klass) {
35
35
  return TypedData_Make_Struct(klass, ruby_whisper_model, &rb_whisper_model_type, rwm);
36
36
  }
37
37
 
38
- VALUE rb_whisper_model_initialize(VALUE context) {
38
+ VALUE rb_whisper_model_s_new(VALUE context) {
39
39
  ruby_whisper_model *rwm;
40
40
  const VALUE model = ruby_whisper_model_allocate(cModel);
41
41
  TypedData_Get_Struct(model, ruby_whisper_model, &rb_whisper_model_type, rwm);
@@ -34,7 +34,7 @@ extern VALUE cVADParams;
34
34
  extern ID id_call;
35
35
 
36
36
  extern VALUE ruby_whisper_normalize_model_path(VALUE model_path);
37
- extern VALUE rb_whisper_segment_initialize(VALUE context, int index);
37
+ extern VALUE rb_whisper_segment_s_new(VALUE context, int index);
38
38
  extern const rb_data_type_t ruby_whisper_vad_params_type;
39
39
 
40
40
  static ID param_names[RUBY_WHISPER_PARAMS_PARAM_NAMES_COUNT];
@@ -77,6 +77,8 @@ static ID id_vad_params;
77
77
  static void
78
78
  rb_whisper_callbcack_container_mark(ruby_whisper_callback_container *rwc)
79
79
  {
80
+ if (rwc == NULL) return;
81
+
80
82
  rb_gc_mark(rwc->user_data);
81
83
  rb_gc_mark(rwc->callback);
82
84
  rb_gc_mark(rwc->callbacks);
@@ -108,7 +110,7 @@ static void new_segment_callback(struct whisper_context *ctx, struct whisper_sta
108
110
  const int n_segments = whisper_full_n_segments_from_state(state);
109
111
  for (int i = n_new; i > 0; i--) {
110
112
  int i_segment = n_segments - i;
111
- VALUE segment = rb_whisper_segment_initialize(*container->context, i_segment);
113
+ VALUE segment = rb_whisper_segment_s_new(*container->context, i_segment);
112
114
  for (int j = 0; j < callbacks_len; j++) {
113
115
  VALUE cb = rb_ary_entry(container->callbacks, j);
114
116
  rb_funcall(cb, id_call, 1, segment);
@@ -1,6 +1,15 @@
1
1
  #include <ruby.h>
2
2
  #include "ruby_whisper.h"
3
3
 
4
+ #define N_KEY_NAMES 5
5
+
6
+ static VALUE sym_start_time;
7
+ static VALUE sym_end_time;
8
+ static VALUE sym_text;
9
+ static VALUE sym_no_speech_prob;
10
+ static VALUE sym_speaker_turn_next;
11
+ static VALUE key_names;
12
+
4
13
  extern const rb_data_type_t ruby_whisper_type;
5
14
 
6
15
  extern VALUE cSegment;
@@ -38,7 +47,7 @@ ruby_whisper_segment_allocate(VALUE klass)
38
47
  }
39
48
 
40
49
  VALUE
41
- rb_whisper_segment_initialize(VALUE context, int index)
50
+ rb_whisper_segment_s_new(VALUE context, int index)
42
51
  {
43
52
  ruby_whisper_segment *rws;
44
53
  const VALUE segment = ruby_whisper_segment_allocate(cSegment);
@@ -63,7 +72,7 @@ ruby_whisper_segment_get_start_time(VALUE self)
63
72
  TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
64
73
  const int64_t t0 = whisper_full_get_segment_t0(rw->context, rws->index);
65
74
  // able to multiply 10 without overflow because to_timestamp() in whisper.cpp does it
66
- return INT2NUM(t0 * 10);
75
+ return LONG2NUM(t0 * 10);
67
76
  }
68
77
 
69
78
  /*
@@ -81,7 +90,7 @@ ruby_whisper_segment_get_end_time(VALUE self)
81
90
  TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
82
91
  const int64_t t1 = whisper_full_get_segment_t1(rw->context, rws->index);
83
92
  // able to multiply 10 without overflow because to_timestamp() in whisper.cpp does it
84
- return INT2NUM(t1 * 10);
93
+ return LONG2NUM(t1 * 10);
85
94
  }
86
95
 
87
96
  /*
@@ -129,15 +138,83 @@ ruby_whisper_segment_get_no_speech_prob(VALUE self)
129
138
  return DBL2NUM(whisper_full_get_segment_no_speech_prob(rw->context, rws->index));
130
139
  }
131
140
 
141
+ /*
142
+ * call-seq:
143
+ * deconstruct_keys(keys) -> hash
144
+ *
145
+ * Possible keys: :start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next
146
+ *
147
+ * whisper.each_segment do |segment|
148
+ * segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:}
149
+ *
150
+ * puts "[#{start_time} --> #{end_time}] #{text} (no speech prob: #{no_speech_prob}#{speaker_turn_next ? ', speaker turns next' : ''})"
151
+ * end
152
+ */
153
+ static VALUE
154
+ ruby_whisper_segment_deconstruct_keys(VALUE self, VALUE keys)
155
+ {
156
+ ruby_whisper_segment *rws;
157
+ TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
158
+ ruby_whisper *rw;
159
+ TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
160
+
161
+ VALUE hash = rb_hash_new();
162
+ long n_keys;
163
+ if (NIL_P(keys)) {
164
+ keys = key_names;
165
+ n_keys = N_KEY_NAMES;
166
+ } else {
167
+ n_keys = RARRAY_LEN(keys);
168
+ if (n_keys > N_KEY_NAMES) {
169
+ return hash;
170
+ }
171
+ }
172
+ for (int i = 0; i < n_keys; i++) {
173
+ VALUE key = rb_ary_entry(keys, i);
174
+ if (key == sym_start_time) {
175
+ rb_hash_aset(hash, key, ruby_whisper_segment_get_start_time(self));
176
+ }
177
+ if (key == sym_end_time) {
178
+ rb_hash_aset(hash, key, ruby_whisper_segment_get_end_time(self));
179
+ }
180
+ if (key == sym_text) {
181
+ rb_hash_aset(hash, key, ruby_whisper_segment_get_text(self));
182
+ }
183
+ if (key == sym_no_speech_prob) {
184
+ rb_hash_aset(hash, key, ruby_whisper_segment_get_no_speech_prob(self));
185
+ }
186
+ if (key == sym_speaker_turn_next) {
187
+ rb_hash_aset(hash, key, ruby_whisper_segment_get_speaker_turn_next(self));
188
+ }
189
+ }
190
+
191
+ return hash;
192
+ }
193
+
132
194
  void
133
195
  init_ruby_whisper_segment(VALUE *mWhisper, VALUE *cContext)
134
196
  {
135
197
  cSegment = rb_define_class_under(*mWhisper, "Segment", rb_cObject);
136
198
 
199
+ sym_start_time = ID2SYM(rb_intern("start_time"));
200
+ sym_end_time = ID2SYM(rb_intern("end_time"));
201
+ sym_text = ID2SYM(rb_intern("text"));
202
+ sym_no_speech_prob = ID2SYM(rb_intern("no_speech_prob"));
203
+ sym_speaker_turn_next = ID2SYM(rb_intern("speaker_turn_next"));
204
+ key_names = rb_ary_new3(
205
+ N_KEY_NAMES,
206
+ sym_start_time,
207
+ sym_end_time,
208
+ sym_text,
209
+ sym_no_speech_prob,
210
+ sym_speaker_turn_next
211
+ );
212
+
137
213
  rb_define_alloc_func(cSegment, ruby_whisper_segment_allocate);
138
214
  rb_define_method(cSegment, "start_time", ruby_whisper_segment_get_start_time, 0);
139
215
  rb_define_method(cSegment, "end_time", ruby_whisper_segment_get_end_time, 0);
140
- rb_define_method(cSegment, "speaker_next_turn?", ruby_whisper_segment_get_speaker_turn_next, 0);
216
+ rb_define_method(cSegment, "speaker_turn_next?", ruby_whisper_segment_get_speaker_turn_next, 0);
141
217
  rb_define_method(cSegment, "text", ruby_whisper_segment_get_text, 0);
142
218
  rb_define_method(cSegment, "no_speech_prob", ruby_whisper_segment_get_no_speech_prob, 0);
219
+ rb_define_method(cSegment, "deconstruct_keys", ruby_whisper_segment_deconstruct_keys, 1);
143
220
  }
@@ -13,6 +13,7 @@ extern const rb_data_type_t ruby_whisper_params_type;
13
13
 
14
14
  extern ID id_to_s;
15
15
  extern ID id_call;
16
+ extern ID transcribe_option_names[1];
16
17
 
17
18
  extern void
18
19
  prepare_transcription(ruby_whisper_params * rwp, VALUE * self);
@@ -34,9 +35,14 @@ VALUE
34
35
  ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
35
36
  ruby_whisper *rw;
36
37
  ruby_whisper_params *rwp;
37
- VALUE wave_file_path, blk, params;
38
+ VALUE wave_file_path, blk, params, kws;
39
+ VALUE opts[1];
40
+
41
+ rb_scan_args_kw(RB_SCAN_ARGS_LAST_HASH_KEYWORDS, argc, argv, "2:&", &wave_file_path, &params, &kws, &blk);
42
+ rb_get_kwargs(kws, transcribe_option_names, 0, 1, opts);
43
+
44
+ int n_processors = opts[0] == Qundef ? 1 : NUM2INT(opts[0]);
38
45
 
39
- rb_scan_args(argc, argv, "02&", &wave_file_path, &params, &blk);
40
46
  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
41
47
  TypedData_Get_Struct(params, ruby_whisper_params, &ruby_whisper_params_type, rwp);
42
48
 
@@ -66,20 +72,20 @@ ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
66
72
 
67
73
  prepare_transcription(rwp, &self);
68
74
 
69
- if (whisper_full_parallel(rw->context, rwp->params, pcmf32.data(), pcmf32.size(), 1) != 0) {
75
+ if (whisper_full_parallel(rw->context, rwp->params, pcmf32.data(), pcmf32.size(), n_processors) != 0) {
70
76
  fprintf(stderr, "failed to process audio\n");
71
77
  return self;
72
78
  }
79
+ if (NIL_P(blk)) {
80
+ return self;
81
+ }
73
82
  const int n_segments = whisper_full_n_segments(rw->context);
74
83
  VALUE output = rb_str_new2("");
75
84
  for (int i = 0; i < n_segments; ++i) {
76
85
  const char * text = whisper_full_get_segment_text(rw->context, i);
77
86
  output = rb_str_concat(output, rb_str_new2(text));
78
87
  }
79
- VALUE idCall = id_call;
80
- if (blk != Qnil) {
81
- rb_funcall(blk, idCall, 1, output);
82
- }
88
+ rb_funcall(blk, id_call, 1, output);
83
89
  return self;
84
90
  }
85
91
  #ifdef __cplusplus
@@ -249,7 +249,7 @@ ruby_whisper_vad_params_initialize(int argc, VALUE *argv, VALUE self)
249
249
  rb_get_kwargs(kw_hash, param_names, 0, NUM_PARAMS, values);
250
250
 
251
251
  for (i = 0; i < NUM_PARAMS; i++) {
252
- id= param_names[i];
252
+ id = param_names[i];
253
253
  value = values[i];
254
254
  if (value == Qundef) {
255
255
  continue;
@@ -1,6 +1,6 @@
1
1
  cmake_minimum_required(VERSION 3.5) # for add_link_options and implicit target directories.
2
2
  project("whisper.cpp" C CXX)
3
- project("whisper.cpp" VERSION 1.7.5)
3
+ project("whisper.cpp" VERSION 1.7.6)
4
4
  include(CheckIncludeFileCXX)
5
5
 
6
6
  set(SOVERSION 1)
@@ -178,6 +178,10 @@ get_directory_property(WHISPER_TRANSIENT_DEFINES COMPILE_DEFINITIONS)
178
178
  set_target_properties(whisper PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/whisper.h)
179
179
  install(TARGETS whisper LIBRARY PUBLIC_HEADER)
180
180
 
181
+ target_compile_definitions(whisper PRIVATE
182
+ WHISPER_VERSION="${PROJECT_VERSION}"
183
+ )
184
+
181
185
  configure_package_config_file(
182
186
  ${CMAKE_CURRENT_SOURCE_DIR}/cmake/whisper-config.cmake.in
183
187
  ${CMAKE_CURRENT_BINARY_DIR}/whisper-config.cmake
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "whisper.cpp",
3
- "version": "1.7.5",
3
+ "version": "1.7.6",
4
4
  "description": "Whisper speech recognition",
5
5
  "main": "whisper.js",
6
6
  "scripts": {
@@ -1,37 +1,133 @@
1
- const path = require("path");
2
- const { whisper } = require(path.join(
3
- __dirname,
4
- "../../../build/Release/addon.node"
5
- ));
6
- const { promisify } = require("util");
1
+ const { join } = require('path');
2
+ const { whisper } = require('../../../build/Release/addon.node');
3
+ const { promisify } = require('util');
7
4
 
8
5
  const whisperAsync = promisify(whisper);
9
6
 
10
- const whisperParamsMock = {
11
- language: "en",
12
- model: path.join(__dirname, "../../../models/ggml-base.en.bin"),
13
- fname_inp: path.join(__dirname, "../../../samples/jfk.wav"),
7
+ const commonParams = {
8
+ language: 'en',
9
+ model: join(__dirname, '../../../models/ggml-base.en.bin'),
10
+ fname_inp: join(__dirname, '../../../samples/jfk.wav'),
14
11
  use_gpu: true,
15
12
  flash_attn: false,
16
13
  no_prints: true,
17
- comma_in_time: false,
18
- translate: true,
19
14
  no_timestamps: false,
15
+ detect_language: false,
20
16
  audio_ctx: 0,
21
- max_len: 0,
22
- prompt: "",
23
- print_progress: false,
24
- progress_callback: (progress) => {
25
- console.log(`Progress: ${progress}`);
26
- },
27
- max_context: -1
17
+ max_len: 0
28
18
  };
29
19
 
30
- describe("Run whisper.node", () => {
31
- test("it should receive a non-empty value", async () => {
32
- let result = await whisperAsync(whisperParamsMock);
20
+ describe('Whisper.cpp Node.js addon with VAD support', () => {
21
+ test('Basic whisper transcription without VAD', async () => {
22
+ const params = {
23
+ ...commonParams,
24
+ vad: false
25
+ };
33
26
 
34
- expect(result.length).toBeGreaterThan(0);
35
- }, 10000);
27
+ const result = await whisperAsync(params);
28
+
29
+ expect(typeof result).toBe('object');
30
+ expect(Array.isArray(result.transcription)).toBe(true);
31
+ expect(result.transcription.length).toBeGreaterThan(0);
32
+
33
+ // Check that we got some transcription text
34
+ const text = result.transcription.map(segment => segment[2]).join(' ');
35
+ expect(text.length).toBeGreaterThan(0);
36
+ expect(text.toLowerCase()).toContain('ask not');
37
+ }, 30000);
38
+
39
+ test('VAD parameters validation', async () => {
40
+ // Test with invalid VAD model - should return empty transcription
41
+ const invalidParams = {
42
+ ...commonParams,
43
+ vad: true,
44
+ vad_model: 'non-existent-model.bin',
45
+ vad_threshold: 0.5
46
+ };
47
+
48
+ // This should handle the error gracefully and return empty transcription
49
+ const result = await whisperAsync(invalidParams);
50
+ expect(typeof result).toBe('object');
51
+ expect(Array.isArray(result.transcription)).toBe(true);
52
+ // When VAD model doesn't exist, it should return empty transcription
53
+ expect(result.transcription.length).toBe(0);
54
+ }, 10000);
55
+
56
+ test('VAD parameter parsing', async () => {
57
+ // Test that VAD parameters are properly parsed (even if VAD model doesn't exist)
58
+ const vadParams = {
59
+ ...commonParams,
60
+ vad: false, // Disabled so no model required
61
+ vad_threshold: 0.7,
62
+ vad_min_speech_duration_ms: 300,
63
+ vad_min_silence_duration_ms: 150,
64
+ vad_max_speech_duration_s: 45.0,
65
+ vad_speech_pad_ms: 50,
66
+ vad_samples_overlap: 0.15
67
+ };
68
+
69
+ const result = await whisperAsync(vadParams);
70
+
71
+ expect(typeof result).toBe('object');
72
+ expect(Array.isArray(result.transcription)).toBe(true);
73
+ }, 30000);
74
+
75
+ test('Progress callback with VAD disabled', async () => {
76
+ let progressCalled = false;
77
+ let lastProgress = 0;
78
+
79
+ const params = {
80
+ ...commonParams,
81
+ vad: false,
82
+ progress_callback: (progress) => {
83
+ progressCalled = true;
84
+ lastProgress = progress;
85
+ expect(progress).toBeGreaterThanOrEqual(0);
86
+ expect(progress).toBeLessThanOrEqual(100);
87
+ }
88
+ };
89
+
90
+ const result = await whisperAsync(params);
91
+
92
+ expect(progressCalled).toBe(true);
93
+ expect(lastProgress).toBe(100);
94
+ expect(typeof result).toBe('object');
95
+ }, 30000);
96
+
97
+ test('Language detection without VAD', async () => {
98
+ const params = {
99
+ ...commonParams,
100
+ vad: false,
101
+ detect_language: true,
102
+ language: 'auto'
103
+ };
104
+
105
+ const result = await whisperAsync(params);
106
+
107
+ expect(typeof result).toBe('object');
108
+ expect(typeof result.language).toBe('string');
109
+ expect(result.language.length).toBeGreaterThan(0);
110
+ }, 30000);
111
+
112
+ test('Basic transcription with all VAD parameters set', async () => {
113
+ // Test with VAD disabled but all parameters set to ensure no crashes
114
+ const params = {
115
+ ...commonParams,
116
+ vad: false, // Disabled so it works without VAD model
117
+ vad_model: '', // Empty model path
118
+ vad_threshold: 0.6,
119
+ vad_min_speech_duration_ms: 200,
120
+ vad_min_silence_duration_ms: 80,
121
+ vad_max_speech_duration_s: 25.0,
122
+ vad_speech_pad_ms: 40,
123
+ vad_samples_overlap: 0.08
124
+ };
125
+
126
+ const result = await whisperAsync(params);
127
+
128
+ expect(typeof result).toBe('object');
129
+ expect(Array.isArray(result.transcription)).toBe(true);
130
+ expect(result.transcription.length).toBeGreaterThan(0);
131
+ }, 30000);
36
132
  });
37
133