whispercpp 1.3.2 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (244) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +6 -3
  3. data/README.md +71 -14
  4. data/Rakefile +20 -7
  5. data/ext/.gitignore +4 -6
  6. data/ext/dependencies.rb +36 -24
  7. data/ext/extconf.rb +1 -1
  8. data/ext/options.rb +48 -184
  9. data/ext/ruby_whisper.c +18 -0
  10. data/ext/ruby_whisper_context.c +43 -12
  11. data/ext/ruby_whisper_model.c +1 -1
  12. data/ext/ruby_whisper_params.c +4 -2
  13. data/ext/ruby_whisper_segment.c +81 -4
  14. data/ext/ruby_whisper_transcribe.cpp +13 -7
  15. data/ext/ruby_whisper_vad_params.c +1 -1
  16. data/ext/sources/CMakeLists.txt +5 -1
  17. data/ext/sources/bindings/javascript/package.json +1 -1
  18. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +120 -24
  19. data/ext/sources/examples/addon.node/addon.cpp +150 -31
  20. data/ext/sources/examples/addon.node/index.js +3 -0
  21. data/ext/sources/examples/addon.node/vad-example.js +132 -0
  22. data/ext/sources/examples/bench/bench.cpp +3 -2
  23. data/ext/sources/examples/cli/cli.cpp +3 -2
  24. data/ext/sources/examples/command/command.cpp +32 -8
  25. data/ext/sources/examples/common-whisper.cpp +14 -7
  26. data/ext/sources/examples/lsp/lsp.cpp +2 -0
  27. data/ext/sources/examples/quantize/quantize.cpp +3 -0
  28. data/ext/sources/examples/server/CMakeLists.txt +3 -0
  29. data/ext/sources/examples/server/server.cpp +169 -22
  30. data/ext/sources/examples/stream/stream.cpp +6 -0
  31. data/ext/sources/examples/talk-llama/CMakeLists.txt +4 -1
  32. data/ext/sources/examples/talk-llama/llama-arch.cpp +171 -3
  33. data/ext/sources/examples/talk-llama/llama-arch.h +28 -1
  34. data/ext/sources/examples/talk-llama/llama-batch.cpp +741 -272
  35. data/ext/sources/examples/talk-llama/llama-batch.h +112 -54
  36. data/ext/sources/examples/talk-llama/llama-chat.cpp +30 -8
  37. data/ext/sources/examples/talk-llama/llama-chat.h +1 -0
  38. data/ext/sources/examples/talk-llama/llama-context.cpp +520 -351
  39. data/ext/sources/examples/talk-llama/llama-context.h +38 -17
  40. data/ext/sources/examples/talk-llama/llama-cparams.cpp +1 -1
  41. data/ext/sources/examples/talk-llama/llama-cparams.h +1 -1
  42. data/ext/sources/examples/talk-llama/llama-graph.cpp +447 -372
  43. data/ext/sources/examples/talk-llama/llama-graph.h +128 -58
  44. data/ext/sources/examples/talk-llama/llama-hparams.cpp +10 -2
  45. data/ext/sources/examples/talk-llama/llama-hparams.h +19 -2
  46. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +279 -0
  47. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.h +128 -0
  48. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.cpp +1841 -0
  49. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.h +303 -0
  50. data/ext/sources/examples/talk-llama/llama-kv-cache.h +14 -472
  51. data/ext/sources/examples/talk-llama/llama-kv-cells.h +86 -26
  52. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +246 -0
  53. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +138 -0
  54. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +1125 -0
  55. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +183 -0
  56. data/ext/sources/examples/talk-llama/llama-memory.cpp +58 -0
  57. data/ext/sources/examples/talk-llama/llama-memory.h +88 -4
  58. data/ext/sources/examples/talk-llama/llama-mmap.cpp +1 -1
  59. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +42 -17
  60. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +1 -0
  61. data/ext/sources/examples/talk-llama/llama-model.cpp +1863 -563
  62. data/ext/sources/examples/talk-llama/llama-model.h +27 -0
  63. data/ext/sources/examples/talk-llama/llama-quant.cpp +89 -6
  64. data/ext/sources/examples/talk-llama/llama-vocab.cpp +65 -28
  65. data/ext/sources/examples/talk-llama/llama-vocab.h +1 -0
  66. data/ext/sources/examples/talk-llama/llama.cpp +11 -7
  67. data/ext/sources/examples/talk-llama/llama.h +147 -40
  68. data/ext/sources/examples/talk-llama/talk-llama.cpp +2 -0
  69. data/ext/sources/examples/talk-llama/unicode.cpp +5 -0
  70. data/ext/sources/examples/vad-speech-segments/speech.cpp +6 -0
  71. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +2 -0
  72. data/ext/sources/ggml/CMakeLists.txt +48 -3
  73. data/ext/sources/ggml/cmake/common.cmake +24 -0
  74. data/ext/sources/ggml/include/ggml-backend.h +1 -1
  75. data/ext/sources/ggml/include/ggml-cpu.h +2 -0
  76. data/ext/sources/ggml/include/ggml.h +144 -5
  77. data/ext/sources/ggml/src/CMakeLists.txt +82 -24
  78. data/ext/sources/ggml/src/ggml-backend-reg.cpp +5 -0
  79. data/ext/sources/ggml/src/ggml-backend.cpp +46 -23
  80. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +3 -3
  81. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +1 -0
  82. data/ext/sources/ggml/src/ggml-cann/common.h +6 -1
  83. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
  84. data/ext/sources/ggml/src/ggml-common.h +4 -0
  85. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +133 -40
  86. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
  87. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
  88. data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  89. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
  90. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
  91. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
  92. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  93. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
  94. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
  95. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
  96. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
  97. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
  98. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
  99. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
  100. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  101. data/ext/sources/ggml/src/ggml-cpu/common.h +4 -3
  102. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
  103. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +146 -105
  104. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
  105. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  106. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
  107. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
  108. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  109. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1057 -174
  110. data/ext/sources/ggml/src/ggml-cpu/ops.h +8 -0
  111. data/ext/sources/ggml/src/ggml-cpu/quants.c +1158 -0
  112. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  113. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1571 -0
  114. data/ext/sources/ggml/src/ggml-cpu/repack.h +98 -0
  115. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +330 -38
  116. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  117. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +111 -18
  118. data/ext/sources/ggml/src/ggml-cpu/vec.h +303 -94
  119. data/ext/sources/ggml/src/ggml-cuda/common.cuh +60 -37
  120. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  121. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  122. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  123. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  124. data/ext/sources/ggml/src/ggml-cuda/convert.cu +22 -0
  125. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +5 -0
  126. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +2 -2
  127. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +5 -2
  128. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
  129. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +265 -123
  130. data/ext/sources/ggml/src/ggml-cuda/mean.cu +19 -0
  131. data/ext/sources/ggml/src/ggml-cuda/mean.cuh +3 -0
  132. data/ext/sources/ggml/src/ggml-cuda/mmv.cu +257 -87
  133. data/ext/sources/ggml/src/ggml-cuda/mmv.cuh +2 -3
  134. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
  135. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +5 -18
  136. data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  137. data/ext/sources/ggml/src/ggml-cuda/unary.cu +89 -0
  138. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +7 -0
  139. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +4 -0
  140. data/ext/sources/ggml/src/ggml-impl.h +127 -183
  141. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +11 -10
  142. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +27 -0
  143. data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +331 -49
  144. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +564 -282
  145. data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +2 -2
  146. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +14 -0
  147. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +1859 -489
  148. data/ext/sources/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  149. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  150. data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +72 -0
  151. data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +201 -0
  152. data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
  153. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  154. data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  155. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  156. data/ext/sources/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  157. data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
  158. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
  159. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  160. data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  161. data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
  162. data/ext/sources/ggml/src/ggml-quants.c +6 -8
  163. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
  164. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
  165. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +5 -6
  166. data/ext/sources/ggml/src/ggml-sycl/common.hpp +20 -48
  167. data/ext/sources/ggml/src/ggml-sycl/concat.cpp +28 -41
  168. data/ext/sources/ggml/src/ggml-sycl/conv.cpp +4 -10
  169. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +117 -165
  170. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +192 -53
  171. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  172. data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +49 -67
  173. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
  174. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +648 -1039
  175. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +18 -9
  176. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
  177. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +8 -105
  178. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -100
  179. data/ext/sources/ggml/src/ggml-sycl/gla.cpp +2 -2
  180. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +1 -1
  181. data/ext/sources/ggml/src/ggml-sycl/mmq.cpp +60 -80
  182. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +158 -203
  183. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +55 -74
  184. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +38 -10
  185. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +138 -27
  186. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +3 -3
  187. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  188. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  189. data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +3 -8
  190. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
  191. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +12 -16
  192. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +36 -32
  193. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +726 -282
  194. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
  195. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  196. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  197. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
  198. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
  199. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  200. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +12 -3
  201. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  202. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +10 -1
  203. data/ext/sources/ggml/src/ggml.c +328 -48
  204. data/ext/sources/ggml/src/ggml.cpp +26 -0
  205. data/ext/sources/ggml/src/gguf.cpp +24 -3
  206. data/ext/sources/include/whisper.h +2 -0
  207. data/ext/sources/src/CMakeLists.txt +2 -0
  208. data/ext/sources/src/coreml/whisper-compat.h +10 -0
  209. data/ext/sources/src/coreml/whisper-compat.m +35 -0
  210. data/ext/sources/src/coreml/whisper-decoder-impl.m +1 -0
  211. data/ext/sources/src/coreml/whisper-encoder-impl.m +1 -0
  212. data/ext/sources/src/whisper.cpp +218 -169
  213. data/extsources.rb +15 -9
  214. data/lib/whisper/context.rb +15 -0
  215. data/lib/whisper/model/uri.rb +56 -1
  216. data/lib/whisper/segment.rb +58 -0
  217. data/sig/whisper.rbs +68 -38
  218. data/{tests → test}/helper.rb +1 -12
  219. data/{tests → test}/test_model.rb +9 -0
  220. data/test/test_package.rb +51 -0
  221. data/test/test_segment.rb +146 -0
  222. data/{tests → test}/test_whisper.rb +70 -0
  223. data/whispercpp.gemspec +2 -3
  224. metadata +91 -43
  225. data/ext/sources/.dockerignore +0 -3
  226. data/ext/sources/.github/workflows/bindings-ruby.yml +0 -21
  227. data/ext/sources/ci/run.sh +0 -336
  228. data/ext/sources/close-issue.yml +0 -28
  229. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2739
  230. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  231. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13747
  232. data/tests/test_package.rb +0 -46
  233. data/tests/test_segment.rb +0 -74
  234. /data/ext/sources/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  235. /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  236. /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
  237. /data/{tests → test}/jfk_reader/.gitignore +0 -0
  238. /data/{tests → test}/jfk_reader/extconf.rb +0 -0
  239. /data/{tests → test}/jfk_reader/jfk_reader.c +0 -0
  240. /data/{tests → test}/test_callback.rb +0 -0
  241. /data/{tests → test}/test_error.rb +0 -0
  242. /data/{tests → test}/test_params.rb +0 -0
  243. /data/{tests → test}/test_vad.rb +0 -0
  244. /data/{tests → test}/test_vad_params.rb +0 -0
@@ -206,15 +206,6 @@ static bool ggml_graph_compute_helper(
206
206
  return t;
207
207
  }
208
208
 
209
- static void whisper_load_backends() {
210
- #ifdef GGML_BACKEND_DL
211
- static std::once_flag flag;
212
- std::call_once(flag, []() {
213
- ggml_backend_load_all();
214
- });
215
- #endif
216
- }
217
-
218
209
  // TODO: move these functions to ggml-base with support for ggml-backend?
219
210
 
220
211
  static ggml_tensor * whisper_set_f32(struct ggml_tensor * t, float v) {
@@ -868,6 +859,11 @@ struct whisper_aheads_masks {
868
859
  ggml_backend_buffer_t buffer = nullptr;
869
860
  };
870
861
 
862
+ struct vad_time_mapping {
863
+ int64_t processed_time; // Time in processed (VAD) audio
864
+ int64_t original_time; // Corresponding time in original audio
865
+ };
866
+
871
867
  struct whisper_state {
872
868
  int64_t t_sample_us = 0;
873
869
  int64_t t_encode_us = 0;
@@ -957,13 +953,15 @@ struct whisper_state {
957
953
  whisper_vad_context * vad_context = nullptr;
958
954
 
959
955
  struct vad_segment_info {
960
- float orig_start;
961
- float orig_end;
962
- float vad_start;
963
- float vad_end;
956
+ int64_t orig_start;
957
+ int64_t orig_end;
958
+ int64_t vad_start;
959
+ int64_t vad_end;
964
960
  };
965
961
  std::vector<vad_segment_info> vad_segments;
966
962
  bool has_vad_segments = false;
963
+
964
+ std::vector<vad_time_mapping> vad_mapping_table;
967
965
  };
968
966
 
969
967
  struct whisper_context {
@@ -1322,8 +1320,6 @@ static size_t aheads_masks_nbytes(struct whisper_aheads_masks & aheads_masks) {
1322
1320
  static ggml_backend_t whisper_backend_init_gpu(const whisper_context_params & params) {
1323
1321
  ggml_log_set(g_state.log_callback, g_state.log_callback_user_data);
1324
1322
 
1325
- whisper_load_backends();
1326
-
1327
1323
  ggml_backend_dev_t dev = nullptr;
1328
1324
 
1329
1325
  int cnt = 0;
@@ -4335,8 +4331,6 @@ static int whisper_has_openvino(void) {
4335
4331
  const char * whisper_print_system_info(void) {
4336
4332
  static std::string s;
4337
4333
 
4338
- whisper_load_backends();
4339
-
4340
4334
  s = "";
4341
4335
  s += "WHISPER : ";
4342
4336
  s += "COREML = " + std::to_string(whisper_has_coreml()) + " | ";
@@ -4420,8 +4414,8 @@ struct whisper_vad_model {
4420
4414
  };
4421
4415
 
4422
4416
  struct whisper_vad_segment {
4423
- float start; // Start time in seconds
4424
- float end; // End time in seconds
4417
+ int64_t start;
4418
+ int64_t end;
4425
4419
  };
4426
4420
 
4427
4421
  struct whisper_vad_segments {
@@ -4469,6 +4463,15 @@ struct whisper_vad_params whisper_vad_default_params(void) {
4469
4463
  return result;
4470
4464
  }
4471
4465
 
4466
+ // Time conversion utility functions for whisper VAD
4467
+ static int cs_to_samples(int64_t cs) {
4468
+ return (int)((cs / 100.0) * WHISPER_SAMPLE_RATE + 0.5);
4469
+ }
4470
+
4471
+ static int64_t samples_to_cs(int samples) {
4472
+ return (int64_t)((samples / (double)WHISPER_SAMPLE_RATE) * 100.0 + 0.5);
4473
+ }
4474
+
4472
4475
  static bool weight_buft_supported(const whisper_vad_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
4473
4476
  bool op_supported = true;
4474
4477
 
@@ -5413,12 +5416,12 @@ struct whisper_vad_segments * whisper_vad_segments_from_probs(
5413
5416
  (speeches[i].end + speech_pad_samples) : audio_length_samples;
5414
5417
  }
5415
5418
 
5416
- // Convert from samples to seconds and copy to final segments
5417
- segments[i].start = (float)speeches[i].start / sample_rate;
5418
- segments[i].end = (float)speeches[i].end / sample_rate;
5419
+ // Convert from samples to centiseconds
5420
+ segments[i].start = samples_to_cs(speeches[i].start);
5421
+ segments[i].end = samples_to_cs(speeches[i].end);
5419
5422
 
5420
5423
  WHISPER_LOG_INFO("%s: VAD segment %d: start = %.2f, end = %.2f (duration: %.2f)\n",
5421
- __func__, i, segments[i].start, segments[i].end, segments[i].end - segments[i].start);
5424
+ __func__, i, segments[i].start/100.0, segments[i].end/100.0, (segments[i].end - segments[i].start)/100.0);
5422
5425
  }
5423
5426
 
5424
5427
  whisper_vad_segments * vad_segments = new whisper_vad_segments;
@@ -6615,10 +6618,13 @@ static bool whisper_vad(
6615
6618
  struct whisper_full_params params,
6616
6619
  const float * samples,
6617
6620
  int n_samples,
6618
- std::vector<float> & filtered_samples,
6619
- int & filtered_n_samples) {
6620
- WHISPER_LOG_INFO("%s: VAD is enabled, processing speach segments only\n", __func__);
6621
- filtered_n_samples = 0;
6621
+ std::vector<float> & filtered_samples) {
6622
+ WHISPER_LOG_INFO("%s: VAD is enabled, processing speech segments only\n", __func__);
6623
+ int filtered_n_samples = 0;
6624
+
6625
+ // Clear any existing mapping table
6626
+ state->vad_mapping_table.clear();
6627
+ state->has_vad_segments = false;
6622
6628
 
6623
6629
  if (state->vad_context == nullptr) {
6624
6630
  struct whisper_vad_context_params vad_ctx_params = whisper_vad_default_context_params();
@@ -6640,13 +6646,17 @@ static bool whisper_vad(
6640
6646
  ctx->state->vad_segments.clear();
6641
6647
  ctx->state->vad_segments.reserve(vad_segments->data.size());
6642
6648
 
6649
+ // Initialize the time mapping table
6650
+ state->vad_mapping_table.clear();
6651
+ state->vad_mapping_table.reserve(vad_segments->data.size() * 4);
6652
+
6643
6653
  WHISPER_LOG_INFO("%s: detected %d speech segments\n", __func__, (int)vad_segments->data.size());
6644
6654
  float overlap_seconds = vad_params.samples_overlap;
6645
6655
  int overlap_samples = overlap_seconds * WHISPER_SAMPLE_RATE;
6646
6656
 
6647
6657
  for (int i = 0; i < (int)vad_segments->data.size(); i++) {
6648
- int segment_start_samples = vad_segments->data[i].start * WHISPER_SAMPLE_RATE;
6649
- int segment_end_samples = vad_segments->data[i].end * WHISPER_SAMPLE_RATE;
6658
+ int segment_start_samples = cs_to_samples(vad_segments->data[i].start);
6659
+ int segment_end_samples = cs_to_samples(vad_segments->data[i].end);
6650
6660
 
6651
6661
  if (i < (int)vad_segments->data.size() - 1) {
6652
6662
  segment_end_samples += overlap_samples;
@@ -6655,9 +6665,9 @@ static bool whisper_vad(
6655
6665
  filtered_n_samples += (segment_end_samples - segment_start_samples);
6656
6666
 
6657
6667
  WHISPER_LOG_INFO("%s: Including segment %d: %.2f - %.2f (duration: %.2f)\n",
6658
- __func__, i, vad_segments->data[i].start,
6659
- vad_segments->data[i].end + (i < (int)vad_segments->data.size() - 1 ? overlap_seconds : 0),
6660
- (vad_segments->data[i].end - vad_segments->data[i].start) +
6668
+ __func__, i, vad_segments->data[i].start/100.0,
6669
+ (vad_segments->data[i].end/100.0 + (i < (int)vad_segments->data.size() - 1 ? overlap_seconds : 0)),
6670
+ (vad_segments->data[i].end - vad_segments->data[i].start)/100.0 +
6661
6671
  (i < (int)vad_segments->data.size() - 1 ? overlap_seconds : 0));
6662
6672
  }
6663
6673
 
@@ -6679,8 +6689,8 @@ static bool whisper_vad(
6679
6689
 
6680
6690
  int offset = 0;
6681
6691
  for (int i = 0; i < (int)vad_segments->data.size(); i++) {
6682
- int segment_start_samples = vad_segments->data[i].start * WHISPER_SAMPLE_RATE;
6683
- int segment_end_samples = vad_segments->data[i].end * WHISPER_SAMPLE_RATE;
6692
+ int segment_start_samples = cs_to_samples(vad_segments->data[i].start);
6693
+ int segment_end_samples = cs_to_samples(vad_segments->data[i].end);
6684
6694
 
6685
6695
  if (i < (int)vad_segments->data.size() - 1) {
6686
6696
  segment_end_samples += overlap_samples;
@@ -6689,18 +6699,47 @@ static bool whisper_vad(
6689
6699
  segment_start_samples = std::min(segment_start_samples, n_samples - 1);
6690
6700
  segment_end_samples = std::min(segment_end_samples, n_samples);
6691
6701
  int segment_length = segment_end_samples - segment_start_samples;
6692
-
6693
6702
  if (segment_length > 0) {
6694
6703
  whisper_state::vad_segment_info segment;
6695
6704
 
6696
6705
  segment.orig_start = vad_segments->data[i].start;
6697
6706
  segment.orig_end = vad_segments->data[i].end;
6698
6707
 
6699
- segment.vad_start = offset / (float)WHISPER_SAMPLE_RATE;
6700
- segment.vad_end = (offset + segment_length) / (float)WHISPER_SAMPLE_RATE;
6708
+ segment.vad_start = samples_to_cs(offset);
6709
+ segment.vad_end = samples_to_cs(offset + segment_length);
6710
+
6711
+ // Add segment boundaries to mapping table
6712
+ vad_time_mapping start_mapping = {segment.vad_start, segment.orig_start};
6713
+ vad_time_mapping end_mapping = {segment.vad_end, segment.orig_end};
6714
+
6715
+ state->vad_mapping_table.push_back(start_mapping);
6716
+ state->vad_mapping_table.push_back(end_mapping);
6717
+
6718
+ // Add intermediate points for longer segments to improve interpolation accuracy
6719
+ const int64_t min_segment_length = 100; // 1 second
6720
+ const int64_t point_interval = 20; // Add a point every 200ms
6721
+
6722
+ if (segment.vad_end - segment.vad_start > min_segment_length) {
6723
+ int64_t segment_duration = segment.vad_end - segment.vad_start;
6724
+ int num_points = (int)(segment_duration / point_interval) - 1;
6725
+
6726
+ for (int j = 1; j <= num_points; j++) {
6727
+ int64_t vad_time = segment.vad_start + j * point_interval;
6728
+
6729
+ if (vad_time >= segment.vad_end) continue;
6730
+
6731
+ int64_t vad_elapsed = vad_time - segment.vad_start;
6732
+ int64_t vad_total = segment.vad_end - segment.vad_start;
6733
+ int64_t orig_total = segment.orig_end - segment.orig_start;
6734
+ int64_t orig_time = segment.orig_start + (vad_elapsed * orig_total) / vad_total;
6735
+
6736
+ vad_time_mapping intermediate_mapping = {vad_time, orig_time};
6737
+ state->vad_mapping_table.push_back(intermediate_mapping);
6738
+ }
6739
+ }
6701
6740
 
6702
6741
  WHISPER_LOG_INFO("%s: vad_segment_info: orig_start: %.2f, orig_end: %.2f, vad_start: %.2f, vad_end: %.2f\n",
6703
- __func__, segment.orig_start, segment.orig_end, segment.vad_start, segment.vad_end);
6742
+ __func__, segment.orig_start/100.0, segment.orig_end/100.0, segment.vad_start/100.0, segment.vad_end/100.0);
6704
6743
  ctx->state->vad_segments.push_back(segment);
6705
6744
 
6706
6745
  // Copy this speech segment
@@ -6709,6 +6748,17 @@ static bool whisper_vad(
6709
6748
 
6710
6749
  // Add silence after this segment (except after the last segment)
6711
6750
  if (i < (int)vad_segments->data.size() - 1) {
6751
+ // Calculate the start and end time of the silence gap in processed audio
6752
+ int64_t silence_start_vad = samples_to_cs(offset);
6753
+ int64_t silence_end_vad = samples_to_cs(offset + silence_samples);
6754
+ // Calculate the corresponding original times
6755
+ int64_t orig_silence_start = segment.orig_end;
6756
+ int64_t orig_silence_end = vad_segments->data[i+1].start;
6757
+
6758
+ // Add mapping points for silence boundaries
6759
+ state->vad_mapping_table.push_back({silence_start_vad, orig_silence_start});
6760
+ state->vad_mapping_table.push_back({silence_end_vad, orig_silence_end});
6761
+
6712
6762
  // Fill with zeros (silence)
6713
6763
  memset(filtered_samples.data() + offset, 0, silence_samples * sizeof(float));
6714
6764
  offset += silence_samples;
@@ -6716,6 +6766,24 @@ static bool whisper_vad(
6716
6766
  }
6717
6767
  }
6718
6768
 
6769
+ // Sort the mapping table by processed time
6770
+ std::sort(state->vad_mapping_table.begin(), state->vad_mapping_table.end(),
6771
+ [](const vad_time_mapping& a, const vad_time_mapping& b) {
6772
+ return a.processed_time < b.processed_time;
6773
+ });
6774
+
6775
+ // Remove any duplicate processed times to ensure monotonicity which is
6776
+ // needed for binary search and interpolation later.
6777
+ if (!state->vad_mapping_table.empty()) {
6778
+ auto last = std::unique(state->vad_mapping_table.begin(), state->vad_mapping_table.end(),
6779
+ [](const vad_time_mapping& a, const vad_time_mapping& b) {
6780
+ return a.processed_time == b.processed_time;
6781
+ });
6782
+ state->vad_mapping_table.erase(last, state->vad_mapping_table.end());
6783
+ }
6784
+
6785
+ WHISPER_LOG_INFO("%s: Created time mapping table with %d points\n", __func__, (int)state->vad_mapping_table.size());
6786
+
6719
6787
  filtered_n_samples = offset;
6720
6788
  WHISPER_LOG_INFO("%s: Reduced audio from %d to %d samples (%.1f%% reduction)\n",
6721
6789
  __func__, n_samples, filtered_n_samples, 100.0f * (1.0f - (float)filtered_n_samples / n_samples));
@@ -6735,27 +6803,9 @@ int whisper_full_with_state(
6735
6803
 
6736
6804
  result_all.clear();
6737
6805
 
6738
- const float * process_samples = samples;
6739
- int n_process_samples = n_samples;
6740
- std::vector<float> vad_samples;
6741
-
6742
- if (params.vad) {
6743
- WHISPER_LOG_INFO("%s: VAD is enabled, processing speech segments only\n", __func__);
6744
- int vad_n_samples;
6745
- if (!whisper_vad(ctx, state, params, samples, n_samples, vad_samples, vad_n_samples)) {
6746
- WHISPER_LOG_ERROR("%s: failed to compute VAD\n", __func__);
6747
- return -1;
6748
- }
6749
- if (vad_n_samples == 0) {
6750
- return 0;
6751
- }
6752
- process_samples = vad_samples.data();
6753
- n_process_samples = vad_n_samples;
6754
- }
6755
-
6756
- if (n_process_samples > 0) {
6806
+ if (n_samples > 0) {
6757
6807
  // compute log mel spectrogram
6758
- if (whisper_pcm_to_mel_with_state(ctx, state, process_samples, n_process_samples, params.n_threads) != 0) {
6808
+ if (whisper_pcm_to_mel_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) {
6759
6809
  WHISPER_LOG_ERROR("%s: failed to compute log mel spectrogram\n", __func__);
6760
6810
  return -2;
6761
6811
  }
@@ -7665,6 +7715,21 @@ int whisper_full(
7665
7715
  struct whisper_full_params params,
7666
7716
  const float * samples,
7667
7717
  int n_samples) {
7718
+
7719
+ std::vector<float> vad_samples;
7720
+ if (params.vad) {
7721
+ WHISPER_LOG_INFO("%s: VAD is enabled, processing speech segments only\n", __func__);
7722
+ if (!whisper_vad(ctx, ctx->state, params, samples, n_samples, vad_samples)) {
7723
+ WHISPER_LOG_ERROR("%s: failed to compute VAD\n", __func__);
7724
+ return -1;
7725
+ }
7726
+ if (vad_samples.empty()) {
7727
+ ctx->state->result_all.clear();
7728
+ return 0;
7729
+ }
7730
+ samples = vad_samples.data();
7731
+ n_samples = vad_samples.size();
7732
+ }
7668
7733
  return whisper_full_with_state(ctx, ctx->state, params, samples, n_samples);
7669
7734
  }
7670
7735
 
@@ -7674,9 +7739,24 @@ int whisper_full_parallel(
7674
7739
  const float * samples,
7675
7740
  int n_samples,
7676
7741
  int n_processors) {
7742
+
7677
7743
  if (n_processors == 1) {
7678
7744
  return whisper_full(ctx, params, samples, n_samples);
7679
7745
  }
7746
+
7747
+ std::vector<float> vad_samples;
7748
+ if (params.vad) {
7749
+ WHISPER_LOG_INFO("%s: VAD is enabled, processing speech segments only\n", __func__);
7750
+ if (!whisper_vad(ctx, ctx->state, params, samples, n_samples, vad_samples)) {
7751
+ WHISPER_LOG_ERROR("%s: failed to compute VAD\n", __func__);
7752
+ return -1;
7753
+ }
7754
+ if (vad_samples.empty()) {
7755
+ return 0;
7756
+ }
7757
+ samples = vad_samples.data();
7758
+ n_samples = vad_samples.size();
7759
+ }
7680
7760
  int ret = 0;
7681
7761
 
7682
7762
  // prepare separate states for each thread
@@ -7799,130 +7879,89 @@ int whisper_full_lang_id(struct whisper_context * ctx) {
7799
7879
  return ctx->state->lang_id;
7800
7880
  }
7801
7881
 
7802
- int64_t whisper_full_get_segment_t0_from_state(struct whisper_state * state, int i_segment) {
7803
- // If VAD wasn't used, return the original timestamp
7804
- if (!state->has_vad_segments || state->vad_segments.empty()) {
7805
- return state->result_all[i_segment].t0;
7882
+ static int64_t map_processed_to_original_time(int64_t processed_time, const std::vector<vad_time_mapping> & mapping_table) {
7883
+ if (mapping_table.empty()) {
7884
+ return processed_time;
7806
7885
  }
7807
7886
 
7808
- // Get the start timestamp produced by whisper_full. whisper_full processes
7809
- // only the speech segments in this case so we need to map these timestamps
7810
- // back to the original audio.
7811
- float t0 = state->result_all[i_segment].t0 / 100.0f;
7887
+ if (processed_time <= mapping_table.front().processed_time) {
7888
+ return mapping_table.front().original_time; // Before first mapping point
7889
+ }
7812
7890
 
7813
- // Find which VAD segment this timestamp belongs.
7814
- // TODO(danbev) This could be optimized by using a binary search if the number
7815
- // of segments exceed a certain limit. Also we might be able to assume that
7816
- // the access pattern is sequential and optimized for that too.
7817
- for (size_t i = 0; i < state->vad_segments.size(); i++) {
7818
- const auto & segment = state->vad_segments[i];
7891
+ if (processed_time >= mapping_table.back().processed_time) {
7892
+ return mapping_table.back().original_time; // After last mapping point
7893
+ }
7819
7894
 
7820
- // Check if the timestamp falls within this segment.
7821
- if (t0 >= segment.vad_start && t0 <= segment.vad_end) {
7822
- float proportion = 0.0f;
7823
- if (segment.vad_end > segment.vad_start) {
7824
- proportion = (t0 - segment.vad_start) / (segment.vad_end - segment.vad_start);
7825
- }
7826
- float orig_t0 = segment.orig_start + proportion * (segment.orig_end - segment.orig_start);
7827
- return (int64_t)(orig_t0 * 100);
7895
+ // Binary search over the time map that finds the first entry that has a
7896
+ // processed time greater than or equal to the current processed time.
7897
+ auto upper = std::lower_bound(mapping_table.begin(), mapping_table.end(), processed_time,
7898
+ [](const vad_time_mapping & entry, int64_t time) {
7899
+ return entry.processed_time < time;
7828
7900
  }
7901
+ );
7902
+
7903
+ // If exact match found
7904
+ if (upper->processed_time == processed_time) {
7905
+ return upper->original_time;
7829
7906
  }
7830
7907
 
7831
- // Check if the timestamp falls between two segments.
7832
- for (size_t i = 0; i < state->vad_segments.size() - 1; i++) {
7833
- const auto & curr = state->vad_segments[i];
7834
- const auto & next = state->vad_segments[i + 1];
7908
+ // Need to interpolate between two points
7909
+ auto lower = upper - 1;
7835
7910
 
7836
- if (t0 > curr.vad_end && t0 < next.vad_start) {
7837
- // Calculate how far we are through the gap as a proportion
7838
- float gap_proportion = 0.0f;
7839
- if (next.vad_start > curr.vad_end) {
7840
- gap_proportion = (t0 - curr.vad_end) / (next.vad_start - curr.vad_end);
7841
- }
7842
- float orig_t0 = curr.orig_end + gap_proportion * (next.orig_start - curr.orig_end);
7843
- return (int64_t)(orig_t0 * 100);
7844
- }
7845
- }
7911
+ int64_t processed_diff = upper->processed_time - lower->processed_time;
7912
+ int64_t original_diff = upper->original_time - lower->original_time;
7913
+ int64_t offset = processed_time - lower->processed_time;
7846
7914
 
7847
- // Handle the case where the timestamp is after the last segment.
7848
- if (t0 > state->vad_segments.back().vad_end) {
7849
- // For timestamps after the last segment, add the extra time to the end of the last segment
7850
- const auto& last = state->vad_segments.back();
7851
- // Calculate how far beyond the last segment
7852
- float extra_time = t0 - last.vad_end;
7853
- // Add this extra time to the original end time
7854
- float orig_t0 = last.orig_end + extra_time;
7855
- return (int64_t)(orig_t0 * 100);
7915
+ if (processed_diff == 0) {
7916
+ return lower->original_time;
7856
7917
  }
7857
7918
 
7858
- WHISPER_LOG_WARN("%s: Could not map t0 = %f to a VAD segment\n", __func__, t0);
7859
- return t0;
7919
+ // Perform linear interpolation
7920
+ return lower->original_time + (offset * original_diff) / processed_diff;
7860
7921
  }
7861
7922
 
7862
- int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment) {
7863
- return whisper_full_get_segment_t0_from_state(ctx->state, i_segment);
7923
+ // Function to get the starting timestamp of a segment
7924
+ int64_t whisper_full_get_segment_t0_from_state(struct whisper_state * state, int i_segment) {
7925
+ // If VAD wasn't used, return the original timestamp
7926
+ if (!state->has_vad_segments || state->vad_mapping_table.empty()) {
7927
+ return state->result_all[i_segment].t0;
7928
+ }
7929
+
7930
+ // Get the processed timestamp
7931
+ int64_t t0 = state->result_all[i_segment].t0;
7932
+
7933
+ // Map to original time using the mapping table
7934
+ return map_processed_to_original_time(t0, state->vad_mapping_table);
7864
7935
  }
7865
7936
 
7937
+ // Function to get the ending timestamp of a segment
7866
7938
  int64_t whisper_full_get_segment_t1_from_state(struct whisper_state * state, int i_segment) {
7867
7939
  // If VAD wasn't used, return the original timestamp
7868
- if (!state->has_vad_segments || state->vad_segments.empty()) {
7940
+ if (!state->has_vad_segments || state->vad_mapping_table.empty()) {
7869
7941
  return state->result_all[i_segment].t1;
7870
7942
  }
7871
7943
 
7872
- // Get the end timestamp produced by whisper_full. whisper_full processes
7873
- // only the speech segments in this case so we need to map these timestamps
7874
- // back to the original audio.
7875
- float t1 = state->result_all[i_segment].t1 / 100.0f;
7876
-
7877
- // Find which VAD segment this timestamp belongs.
7878
- // TODO(danbev) This could be optimized by using a binary search if the number
7879
- // of segments exceed a certain limit. Also we might be able to assume that
7880
- // the access pattern is sequential and optimized for that too.
7881
- for (size_t i = 0; i < state->vad_segments.size(); i++) {
7882
- const auto& segment = state->vad_segments[i];
7883
-
7884
- // Check if the timestamp falls within this segment.
7885
- if (t1 >= segment.vad_start && t1 <= segment.vad_end) {
7886
- // Calculate the proportion through the filtered segment.
7887
- float proportion = 0.0f;
7888
- if (segment.vad_end > segment.vad_start) {
7889
- proportion = (t1 - segment.vad_start) / (segment.vad_end - segment.vad_start);
7890
- }
7891
- float orig_t1 = segment.orig_start + proportion * (segment.orig_end - segment.orig_start);
7892
- return (int64_t)(orig_t1 * 100);
7893
- }
7894
- }
7944
+ // Get the processed timestamp
7945
+ int64_t t1 = state->result_all[i_segment].t1;
7895
7946
 
7896
- // Check if the timestamp falls between two segments.
7897
- for (size_t i = 0; i < state->vad_segments.size() - 1; i++) {
7898
- const auto & curr = state->vad_segments[i];
7899
- const auto & next = state->vad_segments[i + 1];
7947
+ // Map to original time using the mapping table
7948
+ int64_t orig_t1 = map_processed_to_original_time(t1, state->vad_mapping_table);
7900
7949
 
7901
- if (t1 > curr.vad_end && t1 < next.vad_start) {
7902
- // Calculate how far we are through the gap as a proportion
7903
- float gap_proportion = 0.0f;
7904
- if (next.vad_start > curr.vad_end) {
7905
- gap_proportion = (t1 - curr.vad_end) / (next.vad_start - curr.vad_end);
7906
- }
7907
- // Map to the corresponding position in the original gap
7908
- float orig_t1 = curr.orig_end + gap_proportion * (next.orig_start - curr.orig_end);
7909
- return (int64_t)(orig_t1 * 100);
7910
- }
7911
- }
7950
+ // Get the corresponding t0 for this segment
7951
+ int64_t orig_t0 = whisper_full_get_segment_t0_from_state(state, i_segment);
7912
7952
 
7913
- // Handle the case where the timestamp is after the last segment
7914
- if (t1 > state->vad_segments.back().vad_end) {
7915
- // For the last segment, use the end of the last VAD segment
7916
- const auto& last = state->vad_segments.back();
7917
- // Calculate how far beyond the last segment
7918
- float extra_time = t1 - last.vad_end;
7919
- // Add this extra time to the original end time
7920
- float orig_t1 = last.orig_end + extra_time;
7921
- return (int64_t)(orig_t1 * 100);
7953
+ // Ensure minimum duration to prevent zero-length segments
7954
+ const int64_t min_duration = 10; // 10ms minimum
7955
+ if (orig_t1 - orig_t0 < min_duration) {
7956
+ orig_t1 = orig_t0 + min_duration;
7922
7957
  }
7923
7958
 
7924
- WHISPER_LOG_WARN("%s: Could not map t1 = %f to a VAD segment\n", __func__, t1);
7925
- return t1;
7959
+ return orig_t1;
7960
+ }
7961
+
7962
+
7963
+ int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment) {
7964
+ return whisper_full_get_segment_t0_from_state(ctx->state, i_segment);
7926
7965
  }
7927
7966
 
7928
7967
  int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment) {
@@ -8154,8 +8193,6 @@ WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads) {
8154
8193
  }
8155
8194
 
8156
8195
  WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
8157
- whisper_load_backends();
8158
-
8159
8196
  static std::string s;
8160
8197
  s = "";
8161
8198
  char strbuf[256];
@@ -8289,10 +8326,6 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
8289
8326
  // token-level timestamps
8290
8327
  //
8291
8328
 
8292
- static int timestamp_to_sample(int64_t t, int n_samples) {
8293
- return std::max(0, std::min((int) n_samples - 1, (int) ((t*WHISPER_SAMPLE_RATE)/100)));
8294
- }
8295
-
8296
8329
  static int64_t sample_to_timestamp(int i_sample) {
8297
8330
  return (100ll*i_sample)/WHISPER_SAMPLE_RATE;
8298
8331
  }
@@ -8342,6 +8375,18 @@ static std::vector<float> get_signal_energy(const float * signal, int n_samples,
8342
8375
  return result;
8343
8376
  }
8344
8377
 
8378
+ static int timestamp_to_sample(int64_t t, int64_t segment_t0, int n_samples) {
8379
+ // Convert absolute timestamp to segment-relative timestamp
8380
+ int64_t relative_t = t - segment_t0;
8381
+ int sample = (int)((relative_t * WHISPER_SAMPLE_RATE) / 100);
8382
+ return std::max(0, std::min(n_samples - 1, sample));
8383
+ }
8384
+
8385
+ static int64_t sample_to_timestamp(int i_sample, int64_t segment_t0) {
8386
+ int64_t relative_timestamp = (100ll * i_sample) / WHISPER_SAMPLE_RATE;
8387
+ return relative_timestamp + segment_t0;
8388
+ }
8389
+
8345
8390
  static void whisper_exp_compute_token_level_timestamps(
8346
8391
  struct whisper_context & ctx,
8347
8392
  struct whisper_state & state,
@@ -8482,8 +8527,8 @@ static void whisper_exp_compute_token_level_timestamps(
8482
8527
  continue;
8483
8528
  }
8484
8529
 
8485
- int s0 = timestamp_to_sample(tokens[j].t0, n_samples);
8486
- int s1 = timestamp_to_sample(tokens[j].t1, n_samples);
8530
+ int s0 = timestamp_to_sample(tokens[j].t0, segment.t0, n_samples);
8531
+ int s1 = timestamp_to_sample(tokens[j].t1, segment.t0, n_samples);
8487
8532
 
8488
8533
  const int ss0 = std::max(s0 - hw, 0);
8489
8534
  const int ss1 = std::min(s1 + hw, n_samples);
@@ -8504,7 +8549,7 @@ static void whisper_exp_compute_token_level_timestamps(
8504
8549
  while (k > 0 && state.energy[k] > thold) {
8505
8550
  k--;
8506
8551
  }
8507
- tokens[j].t0 = sample_to_timestamp(k);
8552
+ tokens[j].t0 = sample_to_timestamp(k, segment.t0);
8508
8553
  if (tokens[j].t0 < tokens[j - 1].t1) {
8509
8554
  tokens[j].t0 = tokens[j - 1].t1;
8510
8555
  } else {
@@ -8515,7 +8560,7 @@ static void whisper_exp_compute_token_level_timestamps(
8515
8560
  k++;
8516
8561
  }
8517
8562
  s0 = k;
8518
- tokens[j].t0 = sample_to_timestamp(k);
8563
+ tokens[j].t0 = sample_to_timestamp(k, segment.t0);
8519
8564
  }
8520
8565
  }
8521
8566
 
@@ -8525,7 +8570,7 @@ static void whisper_exp_compute_token_level_timestamps(
8525
8570
  while (k < n_samples - 1 && state.energy[k] > thold) {
8526
8571
  k++;
8527
8572
  }
8528
- tokens[j].t1 = sample_to_timestamp(k);
8573
+ tokens[j].t1 = sample_to_timestamp(k, segment.t0);
8529
8574
  if (j < n - 1 && tokens[j].t1 > tokens[j + 1].t0) {
8530
8575
  tokens[j].t1 = tokens[j + 1].t0;
8531
8576
  } else {
@@ -8536,7 +8581,7 @@ static void whisper_exp_compute_token_level_timestamps(
8536
8581
  k--;
8537
8582
  }
8538
8583
  s1 = k;
8539
- tokens[j].t1 = sample_to_timestamp(k);
8584
+ tokens[j].t1 = sample_to_timestamp(k, segment.t0);
8540
8585
  }
8541
8586
  }
8542
8587
  }
@@ -8893,6 +8938,10 @@ void whisper_log_set(ggml_log_callback log_callback, void * user_data) {
8893
8938
  ggml_log_set(g_state.log_callback, g_state.log_callback_user_data);
8894
8939
  }
8895
8940
 
8941
+ const char * whisper_version(void) {
8942
+ return WHISPER_VERSION;
8943
+ }
8944
+
8896
8945
  GGML_ATTRIBUTE_FORMAT(2, 3)
8897
8946
  static void whisper_log_internal(ggml_log_level level, const char * format, ...) {
8898
8947
  va_list args;
data/extsources.rb CHANGED
@@ -1,5 +1,10 @@
1
+ require "pathname"
2
+
3
+ root = Pathname("..")/".."
1
4
  ignored_dirs = %w[
2
5
  .devops
6
+ .github
7
+ ci
3
8
  examples/wchess/wchess.wasm
4
9
  examples/whisper.android
5
10
  examples/whisper.android.java
@@ -9,7 +14,7 @@ ignored_dirs = %w[
9
14
  models
10
15
  samples
11
16
  scripts
12
- ]
17
+ ].collect {|dir| root/dir}
13
18
  ignored_files = %w[
14
19
  AUTHORS
15
20
  Makefile
@@ -17,18 +22,19 @@ ignored_files = %w[
17
22
  README_sycl.md
18
23
  .gitignore
19
24
  .gitmodules
25
+ .dockerignore
20
26
  whisper.nvim
21
27
  twitch.sh
22
28
  yt-wsp.sh
29
+ close-issue.yml
23
30
  ]
24
31
 
25
32
  EXTSOURCES =
26
- `git ls-files -z ../..`.split("\x0")
27
- .select {|file|
28
- basename = File.basename(file)
29
-
30
- ignored_dirs.all? {|dir| !file.start_with?("../../#{dir}")} &&
31
- !ignored_files.include?(basename) &&
32
- (file.start_with?("../..") || file.start_with?("../javascript")) &&
33
- (!file.start_with?("../../.github/") || basename == "bindings-ruby.yml")
33
+ `git ls-files -z #{root}`.split("\x0")
34
+ .collect {|file| Pathname(file)}
35
+ .reject {|file|
36
+ ignored_dirs.any? {|dir| file.descend.any? {|desc| desc == dir}} ||
37
+ ignored_files.include?(file.basename.to_path) ||
38
+ (file.descend.to_a[1] != root && file.descend.to_a[1] != Pathname("..")/"javascript")
34
39
  }
40
+ .collect(&:to_path)