whispercpp 1.3.2 → 1.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +6 -3
- data/README.md +71 -14
- data/Rakefile +20 -7
- data/ext/.gitignore +4 -6
- data/ext/dependencies.rb +36 -24
- data/ext/extconf.rb +1 -1
- data/ext/options.rb +48 -184
- data/ext/ruby_whisper.c +18 -0
- data/ext/ruby_whisper_context.c +43 -12
- data/ext/ruby_whisper_model.c +1 -1
- data/ext/ruby_whisper_params.c +4 -2
- data/ext/ruby_whisper_segment.c +81 -4
- data/ext/ruby_whisper_transcribe.cpp +13 -7
- data/ext/ruby_whisper_vad_params.c +1 -1
- data/ext/sources/CMakeLists.txt +5 -1
- data/ext/sources/bindings/javascript/package.json +1 -1
- data/ext/sources/examples/addon.node/__test__/whisper.spec.js +120 -24
- data/ext/sources/examples/addon.node/addon.cpp +150 -31
- data/ext/sources/examples/addon.node/index.js +3 -0
- data/ext/sources/examples/addon.node/vad-example.js +132 -0
- data/ext/sources/examples/bench/bench.cpp +3 -2
- data/ext/sources/examples/cli/cli.cpp +3 -2
- data/ext/sources/examples/command/command.cpp +32 -8
- data/ext/sources/examples/common-whisper.cpp +14 -7
- data/ext/sources/examples/lsp/lsp.cpp +2 -0
- data/ext/sources/examples/quantize/quantize.cpp +3 -0
- data/ext/sources/examples/server/CMakeLists.txt +3 -0
- data/ext/sources/examples/server/server.cpp +169 -22
- data/ext/sources/examples/stream/stream.cpp +6 -0
- data/ext/sources/examples/talk-llama/CMakeLists.txt +4 -1
- data/ext/sources/examples/talk-llama/llama-arch.cpp +171 -3
- data/ext/sources/examples/talk-llama/llama-arch.h +28 -1
- data/ext/sources/examples/talk-llama/llama-batch.cpp +741 -272
- data/ext/sources/examples/talk-llama/llama-batch.h +112 -54
- data/ext/sources/examples/talk-llama/llama-chat.cpp +30 -8
- data/ext/sources/examples/talk-llama/llama-chat.h +1 -0
- data/ext/sources/examples/talk-llama/llama-context.cpp +520 -351
- data/ext/sources/examples/talk-llama/llama-context.h +38 -17
- data/ext/sources/examples/talk-llama/llama-cparams.cpp +1 -1
- data/ext/sources/examples/talk-llama/llama-cparams.h +1 -1
- data/ext/sources/examples/talk-llama/llama-graph.cpp +447 -372
- data/ext/sources/examples/talk-llama/llama-graph.h +128 -58
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +10 -2
- data/ext/sources/examples/talk-llama/llama-hparams.h +19 -2
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +279 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.h +128 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified.cpp +1841 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified.h +303 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +14 -472
- data/ext/sources/examples/talk-llama/llama-kv-cells.h +86 -26
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +246 -0
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +138 -0
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +1125 -0
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +183 -0
- data/ext/sources/examples/talk-llama/llama-memory.cpp +58 -0
- data/ext/sources/examples/talk-llama/llama-memory.h +88 -4
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +1 -1
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +42 -17
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +1 -0
- data/ext/sources/examples/talk-llama/llama-model.cpp +1863 -563
- data/ext/sources/examples/talk-llama/llama-model.h +27 -0
- data/ext/sources/examples/talk-llama/llama-quant.cpp +89 -6
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +65 -28
- data/ext/sources/examples/talk-llama/llama-vocab.h +1 -0
- data/ext/sources/examples/talk-llama/llama.cpp +11 -7
- data/ext/sources/examples/talk-llama/llama.h +147 -40
- data/ext/sources/examples/talk-llama/talk-llama.cpp +2 -0
- data/ext/sources/examples/talk-llama/unicode.cpp +5 -0
- data/ext/sources/examples/vad-speech-segments/speech.cpp +6 -0
- data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +2 -0
- data/ext/sources/ggml/CMakeLists.txt +48 -3
- data/ext/sources/ggml/cmake/common.cmake +24 -0
- data/ext/sources/ggml/include/ggml-backend.h +1 -1
- data/ext/sources/ggml/include/ggml-cpu.h +2 -0
- data/ext/sources/ggml/include/ggml.h +144 -5
- data/ext/sources/ggml/src/CMakeLists.txt +82 -24
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +5 -0
- data/ext/sources/ggml/src/ggml-backend.cpp +46 -23
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +3 -3
- data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +1 -0
- data/ext/sources/ggml/src/ggml-cann/common.h +6 -1
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
- data/ext/sources/ggml/src/ggml-common.h +4 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +133 -40
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +184 -0
- data/ext/sources/ggml/src/ggml-cpu/common.h +4 -3
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +146 -105
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1057 -174
- data/ext/sources/ggml/src/ggml-cpu/ops.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.c +1158 -0
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1571 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.h +98 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +330 -38
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +111 -18
- data/ext/sources/ggml/src/ggml-cpu/vec.h +303 -94
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +60 -37
- data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +22 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +2 -2
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +5 -2
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +265 -123
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +19 -0
- data/ext/sources/ggml/src/ggml-cuda/mean.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/mmv.cu +257 -87
- data/ext/sources/ggml/src/ggml-cuda/mmv.cuh +2 -3
- data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +5 -18
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +0 -1
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +89 -0
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +7 -0
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +4 -0
- data/ext/sources/ggml/src/ggml-impl.h +127 -183
- data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +11 -10
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +27 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +331 -49
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +564 -282
- data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +2 -2
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +14 -0
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +1859 -489
- data/ext/sources/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +72 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +201 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
- data/ext/sources/ggml/src/ggml-quants.c +6 -8
- data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +5 -6
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +20 -48
- data/ext/sources/ggml/src/ggml-sycl/concat.cpp +28 -41
- data/ext/sources/ggml/src/ggml-sycl/conv.cpp +4 -10
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +117 -165
- data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +192 -53
- data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +32 -0
- data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +49 -67
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +648 -1039
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +18 -9
- data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +8 -105
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -100
- data/ext/sources/ggml/src/ggml-sycl/gla.cpp +2 -2
- data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/mmq.cpp +60 -80
- data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +158 -203
- data/ext/sources/ggml/src/ggml-sycl/norm.cpp +55 -74
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +38 -10
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +138 -27
- data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +3 -3
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +3 -8
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
- data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +12 -16
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +36 -32
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +726 -282
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +12 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +10 -1
- data/ext/sources/ggml/src/ggml.c +328 -48
- data/ext/sources/ggml/src/ggml.cpp +26 -0
- data/ext/sources/ggml/src/gguf.cpp +24 -3
- data/ext/sources/include/whisper.h +2 -0
- data/ext/sources/src/CMakeLists.txt +2 -0
- data/ext/sources/src/coreml/whisper-compat.h +10 -0
- data/ext/sources/src/coreml/whisper-compat.m +35 -0
- data/ext/sources/src/coreml/whisper-decoder-impl.m +1 -0
- data/ext/sources/src/coreml/whisper-encoder-impl.m +1 -0
- data/ext/sources/src/whisper.cpp +218 -169
- data/extsources.rb +15 -9
- data/lib/whisper/context.rb +15 -0
- data/lib/whisper/model/uri.rb +56 -1
- data/lib/whisper/segment.rb +58 -0
- data/sig/whisper.rbs +68 -38
- data/{tests → test}/helper.rb +1 -12
- data/{tests → test}/test_model.rb +9 -0
- data/test/test_package.rb +51 -0
- data/test/test_segment.rb +146 -0
- data/{tests → test}/test_whisper.rb +70 -0
- data/whispercpp.gemspec +2 -3
- metadata +91 -43
- data/ext/sources/.dockerignore +0 -3
- data/ext/sources/.github/workflows/bindings-ruby.yml +0 -21
- data/ext/sources/ci/run.sh +0 -336
- data/ext/sources/close-issue.yml +0 -28
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2739
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13747
- data/tests/test_package.rb +0 -46
- data/tests/test_segment.rb +0 -74
- /data/ext/sources/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
- /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
- /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
- /data/{tests → test}/jfk_reader/.gitignore +0 -0
- /data/{tests → test}/jfk_reader/extconf.rb +0 -0
- /data/{tests → test}/jfk_reader/jfk_reader.c +0 -0
- /data/{tests → test}/test_callback.rb +0 -0
- /data/{tests → test}/test_error.rb +0 -0
- /data/{tests → test}/test_params.rb +0 -0
- /data/{tests → test}/test_vad.rb +0 -0
- /data/{tests → test}/test_vad_params.rb +0 -0
data/ext/sources/src/whisper.cpp
CHANGED
@@ -206,15 +206,6 @@ static bool ggml_graph_compute_helper(
|
|
206
206
|
return t;
|
207
207
|
}
|
208
208
|
|
209
|
-
static void whisper_load_backends() {
|
210
|
-
#ifdef GGML_BACKEND_DL
|
211
|
-
static std::once_flag flag;
|
212
|
-
std::call_once(flag, []() {
|
213
|
-
ggml_backend_load_all();
|
214
|
-
});
|
215
|
-
#endif
|
216
|
-
}
|
217
|
-
|
218
209
|
// TODO: move these functions to ggml-base with support for ggml-backend?
|
219
210
|
|
220
211
|
static ggml_tensor * whisper_set_f32(struct ggml_tensor * t, float v) {
|
@@ -868,6 +859,11 @@ struct whisper_aheads_masks {
|
|
868
859
|
ggml_backend_buffer_t buffer = nullptr;
|
869
860
|
};
|
870
861
|
|
862
|
+
struct vad_time_mapping {
|
863
|
+
int64_t processed_time; // Time in processed (VAD) audio
|
864
|
+
int64_t original_time; // Corresponding time in original audio
|
865
|
+
};
|
866
|
+
|
871
867
|
struct whisper_state {
|
872
868
|
int64_t t_sample_us = 0;
|
873
869
|
int64_t t_encode_us = 0;
|
@@ -957,13 +953,15 @@ struct whisper_state {
|
|
957
953
|
whisper_vad_context * vad_context = nullptr;
|
958
954
|
|
959
955
|
struct vad_segment_info {
|
960
|
-
|
961
|
-
|
962
|
-
|
963
|
-
|
956
|
+
int64_t orig_start;
|
957
|
+
int64_t orig_end;
|
958
|
+
int64_t vad_start;
|
959
|
+
int64_t vad_end;
|
964
960
|
};
|
965
961
|
std::vector<vad_segment_info> vad_segments;
|
966
962
|
bool has_vad_segments = false;
|
963
|
+
|
964
|
+
std::vector<vad_time_mapping> vad_mapping_table;
|
967
965
|
};
|
968
966
|
|
969
967
|
struct whisper_context {
|
@@ -1322,8 +1320,6 @@ static size_t aheads_masks_nbytes(struct whisper_aheads_masks & aheads_masks) {
|
|
1322
1320
|
static ggml_backend_t whisper_backend_init_gpu(const whisper_context_params & params) {
|
1323
1321
|
ggml_log_set(g_state.log_callback, g_state.log_callback_user_data);
|
1324
1322
|
|
1325
|
-
whisper_load_backends();
|
1326
|
-
|
1327
1323
|
ggml_backend_dev_t dev = nullptr;
|
1328
1324
|
|
1329
1325
|
int cnt = 0;
|
@@ -4335,8 +4331,6 @@ static int whisper_has_openvino(void) {
|
|
4335
4331
|
const char * whisper_print_system_info(void) {
|
4336
4332
|
static std::string s;
|
4337
4333
|
|
4338
|
-
whisper_load_backends();
|
4339
|
-
|
4340
4334
|
s = "";
|
4341
4335
|
s += "WHISPER : ";
|
4342
4336
|
s += "COREML = " + std::to_string(whisper_has_coreml()) + " | ";
|
@@ -4420,8 +4414,8 @@ struct whisper_vad_model {
|
|
4420
4414
|
};
|
4421
4415
|
|
4422
4416
|
struct whisper_vad_segment {
|
4423
|
-
|
4424
|
-
|
4417
|
+
int64_t start;
|
4418
|
+
int64_t end;
|
4425
4419
|
};
|
4426
4420
|
|
4427
4421
|
struct whisper_vad_segments {
|
@@ -4469,6 +4463,15 @@ struct whisper_vad_params whisper_vad_default_params(void) {
|
|
4469
4463
|
return result;
|
4470
4464
|
}
|
4471
4465
|
|
4466
|
+
// Time conversion utility functions for whisper VAD
|
4467
|
+
static int cs_to_samples(int64_t cs) {
|
4468
|
+
return (int)((cs / 100.0) * WHISPER_SAMPLE_RATE + 0.5);
|
4469
|
+
}
|
4470
|
+
|
4471
|
+
static int64_t samples_to_cs(int samples) {
|
4472
|
+
return (int64_t)((samples / (double)WHISPER_SAMPLE_RATE) * 100.0 + 0.5);
|
4473
|
+
}
|
4474
|
+
|
4472
4475
|
static bool weight_buft_supported(const whisper_vad_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
|
4473
4476
|
bool op_supported = true;
|
4474
4477
|
|
@@ -5413,12 +5416,12 @@ struct whisper_vad_segments * whisper_vad_segments_from_probs(
|
|
5413
5416
|
(speeches[i].end + speech_pad_samples) : audio_length_samples;
|
5414
5417
|
}
|
5415
5418
|
|
5416
|
-
// Convert from samples to
|
5417
|
-
segments[i].start = (
|
5418
|
-
segments[i].end = (
|
5419
|
+
// Convert from samples to centiseconds
|
5420
|
+
segments[i].start = samples_to_cs(speeches[i].start);
|
5421
|
+
segments[i].end = samples_to_cs(speeches[i].end);
|
5419
5422
|
|
5420
5423
|
WHISPER_LOG_INFO("%s: VAD segment %d: start = %.2f, end = %.2f (duration: %.2f)\n",
|
5421
|
-
__func__, i, segments[i].start, segments[i].end, segments[i].end - segments[i].start);
|
5424
|
+
__func__, i, segments[i].start/100.0, segments[i].end/100.0, (segments[i].end - segments[i].start)/100.0);
|
5422
5425
|
}
|
5423
5426
|
|
5424
5427
|
whisper_vad_segments * vad_segments = new whisper_vad_segments;
|
@@ -6615,10 +6618,13 @@ static bool whisper_vad(
|
|
6615
6618
|
struct whisper_full_params params,
|
6616
6619
|
const float * samples,
|
6617
6620
|
int n_samples,
|
6618
|
-
std::vector<float> & filtered_samples
|
6619
|
-
|
6620
|
-
|
6621
|
-
|
6621
|
+
std::vector<float> & filtered_samples) {
|
6622
|
+
WHISPER_LOG_INFO("%s: VAD is enabled, processing speech segments only\n", __func__);
|
6623
|
+
int filtered_n_samples = 0;
|
6624
|
+
|
6625
|
+
// Clear any existing mapping table
|
6626
|
+
state->vad_mapping_table.clear();
|
6627
|
+
state->has_vad_segments = false;
|
6622
6628
|
|
6623
6629
|
if (state->vad_context == nullptr) {
|
6624
6630
|
struct whisper_vad_context_params vad_ctx_params = whisper_vad_default_context_params();
|
@@ -6640,13 +6646,17 @@ static bool whisper_vad(
|
|
6640
6646
|
ctx->state->vad_segments.clear();
|
6641
6647
|
ctx->state->vad_segments.reserve(vad_segments->data.size());
|
6642
6648
|
|
6649
|
+
// Initialize the time mapping table
|
6650
|
+
state->vad_mapping_table.clear();
|
6651
|
+
state->vad_mapping_table.reserve(vad_segments->data.size() * 4);
|
6652
|
+
|
6643
6653
|
WHISPER_LOG_INFO("%s: detected %d speech segments\n", __func__, (int)vad_segments->data.size());
|
6644
6654
|
float overlap_seconds = vad_params.samples_overlap;
|
6645
6655
|
int overlap_samples = overlap_seconds * WHISPER_SAMPLE_RATE;
|
6646
6656
|
|
6647
6657
|
for (int i = 0; i < (int)vad_segments->data.size(); i++) {
|
6648
|
-
int segment_start_samples = vad_segments->data[i].start
|
6649
|
-
int segment_end_samples = vad_segments->data[i].end
|
6658
|
+
int segment_start_samples = cs_to_samples(vad_segments->data[i].start);
|
6659
|
+
int segment_end_samples = cs_to_samples(vad_segments->data[i].end);
|
6650
6660
|
|
6651
6661
|
if (i < (int)vad_segments->data.size() - 1) {
|
6652
6662
|
segment_end_samples += overlap_samples;
|
@@ -6655,9 +6665,9 @@ static bool whisper_vad(
|
|
6655
6665
|
filtered_n_samples += (segment_end_samples - segment_start_samples);
|
6656
6666
|
|
6657
6667
|
WHISPER_LOG_INFO("%s: Including segment %d: %.2f - %.2f (duration: %.2f)\n",
|
6658
|
-
__func__, i, vad_segments->data[i].start,
|
6659
|
-
vad_segments->data[i].end + (i < (int)vad_segments->data.size() - 1 ? overlap_seconds : 0),
|
6660
|
-
(vad_segments->data[i].end - vad_segments->data[i].start) +
|
6668
|
+
__func__, i, vad_segments->data[i].start/100.0,
|
6669
|
+
(vad_segments->data[i].end/100.0 + (i < (int)vad_segments->data.size() - 1 ? overlap_seconds : 0)),
|
6670
|
+
(vad_segments->data[i].end - vad_segments->data[i].start)/100.0 +
|
6661
6671
|
(i < (int)vad_segments->data.size() - 1 ? overlap_seconds : 0));
|
6662
6672
|
}
|
6663
6673
|
|
@@ -6679,8 +6689,8 @@ static bool whisper_vad(
|
|
6679
6689
|
|
6680
6690
|
int offset = 0;
|
6681
6691
|
for (int i = 0; i < (int)vad_segments->data.size(); i++) {
|
6682
|
-
int segment_start_samples = vad_segments->data[i].start
|
6683
|
-
int segment_end_samples = vad_segments->data[i].end
|
6692
|
+
int segment_start_samples = cs_to_samples(vad_segments->data[i].start);
|
6693
|
+
int segment_end_samples = cs_to_samples(vad_segments->data[i].end);
|
6684
6694
|
|
6685
6695
|
if (i < (int)vad_segments->data.size() - 1) {
|
6686
6696
|
segment_end_samples += overlap_samples;
|
@@ -6689,18 +6699,47 @@ static bool whisper_vad(
|
|
6689
6699
|
segment_start_samples = std::min(segment_start_samples, n_samples - 1);
|
6690
6700
|
segment_end_samples = std::min(segment_end_samples, n_samples);
|
6691
6701
|
int segment_length = segment_end_samples - segment_start_samples;
|
6692
|
-
|
6693
6702
|
if (segment_length > 0) {
|
6694
6703
|
whisper_state::vad_segment_info segment;
|
6695
6704
|
|
6696
6705
|
segment.orig_start = vad_segments->data[i].start;
|
6697
6706
|
segment.orig_end = vad_segments->data[i].end;
|
6698
6707
|
|
6699
|
-
segment.vad_start = offset
|
6700
|
-
segment.vad_end = (offset + segment_length)
|
6708
|
+
segment.vad_start = samples_to_cs(offset);
|
6709
|
+
segment.vad_end = samples_to_cs(offset + segment_length);
|
6710
|
+
|
6711
|
+
// Add segment boundaries to mapping table
|
6712
|
+
vad_time_mapping start_mapping = {segment.vad_start, segment.orig_start};
|
6713
|
+
vad_time_mapping end_mapping = {segment.vad_end, segment.orig_end};
|
6714
|
+
|
6715
|
+
state->vad_mapping_table.push_back(start_mapping);
|
6716
|
+
state->vad_mapping_table.push_back(end_mapping);
|
6717
|
+
|
6718
|
+
// Add intermediate points for longer segments to improve interpolation accuracy
|
6719
|
+
const int64_t min_segment_length = 100; // 1 second
|
6720
|
+
const int64_t point_interval = 20; // Add a point every 200ms
|
6721
|
+
|
6722
|
+
if (segment.vad_end - segment.vad_start > min_segment_length) {
|
6723
|
+
int64_t segment_duration = segment.vad_end - segment.vad_start;
|
6724
|
+
int num_points = (int)(segment_duration / point_interval) - 1;
|
6725
|
+
|
6726
|
+
for (int j = 1; j <= num_points; j++) {
|
6727
|
+
int64_t vad_time = segment.vad_start + j * point_interval;
|
6728
|
+
|
6729
|
+
if (vad_time >= segment.vad_end) continue;
|
6730
|
+
|
6731
|
+
int64_t vad_elapsed = vad_time - segment.vad_start;
|
6732
|
+
int64_t vad_total = segment.vad_end - segment.vad_start;
|
6733
|
+
int64_t orig_total = segment.orig_end - segment.orig_start;
|
6734
|
+
int64_t orig_time = segment.orig_start + (vad_elapsed * orig_total) / vad_total;
|
6735
|
+
|
6736
|
+
vad_time_mapping intermediate_mapping = {vad_time, orig_time};
|
6737
|
+
state->vad_mapping_table.push_back(intermediate_mapping);
|
6738
|
+
}
|
6739
|
+
}
|
6701
6740
|
|
6702
6741
|
WHISPER_LOG_INFO("%s: vad_segment_info: orig_start: %.2f, orig_end: %.2f, vad_start: %.2f, vad_end: %.2f\n",
|
6703
|
-
__func__, segment.orig_start, segment.orig_end, segment.vad_start, segment.vad_end);
|
6742
|
+
__func__, segment.orig_start/100.0, segment.orig_end/100.0, segment.vad_start/100.0, segment.vad_end/100.0);
|
6704
6743
|
ctx->state->vad_segments.push_back(segment);
|
6705
6744
|
|
6706
6745
|
// Copy this speech segment
|
@@ -6709,6 +6748,17 @@ static bool whisper_vad(
|
|
6709
6748
|
|
6710
6749
|
// Add silence after this segment (except after the last segment)
|
6711
6750
|
if (i < (int)vad_segments->data.size() - 1) {
|
6751
|
+
// Calculate the start and end time of the silence gap in processed audio
|
6752
|
+
int64_t silence_start_vad = samples_to_cs(offset);
|
6753
|
+
int64_t silence_end_vad = samples_to_cs(offset + silence_samples);
|
6754
|
+
// Calculate the corresponding original times
|
6755
|
+
int64_t orig_silence_start = segment.orig_end;
|
6756
|
+
int64_t orig_silence_end = vad_segments->data[i+1].start;
|
6757
|
+
|
6758
|
+
// Add mapping points for silence boundaries
|
6759
|
+
state->vad_mapping_table.push_back({silence_start_vad, orig_silence_start});
|
6760
|
+
state->vad_mapping_table.push_back({silence_end_vad, orig_silence_end});
|
6761
|
+
|
6712
6762
|
// Fill with zeros (silence)
|
6713
6763
|
memset(filtered_samples.data() + offset, 0, silence_samples * sizeof(float));
|
6714
6764
|
offset += silence_samples;
|
@@ -6716,6 +6766,24 @@ static bool whisper_vad(
|
|
6716
6766
|
}
|
6717
6767
|
}
|
6718
6768
|
|
6769
|
+
// Sort the mapping table by processed time
|
6770
|
+
std::sort(state->vad_mapping_table.begin(), state->vad_mapping_table.end(),
|
6771
|
+
[](const vad_time_mapping& a, const vad_time_mapping& b) {
|
6772
|
+
return a.processed_time < b.processed_time;
|
6773
|
+
});
|
6774
|
+
|
6775
|
+
// Remove any duplicate processed times to ensure monotonicity which is
|
6776
|
+
// needed for binary search and interpolation later.
|
6777
|
+
if (!state->vad_mapping_table.empty()) {
|
6778
|
+
auto last = std::unique(state->vad_mapping_table.begin(), state->vad_mapping_table.end(),
|
6779
|
+
[](const vad_time_mapping& a, const vad_time_mapping& b) {
|
6780
|
+
return a.processed_time == b.processed_time;
|
6781
|
+
});
|
6782
|
+
state->vad_mapping_table.erase(last, state->vad_mapping_table.end());
|
6783
|
+
}
|
6784
|
+
|
6785
|
+
WHISPER_LOG_INFO("%s: Created time mapping table with %d points\n", __func__, (int)state->vad_mapping_table.size());
|
6786
|
+
|
6719
6787
|
filtered_n_samples = offset;
|
6720
6788
|
WHISPER_LOG_INFO("%s: Reduced audio from %d to %d samples (%.1f%% reduction)\n",
|
6721
6789
|
__func__, n_samples, filtered_n_samples, 100.0f * (1.0f - (float)filtered_n_samples / n_samples));
|
@@ -6735,27 +6803,9 @@ int whisper_full_with_state(
|
|
6735
6803
|
|
6736
6804
|
result_all.clear();
|
6737
6805
|
|
6738
|
-
|
6739
|
-
int n_process_samples = n_samples;
|
6740
|
-
std::vector<float> vad_samples;
|
6741
|
-
|
6742
|
-
if (params.vad) {
|
6743
|
-
WHISPER_LOG_INFO("%s: VAD is enabled, processing speech segments only\n", __func__);
|
6744
|
-
int vad_n_samples;
|
6745
|
-
if (!whisper_vad(ctx, state, params, samples, n_samples, vad_samples, vad_n_samples)) {
|
6746
|
-
WHISPER_LOG_ERROR("%s: failed to compute VAD\n", __func__);
|
6747
|
-
return -1;
|
6748
|
-
}
|
6749
|
-
if (vad_n_samples == 0) {
|
6750
|
-
return 0;
|
6751
|
-
}
|
6752
|
-
process_samples = vad_samples.data();
|
6753
|
-
n_process_samples = vad_n_samples;
|
6754
|
-
}
|
6755
|
-
|
6756
|
-
if (n_process_samples > 0) {
|
6806
|
+
if (n_samples > 0) {
|
6757
6807
|
// compute log mel spectrogram
|
6758
|
-
if (whisper_pcm_to_mel_with_state(ctx, state,
|
6808
|
+
if (whisper_pcm_to_mel_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) {
|
6759
6809
|
WHISPER_LOG_ERROR("%s: failed to compute log mel spectrogram\n", __func__);
|
6760
6810
|
return -2;
|
6761
6811
|
}
|
@@ -7665,6 +7715,21 @@ int whisper_full(
|
|
7665
7715
|
struct whisper_full_params params,
|
7666
7716
|
const float * samples,
|
7667
7717
|
int n_samples) {
|
7718
|
+
|
7719
|
+
std::vector<float> vad_samples;
|
7720
|
+
if (params.vad) {
|
7721
|
+
WHISPER_LOG_INFO("%s: VAD is enabled, processing speech segments only\n", __func__);
|
7722
|
+
if (!whisper_vad(ctx, ctx->state, params, samples, n_samples, vad_samples)) {
|
7723
|
+
WHISPER_LOG_ERROR("%s: failed to compute VAD\n", __func__);
|
7724
|
+
return -1;
|
7725
|
+
}
|
7726
|
+
if (vad_samples.empty()) {
|
7727
|
+
ctx->state->result_all.clear();
|
7728
|
+
return 0;
|
7729
|
+
}
|
7730
|
+
samples = vad_samples.data();
|
7731
|
+
n_samples = vad_samples.size();
|
7732
|
+
}
|
7668
7733
|
return whisper_full_with_state(ctx, ctx->state, params, samples, n_samples);
|
7669
7734
|
}
|
7670
7735
|
|
@@ -7674,9 +7739,24 @@ int whisper_full_parallel(
|
|
7674
7739
|
const float * samples,
|
7675
7740
|
int n_samples,
|
7676
7741
|
int n_processors) {
|
7742
|
+
|
7677
7743
|
if (n_processors == 1) {
|
7678
7744
|
return whisper_full(ctx, params, samples, n_samples);
|
7679
7745
|
}
|
7746
|
+
|
7747
|
+
std::vector<float> vad_samples;
|
7748
|
+
if (params.vad) {
|
7749
|
+
WHISPER_LOG_INFO("%s: VAD is enabled, processing speech segments only\n", __func__);
|
7750
|
+
if (!whisper_vad(ctx, ctx->state, params, samples, n_samples, vad_samples)) {
|
7751
|
+
WHISPER_LOG_ERROR("%s: failed to compute VAD\n", __func__);
|
7752
|
+
return -1;
|
7753
|
+
}
|
7754
|
+
if (vad_samples.empty()) {
|
7755
|
+
return 0;
|
7756
|
+
}
|
7757
|
+
samples = vad_samples.data();
|
7758
|
+
n_samples = vad_samples.size();
|
7759
|
+
}
|
7680
7760
|
int ret = 0;
|
7681
7761
|
|
7682
7762
|
// prepare separate states for each thread
|
@@ -7799,130 +7879,89 @@ int whisper_full_lang_id(struct whisper_context * ctx) {
|
|
7799
7879
|
return ctx->state->lang_id;
|
7800
7880
|
}
|
7801
7881
|
|
7802
|
-
int64_t
|
7803
|
-
|
7804
|
-
|
7805
|
-
return state->result_all[i_segment].t0;
|
7882
|
+
static int64_t map_processed_to_original_time(int64_t processed_time, const std::vector<vad_time_mapping> & mapping_table) {
|
7883
|
+
if (mapping_table.empty()) {
|
7884
|
+
return processed_time;
|
7806
7885
|
}
|
7807
7886
|
|
7808
|
-
|
7809
|
-
|
7810
|
-
|
7811
|
-
float t0 = state->result_all[i_segment].t0 / 100.0f;
|
7887
|
+
if (processed_time <= mapping_table.front().processed_time) {
|
7888
|
+
return mapping_table.front().original_time; // Before first mapping point
|
7889
|
+
}
|
7812
7890
|
|
7813
|
-
|
7814
|
-
|
7815
|
-
|
7816
|
-
// the access pattern is sequential and optimized for that too.
|
7817
|
-
for (size_t i = 0; i < state->vad_segments.size(); i++) {
|
7818
|
-
const auto & segment = state->vad_segments[i];
|
7891
|
+
if (processed_time >= mapping_table.back().processed_time) {
|
7892
|
+
return mapping_table.back().original_time; // After last mapping point
|
7893
|
+
}
|
7819
7894
|
|
7820
|
-
|
7821
|
-
|
7822
|
-
|
7823
|
-
|
7824
|
-
|
7825
|
-
}
|
7826
|
-
float orig_t0 = segment.orig_start + proportion * (segment.orig_end - segment.orig_start);
|
7827
|
-
return (int64_t)(orig_t0 * 100);
|
7895
|
+
// Binary search over the time map that finds the first entry that has a
|
7896
|
+
// processed time greater than or equal to the current processed time.
|
7897
|
+
auto upper = std::lower_bound(mapping_table.begin(), mapping_table.end(), processed_time,
|
7898
|
+
[](const vad_time_mapping & entry, int64_t time) {
|
7899
|
+
return entry.processed_time < time;
|
7828
7900
|
}
|
7901
|
+
);
|
7902
|
+
|
7903
|
+
// If exact match found
|
7904
|
+
if (upper->processed_time == processed_time) {
|
7905
|
+
return upper->original_time;
|
7829
7906
|
}
|
7830
7907
|
|
7831
|
-
//
|
7832
|
-
|
7833
|
-
const auto & curr = state->vad_segments[i];
|
7834
|
-
const auto & next = state->vad_segments[i + 1];
|
7908
|
+
// Need to interpolate between two points
|
7909
|
+
auto lower = upper - 1;
|
7835
7910
|
|
7836
|
-
|
7837
|
-
|
7838
|
-
|
7839
|
-
if (next.vad_start > curr.vad_end) {
|
7840
|
-
gap_proportion = (t0 - curr.vad_end) / (next.vad_start - curr.vad_end);
|
7841
|
-
}
|
7842
|
-
float orig_t0 = curr.orig_end + gap_proportion * (next.orig_start - curr.orig_end);
|
7843
|
-
return (int64_t)(orig_t0 * 100);
|
7844
|
-
}
|
7845
|
-
}
|
7911
|
+
int64_t processed_diff = upper->processed_time - lower->processed_time;
|
7912
|
+
int64_t original_diff = upper->original_time - lower->original_time;
|
7913
|
+
int64_t offset = processed_time - lower->processed_time;
|
7846
7914
|
|
7847
|
-
|
7848
|
-
|
7849
|
-
// For timestamps after the last segment, add the extra time to the end of the last segment
|
7850
|
-
const auto& last = state->vad_segments.back();
|
7851
|
-
// Calculate how far beyond the last segment
|
7852
|
-
float extra_time = t0 - last.vad_end;
|
7853
|
-
// Add this extra time to the original end time
|
7854
|
-
float orig_t0 = last.orig_end + extra_time;
|
7855
|
-
return (int64_t)(orig_t0 * 100);
|
7915
|
+
if (processed_diff == 0) {
|
7916
|
+
return lower->original_time;
|
7856
7917
|
}
|
7857
7918
|
|
7858
|
-
|
7859
|
-
return
|
7919
|
+
// Perform linear interpolation
|
7920
|
+
return lower->original_time + (offset * original_diff) / processed_diff;
|
7860
7921
|
}
|
7861
7922
|
|
7862
|
-
|
7863
|
-
|
7923
|
+
// Function to get the starting timestamp of a segment
|
7924
|
+
int64_t whisper_full_get_segment_t0_from_state(struct whisper_state * state, int i_segment) {
|
7925
|
+
// If VAD wasn't used, return the original timestamp
|
7926
|
+
if (!state->has_vad_segments || state->vad_mapping_table.empty()) {
|
7927
|
+
return state->result_all[i_segment].t0;
|
7928
|
+
}
|
7929
|
+
|
7930
|
+
// Get the processed timestamp
|
7931
|
+
int64_t t0 = state->result_all[i_segment].t0;
|
7932
|
+
|
7933
|
+
// Map to original time using the mapping table
|
7934
|
+
return map_processed_to_original_time(t0, state->vad_mapping_table);
|
7864
7935
|
}
|
7865
7936
|
|
7937
|
+
// Function to get the ending timestamp of a segment
|
7866
7938
|
int64_t whisper_full_get_segment_t1_from_state(struct whisper_state * state, int i_segment) {
|
7867
7939
|
// If VAD wasn't used, return the original timestamp
|
7868
|
-
if (!state->has_vad_segments || state->
|
7940
|
+
if (!state->has_vad_segments || state->vad_mapping_table.empty()) {
|
7869
7941
|
return state->result_all[i_segment].t1;
|
7870
7942
|
}
|
7871
7943
|
|
7872
|
-
// Get the
|
7873
|
-
|
7874
|
-
// back to the original audio.
|
7875
|
-
float t1 = state->result_all[i_segment].t1 / 100.0f;
|
7876
|
-
|
7877
|
-
// Find which VAD segment this timestamp belongs.
|
7878
|
-
// TODO(danbev) This could be optimized by using a binary search if the number
|
7879
|
-
// of segments exceed a certain limit. Also we might be able to assume that
|
7880
|
-
// the access pattern is sequential and optimized for that too.
|
7881
|
-
for (size_t i = 0; i < state->vad_segments.size(); i++) {
|
7882
|
-
const auto& segment = state->vad_segments[i];
|
7883
|
-
|
7884
|
-
// Check if the timestamp falls within this segment.
|
7885
|
-
if (t1 >= segment.vad_start && t1 <= segment.vad_end) {
|
7886
|
-
// Calculate the proportion through the filtered segment.
|
7887
|
-
float proportion = 0.0f;
|
7888
|
-
if (segment.vad_end > segment.vad_start) {
|
7889
|
-
proportion = (t1 - segment.vad_start) / (segment.vad_end - segment.vad_start);
|
7890
|
-
}
|
7891
|
-
float orig_t1 = segment.orig_start + proportion * (segment.orig_end - segment.orig_start);
|
7892
|
-
return (int64_t)(orig_t1 * 100);
|
7893
|
-
}
|
7894
|
-
}
|
7944
|
+
// Get the processed timestamp
|
7945
|
+
int64_t t1 = state->result_all[i_segment].t1;
|
7895
7946
|
|
7896
|
-
//
|
7897
|
-
|
7898
|
-
const auto & curr = state->vad_segments[i];
|
7899
|
-
const auto & next = state->vad_segments[i + 1];
|
7947
|
+
// Map to original time using the mapping table
|
7948
|
+
int64_t orig_t1 = map_processed_to_original_time(t1, state->vad_mapping_table);
|
7900
7949
|
|
7901
|
-
|
7902
|
-
|
7903
|
-
float gap_proportion = 0.0f;
|
7904
|
-
if (next.vad_start > curr.vad_end) {
|
7905
|
-
gap_proportion = (t1 - curr.vad_end) / (next.vad_start - curr.vad_end);
|
7906
|
-
}
|
7907
|
-
// Map to the corresponding position in the original gap
|
7908
|
-
float orig_t1 = curr.orig_end + gap_proportion * (next.orig_start - curr.orig_end);
|
7909
|
-
return (int64_t)(orig_t1 * 100);
|
7910
|
-
}
|
7911
|
-
}
|
7950
|
+
// Get the corresponding t0 for this segment
|
7951
|
+
int64_t orig_t0 = whisper_full_get_segment_t0_from_state(state, i_segment);
|
7912
7952
|
|
7913
|
-
//
|
7914
|
-
|
7915
|
-
|
7916
|
-
|
7917
|
-
// Calculate how far beyond the last segment
|
7918
|
-
float extra_time = t1 - last.vad_end;
|
7919
|
-
// Add this extra time to the original end time
|
7920
|
-
float orig_t1 = last.orig_end + extra_time;
|
7921
|
-
return (int64_t)(orig_t1 * 100);
|
7953
|
+
// Ensure minimum duration to prevent zero-length segments
|
7954
|
+
const int64_t min_duration = 10; // 10ms minimum
|
7955
|
+
if (orig_t1 - orig_t0 < min_duration) {
|
7956
|
+
orig_t1 = orig_t0 + min_duration;
|
7922
7957
|
}
|
7923
7958
|
|
7924
|
-
|
7925
|
-
|
7959
|
+
return orig_t1;
|
7960
|
+
}
|
7961
|
+
|
7962
|
+
|
7963
|
+
int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment) {
|
7964
|
+
return whisper_full_get_segment_t0_from_state(ctx->state, i_segment);
|
7926
7965
|
}
|
7927
7966
|
|
7928
7967
|
int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment) {
|
@@ -8154,8 +8193,6 @@ WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads) {
|
|
8154
8193
|
}
|
8155
8194
|
|
8156
8195
|
WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
|
8157
|
-
whisper_load_backends();
|
8158
|
-
|
8159
8196
|
static std::string s;
|
8160
8197
|
s = "";
|
8161
8198
|
char strbuf[256];
|
@@ -8289,10 +8326,6 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
|
|
8289
8326
|
// token-level timestamps
|
8290
8327
|
//
|
8291
8328
|
|
8292
|
-
static int timestamp_to_sample(int64_t t, int n_samples) {
|
8293
|
-
return std::max(0, std::min((int) n_samples - 1, (int) ((t*WHISPER_SAMPLE_RATE)/100)));
|
8294
|
-
}
|
8295
|
-
|
8296
8329
|
static int64_t sample_to_timestamp(int i_sample) {
|
8297
8330
|
return (100ll*i_sample)/WHISPER_SAMPLE_RATE;
|
8298
8331
|
}
|
@@ -8342,6 +8375,18 @@ static std::vector<float> get_signal_energy(const float * signal, int n_samples,
|
|
8342
8375
|
return result;
|
8343
8376
|
}
|
8344
8377
|
|
8378
|
+
static int timestamp_to_sample(int64_t t, int64_t segment_t0, int n_samples) {
|
8379
|
+
// Convert absolute timestamp to segment-relative timestamp
|
8380
|
+
int64_t relative_t = t - segment_t0;
|
8381
|
+
int sample = (int)((relative_t * WHISPER_SAMPLE_RATE) / 100);
|
8382
|
+
return std::max(0, std::min(n_samples - 1, sample));
|
8383
|
+
}
|
8384
|
+
|
8385
|
+
static int64_t sample_to_timestamp(int i_sample, int64_t segment_t0) {
|
8386
|
+
int64_t relative_timestamp = (100ll * i_sample) / WHISPER_SAMPLE_RATE;
|
8387
|
+
return relative_timestamp + segment_t0;
|
8388
|
+
}
|
8389
|
+
|
8345
8390
|
static void whisper_exp_compute_token_level_timestamps(
|
8346
8391
|
struct whisper_context & ctx,
|
8347
8392
|
struct whisper_state & state,
|
@@ -8482,8 +8527,8 @@ static void whisper_exp_compute_token_level_timestamps(
|
|
8482
8527
|
continue;
|
8483
8528
|
}
|
8484
8529
|
|
8485
|
-
int s0 = timestamp_to_sample(tokens[j].t0, n_samples);
|
8486
|
-
int s1 = timestamp_to_sample(tokens[j].t1, n_samples);
|
8530
|
+
int s0 = timestamp_to_sample(tokens[j].t0, segment.t0, n_samples);
|
8531
|
+
int s1 = timestamp_to_sample(tokens[j].t1, segment.t0, n_samples);
|
8487
8532
|
|
8488
8533
|
const int ss0 = std::max(s0 - hw, 0);
|
8489
8534
|
const int ss1 = std::min(s1 + hw, n_samples);
|
@@ -8504,7 +8549,7 @@ static void whisper_exp_compute_token_level_timestamps(
|
|
8504
8549
|
while (k > 0 && state.energy[k] > thold) {
|
8505
8550
|
k--;
|
8506
8551
|
}
|
8507
|
-
tokens[j].t0 = sample_to_timestamp(k);
|
8552
|
+
tokens[j].t0 = sample_to_timestamp(k, segment.t0);
|
8508
8553
|
if (tokens[j].t0 < tokens[j - 1].t1) {
|
8509
8554
|
tokens[j].t0 = tokens[j - 1].t1;
|
8510
8555
|
} else {
|
@@ -8515,7 +8560,7 @@ static void whisper_exp_compute_token_level_timestamps(
|
|
8515
8560
|
k++;
|
8516
8561
|
}
|
8517
8562
|
s0 = k;
|
8518
|
-
tokens[j].t0 = sample_to_timestamp(k);
|
8563
|
+
tokens[j].t0 = sample_to_timestamp(k, segment.t0);
|
8519
8564
|
}
|
8520
8565
|
}
|
8521
8566
|
|
@@ -8525,7 +8570,7 @@ static void whisper_exp_compute_token_level_timestamps(
|
|
8525
8570
|
while (k < n_samples - 1 && state.energy[k] > thold) {
|
8526
8571
|
k++;
|
8527
8572
|
}
|
8528
|
-
tokens[j].t1 = sample_to_timestamp(k);
|
8573
|
+
tokens[j].t1 = sample_to_timestamp(k, segment.t0);
|
8529
8574
|
if (j < n - 1 && tokens[j].t1 > tokens[j + 1].t0) {
|
8530
8575
|
tokens[j].t1 = tokens[j + 1].t0;
|
8531
8576
|
} else {
|
@@ -8536,7 +8581,7 @@ static void whisper_exp_compute_token_level_timestamps(
|
|
8536
8581
|
k--;
|
8537
8582
|
}
|
8538
8583
|
s1 = k;
|
8539
|
-
tokens[j].t1 = sample_to_timestamp(k);
|
8584
|
+
tokens[j].t1 = sample_to_timestamp(k, segment.t0);
|
8540
8585
|
}
|
8541
8586
|
}
|
8542
8587
|
}
|
@@ -8893,6 +8938,10 @@ void whisper_log_set(ggml_log_callback log_callback, void * user_data) {
|
|
8893
8938
|
ggml_log_set(g_state.log_callback, g_state.log_callback_user_data);
|
8894
8939
|
}
|
8895
8940
|
|
8941
|
+
const char * whisper_version(void) {
|
8942
|
+
return WHISPER_VERSION;
|
8943
|
+
}
|
8944
|
+
|
8896
8945
|
GGML_ATTRIBUTE_FORMAT(2, 3)
|
8897
8946
|
static void whisper_log_internal(ggml_log_level level, const char * format, ...) {
|
8898
8947
|
va_list args;
|
data/extsources.rb
CHANGED
@@ -1,5 +1,10 @@
|
|
1
|
+
require "pathname"
|
2
|
+
|
3
|
+
root = Pathname("..")/".."
|
1
4
|
ignored_dirs = %w[
|
2
5
|
.devops
|
6
|
+
.github
|
7
|
+
ci
|
3
8
|
examples/wchess/wchess.wasm
|
4
9
|
examples/whisper.android
|
5
10
|
examples/whisper.android.java
|
@@ -9,7 +14,7 @@ ignored_dirs = %w[
|
|
9
14
|
models
|
10
15
|
samples
|
11
16
|
scripts
|
12
|
-
]
|
17
|
+
].collect {|dir| root/dir}
|
13
18
|
ignored_files = %w[
|
14
19
|
AUTHORS
|
15
20
|
Makefile
|
@@ -17,18 +22,19 @@ ignored_files = %w[
|
|
17
22
|
README_sycl.md
|
18
23
|
.gitignore
|
19
24
|
.gitmodules
|
25
|
+
.dockerignore
|
20
26
|
whisper.nvim
|
21
27
|
twitch.sh
|
22
28
|
yt-wsp.sh
|
29
|
+
close-issue.yml
|
23
30
|
]
|
24
31
|
|
25
32
|
EXTSOURCES =
|
26
|
-
`git ls-files -z
|
27
|
-
.
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
(file.start_with?("../..") || file.start_with?("../javascript")) &&
|
33
|
-
(!file.start_with?("../../.github/") || basename == "bindings-ruby.yml")
|
33
|
+
`git ls-files -z #{root}`.split("\x0")
|
34
|
+
.collect {|file| Pathname(file)}
|
35
|
+
.reject {|file|
|
36
|
+
ignored_dirs.any? {|dir| file.descend.any? {|desc| desc == dir}} ||
|
37
|
+
ignored_files.include?(file.basename.to_path) ||
|
38
|
+
(file.descend.to_a[1] != root && file.descend.to_a[1] != Pathname("..")/"javascript")
|
34
39
|
}
|
40
|
+
.collect(&:to_path)
|