whispercpp 1.3.2 → 1.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +6 -3
- data/README.md +71 -14
- data/Rakefile +20 -7
- data/ext/.gitignore +4 -6
- data/ext/dependencies.rb +36 -24
- data/ext/extconf.rb +1 -1
- data/ext/options.rb +48 -184
- data/ext/ruby_whisper.c +18 -0
- data/ext/ruby_whisper_context.c +43 -12
- data/ext/ruby_whisper_model.c +1 -1
- data/ext/ruby_whisper_params.c +4 -2
- data/ext/ruby_whisper_segment.c +81 -4
- data/ext/ruby_whisper_transcribe.cpp +13 -7
- data/ext/ruby_whisper_vad_params.c +1 -1
- data/ext/sources/CMakeLists.txt +5 -1
- data/ext/sources/bindings/javascript/package.json +1 -1
- data/ext/sources/examples/addon.node/__test__/whisper.spec.js +120 -24
- data/ext/sources/examples/addon.node/addon.cpp +150 -31
- data/ext/sources/examples/addon.node/index.js +3 -0
- data/ext/sources/examples/addon.node/vad-example.js +132 -0
- data/ext/sources/examples/bench/bench.cpp +3 -2
- data/ext/sources/examples/cli/cli.cpp +3 -2
- data/ext/sources/examples/command/command.cpp +32 -8
- data/ext/sources/examples/common-whisper.cpp +14 -7
- data/ext/sources/examples/lsp/lsp.cpp +2 -0
- data/ext/sources/examples/quantize/quantize.cpp +3 -0
- data/ext/sources/examples/server/CMakeLists.txt +3 -0
- data/ext/sources/examples/server/server.cpp +169 -22
- data/ext/sources/examples/stream/stream.cpp +6 -0
- data/ext/sources/examples/talk-llama/CMakeLists.txt +4 -1
- data/ext/sources/examples/talk-llama/llama-arch.cpp +171 -3
- data/ext/sources/examples/talk-llama/llama-arch.h +28 -1
- data/ext/sources/examples/talk-llama/llama-batch.cpp +741 -272
- data/ext/sources/examples/talk-llama/llama-batch.h +112 -54
- data/ext/sources/examples/talk-llama/llama-chat.cpp +30 -8
- data/ext/sources/examples/talk-llama/llama-chat.h +1 -0
- data/ext/sources/examples/talk-llama/llama-context.cpp +520 -351
- data/ext/sources/examples/talk-llama/llama-context.h +38 -17
- data/ext/sources/examples/talk-llama/llama-cparams.cpp +1 -1
- data/ext/sources/examples/talk-llama/llama-cparams.h +1 -1
- data/ext/sources/examples/talk-llama/llama-graph.cpp +447 -372
- data/ext/sources/examples/talk-llama/llama-graph.h +128 -58
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +10 -2
- data/ext/sources/examples/talk-llama/llama-hparams.h +19 -2
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +279 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.h +128 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified.cpp +1841 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified.h +303 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +14 -472
- data/ext/sources/examples/talk-llama/llama-kv-cells.h +86 -26
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +246 -0
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +138 -0
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +1125 -0
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +183 -0
- data/ext/sources/examples/talk-llama/llama-memory.cpp +58 -0
- data/ext/sources/examples/talk-llama/llama-memory.h +88 -4
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +1 -1
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +42 -17
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +1 -0
- data/ext/sources/examples/talk-llama/llama-model.cpp +1863 -563
- data/ext/sources/examples/talk-llama/llama-model.h +27 -0
- data/ext/sources/examples/talk-llama/llama-quant.cpp +89 -6
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +65 -28
- data/ext/sources/examples/talk-llama/llama-vocab.h +1 -0
- data/ext/sources/examples/talk-llama/llama.cpp +11 -7
- data/ext/sources/examples/talk-llama/llama.h +147 -40
- data/ext/sources/examples/talk-llama/talk-llama.cpp +2 -0
- data/ext/sources/examples/talk-llama/unicode.cpp +5 -0
- data/ext/sources/examples/vad-speech-segments/speech.cpp +6 -0
- data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +2 -0
- data/ext/sources/ggml/CMakeLists.txt +48 -3
- data/ext/sources/ggml/cmake/common.cmake +24 -0
- data/ext/sources/ggml/include/ggml-backend.h +1 -1
- data/ext/sources/ggml/include/ggml-cpu.h +2 -0
- data/ext/sources/ggml/include/ggml.h +144 -5
- data/ext/sources/ggml/src/CMakeLists.txt +82 -24
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +5 -0
- data/ext/sources/ggml/src/ggml-backend.cpp +46 -23
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +3 -3
- data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +1 -0
- data/ext/sources/ggml/src/ggml-cann/common.h +6 -1
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
- data/ext/sources/ggml/src/ggml-common.h +4 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +133 -40
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +184 -0
- data/ext/sources/ggml/src/ggml-cpu/common.h +4 -3
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +146 -105
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1057 -174
- data/ext/sources/ggml/src/ggml-cpu/ops.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.c +1158 -0
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1571 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.h +98 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +330 -38
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +111 -18
- data/ext/sources/ggml/src/ggml-cpu/vec.h +303 -94
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +60 -37
- data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +22 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +2 -2
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +5 -2
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +265 -123
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +19 -0
- data/ext/sources/ggml/src/ggml-cuda/mean.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/mmv.cu +257 -87
- data/ext/sources/ggml/src/ggml-cuda/mmv.cuh +2 -3
- data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +5 -18
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +0 -1
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +89 -0
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +7 -0
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +4 -0
- data/ext/sources/ggml/src/ggml-impl.h +127 -183
- data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +11 -10
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +27 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +331 -49
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +564 -282
- data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +2 -2
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +14 -0
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +1859 -489
- data/ext/sources/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +72 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +201 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
- data/ext/sources/ggml/src/ggml-quants.c +6 -8
- data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +5 -6
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +20 -48
- data/ext/sources/ggml/src/ggml-sycl/concat.cpp +28 -41
- data/ext/sources/ggml/src/ggml-sycl/conv.cpp +4 -10
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +117 -165
- data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +192 -53
- data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +32 -0
- data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +49 -67
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +648 -1039
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +18 -9
- data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +8 -105
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -100
- data/ext/sources/ggml/src/ggml-sycl/gla.cpp +2 -2
- data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/mmq.cpp +60 -80
- data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +158 -203
- data/ext/sources/ggml/src/ggml-sycl/norm.cpp +55 -74
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +38 -10
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +138 -27
- data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +3 -3
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +3 -8
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
- data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +12 -16
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +36 -32
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +726 -282
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +12 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +10 -1
- data/ext/sources/ggml/src/ggml.c +328 -48
- data/ext/sources/ggml/src/ggml.cpp +26 -0
- data/ext/sources/ggml/src/gguf.cpp +24 -3
- data/ext/sources/include/whisper.h +2 -0
- data/ext/sources/src/CMakeLists.txt +2 -0
- data/ext/sources/src/coreml/whisper-compat.h +10 -0
- data/ext/sources/src/coreml/whisper-compat.m +35 -0
- data/ext/sources/src/coreml/whisper-decoder-impl.m +1 -0
- data/ext/sources/src/coreml/whisper-encoder-impl.m +1 -0
- data/ext/sources/src/whisper.cpp +218 -169
- data/extsources.rb +15 -9
- data/lib/whisper/context.rb +15 -0
- data/lib/whisper/model/uri.rb +56 -1
- data/lib/whisper/segment.rb +58 -0
- data/sig/whisper.rbs +68 -38
- data/{tests → test}/helper.rb +1 -12
- data/{tests → test}/test_model.rb +9 -0
- data/test/test_package.rb +51 -0
- data/test/test_segment.rb +146 -0
- data/{tests → test}/test_whisper.rb +70 -0
- data/whispercpp.gemspec +2 -3
- metadata +91 -43
- data/ext/sources/.dockerignore +0 -3
- data/ext/sources/.github/workflows/bindings-ruby.yml +0 -21
- data/ext/sources/ci/run.sh +0 -336
- data/ext/sources/close-issue.yml +0 -28
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2739
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13747
- data/tests/test_package.rb +0 -46
- data/tests/test_segment.rb +0 -74
- /data/ext/sources/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
- /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
- /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
- /data/{tests → test}/jfk_reader/.gitignore +0 -0
- /data/{tests → test}/jfk_reader/extconf.rb +0 -0
- /data/{tests → test}/jfk_reader/jfk_reader.c +0 -0
- /data/{tests → test}/test_callback.rb +0 -0
- /data/{tests → test}/test_error.rb +0 -0
- /data/{tests → test}/test_params.rb +0 -0
- /data/{tests → test}/test_vad.rb +0 -0
- /data/{tests → test}/test_vad_params.rb +0 -0
data/ext/ruby_whisper_context.c
CHANGED
@@ -11,6 +11,9 @@ extern ID id_new;
|
|
11
11
|
extern ID id_to_path;
|
12
12
|
extern ID id_URI;
|
13
13
|
extern ID id_pre_converted_models;
|
14
|
+
extern ID id_coreml_compiled_models;
|
15
|
+
extern ID id_cache;
|
16
|
+
extern ID id_n_processors;
|
14
17
|
|
15
18
|
extern VALUE cContext;
|
16
19
|
extern VALUE eError;
|
@@ -18,10 +21,12 @@ extern VALUE cModel;
|
|
18
21
|
|
19
22
|
extern const rb_data_type_t ruby_whisper_params_type;
|
20
23
|
extern VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self);
|
21
|
-
extern VALUE
|
22
|
-
extern VALUE
|
24
|
+
extern VALUE rb_whisper_model_s_new(VALUE context);
|
25
|
+
extern VALUE rb_whisper_segment_s_new(VALUE context, int index);
|
23
26
|
extern void prepare_transcription(ruby_whisper_params *rwp, VALUE *context);
|
24
27
|
|
28
|
+
ID transcribe_option_names[1];
|
29
|
+
|
25
30
|
static void
|
26
31
|
ruby_whisper_free(ruby_whisper *rw)
|
27
32
|
{
|
@@ -53,6 +58,9 @@ ruby_whisper_memsize(const void *p)
|
|
53
58
|
if (!rw) {
|
54
59
|
return 0;
|
55
60
|
}
|
61
|
+
if (rw->context) {
|
62
|
+
size += sizeof(rw->context);
|
63
|
+
}
|
56
64
|
return size;
|
57
65
|
}
|
58
66
|
|
@@ -79,6 +87,13 @@ ruby_whisper_normalize_model_path(VALUE model_path)
|
|
79
87
|
VALUE pre_converted_model = rb_hash_aref(pre_converted_models, model_path);
|
80
88
|
if (!NIL_P(pre_converted_model)) {
|
81
89
|
model_path = pre_converted_model;
|
90
|
+
#ifdef RUBY_WHISPER_USE_COREML
|
91
|
+
VALUE coreml_converted_models = rb_funcall(cModel, id_coreml_compiled_models, 0);
|
92
|
+
VALUE coreml_converted_model = rb_hash_aref(coreml_converted_models, pre_converted_model);
|
93
|
+
if (!NIL_P(coreml_converted_model)) {
|
94
|
+
rb_funcall(coreml_converted_model, id_cache, 0);
|
95
|
+
}
|
96
|
+
#endif
|
82
97
|
}
|
83
98
|
else if (TYPE(model_path) == T_STRING) {
|
84
99
|
const char * model_path_str = StringValueCStr(model_path);
|
@@ -293,13 +308,20 @@ VALUE ruby_whisper_full(int argc, VALUE *argv, VALUE self)
|
|
293
308
|
// Should check when samples.respond_to?(:length)?
|
294
309
|
} else {
|
295
310
|
if (TYPE(samples) == T_ARRAY) {
|
296
|
-
|
311
|
+
if (RARRAY_LEN(samples) > INT_MAX) {
|
312
|
+
rb_raise(rb_eArgError, "samples are too long");
|
313
|
+
}
|
314
|
+
n_samples = (int)RARRAY_LEN(samples);
|
297
315
|
} else if (memory_view_available_p) {
|
298
316
|
if (!rb_memory_view_get(samples, &view, RUBY_MEMORY_VIEW_SIMPLE)) {
|
299
317
|
view.obj = Qnil;
|
300
318
|
rb_raise(rb_eArgError, "unable to get a memory view");
|
301
319
|
}
|
302
|
-
|
320
|
+
ssize_t n_samples_size = view.byte_size / view.item_size;
|
321
|
+
if (n_samples_size > INT_MAX) {
|
322
|
+
rb_raise(rb_eArgError, "samples are too long");
|
323
|
+
}
|
324
|
+
n_samples = (int)n_samples_size;
|
303
325
|
} else if (rb_respond_to(samples, id_length)) {
|
304
326
|
n_samples = NUM2INT(rb_funcall(samples, id_length, 0));
|
305
327
|
} else {
|
@@ -387,10 +409,17 @@ ruby_whisper_full_parallel(int argc, VALUE *argv,VALUE self)
|
|
387
409
|
view.obj = Qnil;
|
388
410
|
rb_raise(rb_eArgError, "unable to get a memory view");
|
389
411
|
}
|
390
|
-
|
412
|
+
ssize_t n_samples_size = view.byte_size / view.item_size;
|
413
|
+
if (n_samples_size > INT_MAX) {
|
414
|
+
rb_raise(rb_eArgError, "samples are too long");
|
415
|
+
}
|
416
|
+
n_samples = (int)n_samples_size;
|
391
417
|
} else {
|
392
418
|
if (TYPE(samples) == T_ARRAY) {
|
393
|
-
|
419
|
+
if (RARRAY_LEN(samples) > INT_MAX) {
|
420
|
+
rb_raise(rb_eArgError, "samples are too long");
|
421
|
+
}
|
422
|
+
n_samples = (int)RARRAY_LEN(samples);
|
394
423
|
} else if (rb_respond_to(samples, id_length)) {
|
395
424
|
n_samples = NUM2INT(rb_funcall(samples, id_length, 0));
|
396
425
|
} else {
|
@@ -476,7 +505,7 @@ ruby_whisper_full_get_segment_t0(VALUE self, VALUE i_segment)
|
|
476
505
|
TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
|
477
506
|
const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
|
478
507
|
const int64_t t0 = whisper_full_get_segment_t0(rw->context, c_i_segment);
|
479
|
-
return
|
508
|
+
return LONG2NUM(t0);
|
480
509
|
}
|
481
510
|
|
482
511
|
/*
|
@@ -494,7 +523,7 @@ ruby_whisper_full_get_segment_t1(VALUE self, VALUE i_segment)
|
|
494
523
|
TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
|
495
524
|
const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
|
496
525
|
const int64_t t1 = whisper_full_get_segment_t1(rw->context, c_i_segment);
|
497
|
-
return
|
526
|
+
return LONG2NUM(t1);
|
498
527
|
}
|
499
528
|
|
500
529
|
/*
|
@@ -552,7 +581,7 @@ ruby_whisper_full_get_segment_no_speech_prob(VALUE self, VALUE i_segment)
|
|
552
581
|
static VALUE
|
553
582
|
ruby_whisper_full_get_segment(VALUE self, VALUE i_segment)
|
554
583
|
{
|
555
|
-
return
|
584
|
+
return rb_whisper_segment_s_new(self, NUM2INT(i_segment));
|
556
585
|
}
|
557
586
|
|
558
587
|
/*
|
@@ -586,7 +615,7 @@ ruby_whisper_each_segment(VALUE self)
|
|
586
615
|
|
587
616
|
const int n_segments = whisper_full_n_segments(rw->context);
|
588
617
|
for (int i = 0; i < n_segments; ++i) {
|
589
|
-
rb_yield(
|
618
|
+
rb_yield(rb_whisper_segment_s_new(self, i));
|
590
619
|
}
|
591
620
|
|
592
621
|
return self;
|
@@ -599,7 +628,7 @@ ruby_whisper_each_segment(VALUE self)
|
|
599
628
|
static VALUE
|
600
629
|
ruby_whisper_get_model(VALUE self)
|
601
630
|
{
|
602
|
-
return
|
631
|
+
return rb_whisper_model_s_new(self);
|
603
632
|
}
|
604
633
|
|
605
634
|
void
|
@@ -607,6 +636,8 @@ init_ruby_whisper_context(VALUE *mWhisper)
|
|
607
636
|
{
|
608
637
|
cContext = rb_define_class_under(*mWhisper, "Context", rb_cObject);
|
609
638
|
|
639
|
+
transcribe_option_names[0] = id_n_processors;
|
640
|
+
|
610
641
|
rb_define_alloc_func(cContext, ruby_whisper_allocate);
|
611
642
|
rb_define_method(cContext, "initialize", ruby_whisper_initialize, -1);
|
612
643
|
|
@@ -633,7 +664,7 @@ init_ruby_whisper_context(VALUE *mWhisper)
|
|
633
664
|
rb_define_method(cContext, "full", ruby_whisper_full, -1);
|
634
665
|
rb_define_method(cContext, "full_parallel", ruby_whisper_full_parallel, -1);
|
635
666
|
|
636
|
-
// High
|
667
|
+
// High level
|
637
668
|
rb_define_method(cContext, "full_get_segment", ruby_whisper_full_get_segment, 1);
|
638
669
|
rb_define_method(cContext, "each_segment", ruby_whisper_each_segment, 0);
|
639
670
|
|
data/ext/ruby_whisper_model.c
CHANGED
@@ -35,7 +35,7 @@ static VALUE ruby_whisper_model_allocate(VALUE klass) {
|
|
35
35
|
return TypedData_Make_Struct(klass, ruby_whisper_model, &rb_whisper_model_type, rwm);
|
36
36
|
}
|
37
37
|
|
38
|
-
VALUE
|
38
|
+
VALUE rb_whisper_model_s_new(VALUE context) {
|
39
39
|
ruby_whisper_model *rwm;
|
40
40
|
const VALUE model = ruby_whisper_model_allocate(cModel);
|
41
41
|
TypedData_Get_Struct(model, ruby_whisper_model, &rb_whisper_model_type, rwm);
|
data/ext/ruby_whisper_params.c
CHANGED
@@ -34,7 +34,7 @@ extern VALUE cVADParams;
|
|
34
34
|
extern ID id_call;
|
35
35
|
|
36
36
|
extern VALUE ruby_whisper_normalize_model_path(VALUE model_path);
|
37
|
-
extern VALUE
|
37
|
+
extern VALUE rb_whisper_segment_s_new(VALUE context, int index);
|
38
38
|
extern const rb_data_type_t ruby_whisper_vad_params_type;
|
39
39
|
|
40
40
|
static ID param_names[RUBY_WHISPER_PARAMS_PARAM_NAMES_COUNT];
|
@@ -77,6 +77,8 @@ static ID id_vad_params;
|
|
77
77
|
static void
|
78
78
|
rb_whisper_callbcack_container_mark(ruby_whisper_callback_container *rwc)
|
79
79
|
{
|
80
|
+
if (rwc == NULL) return;
|
81
|
+
|
80
82
|
rb_gc_mark(rwc->user_data);
|
81
83
|
rb_gc_mark(rwc->callback);
|
82
84
|
rb_gc_mark(rwc->callbacks);
|
@@ -108,7 +110,7 @@ static void new_segment_callback(struct whisper_context *ctx, struct whisper_sta
|
|
108
110
|
const int n_segments = whisper_full_n_segments_from_state(state);
|
109
111
|
for (int i = n_new; i > 0; i--) {
|
110
112
|
int i_segment = n_segments - i;
|
111
|
-
VALUE segment =
|
113
|
+
VALUE segment = rb_whisper_segment_s_new(*container->context, i_segment);
|
112
114
|
for (int j = 0; j < callbacks_len; j++) {
|
113
115
|
VALUE cb = rb_ary_entry(container->callbacks, j);
|
114
116
|
rb_funcall(cb, id_call, 1, segment);
|
data/ext/ruby_whisper_segment.c
CHANGED
@@ -1,6 +1,15 @@
|
|
1
1
|
#include <ruby.h>
|
2
2
|
#include "ruby_whisper.h"
|
3
3
|
|
4
|
+
#define N_KEY_NAMES 5
|
5
|
+
|
6
|
+
static VALUE sym_start_time;
|
7
|
+
static VALUE sym_end_time;
|
8
|
+
static VALUE sym_text;
|
9
|
+
static VALUE sym_no_speech_prob;
|
10
|
+
static VALUE sym_speaker_turn_next;
|
11
|
+
static VALUE key_names;
|
12
|
+
|
4
13
|
extern const rb_data_type_t ruby_whisper_type;
|
5
14
|
|
6
15
|
extern VALUE cSegment;
|
@@ -38,7 +47,7 @@ ruby_whisper_segment_allocate(VALUE klass)
|
|
38
47
|
}
|
39
48
|
|
40
49
|
VALUE
|
41
|
-
|
50
|
+
rb_whisper_segment_s_new(VALUE context, int index)
|
42
51
|
{
|
43
52
|
ruby_whisper_segment *rws;
|
44
53
|
const VALUE segment = ruby_whisper_segment_allocate(cSegment);
|
@@ -63,7 +72,7 @@ ruby_whisper_segment_get_start_time(VALUE self)
|
|
63
72
|
TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
|
64
73
|
const int64_t t0 = whisper_full_get_segment_t0(rw->context, rws->index);
|
65
74
|
// able to multiply 10 without overflow because to_timestamp() in whisper.cpp does it
|
66
|
-
return
|
75
|
+
return LONG2NUM(t0 * 10);
|
67
76
|
}
|
68
77
|
|
69
78
|
/*
|
@@ -81,7 +90,7 @@ ruby_whisper_segment_get_end_time(VALUE self)
|
|
81
90
|
TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
|
82
91
|
const int64_t t1 = whisper_full_get_segment_t1(rw->context, rws->index);
|
83
92
|
// able to multiply 10 without overflow because to_timestamp() in whisper.cpp does it
|
84
|
-
return
|
93
|
+
return LONG2NUM(t1 * 10);
|
85
94
|
}
|
86
95
|
|
87
96
|
/*
|
@@ -129,15 +138,83 @@ ruby_whisper_segment_get_no_speech_prob(VALUE self)
|
|
129
138
|
return DBL2NUM(whisper_full_get_segment_no_speech_prob(rw->context, rws->index));
|
130
139
|
}
|
131
140
|
|
141
|
+
/*
|
142
|
+
* call-seq:
|
143
|
+
* deconstruct_keys(keys) -> hash
|
144
|
+
*
|
145
|
+
* Possible keys: :start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next
|
146
|
+
*
|
147
|
+
* whisper.each_segment do |segment|
|
148
|
+
* segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:}
|
149
|
+
*
|
150
|
+
* puts "[#{start_time} --> #{end_time}] #{text} (no speech prob: #{no_speech_prob}#{speaker_turn_next ? ', speaker turns next' : ''})"
|
151
|
+
* end
|
152
|
+
*/
|
153
|
+
static VALUE
|
154
|
+
ruby_whisper_segment_deconstruct_keys(VALUE self, VALUE keys)
|
155
|
+
{
|
156
|
+
ruby_whisper_segment *rws;
|
157
|
+
TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
|
158
|
+
ruby_whisper *rw;
|
159
|
+
TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
|
160
|
+
|
161
|
+
VALUE hash = rb_hash_new();
|
162
|
+
long n_keys;
|
163
|
+
if (NIL_P(keys)) {
|
164
|
+
keys = key_names;
|
165
|
+
n_keys = N_KEY_NAMES;
|
166
|
+
} else {
|
167
|
+
n_keys = RARRAY_LEN(keys);
|
168
|
+
if (n_keys > N_KEY_NAMES) {
|
169
|
+
return hash;
|
170
|
+
}
|
171
|
+
}
|
172
|
+
for (int i = 0; i < n_keys; i++) {
|
173
|
+
VALUE key = rb_ary_entry(keys, i);
|
174
|
+
if (key == sym_start_time) {
|
175
|
+
rb_hash_aset(hash, key, ruby_whisper_segment_get_start_time(self));
|
176
|
+
}
|
177
|
+
if (key == sym_end_time) {
|
178
|
+
rb_hash_aset(hash, key, ruby_whisper_segment_get_end_time(self));
|
179
|
+
}
|
180
|
+
if (key == sym_text) {
|
181
|
+
rb_hash_aset(hash, key, ruby_whisper_segment_get_text(self));
|
182
|
+
}
|
183
|
+
if (key == sym_no_speech_prob) {
|
184
|
+
rb_hash_aset(hash, key, ruby_whisper_segment_get_no_speech_prob(self));
|
185
|
+
}
|
186
|
+
if (key == sym_speaker_turn_next) {
|
187
|
+
rb_hash_aset(hash, key, ruby_whisper_segment_get_speaker_turn_next(self));
|
188
|
+
}
|
189
|
+
}
|
190
|
+
|
191
|
+
return hash;
|
192
|
+
}
|
193
|
+
|
132
194
|
void
|
133
195
|
init_ruby_whisper_segment(VALUE *mWhisper, VALUE *cContext)
|
134
196
|
{
|
135
197
|
cSegment = rb_define_class_under(*mWhisper, "Segment", rb_cObject);
|
136
198
|
|
199
|
+
sym_start_time = ID2SYM(rb_intern("start_time"));
|
200
|
+
sym_end_time = ID2SYM(rb_intern("end_time"));
|
201
|
+
sym_text = ID2SYM(rb_intern("text"));
|
202
|
+
sym_no_speech_prob = ID2SYM(rb_intern("no_speech_prob"));
|
203
|
+
sym_speaker_turn_next = ID2SYM(rb_intern("speaker_turn_next"));
|
204
|
+
key_names = rb_ary_new3(
|
205
|
+
N_KEY_NAMES,
|
206
|
+
sym_start_time,
|
207
|
+
sym_end_time,
|
208
|
+
sym_text,
|
209
|
+
sym_no_speech_prob,
|
210
|
+
sym_speaker_turn_next
|
211
|
+
);
|
212
|
+
|
137
213
|
rb_define_alloc_func(cSegment, ruby_whisper_segment_allocate);
|
138
214
|
rb_define_method(cSegment, "start_time", ruby_whisper_segment_get_start_time, 0);
|
139
215
|
rb_define_method(cSegment, "end_time", ruby_whisper_segment_get_end_time, 0);
|
140
|
-
rb_define_method(cSegment, "
|
216
|
+
rb_define_method(cSegment, "speaker_turn_next?", ruby_whisper_segment_get_speaker_turn_next, 0);
|
141
217
|
rb_define_method(cSegment, "text", ruby_whisper_segment_get_text, 0);
|
142
218
|
rb_define_method(cSegment, "no_speech_prob", ruby_whisper_segment_get_no_speech_prob, 0);
|
219
|
+
rb_define_method(cSegment, "deconstruct_keys", ruby_whisper_segment_deconstruct_keys, 1);
|
143
220
|
}
|
@@ -13,6 +13,7 @@ extern const rb_data_type_t ruby_whisper_params_type;
|
|
13
13
|
|
14
14
|
extern ID id_to_s;
|
15
15
|
extern ID id_call;
|
16
|
+
extern ID transcribe_option_names[1];
|
16
17
|
|
17
18
|
extern void
|
18
19
|
prepare_transcription(ruby_whisper_params * rwp, VALUE * self);
|
@@ -34,9 +35,14 @@ VALUE
|
|
34
35
|
ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
|
35
36
|
ruby_whisper *rw;
|
36
37
|
ruby_whisper_params *rwp;
|
37
|
-
VALUE wave_file_path, blk, params;
|
38
|
+
VALUE wave_file_path, blk, params, kws;
|
39
|
+
VALUE opts[1];
|
40
|
+
|
41
|
+
rb_scan_args_kw(RB_SCAN_ARGS_LAST_HASH_KEYWORDS, argc, argv, "2:&", &wave_file_path, ¶ms, &kws, &blk);
|
42
|
+
rb_get_kwargs(kws, transcribe_option_names, 0, 1, opts);
|
43
|
+
|
44
|
+
int n_processors = opts[0] == Qundef ? 1 : NUM2INT(opts[0]);
|
38
45
|
|
39
|
-
rb_scan_args(argc, argv, "02&", &wave_file_path, ¶ms, &blk);
|
40
46
|
TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
|
41
47
|
TypedData_Get_Struct(params, ruby_whisper_params, &ruby_whisper_params_type, rwp);
|
42
48
|
|
@@ -66,20 +72,20 @@ ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
|
|
66
72
|
|
67
73
|
prepare_transcription(rwp, &self);
|
68
74
|
|
69
|
-
if (whisper_full_parallel(rw->context, rwp->params, pcmf32.data(), pcmf32.size(),
|
75
|
+
if (whisper_full_parallel(rw->context, rwp->params, pcmf32.data(), pcmf32.size(), n_processors) != 0) {
|
70
76
|
fprintf(stderr, "failed to process audio\n");
|
71
77
|
return self;
|
72
78
|
}
|
79
|
+
if (NIL_P(blk)) {
|
80
|
+
return self;
|
81
|
+
}
|
73
82
|
const int n_segments = whisper_full_n_segments(rw->context);
|
74
83
|
VALUE output = rb_str_new2("");
|
75
84
|
for (int i = 0; i < n_segments; ++i) {
|
76
85
|
const char * text = whisper_full_get_segment_text(rw->context, i);
|
77
86
|
output = rb_str_concat(output, rb_str_new2(text));
|
78
87
|
}
|
79
|
-
|
80
|
-
if (blk != Qnil) {
|
81
|
-
rb_funcall(blk, idCall, 1, output);
|
82
|
-
}
|
88
|
+
rb_funcall(blk, id_call, 1, output);
|
83
89
|
return self;
|
84
90
|
}
|
85
91
|
#ifdef __cplusplus
|
@@ -249,7 +249,7 @@ ruby_whisper_vad_params_initialize(int argc, VALUE *argv, VALUE self)
|
|
249
249
|
rb_get_kwargs(kw_hash, param_names, 0, NUM_PARAMS, values);
|
250
250
|
|
251
251
|
for (i = 0; i < NUM_PARAMS; i++) {
|
252
|
-
id= param_names[i];
|
252
|
+
id = param_names[i];
|
253
253
|
value = values[i];
|
254
254
|
if (value == Qundef) {
|
255
255
|
continue;
|
data/ext/sources/CMakeLists.txt
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
cmake_minimum_required(VERSION 3.5) # for add_link_options and implicit target directories.
|
2
2
|
project("whisper.cpp" C CXX)
|
3
|
-
project("whisper.cpp" VERSION 1.7.
|
3
|
+
project("whisper.cpp" VERSION 1.7.6)
|
4
4
|
include(CheckIncludeFileCXX)
|
5
5
|
|
6
6
|
set(SOVERSION 1)
|
@@ -178,6 +178,10 @@ get_directory_property(WHISPER_TRANSIENT_DEFINES COMPILE_DEFINITIONS)
|
|
178
178
|
set_target_properties(whisper PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/whisper.h)
|
179
179
|
install(TARGETS whisper LIBRARY PUBLIC_HEADER)
|
180
180
|
|
181
|
+
target_compile_definitions(whisper PRIVATE
|
182
|
+
WHISPER_VERSION="${PROJECT_VERSION}"
|
183
|
+
)
|
184
|
+
|
181
185
|
configure_package_config_file(
|
182
186
|
${CMAKE_CURRENT_SOURCE_DIR}/cmake/whisper-config.cmake.in
|
183
187
|
${CMAKE_CURRENT_BINARY_DIR}/whisper-config.cmake
|
@@ -1,37 +1,133 @@
|
|
1
|
-
const
|
2
|
-
const { whisper } = require(
|
3
|
-
|
4
|
-
"../../../build/Release/addon.node"
|
5
|
-
));
|
6
|
-
const { promisify } = require("util");
|
1
|
+
const { join } = require('path');
|
2
|
+
const { whisper } = require('../../../build/Release/addon.node');
|
3
|
+
const { promisify } = require('util');
|
7
4
|
|
8
5
|
const whisperAsync = promisify(whisper);
|
9
6
|
|
10
|
-
const
|
11
|
-
language:
|
12
|
-
model:
|
13
|
-
fname_inp:
|
7
|
+
const commonParams = {
|
8
|
+
language: 'en',
|
9
|
+
model: join(__dirname, '../../../models/ggml-base.en.bin'),
|
10
|
+
fname_inp: join(__dirname, '../../../samples/jfk.wav'),
|
14
11
|
use_gpu: true,
|
15
12
|
flash_attn: false,
|
16
13
|
no_prints: true,
|
17
|
-
comma_in_time: false,
|
18
|
-
translate: true,
|
19
14
|
no_timestamps: false,
|
15
|
+
detect_language: false,
|
20
16
|
audio_ctx: 0,
|
21
|
-
max_len: 0
|
22
|
-
prompt: "",
|
23
|
-
print_progress: false,
|
24
|
-
progress_callback: (progress) => {
|
25
|
-
console.log(`Progress: ${progress}`);
|
26
|
-
},
|
27
|
-
max_context: -1
|
17
|
+
max_len: 0
|
28
18
|
};
|
29
19
|
|
30
|
-
describe(
|
31
|
-
|
32
|
-
|
20
|
+
describe('Whisper.cpp Node.js addon with VAD support', () => {
|
21
|
+
test('Basic whisper transcription without VAD', async () => {
|
22
|
+
const params = {
|
23
|
+
...commonParams,
|
24
|
+
vad: false
|
25
|
+
};
|
33
26
|
|
34
|
-
|
35
|
-
|
27
|
+
const result = await whisperAsync(params);
|
28
|
+
|
29
|
+
expect(typeof result).toBe('object');
|
30
|
+
expect(Array.isArray(result.transcription)).toBe(true);
|
31
|
+
expect(result.transcription.length).toBeGreaterThan(0);
|
32
|
+
|
33
|
+
// Check that we got some transcription text
|
34
|
+
const text = result.transcription.map(segment => segment[2]).join(' ');
|
35
|
+
expect(text.length).toBeGreaterThan(0);
|
36
|
+
expect(text.toLowerCase()).toContain('ask not');
|
37
|
+
}, 30000);
|
38
|
+
|
39
|
+
test('VAD parameters validation', async () => {
|
40
|
+
// Test with invalid VAD model - should return empty transcription
|
41
|
+
const invalidParams = {
|
42
|
+
...commonParams,
|
43
|
+
vad: true,
|
44
|
+
vad_model: 'non-existent-model.bin',
|
45
|
+
vad_threshold: 0.5
|
46
|
+
};
|
47
|
+
|
48
|
+
// This should handle the error gracefully and return empty transcription
|
49
|
+
const result = await whisperAsync(invalidParams);
|
50
|
+
expect(typeof result).toBe('object');
|
51
|
+
expect(Array.isArray(result.transcription)).toBe(true);
|
52
|
+
// When VAD model doesn't exist, it should return empty transcription
|
53
|
+
expect(result.transcription.length).toBe(0);
|
54
|
+
}, 10000);
|
55
|
+
|
56
|
+
test('VAD parameter parsing', async () => {
|
57
|
+
// Test that VAD parameters are properly parsed (even if VAD model doesn't exist)
|
58
|
+
const vadParams = {
|
59
|
+
...commonParams,
|
60
|
+
vad: false, // Disabled so no model required
|
61
|
+
vad_threshold: 0.7,
|
62
|
+
vad_min_speech_duration_ms: 300,
|
63
|
+
vad_min_silence_duration_ms: 150,
|
64
|
+
vad_max_speech_duration_s: 45.0,
|
65
|
+
vad_speech_pad_ms: 50,
|
66
|
+
vad_samples_overlap: 0.15
|
67
|
+
};
|
68
|
+
|
69
|
+
const result = await whisperAsync(vadParams);
|
70
|
+
|
71
|
+
expect(typeof result).toBe('object');
|
72
|
+
expect(Array.isArray(result.transcription)).toBe(true);
|
73
|
+
}, 30000);
|
74
|
+
|
75
|
+
test('Progress callback with VAD disabled', async () => {
|
76
|
+
let progressCalled = false;
|
77
|
+
let lastProgress = 0;
|
78
|
+
|
79
|
+
const params = {
|
80
|
+
...commonParams,
|
81
|
+
vad: false,
|
82
|
+
progress_callback: (progress) => {
|
83
|
+
progressCalled = true;
|
84
|
+
lastProgress = progress;
|
85
|
+
expect(progress).toBeGreaterThanOrEqual(0);
|
86
|
+
expect(progress).toBeLessThanOrEqual(100);
|
87
|
+
}
|
88
|
+
};
|
89
|
+
|
90
|
+
const result = await whisperAsync(params);
|
91
|
+
|
92
|
+
expect(progressCalled).toBe(true);
|
93
|
+
expect(lastProgress).toBe(100);
|
94
|
+
expect(typeof result).toBe('object');
|
95
|
+
}, 30000);
|
96
|
+
|
97
|
+
test('Language detection without VAD', async () => {
|
98
|
+
const params = {
|
99
|
+
...commonParams,
|
100
|
+
vad: false,
|
101
|
+
detect_language: true,
|
102
|
+
language: 'auto'
|
103
|
+
};
|
104
|
+
|
105
|
+
const result = await whisperAsync(params);
|
106
|
+
|
107
|
+
expect(typeof result).toBe('object');
|
108
|
+
expect(typeof result.language).toBe('string');
|
109
|
+
expect(result.language.length).toBeGreaterThan(0);
|
110
|
+
}, 30000);
|
111
|
+
|
112
|
+
test('Basic transcription with all VAD parameters set', async () => {
|
113
|
+
// Test with VAD disabled but all parameters set to ensure no crashes
|
114
|
+
const params = {
|
115
|
+
...commonParams,
|
116
|
+
vad: false, // Disabled so it works without VAD model
|
117
|
+
vad_model: '', // Empty model path
|
118
|
+
vad_threshold: 0.6,
|
119
|
+
vad_min_speech_duration_ms: 200,
|
120
|
+
vad_min_silence_duration_ms: 80,
|
121
|
+
vad_max_speech_duration_s: 25.0,
|
122
|
+
vad_speech_pad_ms: 40,
|
123
|
+
vad_samples_overlap: 0.08
|
124
|
+
};
|
125
|
+
|
126
|
+
const result = await whisperAsync(params);
|
127
|
+
|
128
|
+
expect(typeof result).toBe('object');
|
129
|
+
expect(Array.isArray(result.transcription)).toBe(true);
|
130
|
+
expect(result.transcription.length).toBeGreaterThan(0);
|
131
|
+
}, 30000);
|
36
132
|
});
|
37
133
|
|