whispercpp 1.3.2 → 1.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +6 -3
- data/README.md +71 -14
- data/Rakefile +20 -7
- data/ext/.gitignore +4 -6
- data/ext/dependencies.rb +36 -24
- data/ext/extconf.rb +1 -1
- data/ext/options.rb +48 -184
- data/ext/ruby_whisper.c +18 -0
- data/ext/ruby_whisper_context.c +43 -12
- data/ext/ruby_whisper_model.c +1 -1
- data/ext/ruby_whisper_params.c +4 -2
- data/ext/ruby_whisper_segment.c +81 -4
- data/ext/ruby_whisper_transcribe.cpp +13 -7
- data/ext/ruby_whisper_vad_params.c +1 -1
- data/ext/sources/CMakeLists.txt +5 -1
- data/ext/sources/bindings/javascript/package.json +1 -1
- data/ext/sources/examples/addon.node/__test__/whisper.spec.js +120 -24
- data/ext/sources/examples/addon.node/addon.cpp +150 -31
- data/ext/sources/examples/addon.node/index.js +3 -0
- data/ext/sources/examples/addon.node/vad-example.js +132 -0
- data/ext/sources/examples/bench/bench.cpp +3 -2
- data/ext/sources/examples/cli/cli.cpp +3 -2
- data/ext/sources/examples/command/command.cpp +32 -8
- data/ext/sources/examples/common-whisper.cpp +14 -7
- data/ext/sources/examples/lsp/lsp.cpp +2 -0
- data/ext/sources/examples/quantize/quantize.cpp +3 -0
- data/ext/sources/examples/server/CMakeLists.txt +3 -0
- data/ext/sources/examples/server/server.cpp +169 -22
- data/ext/sources/examples/stream/stream.cpp +6 -0
- data/ext/sources/examples/talk-llama/CMakeLists.txt +4 -1
- data/ext/sources/examples/talk-llama/llama-arch.cpp +171 -3
- data/ext/sources/examples/talk-llama/llama-arch.h +28 -1
- data/ext/sources/examples/talk-llama/llama-batch.cpp +741 -272
- data/ext/sources/examples/talk-llama/llama-batch.h +112 -54
- data/ext/sources/examples/talk-llama/llama-chat.cpp +30 -8
- data/ext/sources/examples/talk-llama/llama-chat.h +1 -0
- data/ext/sources/examples/talk-llama/llama-context.cpp +520 -351
- data/ext/sources/examples/talk-llama/llama-context.h +38 -17
- data/ext/sources/examples/talk-llama/llama-cparams.cpp +1 -1
- data/ext/sources/examples/talk-llama/llama-cparams.h +1 -1
- data/ext/sources/examples/talk-llama/llama-graph.cpp +447 -372
- data/ext/sources/examples/talk-llama/llama-graph.h +128 -58
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +10 -2
- data/ext/sources/examples/talk-llama/llama-hparams.h +19 -2
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +279 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.h +128 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified.cpp +1841 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified.h +303 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +14 -472
- data/ext/sources/examples/talk-llama/llama-kv-cells.h +86 -26
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +246 -0
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +138 -0
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +1125 -0
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +183 -0
- data/ext/sources/examples/talk-llama/llama-memory.cpp +58 -0
- data/ext/sources/examples/talk-llama/llama-memory.h +88 -4
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +1 -1
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +42 -17
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +1 -0
- data/ext/sources/examples/talk-llama/llama-model.cpp +1863 -563
- data/ext/sources/examples/talk-llama/llama-model.h +27 -0
- data/ext/sources/examples/talk-llama/llama-quant.cpp +89 -6
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +65 -28
- data/ext/sources/examples/talk-llama/llama-vocab.h +1 -0
- data/ext/sources/examples/talk-llama/llama.cpp +11 -7
- data/ext/sources/examples/talk-llama/llama.h +147 -40
- data/ext/sources/examples/talk-llama/talk-llama.cpp +2 -0
- data/ext/sources/examples/talk-llama/unicode.cpp +5 -0
- data/ext/sources/examples/vad-speech-segments/speech.cpp +6 -0
- data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +2 -0
- data/ext/sources/ggml/CMakeLists.txt +48 -3
- data/ext/sources/ggml/cmake/common.cmake +24 -0
- data/ext/sources/ggml/include/ggml-backend.h +1 -1
- data/ext/sources/ggml/include/ggml-cpu.h +2 -0
- data/ext/sources/ggml/include/ggml.h +144 -5
- data/ext/sources/ggml/src/CMakeLists.txt +82 -24
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +5 -0
- data/ext/sources/ggml/src/ggml-backend.cpp +46 -23
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +3 -3
- data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +1 -0
- data/ext/sources/ggml/src/ggml-cann/common.h +6 -1
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
- data/ext/sources/ggml/src/ggml-common.h +4 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +133 -40
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +184 -0
- data/ext/sources/ggml/src/ggml-cpu/common.h +4 -3
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +146 -105
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1057 -174
- data/ext/sources/ggml/src/ggml-cpu/ops.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.c +1158 -0
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1571 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.h +98 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +330 -38
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +111 -18
- data/ext/sources/ggml/src/ggml-cpu/vec.h +303 -94
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +60 -37
- data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +22 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +2 -2
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +5 -2
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +265 -123
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +19 -0
- data/ext/sources/ggml/src/ggml-cuda/mean.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/mmv.cu +257 -87
- data/ext/sources/ggml/src/ggml-cuda/mmv.cuh +2 -3
- data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +5 -18
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +0 -1
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +89 -0
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +7 -0
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +4 -0
- data/ext/sources/ggml/src/ggml-impl.h +127 -183
- data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +11 -10
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +27 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +331 -49
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +564 -282
- data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +2 -2
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +14 -0
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +1859 -489
- data/ext/sources/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +72 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +201 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
- data/ext/sources/ggml/src/ggml-quants.c +6 -8
- data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +5 -6
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +20 -48
- data/ext/sources/ggml/src/ggml-sycl/concat.cpp +28 -41
- data/ext/sources/ggml/src/ggml-sycl/conv.cpp +4 -10
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +117 -165
- data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +192 -53
- data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +32 -0
- data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +49 -67
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +648 -1039
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +18 -9
- data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +8 -105
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -100
- data/ext/sources/ggml/src/ggml-sycl/gla.cpp +2 -2
- data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/mmq.cpp +60 -80
- data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +158 -203
- data/ext/sources/ggml/src/ggml-sycl/norm.cpp +55 -74
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +38 -10
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +138 -27
- data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +3 -3
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +3 -8
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
- data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +12 -16
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +36 -32
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +726 -282
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +12 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +10 -1
- data/ext/sources/ggml/src/ggml.c +328 -48
- data/ext/sources/ggml/src/ggml.cpp +26 -0
- data/ext/sources/ggml/src/gguf.cpp +24 -3
- data/ext/sources/include/whisper.h +2 -0
- data/ext/sources/src/CMakeLists.txt +2 -0
- data/ext/sources/src/coreml/whisper-compat.h +10 -0
- data/ext/sources/src/coreml/whisper-compat.m +35 -0
- data/ext/sources/src/coreml/whisper-decoder-impl.m +1 -0
- data/ext/sources/src/coreml/whisper-encoder-impl.m +1 -0
- data/ext/sources/src/whisper.cpp +218 -169
- data/extsources.rb +15 -9
- data/lib/whisper/context.rb +15 -0
- data/lib/whisper/model/uri.rb +56 -1
- data/lib/whisper/segment.rb +58 -0
- data/sig/whisper.rbs +68 -38
- data/{tests → test}/helper.rb +1 -12
- data/{tests → test}/test_model.rb +9 -0
- data/test/test_package.rb +51 -0
- data/test/test_segment.rb +146 -0
- data/{tests → test}/test_whisper.rb +70 -0
- data/whispercpp.gemspec +2 -3
- metadata +91 -43
- data/ext/sources/.dockerignore +0 -3
- data/ext/sources/.github/workflows/bindings-ruby.yml +0 -21
- data/ext/sources/ci/run.sh +0 -336
- data/ext/sources/close-issue.yml +0 -28
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2739
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13747
- data/tests/test_package.rb +0 -46
- data/tests/test_segment.rb +0 -74
- /data/ext/sources/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
- /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
- /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
- /data/{tests → test}/jfk_reader/.gitignore +0 -0
- /data/{tests → test}/jfk_reader/extconf.rb +0 -0
- /data/{tests → test}/jfk_reader/jfk_reader.c +0 -0
- /data/{tests → test}/test_callback.rb +0 -0
- /data/{tests → test}/test_error.rb +0 -0
- /data/{tests → test}/test_params.rb +0 -0
- /data/{tests → test}/test_vad.rb +0 -0
- /data/{tests → test}/test_vad_params.rb +0 -0
@@ -9,6 +9,7 @@
|
|
9
9
|
#include <vector>
|
10
10
|
#include <cmath>
|
11
11
|
#include <cstdint>
|
12
|
+
#include <cfloat>
|
12
13
|
|
13
14
|
struct whisper_params {
|
14
15
|
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
@@ -38,6 +39,7 @@ struct whisper_params {
|
|
38
39
|
bool print_progress = false;
|
39
40
|
bool no_timestamps = false;
|
40
41
|
bool no_prints = false;
|
42
|
+
bool detect_language= false;
|
41
43
|
bool use_gpu = true;
|
42
44
|
bool flash_attn = false;
|
43
45
|
bool comma_in_time = true;
|
@@ -50,6 +52,16 @@ struct whisper_params {
|
|
50
52
|
std::vector<std::string> fname_out = {};
|
51
53
|
|
52
54
|
std::vector<float> pcmf32 = {}; // mono-channel F32 PCM
|
55
|
+
|
56
|
+
// Voice Activity Detection (VAD) parameters
|
57
|
+
bool vad = false;
|
58
|
+
std::string vad_model = "";
|
59
|
+
float vad_threshold = 0.5f;
|
60
|
+
int vad_min_speech_duration_ms = 250;
|
61
|
+
int vad_min_silence_duration_ms = 100;
|
62
|
+
float vad_max_speech_duration_s = FLT_MAX;
|
63
|
+
int vad_speech_pad_ms = 30;
|
64
|
+
float vad_samples_overlap = 0.1f;
|
53
65
|
};
|
54
66
|
|
55
67
|
struct whisper_print_user_data {
|
@@ -130,6 +142,11 @@ void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper
|
|
130
142
|
|
131
143
|
void cb_log_disable(enum ggml_log_level, const char *, void *) {}
|
132
144
|
|
145
|
+
struct whisper_result {
|
146
|
+
std::vector<std::vector<std::string>> segments;
|
147
|
+
std::string language;
|
148
|
+
};
|
149
|
+
|
133
150
|
class ProgressWorker : public Napi::AsyncWorker {
|
134
151
|
public:
|
135
152
|
ProgressWorker(Napi::Function& callback, whisper_params params, Napi::Function progress_callback, Napi::Env env)
|
@@ -160,15 +177,27 @@ class ProgressWorker : public Napi::AsyncWorker {
|
|
160
177
|
|
161
178
|
void OnOK() override {
|
162
179
|
Napi::HandleScope scope(Env());
|
163
|
-
|
164
|
-
|
180
|
+
|
181
|
+
if (params.detect_language) {
|
182
|
+
Napi::Object resultObj = Napi::Object::New(Env());
|
183
|
+
resultObj.Set("language", Napi::String::New(Env(), result.language));
|
184
|
+
Callback().Call({Env().Null(), resultObj});
|
185
|
+
}
|
186
|
+
|
187
|
+
Napi::Object returnObj = Napi::Object::New(Env());
|
188
|
+
if (!result.language.empty()) {
|
189
|
+
returnObj.Set("language", Napi::String::New(Env(), result.language));
|
190
|
+
}
|
191
|
+
Napi::Array transcriptionArray = Napi::Array::New(Env(), result.segments.size());
|
192
|
+
for (uint64_t i = 0; i < result.segments.size(); ++i) {
|
165
193
|
Napi::Object tmp = Napi::Array::New(Env(), 3);
|
166
194
|
for (uint64_t j = 0; j < 3; ++j) {
|
167
|
-
tmp[j] = Napi::String::New(Env(), result[i][j]);
|
195
|
+
tmp[j] = Napi::String::New(Env(), result.segments[i][j]);
|
168
196
|
}
|
169
|
-
|
170
|
-
|
171
|
-
|
197
|
+
transcriptionArray[i] = tmp;
|
198
|
+
}
|
199
|
+
returnObj.Set("transcription", transcriptionArray);
|
200
|
+
Callback().Call({Env().Null(), returnObj});
|
172
201
|
}
|
173
202
|
|
174
203
|
// Progress callback function - using thread-safe function
|
@@ -185,12 +214,12 @@ class ProgressWorker : public Napi::AsyncWorker {
|
|
185
214
|
|
186
215
|
private:
|
187
216
|
whisper_params params;
|
188
|
-
|
217
|
+
whisper_result result;
|
189
218
|
Napi::Env env;
|
190
219
|
Napi::ThreadSafeFunction tsfn;
|
191
220
|
|
192
221
|
// Custom run function with progress callback support
|
193
|
-
int run_with_progress(whisper_params ¶ms,
|
222
|
+
int run_with_progress(whisper_params ¶ms, whisper_result & result) {
|
194
223
|
if (params.no_prints) {
|
195
224
|
whisper_log_set(cb_log_disable, NULL);
|
196
225
|
}
|
@@ -279,7 +308,8 @@ class ProgressWorker : public Napi::AsyncWorker {
|
|
279
308
|
wparams.print_timestamps = !params.no_timestamps;
|
280
309
|
wparams.print_special = params.print_special;
|
281
310
|
wparams.translate = params.translate;
|
282
|
-
wparams.language = params.language.c_str();
|
311
|
+
wparams.language = params.detect_language ? "auto" : params.language.c_str();
|
312
|
+
wparams.detect_language = params.detect_language;
|
283
313
|
wparams.n_threads = params.n_threads;
|
284
314
|
wparams.n_max_text_ctx = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
|
285
315
|
wparams.offset_ms = params.offset_t_ms;
|
@@ -314,34 +344,38 @@ class ProgressWorker : public Napi::AsyncWorker {
|
|
314
344
|
};
|
315
345
|
wparams.progress_callback_user_data = this;
|
316
346
|
|
317
|
-
//
|
318
|
-
|
319
|
-
|
347
|
+
// Set VAD parameters
|
348
|
+
wparams.vad = params.vad;
|
349
|
+
wparams.vad_model_path = params.vad_model.c_str();
|
320
350
|
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
351
|
+
wparams.vad_params.threshold = params.vad_threshold;
|
352
|
+
wparams.vad_params.min_speech_duration_ms = params.vad_min_speech_duration_ms;
|
353
|
+
wparams.vad_params.min_silence_duration_ms = params.vad_min_silence_duration_ms;
|
354
|
+
wparams.vad_params.max_speech_duration_s = params.vad_max_speech_duration_s;
|
355
|
+
wparams.vad_params.speech_pad_ms = params.vad_speech_pad_ms;
|
356
|
+
wparams.vad_params.samples_overlap = params.vad_samples_overlap;
|
327
357
|
|
328
358
|
if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
|
329
359
|
fprintf(stderr, "failed to process audio\n");
|
330
360
|
return 10;
|
331
361
|
}
|
332
362
|
}
|
333
|
-
|
363
|
+
}
|
334
364
|
|
365
|
+
if (params.detect_language || params.language == "auto") {
|
366
|
+
result.language = whisper_lang_str(whisper_full_lang_id(ctx));
|
367
|
+
}
|
335
368
|
const int n_segments = whisper_full_n_segments(ctx);
|
336
|
-
result.resize(n_segments);
|
369
|
+
result.segments.resize(n_segments);
|
370
|
+
|
337
371
|
for (int i = 0; i < n_segments; ++i) {
|
338
372
|
const char * text = whisper_full_get_segment_text(ctx, i);
|
339
373
|
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
|
340
374
|
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
|
341
375
|
|
342
|
-
result[i].emplace_back(to_timestamp(t0, params.comma_in_time));
|
343
|
-
result[i].emplace_back(to_timestamp(t1, params.comma_in_time));
|
344
|
-
result[i].emplace_back(text);
|
376
|
+
result.segments[i].emplace_back(to_timestamp(t0, params.comma_in_time));
|
377
|
+
result.segments[i].emplace_back(to_timestamp(t1, params.comma_in_time));
|
378
|
+
result.segments[i].emplace_back(text);
|
345
379
|
}
|
346
380
|
|
347
381
|
whisper_print_timings(ctx);
|
@@ -362,13 +396,46 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
|
|
362
396
|
std::string language = whisper_params.Get("language").As<Napi::String>();
|
363
397
|
std::string model = whisper_params.Get("model").As<Napi::String>();
|
364
398
|
std::string input = whisper_params.Get("fname_inp").As<Napi::String>();
|
365
|
-
|
366
|
-
bool
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
399
|
+
|
400
|
+
bool use_gpu = true;
|
401
|
+
if (whisper_params.Has("use_gpu") && whisper_params.Get("use_gpu").IsBoolean()) {
|
402
|
+
use_gpu = whisper_params.Get("use_gpu").As<Napi::Boolean>();
|
403
|
+
}
|
404
|
+
|
405
|
+
bool flash_attn = false;
|
406
|
+
if (whisper_params.Has("flash_attn") && whisper_params.Get("flash_attn").IsBoolean()) {
|
407
|
+
flash_attn = whisper_params.Get("flash_attn").As<Napi::Boolean>();
|
408
|
+
}
|
409
|
+
|
410
|
+
bool no_prints = false;
|
411
|
+
if (whisper_params.Has("no_prints") && whisper_params.Get("no_prints").IsBoolean()) {
|
412
|
+
no_prints = whisper_params.Get("no_prints").As<Napi::Boolean>();
|
413
|
+
}
|
414
|
+
|
415
|
+
bool no_timestamps = false;
|
416
|
+
if (whisper_params.Has("no_timestamps") && whisper_params.Get("no_timestamps").IsBoolean()) {
|
417
|
+
no_timestamps = whisper_params.Get("no_timestamps").As<Napi::Boolean>();
|
418
|
+
}
|
419
|
+
|
420
|
+
bool detect_language = false;
|
421
|
+
if (whisper_params.Has("detect_language") && whisper_params.Get("detect_language").IsBoolean()) {
|
422
|
+
detect_language = whisper_params.Get("detect_language").As<Napi::Boolean>();
|
423
|
+
}
|
424
|
+
|
425
|
+
int32_t audio_ctx = 0;
|
426
|
+
if (whisper_params.Has("audio_ctx") && whisper_params.Get("audio_ctx").IsNumber()) {
|
427
|
+
audio_ctx = whisper_params.Get("audio_ctx").As<Napi::Number>();
|
428
|
+
}
|
429
|
+
|
430
|
+
bool comma_in_time = true;
|
431
|
+
if (whisper_params.Has("comma_in_time") && whisper_params.Get("comma_in_time").IsBoolean()) {
|
432
|
+
comma_in_time = whisper_params.Get("comma_in_time").As<Napi::Boolean>();
|
433
|
+
}
|
434
|
+
|
435
|
+
int32_t max_len = 0;
|
436
|
+
if (whisper_params.Has("max_len") && whisper_params.Get("max_len").IsNumber()) {
|
437
|
+
max_len = whisper_params.Get("max_len").As<Napi::Number>();
|
438
|
+
}
|
372
439
|
|
373
440
|
// Add support for max_context
|
374
441
|
int32_t max_context = -1;
|
@@ -384,7 +451,7 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
|
|
384
451
|
|
385
452
|
// Add support for print_progress
|
386
453
|
bool print_progress = false;
|
387
|
-
if (whisper_params.Has("print_progress")) {
|
454
|
+
if (whisper_params.Has("print_progress") && whisper_params.Get("print_progress").IsBoolean()) {
|
388
455
|
print_progress = whisper_params.Get("print_progress").As<Napi::Boolean>();
|
389
456
|
}
|
390
457
|
// Add support for progress_callback
|
@@ -393,6 +460,47 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
|
|
393
460
|
progress_callback = whisper_params.Get("progress_callback").As<Napi::Function>();
|
394
461
|
}
|
395
462
|
|
463
|
+
// Add support for VAD parameters
|
464
|
+
bool vad = false;
|
465
|
+
if (whisper_params.Has("vad") && whisper_params.Get("vad").IsBoolean()) {
|
466
|
+
vad = whisper_params.Get("vad").As<Napi::Boolean>();
|
467
|
+
}
|
468
|
+
|
469
|
+
std::string vad_model = "";
|
470
|
+
if (whisper_params.Has("vad_model") && whisper_params.Get("vad_model").IsString()) {
|
471
|
+
vad_model = whisper_params.Get("vad_model").As<Napi::String>();
|
472
|
+
}
|
473
|
+
|
474
|
+
float vad_threshold = 0.5f;
|
475
|
+
if (whisper_params.Has("vad_threshold") && whisper_params.Get("vad_threshold").IsNumber()) {
|
476
|
+
vad_threshold = whisper_params.Get("vad_threshold").As<Napi::Number>();
|
477
|
+
}
|
478
|
+
|
479
|
+
int vad_min_speech_duration_ms = 250;
|
480
|
+
if (whisper_params.Has("vad_min_speech_duration_ms") && whisper_params.Get("vad_min_speech_duration_ms").IsNumber()) {
|
481
|
+
vad_min_speech_duration_ms = whisper_params.Get("vad_min_speech_duration_ms").As<Napi::Number>();
|
482
|
+
}
|
483
|
+
|
484
|
+
int vad_min_silence_duration_ms = 100;
|
485
|
+
if (whisper_params.Has("vad_min_silence_duration_ms") && whisper_params.Get("vad_min_silence_duration_ms").IsNumber()) {
|
486
|
+
vad_min_silence_duration_ms = whisper_params.Get("vad_min_silence_duration_ms").As<Napi::Number>();
|
487
|
+
}
|
488
|
+
|
489
|
+
float vad_max_speech_duration_s = FLT_MAX;
|
490
|
+
if (whisper_params.Has("vad_max_speech_duration_s") && whisper_params.Get("vad_max_speech_duration_s").IsNumber()) {
|
491
|
+
vad_max_speech_duration_s = whisper_params.Get("vad_max_speech_duration_s").As<Napi::Number>();
|
492
|
+
}
|
493
|
+
|
494
|
+
int vad_speech_pad_ms = 30;
|
495
|
+
if (whisper_params.Has("vad_speech_pad_ms") && whisper_params.Get("vad_speech_pad_ms").IsNumber()) {
|
496
|
+
vad_speech_pad_ms = whisper_params.Get("vad_speech_pad_ms").As<Napi::Number>();
|
497
|
+
}
|
498
|
+
|
499
|
+
float vad_samples_overlap = 0.1f;
|
500
|
+
if (whisper_params.Has("vad_samples_overlap") && whisper_params.Get("vad_samples_overlap").IsNumber()) {
|
501
|
+
vad_samples_overlap = whisper_params.Get("vad_samples_overlap").As<Napi::Number>();
|
502
|
+
}
|
503
|
+
|
396
504
|
Napi::Value pcmf32Value = whisper_params.Get("pcmf32");
|
397
505
|
std::vector<float> pcmf32_vec;
|
398
506
|
if (pcmf32Value.IsTypedArray()) {
|
@@ -418,6 +526,17 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
|
|
418
526
|
params.max_context = max_context;
|
419
527
|
params.print_progress = print_progress;
|
420
528
|
params.prompt = prompt;
|
529
|
+
params.detect_language = detect_language;
|
530
|
+
|
531
|
+
// Set VAD parameters
|
532
|
+
params.vad = vad;
|
533
|
+
params.vad_model = vad_model;
|
534
|
+
params.vad_threshold = vad_threshold;
|
535
|
+
params.vad_min_speech_duration_ms = vad_min_speech_duration_ms;
|
536
|
+
params.vad_min_silence_duration_ms = vad_min_silence_duration_ms;
|
537
|
+
params.vad_max_speech_duration_s = vad_max_speech_duration_s;
|
538
|
+
params.vad_speech_pad_ms = vad_speech_pad_ms;
|
539
|
+
params.vad_samples_overlap = vad_samples_overlap;
|
421
540
|
|
422
541
|
Napi::Function callback = info[1].As<Napi::Function>();
|
423
542
|
// Create a new Worker class with progress callback support
|
@@ -17,6 +17,7 @@ const whisperParams = {
|
|
17
17
|
comma_in_time: false,
|
18
18
|
translate: true,
|
19
19
|
no_timestamps: false,
|
20
|
+
detect_language: false,
|
20
21
|
audio_ctx: 0,
|
21
22
|
max_len: 0,
|
22
23
|
progress_callback: (progress) => {
|
@@ -31,6 +32,8 @@ const params = Object.fromEntries(
|
|
31
32
|
const [key, value] = item.slice(2).split("=");
|
32
33
|
if (key === "audio_ctx") {
|
33
34
|
whisperParams[key] = parseInt(value);
|
35
|
+
} else if (key === "detect_language") {
|
36
|
+
whisperParams[key] = value === "true";
|
34
37
|
} else {
|
35
38
|
whisperParams[key] = value;
|
36
39
|
}
|
@@ -0,0 +1,132 @@
|
|
1
|
+
const path = require("path");
|
2
|
+
const { whisper } = require(path.join(
|
3
|
+
__dirname,
|
4
|
+
"../../build/Release/addon.node"
|
5
|
+
));
|
6
|
+
const { promisify } = require("util");
|
7
|
+
|
8
|
+
const whisperAsync = promisify(whisper);
|
9
|
+
|
10
|
+
// Example with VAD enabled
|
11
|
+
const vadParams = {
|
12
|
+
language: "en",
|
13
|
+
model: path.join(__dirname, "../../models/ggml-base.en.bin"),
|
14
|
+
fname_inp: path.join(__dirname, "../../samples/jfk.wav"),
|
15
|
+
use_gpu: true,
|
16
|
+
flash_attn: false,
|
17
|
+
no_prints: false,
|
18
|
+
comma_in_time: true,
|
19
|
+
translate: false,
|
20
|
+
no_timestamps: false,
|
21
|
+
detect_language: false,
|
22
|
+
audio_ctx: 0,
|
23
|
+
max_len: 0,
|
24
|
+
// VAD parameters
|
25
|
+
vad: true,
|
26
|
+
vad_model: path.join(__dirname, "../../models/ggml-silero-v5.1.2.bin"), // You need to download this model
|
27
|
+
vad_threshold: 0.5,
|
28
|
+
vad_min_speech_duration_ms: 250,
|
29
|
+
vad_min_silence_duration_ms: 100,
|
30
|
+
vad_max_speech_duration_s: 30.0,
|
31
|
+
vad_speech_pad_ms: 30,
|
32
|
+
vad_samples_overlap: 0.1,
|
33
|
+
progress_callback: (progress) => {
|
34
|
+
console.log(`VAD Transcription progress: ${progress}%`);
|
35
|
+
}
|
36
|
+
};
|
37
|
+
|
38
|
+
// Example without VAD (traditional approach)
|
39
|
+
const traditionalParams = {
|
40
|
+
language: "en",
|
41
|
+
model: path.join(__dirname, "../../models/ggml-base.en.bin"),
|
42
|
+
fname_inp: path.join(__dirname, "../../samples/jfk.wav"),
|
43
|
+
use_gpu: true,
|
44
|
+
flash_attn: false,
|
45
|
+
no_prints: false,
|
46
|
+
comma_in_time: true,
|
47
|
+
translate: false,
|
48
|
+
no_timestamps: false,
|
49
|
+
detect_language: false,
|
50
|
+
audio_ctx: 0,
|
51
|
+
max_len: 0,
|
52
|
+
vad: false, // Explicitly disable VAD
|
53
|
+
progress_callback: (progress) => {
|
54
|
+
console.log(`Traditional transcription progress: ${progress}%`);
|
55
|
+
}
|
56
|
+
};
|
57
|
+
|
58
|
+
async function runVADExample() {
|
59
|
+
try {
|
60
|
+
console.log("=== Whisper.cpp Node.js VAD Example ===\n");
|
61
|
+
|
62
|
+
// Check if VAD model exists
|
63
|
+
const fs = require('fs');
|
64
|
+
if (!fs.existsSync(vadParams.vad_model)) {
|
65
|
+
console.log("⚠️ VAD model not found. Please download the VAD model first:");
|
66
|
+
console.log(" ./models/download-vad-model.sh silero-v5.1.2");
|
67
|
+
console.log(" Or run: python models/convert-silero-vad-to-ggml.py");
|
68
|
+
console.log("\n Falling back to traditional transcription without VAD...\n");
|
69
|
+
|
70
|
+
// Run without VAD
|
71
|
+
console.log("🎵 Running traditional transcription...");
|
72
|
+
const traditionalResult = await whisperAsync(traditionalParams);
|
73
|
+
console.log("\n📝 Traditional transcription result:");
|
74
|
+
console.log(traditionalResult);
|
75
|
+
return;
|
76
|
+
}
|
77
|
+
|
78
|
+
console.log("🎵 Running transcription with VAD enabled...");
|
79
|
+
console.log("VAD Parameters:");
|
80
|
+
console.log(` - Threshold: ${vadParams.vad_threshold}`);
|
81
|
+
console.log(` - Min speech duration: ${vadParams.vad_min_speech_duration_ms}ms`);
|
82
|
+
console.log(` - Min silence duration: ${vadParams.vad_min_silence_duration_ms}ms`);
|
83
|
+
console.log(` - Max speech duration: ${vadParams.vad_max_speech_duration_s}s`);
|
84
|
+
console.log(` - Speech padding: ${vadParams.vad_speech_pad_ms}ms`);
|
85
|
+
console.log(` - Samples overlap: ${vadParams.vad_samples_overlap}\n`);
|
86
|
+
|
87
|
+
const startTime = Date.now();
|
88
|
+
const vadResult = await whisperAsync(vadParams);
|
89
|
+
const vadDuration = Date.now() - startTime;
|
90
|
+
|
91
|
+
console.log("\n✅ VAD transcription completed!");
|
92
|
+
console.log(`⏱️ Processing time: ${vadDuration}ms`);
|
93
|
+
console.log("\n📝 VAD transcription result:");
|
94
|
+
console.log(vadResult);
|
95
|
+
|
96
|
+
// Compare with traditional approach
|
97
|
+
console.log("\n🔄 Running traditional transcription for comparison...");
|
98
|
+
const traditionalStartTime = Date.now();
|
99
|
+
const traditionalResult = await whisperAsync(traditionalParams);
|
100
|
+
const traditionalDuration = Date.now() - traditionalStartTime;
|
101
|
+
|
102
|
+
console.log("\n✅ Traditional transcription completed!");
|
103
|
+
console.log(`⏱️ Processing time: ${traditionalDuration}ms`);
|
104
|
+
console.log("\n📝 Traditional transcription result:");
|
105
|
+
console.log(traditionalResult);
|
106
|
+
|
107
|
+
// Performance comparison
|
108
|
+
console.log("\n📊 Performance Comparison:");
|
109
|
+
console.log(`VAD: ${vadDuration}ms`);
|
110
|
+
console.log(`Traditional: ${traditionalDuration}ms`);
|
111
|
+
const speedup = traditionalDuration / vadDuration;
|
112
|
+
if (speedup > 1) {
|
113
|
+
console.log(`🚀 VAD is ${speedup.toFixed(2)}x faster!`);
|
114
|
+
} else {
|
115
|
+
console.log(`ℹ️ Traditional approach was ${(1/speedup).toFixed(2)}x faster in this case.`);
|
116
|
+
}
|
117
|
+
|
118
|
+
} catch (error) {
|
119
|
+
console.error("❌ Error during transcription:", error);
|
120
|
+
}
|
121
|
+
}
|
122
|
+
|
123
|
+
// Run the example
|
124
|
+
if (require.main === module) {
|
125
|
+
runVADExample();
|
126
|
+
}
|
127
|
+
|
128
|
+
module.exports = {
|
129
|
+
runVADExample,
|
130
|
+
vadParams,
|
131
|
+
traditionalParams
|
132
|
+
};
|
@@ -66,13 +66,12 @@ static int whisper_bench_full(const whisper_params & params) {
|
|
66
66
|
cparams.use_gpu = params.use_gpu;
|
67
67
|
cparams.flash_attn = params.flash_attn;
|
68
68
|
|
69
|
-
struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
|
70
|
-
|
71
69
|
{
|
72
70
|
fprintf(stderr, "\n");
|
73
71
|
fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", params.n_threads, std::thread::hardware_concurrency(), whisper_print_system_info());
|
74
72
|
}
|
75
73
|
|
74
|
+
struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
|
76
75
|
if (ctx == nullptr) {
|
77
76
|
fprintf(stderr, "error: failed to initialize whisper context\n");
|
78
77
|
return 2;
|
@@ -156,6 +155,8 @@ static int whisper_bench_full(const whisper_params & params) {
|
|
156
155
|
}
|
157
156
|
|
158
157
|
int main(int argc, char ** argv) {
|
158
|
+
ggml_backend_load_all();
|
159
|
+
|
159
160
|
whisper_params params;
|
160
161
|
|
161
162
|
if (whisper_params_parse(argc, argv, params) == false) {
|
@@ -202,7 +202,7 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
|
|
202
202
|
else if ( arg == "--vad") { params.vad = true; }
|
203
203
|
else if (arg == "-vm" || arg == "--vad-model") { params.vad_model = ARGV_NEXT; }
|
204
204
|
else if (arg == "-vt" || arg == "--vad-threshold") { params.vad_threshold = std::stof(ARGV_NEXT); }
|
205
|
-
else if (arg == "-
|
205
|
+
else if (arg == "-vspd" || arg == "--vad-min-speech-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(ARGV_NEXT); }
|
206
206
|
else if (arg == "-vsd" || arg == "--vad-min-silence-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(ARGV_NEXT); }
|
207
207
|
else if (arg == "-vmsd" || arg == "--vad-max-speech-duration-s") { params.vad_max_speech_duration_s = std::stof(ARGV_NEXT); }
|
208
208
|
else if (arg == "-vp" || arg == "--vad-speech-pad-ms") { params.vad_speech_pad_ms = std::stoi(ARGV_NEXT); }
|
@@ -909,6 +909,8 @@ static void output_lrc(struct whisper_context * ctx, std::ofstream & fout, const
|
|
909
909
|
static void cb_log_disable(enum ggml_log_level , const char * , void * ) { }
|
910
910
|
|
911
911
|
int main(int argc, char ** argv) {
|
912
|
+
ggml_backend_load_all();
|
913
|
+
|
912
914
|
#if defined(_WIN32)
|
913
915
|
// Set the console output code page to UTF-8, while command line arguments
|
914
916
|
// are still encoded in the system's code page. In this way, we can print
|
@@ -988,7 +990,6 @@ int main(int argc, char ** argv) {
|
|
988
990
|
}
|
989
991
|
|
990
992
|
// whisper init
|
991
|
-
|
992
993
|
struct whisper_context_params cparams = whisper_context_default_params();
|
993
994
|
|
994
995
|
cparams.use_gpu = params.use_gpu;
|
@@ -251,7 +251,7 @@ static std::vector<std::string> get_words(const std::string &txt) {
|
|
251
251
|
|
252
252
|
// command-list mode
|
253
253
|
// guide the transcription to match the most likely command from a provided list
|
254
|
-
static int process_command_list(struct whisper_context * ctx, audio_async &audio, const whisper_params ¶ms) {
|
254
|
+
static int process_command_list(struct whisper_context * ctx, audio_async &audio, const whisper_params ¶ms, std::ofstream &fout) {
|
255
255
|
fprintf(stderr, "\n");
|
256
256
|
fprintf(stderr, "%s: guided mode\n", __func__);
|
257
257
|
|
@@ -444,12 +444,16 @@ static int process_command_list(struct whisper_context * ctx, audio_async &audio
|
|
444
444
|
|
445
445
|
const float prob = probs_id[0].first;
|
446
446
|
const int index = probs_id[0].second;
|
447
|
+
const char * best_command = allowed_commands[index].c_str();
|
447
448
|
|
448
449
|
fprintf(stdout, "\n");
|
449
450
|
fprintf(stdout, "%s: detected command: %s%s%s | p = %f | t = %d ms\n", __func__,
|
450
|
-
"\033[1m",
|
451
|
+
"\033[1m", best_command, "\033[0m", prob,
|
451
452
|
(int) std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count());
|
452
453
|
fprintf(stdout, "\n");
|
454
|
+
if (fout.is_open()) {
|
455
|
+
fout << best_command << std::endl;
|
456
|
+
}
|
453
457
|
}
|
454
458
|
}
|
455
459
|
|
@@ -462,7 +466,7 @@ static int process_command_list(struct whisper_context * ctx, audio_async &audio
|
|
462
466
|
|
463
467
|
// always-prompt mode
|
464
468
|
// transcribe the voice into text after valid prompt
|
465
|
-
static int always_prompt_transcription(struct whisper_context * ctx, audio_async & audio, const whisper_params & params) {
|
469
|
+
static int always_prompt_transcription(struct whisper_context * ctx, audio_async & audio, const whisper_params & params, std::ofstream & fout) {
|
466
470
|
bool is_running = true;
|
467
471
|
bool ask_prompt = true;
|
468
472
|
|
@@ -528,6 +532,9 @@ static int always_prompt_transcription(struct whisper_context * ctx, audio_async
|
|
528
532
|
|
529
533
|
if ((sim > 0.7f) && (command.size() > 0)) {
|
530
534
|
fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms);
|
535
|
+
if (fout.is_open()) {
|
536
|
+
fout << command << std::endl;
|
537
|
+
}
|
531
538
|
}
|
532
539
|
|
533
540
|
fprintf(stdout, "\n");
|
@@ -542,7 +549,7 @@ static int always_prompt_transcription(struct whisper_context * ctx, audio_async
|
|
542
549
|
|
543
550
|
// general-purpose mode
|
544
551
|
// freely transcribe the voice into text
|
545
|
-
static int process_general_transcription(struct whisper_context * ctx, audio_async & audio, const whisper_params & params) {
|
552
|
+
static int process_general_transcription(struct whisper_context * ctx, audio_async & audio, const whisper_params & params, std::ofstream & fout) {
|
546
553
|
bool is_running = true;
|
547
554
|
bool have_prompt = false;
|
548
555
|
bool ask_prompt = true;
|
@@ -662,8 +669,10 @@ static int process_general_transcription(struct whisper_context * ctx, audio_asy
|
|
662
669
|
} else {
|
663
670
|
// cut the prompt from the decoded text
|
664
671
|
const std::string command = ::trim(txt.substr(best_len));
|
665
|
-
|
666
672
|
fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms);
|
673
|
+
if (fout.is_open()) {
|
674
|
+
fout << command << std::endl;
|
675
|
+
}
|
667
676
|
}
|
668
677
|
|
669
678
|
fprintf(stdout, "\n");
|
@@ -678,6 +687,8 @@ static int process_general_transcription(struct whisper_context * ctx, audio_asy
|
|
678
687
|
}
|
679
688
|
|
680
689
|
int main(int argc, char ** argv) {
|
690
|
+
ggml_backend_load_all();
|
691
|
+
|
681
692
|
whisper_params params;
|
682
693
|
|
683
694
|
if (whisper_params_parse(argc, argv, params) == false) {
|
@@ -698,6 +709,10 @@ int main(int argc, char ** argv) {
|
|
698
709
|
cparams.flash_attn = params.flash_attn;
|
699
710
|
|
700
711
|
struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
|
712
|
+
if (ctx == nullptr) {
|
713
|
+
fprintf(stderr, "error: failed to initialize whisper context\n");
|
714
|
+
return 2;
|
715
|
+
}
|
701
716
|
|
702
717
|
// print some info about the processing
|
703
718
|
{
|
@@ -757,13 +772,22 @@ int main(int argc, char ** argv) {
|
|
757
772
|
}
|
758
773
|
}
|
759
774
|
|
775
|
+
std::ofstream fout;
|
776
|
+
if (params.fname_out.length() > 0) {
|
777
|
+
fout.open(params.fname_out);
|
778
|
+
if (!fout.is_open()) {
|
779
|
+
fprintf(stderr, "%s: failed to open output file '%s'!\n", __func__, params.fname_out.c_str());
|
780
|
+
return 1;
|
781
|
+
}
|
782
|
+
}
|
783
|
+
|
760
784
|
if (ret_val == 0) {
|
761
785
|
if (!params.commands.empty()) {
|
762
|
-
ret_val = process_command_list(ctx, audio, params);
|
786
|
+
ret_val = process_command_list(ctx, audio, params, fout);
|
763
787
|
} else if (!params.prompt.empty() && params.grammar_parsed.rules.empty()) {
|
764
|
-
ret_val = always_prompt_transcription(ctx, audio, params);
|
788
|
+
ret_val = always_prompt_transcription(ctx, audio, params, fout);
|
765
789
|
} else {
|
766
|
-
ret_val = process_general_transcription(ctx, audio, params);
|
790
|
+
ret_val = process_general_transcription(ctx, audio, params, fout);
|
767
791
|
}
|
768
792
|
}
|
769
793
|
|
@@ -112,13 +112,20 @@ bool read_audio_data(const std::string & fname, std::vector<float>& pcmf32, std:
|
|
112
112
|
}
|
113
113
|
|
114
114
|
if (stereo) {
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
115
|
+
std::vector<float> stereo_data = pcmf32;
|
116
|
+
pcmf32.resize(frame_count);
|
117
|
+
|
118
|
+
for (uint64_t i = 0; i < frame_count; i++) {
|
119
|
+
pcmf32[i] = (stereo_data[2*i] + stereo_data[2*i + 1]);
|
120
|
+
}
|
121
|
+
|
122
|
+
pcmf32s.resize(2);
|
123
|
+
pcmf32s[0].resize(frame_count);
|
124
|
+
pcmf32s[1].resize(frame_count);
|
125
|
+
for (uint64_t i = 0; i < frame_count; i++) {
|
126
|
+
pcmf32s[0][i] = stereo_data[2*i];
|
127
|
+
pcmf32s[1][i] = stereo_data[2*i + 1];
|
128
|
+
}
|
122
129
|
}
|
123
130
|
|
124
131
|
ma_decoder_uninit(&decoder);
|
@@ -424,6 +424,8 @@ static void process_loop(struct whisper_context * ctx, audio_async &audio, const
|
|
424
424
|
}
|
425
425
|
|
426
426
|
int main(int argc, char ** argv) {
|
427
|
+
ggml_backend_load_all();
|
428
|
+
|
427
429
|
whisper_params params;
|
428
430
|
if (whisper_params_parse(argc, argv, params) == false) {
|
429
431
|
return 1;
|
@@ -1,4 +1,5 @@
|
|
1
1
|
#include "ggml.h"
|
2
|
+
#include "ggml-backend.h"
|
2
3
|
|
3
4
|
#include "common.h"
|
4
5
|
#include "common-ggml.h"
|
@@ -176,6 +177,8 @@ static bool whisper_model_quantize(const std::string & fname_inp, const std::str
|
|
176
177
|
}
|
177
178
|
|
178
179
|
int main(int argc, char ** argv) {
|
180
|
+
ggml_backend_load_all();
|
181
|
+
|
179
182
|
if (argc != 4) {
|
180
183
|
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
|
181
184
|
ggml_print_ftypes(stderr);
|