whispercpp 1.3.2 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (244) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +6 -3
  3. data/README.md +71 -14
  4. data/Rakefile +20 -7
  5. data/ext/.gitignore +4 -6
  6. data/ext/dependencies.rb +36 -24
  7. data/ext/extconf.rb +1 -1
  8. data/ext/options.rb +48 -184
  9. data/ext/ruby_whisper.c +18 -0
  10. data/ext/ruby_whisper_context.c +43 -12
  11. data/ext/ruby_whisper_model.c +1 -1
  12. data/ext/ruby_whisper_params.c +4 -2
  13. data/ext/ruby_whisper_segment.c +81 -4
  14. data/ext/ruby_whisper_transcribe.cpp +13 -7
  15. data/ext/ruby_whisper_vad_params.c +1 -1
  16. data/ext/sources/CMakeLists.txt +5 -1
  17. data/ext/sources/bindings/javascript/package.json +1 -1
  18. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +120 -24
  19. data/ext/sources/examples/addon.node/addon.cpp +150 -31
  20. data/ext/sources/examples/addon.node/index.js +3 -0
  21. data/ext/sources/examples/addon.node/vad-example.js +132 -0
  22. data/ext/sources/examples/bench/bench.cpp +3 -2
  23. data/ext/sources/examples/cli/cli.cpp +3 -2
  24. data/ext/sources/examples/command/command.cpp +32 -8
  25. data/ext/sources/examples/common-whisper.cpp +14 -7
  26. data/ext/sources/examples/lsp/lsp.cpp +2 -0
  27. data/ext/sources/examples/quantize/quantize.cpp +3 -0
  28. data/ext/sources/examples/server/CMakeLists.txt +3 -0
  29. data/ext/sources/examples/server/server.cpp +169 -22
  30. data/ext/sources/examples/stream/stream.cpp +6 -0
  31. data/ext/sources/examples/talk-llama/CMakeLists.txt +4 -1
  32. data/ext/sources/examples/talk-llama/llama-arch.cpp +171 -3
  33. data/ext/sources/examples/talk-llama/llama-arch.h +28 -1
  34. data/ext/sources/examples/talk-llama/llama-batch.cpp +741 -272
  35. data/ext/sources/examples/talk-llama/llama-batch.h +112 -54
  36. data/ext/sources/examples/talk-llama/llama-chat.cpp +30 -8
  37. data/ext/sources/examples/talk-llama/llama-chat.h +1 -0
  38. data/ext/sources/examples/talk-llama/llama-context.cpp +520 -351
  39. data/ext/sources/examples/talk-llama/llama-context.h +38 -17
  40. data/ext/sources/examples/talk-llama/llama-cparams.cpp +1 -1
  41. data/ext/sources/examples/talk-llama/llama-cparams.h +1 -1
  42. data/ext/sources/examples/talk-llama/llama-graph.cpp +447 -372
  43. data/ext/sources/examples/talk-llama/llama-graph.h +128 -58
  44. data/ext/sources/examples/talk-llama/llama-hparams.cpp +10 -2
  45. data/ext/sources/examples/talk-llama/llama-hparams.h +19 -2
  46. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +279 -0
  47. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.h +128 -0
  48. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.cpp +1841 -0
  49. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.h +303 -0
  50. data/ext/sources/examples/talk-llama/llama-kv-cache.h +14 -472
  51. data/ext/sources/examples/talk-llama/llama-kv-cells.h +86 -26
  52. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +246 -0
  53. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +138 -0
  54. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +1125 -0
  55. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +183 -0
  56. data/ext/sources/examples/talk-llama/llama-memory.cpp +58 -0
  57. data/ext/sources/examples/talk-llama/llama-memory.h +88 -4
  58. data/ext/sources/examples/talk-llama/llama-mmap.cpp +1 -1
  59. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +42 -17
  60. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +1 -0
  61. data/ext/sources/examples/talk-llama/llama-model.cpp +1863 -563
  62. data/ext/sources/examples/talk-llama/llama-model.h +27 -0
  63. data/ext/sources/examples/talk-llama/llama-quant.cpp +89 -6
  64. data/ext/sources/examples/talk-llama/llama-vocab.cpp +65 -28
  65. data/ext/sources/examples/talk-llama/llama-vocab.h +1 -0
  66. data/ext/sources/examples/talk-llama/llama.cpp +11 -7
  67. data/ext/sources/examples/talk-llama/llama.h +147 -40
  68. data/ext/sources/examples/talk-llama/talk-llama.cpp +2 -0
  69. data/ext/sources/examples/talk-llama/unicode.cpp +5 -0
  70. data/ext/sources/examples/vad-speech-segments/speech.cpp +6 -0
  71. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +2 -0
  72. data/ext/sources/ggml/CMakeLists.txt +48 -3
  73. data/ext/sources/ggml/cmake/common.cmake +24 -0
  74. data/ext/sources/ggml/include/ggml-backend.h +1 -1
  75. data/ext/sources/ggml/include/ggml-cpu.h +2 -0
  76. data/ext/sources/ggml/include/ggml.h +144 -5
  77. data/ext/sources/ggml/src/CMakeLists.txt +82 -24
  78. data/ext/sources/ggml/src/ggml-backend-reg.cpp +5 -0
  79. data/ext/sources/ggml/src/ggml-backend.cpp +46 -23
  80. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +3 -3
  81. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +1 -0
  82. data/ext/sources/ggml/src/ggml-cann/common.h +6 -1
  83. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
  84. data/ext/sources/ggml/src/ggml-common.h +4 -0
  85. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +133 -40
  86. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
  87. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
  88. data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  89. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
  90. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
  91. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
  92. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  93. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
  94. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
  95. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
  96. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
  97. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
  98. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
  99. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
  100. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  101. data/ext/sources/ggml/src/ggml-cpu/common.h +4 -3
  102. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
  103. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +146 -105
  104. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
  105. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  106. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
  107. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
  108. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  109. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1057 -174
  110. data/ext/sources/ggml/src/ggml-cpu/ops.h +8 -0
  111. data/ext/sources/ggml/src/ggml-cpu/quants.c +1158 -0
  112. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  113. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1571 -0
  114. data/ext/sources/ggml/src/ggml-cpu/repack.h +98 -0
  115. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +330 -38
  116. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  117. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +111 -18
  118. data/ext/sources/ggml/src/ggml-cpu/vec.h +303 -94
  119. data/ext/sources/ggml/src/ggml-cuda/common.cuh +60 -37
  120. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  121. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  122. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  123. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  124. data/ext/sources/ggml/src/ggml-cuda/convert.cu +22 -0
  125. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +5 -0
  126. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +2 -2
  127. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +5 -2
  128. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
  129. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +265 -123
  130. data/ext/sources/ggml/src/ggml-cuda/mean.cu +19 -0
  131. data/ext/sources/ggml/src/ggml-cuda/mean.cuh +3 -0
  132. data/ext/sources/ggml/src/ggml-cuda/mmv.cu +257 -87
  133. data/ext/sources/ggml/src/ggml-cuda/mmv.cuh +2 -3
  134. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
  135. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +5 -18
  136. data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  137. data/ext/sources/ggml/src/ggml-cuda/unary.cu +89 -0
  138. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +7 -0
  139. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +4 -0
  140. data/ext/sources/ggml/src/ggml-impl.h +127 -183
  141. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +11 -10
  142. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +27 -0
  143. data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +331 -49
  144. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +564 -282
  145. data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +2 -2
  146. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +14 -0
  147. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +1859 -489
  148. data/ext/sources/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  149. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  150. data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +72 -0
  151. data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +201 -0
  152. data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
  153. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  154. data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  155. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  156. data/ext/sources/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  157. data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
  158. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
  159. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  160. data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  161. data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
  162. data/ext/sources/ggml/src/ggml-quants.c +6 -8
  163. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
  164. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
  165. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +5 -6
  166. data/ext/sources/ggml/src/ggml-sycl/common.hpp +20 -48
  167. data/ext/sources/ggml/src/ggml-sycl/concat.cpp +28 -41
  168. data/ext/sources/ggml/src/ggml-sycl/conv.cpp +4 -10
  169. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +117 -165
  170. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +192 -53
  171. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  172. data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +49 -67
  173. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
  174. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +648 -1039
  175. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +18 -9
  176. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
  177. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +8 -105
  178. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -100
  179. data/ext/sources/ggml/src/ggml-sycl/gla.cpp +2 -2
  180. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +1 -1
  181. data/ext/sources/ggml/src/ggml-sycl/mmq.cpp +60 -80
  182. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +158 -203
  183. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +55 -74
  184. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +38 -10
  185. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +138 -27
  186. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +3 -3
  187. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  188. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  189. data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +3 -8
  190. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
  191. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +12 -16
  192. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +36 -32
  193. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +726 -282
  194. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
  195. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  196. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  197. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
  198. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
  199. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  200. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +12 -3
  201. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  202. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +10 -1
  203. data/ext/sources/ggml/src/ggml.c +328 -48
  204. data/ext/sources/ggml/src/ggml.cpp +26 -0
  205. data/ext/sources/ggml/src/gguf.cpp +24 -3
  206. data/ext/sources/include/whisper.h +2 -0
  207. data/ext/sources/src/CMakeLists.txt +2 -0
  208. data/ext/sources/src/coreml/whisper-compat.h +10 -0
  209. data/ext/sources/src/coreml/whisper-compat.m +35 -0
  210. data/ext/sources/src/coreml/whisper-decoder-impl.m +1 -0
  211. data/ext/sources/src/coreml/whisper-encoder-impl.m +1 -0
  212. data/ext/sources/src/whisper.cpp +218 -169
  213. data/extsources.rb +15 -9
  214. data/lib/whisper/context.rb +15 -0
  215. data/lib/whisper/model/uri.rb +56 -1
  216. data/lib/whisper/segment.rb +58 -0
  217. data/sig/whisper.rbs +68 -38
  218. data/{tests → test}/helper.rb +1 -12
  219. data/{tests → test}/test_model.rb +9 -0
  220. data/test/test_package.rb +51 -0
  221. data/test/test_segment.rb +146 -0
  222. data/{tests → test}/test_whisper.rb +70 -0
  223. data/whispercpp.gemspec +2 -3
  224. metadata +91 -43
  225. data/ext/sources/.dockerignore +0 -3
  226. data/ext/sources/.github/workflows/bindings-ruby.yml +0 -21
  227. data/ext/sources/ci/run.sh +0 -336
  228. data/ext/sources/close-issue.yml +0 -28
  229. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2739
  230. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  231. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13747
  232. data/tests/test_package.rb +0 -46
  233. data/tests/test_segment.rb +0 -74
  234. /data/ext/sources/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  235. /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  236. /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
  237. /data/{tests → test}/jfk_reader/.gitignore +0 -0
  238. /data/{tests → test}/jfk_reader/extconf.rb +0 -0
  239. /data/{tests → test}/jfk_reader/jfk_reader.c +0 -0
  240. /data/{tests → test}/test_callback.rb +0 -0
  241. /data/{tests → test}/test_error.rb +0 -0
  242. /data/{tests → test}/test_params.rb +0 -0
  243. /data/{tests → test}/test_vad.rb +0 -0
  244. /data/{tests → test}/test_vad_params.rb +0 -0
@@ -9,6 +9,7 @@
9
9
  #include <vector>
10
10
  #include <cmath>
11
11
  #include <cstdint>
12
+ #include <cfloat>
12
13
 
13
14
  struct whisper_params {
14
15
  int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
@@ -38,6 +39,7 @@ struct whisper_params {
38
39
  bool print_progress = false;
39
40
  bool no_timestamps = false;
40
41
  bool no_prints = false;
42
+ bool detect_language= false;
41
43
  bool use_gpu = true;
42
44
  bool flash_attn = false;
43
45
  bool comma_in_time = true;
@@ -50,6 +52,16 @@ struct whisper_params {
50
52
  std::vector<std::string> fname_out = {};
51
53
 
52
54
  std::vector<float> pcmf32 = {}; // mono-channel F32 PCM
55
+
56
+ // Voice Activity Detection (VAD) parameters
57
+ bool vad = false;
58
+ std::string vad_model = "";
59
+ float vad_threshold = 0.5f;
60
+ int vad_min_speech_duration_ms = 250;
61
+ int vad_min_silence_duration_ms = 100;
62
+ float vad_max_speech_duration_s = FLT_MAX;
63
+ int vad_speech_pad_ms = 30;
64
+ float vad_samples_overlap = 0.1f;
53
65
  };
54
66
 
55
67
  struct whisper_print_user_data {
@@ -130,6 +142,11 @@ void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper
130
142
 
131
143
  void cb_log_disable(enum ggml_log_level, const char *, void *) {}
132
144
 
145
+ struct whisper_result {
146
+ std::vector<std::vector<std::string>> segments;
147
+ std::string language;
148
+ };
149
+
133
150
  class ProgressWorker : public Napi::AsyncWorker {
134
151
  public:
135
152
  ProgressWorker(Napi::Function& callback, whisper_params params, Napi::Function progress_callback, Napi::Env env)
@@ -160,15 +177,27 @@ class ProgressWorker : public Napi::AsyncWorker {
160
177
 
161
178
  void OnOK() override {
162
179
  Napi::HandleScope scope(Env());
163
- Napi::Object res = Napi::Array::New(Env(), result.size());
164
- for (uint64_t i = 0; i < result.size(); ++i) {
180
+
181
+ if (params.detect_language) {
182
+ Napi::Object resultObj = Napi::Object::New(Env());
183
+ resultObj.Set("language", Napi::String::New(Env(), result.language));
184
+ Callback().Call({Env().Null(), resultObj});
185
+ }
186
+
187
+ Napi::Object returnObj = Napi::Object::New(Env());
188
+ if (!result.language.empty()) {
189
+ returnObj.Set("language", Napi::String::New(Env(), result.language));
190
+ }
191
+ Napi::Array transcriptionArray = Napi::Array::New(Env(), result.segments.size());
192
+ for (uint64_t i = 0; i < result.segments.size(); ++i) {
165
193
  Napi::Object tmp = Napi::Array::New(Env(), 3);
166
194
  for (uint64_t j = 0; j < 3; ++j) {
167
- tmp[j] = Napi::String::New(Env(), result[i][j]);
195
+ tmp[j] = Napi::String::New(Env(), result.segments[i][j]);
168
196
  }
169
- res[i] = tmp;
170
- }
171
- Callback().Call({Env().Null(), res});
197
+ transcriptionArray[i] = tmp;
198
+ }
199
+ returnObj.Set("transcription", transcriptionArray);
200
+ Callback().Call({Env().Null(), returnObj});
172
201
  }
173
202
 
174
203
  // Progress callback function - using thread-safe function
@@ -185,12 +214,12 @@ class ProgressWorker : public Napi::AsyncWorker {
185
214
 
186
215
  private:
187
216
  whisper_params params;
188
- std::vector<std::vector<std::string>> result;
217
+ whisper_result result;
189
218
  Napi::Env env;
190
219
  Napi::ThreadSafeFunction tsfn;
191
220
 
192
221
  // Custom run function with progress callback support
193
- int run_with_progress(whisper_params &params, std::vector<std::vector<std::string>> &result) {
222
+ int run_with_progress(whisper_params &params, whisper_result & result) {
194
223
  if (params.no_prints) {
195
224
  whisper_log_set(cb_log_disable, NULL);
196
225
  }
@@ -279,7 +308,8 @@ class ProgressWorker : public Napi::AsyncWorker {
279
308
  wparams.print_timestamps = !params.no_timestamps;
280
309
  wparams.print_special = params.print_special;
281
310
  wparams.translate = params.translate;
282
- wparams.language = params.language.c_str();
311
+ wparams.language = params.detect_language ? "auto" : params.language.c_str();
312
+ wparams.detect_language = params.detect_language;
283
313
  wparams.n_threads = params.n_threads;
284
314
  wparams.n_max_text_ctx = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
285
315
  wparams.offset_ms = params.offset_t_ms;
@@ -314,34 +344,38 @@ class ProgressWorker : public Napi::AsyncWorker {
314
344
  };
315
345
  wparams.progress_callback_user_data = this;
316
346
 
317
- // Abort mechanism example
318
- {
319
- static bool is_aborted = false; // Note: this should be atomic to avoid data races
347
+ // Set VAD parameters
348
+ wparams.vad = params.vad;
349
+ wparams.vad_model_path = params.vad_model.c_str();
320
350
 
321
- wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
322
- bool is_aborted = *(bool*)user_data;
323
- return !is_aborted;
324
- };
325
- wparams.encoder_begin_callback_user_data = &is_aborted;
326
- }
351
+ wparams.vad_params.threshold = params.vad_threshold;
352
+ wparams.vad_params.min_speech_duration_ms = params.vad_min_speech_duration_ms;
353
+ wparams.vad_params.min_silence_duration_ms = params.vad_min_silence_duration_ms;
354
+ wparams.vad_params.max_speech_duration_s = params.vad_max_speech_duration_s;
355
+ wparams.vad_params.speech_pad_ms = params.vad_speech_pad_ms;
356
+ wparams.vad_params.samples_overlap = params.vad_samples_overlap;
327
357
 
328
358
  if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
329
359
  fprintf(stderr, "failed to process audio\n");
330
360
  return 10;
331
361
  }
332
362
  }
333
- }
363
+ }
334
364
 
365
+ if (params.detect_language || params.language == "auto") {
366
+ result.language = whisper_lang_str(whisper_full_lang_id(ctx));
367
+ }
335
368
  const int n_segments = whisper_full_n_segments(ctx);
336
- result.resize(n_segments);
369
+ result.segments.resize(n_segments);
370
+
337
371
  for (int i = 0; i < n_segments; ++i) {
338
372
  const char * text = whisper_full_get_segment_text(ctx, i);
339
373
  const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
340
374
  const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
341
375
 
342
- result[i].emplace_back(to_timestamp(t0, params.comma_in_time));
343
- result[i].emplace_back(to_timestamp(t1, params.comma_in_time));
344
- result[i].emplace_back(text);
376
+ result.segments[i].emplace_back(to_timestamp(t0, params.comma_in_time));
377
+ result.segments[i].emplace_back(to_timestamp(t1, params.comma_in_time));
378
+ result.segments[i].emplace_back(text);
345
379
  }
346
380
 
347
381
  whisper_print_timings(ctx);
@@ -362,13 +396,46 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
362
396
  std::string language = whisper_params.Get("language").As<Napi::String>();
363
397
  std::string model = whisper_params.Get("model").As<Napi::String>();
364
398
  std::string input = whisper_params.Get("fname_inp").As<Napi::String>();
365
- bool use_gpu = whisper_params.Get("use_gpu").As<Napi::Boolean>();
366
- bool flash_attn = whisper_params.Get("flash_attn").As<Napi::Boolean>();
367
- bool no_prints = whisper_params.Get("no_prints").As<Napi::Boolean>();
368
- bool no_timestamps = whisper_params.Get("no_timestamps").As<Napi::Boolean>();
369
- int32_t audio_ctx = whisper_params.Get("audio_ctx").As<Napi::Number>();
370
- bool comma_in_time = whisper_params.Get("comma_in_time").As<Napi::Boolean>();
371
- int32_t max_len = whisper_params.Get("max_len").As<Napi::Number>();
399
+
400
+ bool use_gpu = true;
401
+ if (whisper_params.Has("use_gpu") && whisper_params.Get("use_gpu").IsBoolean()) {
402
+ use_gpu = whisper_params.Get("use_gpu").As<Napi::Boolean>();
403
+ }
404
+
405
+ bool flash_attn = false;
406
+ if (whisper_params.Has("flash_attn") && whisper_params.Get("flash_attn").IsBoolean()) {
407
+ flash_attn = whisper_params.Get("flash_attn").As<Napi::Boolean>();
408
+ }
409
+
410
+ bool no_prints = false;
411
+ if (whisper_params.Has("no_prints") && whisper_params.Get("no_prints").IsBoolean()) {
412
+ no_prints = whisper_params.Get("no_prints").As<Napi::Boolean>();
413
+ }
414
+
415
+ bool no_timestamps = false;
416
+ if (whisper_params.Has("no_timestamps") && whisper_params.Get("no_timestamps").IsBoolean()) {
417
+ no_timestamps = whisper_params.Get("no_timestamps").As<Napi::Boolean>();
418
+ }
419
+
420
+ bool detect_language = false;
421
+ if (whisper_params.Has("detect_language") && whisper_params.Get("detect_language").IsBoolean()) {
422
+ detect_language = whisper_params.Get("detect_language").As<Napi::Boolean>();
423
+ }
424
+
425
+ int32_t audio_ctx = 0;
426
+ if (whisper_params.Has("audio_ctx") && whisper_params.Get("audio_ctx").IsNumber()) {
427
+ audio_ctx = whisper_params.Get("audio_ctx").As<Napi::Number>();
428
+ }
429
+
430
+ bool comma_in_time = true;
431
+ if (whisper_params.Has("comma_in_time") && whisper_params.Get("comma_in_time").IsBoolean()) {
432
+ comma_in_time = whisper_params.Get("comma_in_time").As<Napi::Boolean>();
433
+ }
434
+
435
+ int32_t max_len = 0;
436
+ if (whisper_params.Has("max_len") && whisper_params.Get("max_len").IsNumber()) {
437
+ max_len = whisper_params.Get("max_len").As<Napi::Number>();
438
+ }
372
439
 
373
440
  // Add support for max_context
374
441
  int32_t max_context = -1;
@@ -384,7 +451,7 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
384
451
 
385
452
  // Add support for print_progress
386
453
  bool print_progress = false;
387
- if (whisper_params.Has("print_progress")) {
454
+ if (whisper_params.Has("print_progress") && whisper_params.Get("print_progress").IsBoolean()) {
388
455
  print_progress = whisper_params.Get("print_progress").As<Napi::Boolean>();
389
456
  }
390
457
  // Add support for progress_callback
@@ -393,6 +460,47 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
393
460
  progress_callback = whisper_params.Get("progress_callback").As<Napi::Function>();
394
461
  }
395
462
 
463
+ // Add support for VAD parameters
464
+ bool vad = false;
465
+ if (whisper_params.Has("vad") && whisper_params.Get("vad").IsBoolean()) {
466
+ vad = whisper_params.Get("vad").As<Napi::Boolean>();
467
+ }
468
+
469
+ std::string vad_model = "";
470
+ if (whisper_params.Has("vad_model") && whisper_params.Get("vad_model").IsString()) {
471
+ vad_model = whisper_params.Get("vad_model").As<Napi::String>();
472
+ }
473
+
474
+ float vad_threshold = 0.5f;
475
+ if (whisper_params.Has("vad_threshold") && whisper_params.Get("vad_threshold").IsNumber()) {
476
+ vad_threshold = whisper_params.Get("vad_threshold").As<Napi::Number>();
477
+ }
478
+
479
+ int vad_min_speech_duration_ms = 250;
480
+ if (whisper_params.Has("vad_min_speech_duration_ms") && whisper_params.Get("vad_min_speech_duration_ms").IsNumber()) {
481
+ vad_min_speech_duration_ms = whisper_params.Get("vad_min_speech_duration_ms").As<Napi::Number>();
482
+ }
483
+
484
+ int vad_min_silence_duration_ms = 100;
485
+ if (whisper_params.Has("vad_min_silence_duration_ms") && whisper_params.Get("vad_min_silence_duration_ms").IsNumber()) {
486
+ vad_min_silence_duration_ms = whisper_params.Get("vad_min_silence_duration_ms").As<Napi::Number>();
487
+ }
488
+
489
+ float vad_max_speech_duration_s = FLT_MAX;
490
+ if (whisper_params.Has("vad_max_speech_duration_s") && whisper_params.Get("vad_max_speech_duration_s").IsNumber()) {
491
+ vad_max_speech_duration_s = whisper_params.Get("vad_max_speech_duration_s").As<Napi::Number>();
492
+ }
493
+
494
+ int vad_speech_pad_ms = 30;
495
+ if (whisper_params.Has("vad_speech_pad_ms") && whisper_params.Get("vad_speech_pad_ms").IsNumber()) {
496
+ vad_speech_pad_ms = whisper_params.Get("vad_speech_pad_ms").As<Napi::Number>();
497
+ }
498
+
499
+ float vad_samples_overlap = 0.1f;
500
+ if (whisper_params.Has("vad_samples_overlap") && whisper_params.Get("vad_samples_overlap").IsNumber()) {
501
+ vad_samples_overlap = whisper_params.Get("vad_samples_overlap").As<Napi::Number>();
502
+ }
503
+
396
504
  Napi::Value pcmf32Value = whisper_params.Get("pcmf32");
397
505
  std::vector<float> pcmf32_vec;
398
506
  if (pcmf32Value.IsTypedArray()) {
@@ -418,6 +526,17 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
418
526
  params.max_context = max_context;
419
527
  params.print_progress = print_progress;
420
528
  params.prompt = prompt;
529
+ params.detect_language = detect_language;
530
+
531
+ // Set VAD parameters
532
+ params.vad = vad;
533
+ params.vad_model = vad_model;
534
+ params.vad_threshold = vad_threshold;
535
+ params.vad_min_speech_duration_ms = vad_min_speech_duration_ms;
536
+ params.vad_min_silence_duration_ms = vad_min_silence_duration_ms;
537
+ params.vad_max_speech_duration_s = vad_max_speech_duration_s;
538
+ params.vad_speech_pad_ms = vad_speech_pad_ms;
539
+ params.vad_samples_overlap = vad_samples_overlap;
421
540
 
422
541
  Napi::Function callback = info[1].As<Napi::Function>();
423
542
  // Create a new Worker class with progress callback support
@@ -17,6 +17,7 @@ const whisperParams = {
17
17
  comma_in_time: false,
18
18
  translate: true,
19
19
  no_timestamps: false,
20
+ detect_language: false,
20
21
  audio_ctx: 0,
21
22
  max_len: 0,
22
23
  progress_callback: (progress) => {
@@ -31,6 +32,8 @@ const params = Object.fromEntries(
31
32
  const [key, value] = item.slice(2).split("=");
32
33
  if (key === "audio_ctx") {
33
34
  whisperParams[key] = parseInt(value);
35
+ } else if (key === "detect_language") {
36
+ whisperParams[key] = value === "true";
34
37
  } else {
35
38
  whisperParams[key] = value;
36
39
  }
@@ -0,0 +1,132 @@
1
+ const path = require("path");
2
+ const { whisper } = require(path.join(
3
+ __dirname,
4
+ "../../build/Release/addon.node"
5
+ ));
6
+ const { promisify } = require("util");
7
+
8
+ const whisperAsync = promisify(whisper);
9
+
10
+ // Example with VAD enabled
11
+ const vadParams = {
12
+ language: "en",
13
+ model: path.join(__dirname, "../../models/ggml-base.en.bin"),
14
+ fname_inp: path.join(__dirname, "../../samples/jfk.wav"),
15
+ use_gpu: true,
16
+ flash_attn: false,
17
+ no_prints: false,
18
+ comma_in_time: true,
19
+ translate: false,
20
+ no_timestamps: false,
21
+ detect_language: false,
22
+ audio_ctx: 0,
23
+ max_len: 0,
24
+ // VAD parameters
25
+ vad: true,
26
+ vad_model: path.join(__dirname, "../../models/ggml-silero-v5.1.2.bin"), // You need to download this model
27
+ vad_threshold: 0.5,
28
+ vad_min_speech_duration_ms: 250,
29
+ vad_min_silence_duration_ms: 100,
30
+ vad_max_speech_duration_s: 30.0,
31
+ vad_speech_pad_ms: 30,
32
+ vad_samples_overlap: 0.1,
33
+ progress_callback: (progress) => {
34
+ console.log(`VAD Transcription progress: ${progress}%`);
35
+ }
36
+ };
37
+
38
+ // Example without VAD (traditional approach)
39
+ const traditionalParams = {
40
+ language: "en",
41
+ model: path.join(__dirname, "../../models/ggml-base.en.bin"),
42
+ fname_inp: path.join(__dirname, "../../samples/jfk.wav"),
43
+ use_gpu: true,
44
+ flash_attn: false,
45
+ no_prints: false,
46
+ comma_in_time: true,
47
+ translate: false,
48
+ no_timestamps: false,
49
+ detect_language: false,
50
+ audio_ctx: 0,
51
+ max_len: 0,
52
+ vad: false, // Explicitly disable VAD
53
+ progress_callback: (progress) => {
54
+ console.log(`Traditional transcription progress: ${progress}%`);
55
+ }
56
+ };
57
+
58
+ async function runVADExample() {
59
+ try {
60
+ console.log("=== Whisper.cpp Node.js VAD Example ===\n");
61
+
62
+ // Check if VAD model exists
63
+ const fs = require('fs');
64
+ if (!fs.existsSync(vadParams.vad_model)) {
65
+ console.log("⚠️ VAD model not found. Please download the VAD model first:");
66
+ console.log(" ./models/download-vad-model.sh silero-v5.1.2");
67
+ console.log(" Or run: python models/convert-silero-vad-to-ggml.py");
68
+ console.log("\n Falling back to traditional transcription without VAD...\n");
69
+
70
+ // Run without VAD
71
+ console.log("🎵 Running traditional transcription...");
72
+ const traditionalResult = await whisperAsync(traditionalParams);
73
+ console.log("\n📝 Traditional transcription result:");
74
+ console.log(traditionalResult);
75
+ return;
76
+ }
77
+
78
+ console.log("🎵 Running transcription with VAD enabled...");
79
+ console.log("VAD Parameters:");
80
+ console.log(` - Threshold: ${vadParams.vad_threshold}`);
81
+ console.log(` - Min speech duration: ${vadParams.vad_min_speech_duration_ms}ms`);
82
+ console.log(` - Min silence duration: ${vadParams.vad_min_silence_duration_ms}ms`);
83
+ console.log(` - Max speech duration: ${vadParams.vad_max_speech_duration_s}s`);
84
+ console.log(` - Speech padding: ${vadParams.vad_speech_pad_ms}ms`);
85
+ console.log(` - Samples overlap: ${vadParams.vad_samples_overlap}\n`);
86
+
87
+ const startTime = Date.now();
88
+ const vadResult = await whisperAsync(vadParams);
89
+ const vadDuration = Date.now() - startTime;
90
+
91
+ console.log("\n✅ VAD transcription completed!");
92
+ console.log(`⏱️ Processing time: ${vadDuration}ms`);
93
+ console.log("\n📝 VAD transcription result:");
94
+ console.log(vadResult);
95
+
96
+ // Compare with traditional approach
97
+ console.log("\n🔄 Running traditional transcription for comparison...");
98
+ const traditionalStartTime = Date.now();
99
+ const traditionalResult = await whisperAsync(traditionalParams);
100
+ const traditionalDuration = Date.now() - traditionalStartTime;
101
+
102
+ console.log("\n✅ Traditional transcription completed!");
103
+ console.log(`⏱️ Processing time: ${traditionalDuration}ms`);
104
+ console.log("\n📝 Traditional transcription result:");
105
+ console.log(traditionalResult);
106
+
107
+ // Performance comparison
108
+ console.log("\n📊 Performance Comparison:");
109
+ console.log(`VAD: ${vadDuration}ms`);
110
+ console.log(`Traditional: ${traditionalDuration}ms`);
111
+ const speedup = traditionalDuration / vadDuration;
112
+ if (speedup > 1) {
113
+ console.log(`🚀 VAD is ${speedup.toFixed(2)}x faster!`);
114
+ } else {
115
+ console.log(`ℹ️ Traditional approach was ${(1/speedup).toFixed(2)}x faster in this case.`);
116
+ }
117
+
118
+ } catch (error) {
119
+ console.error("❌ Error during transcription:", error);
120
+ }
121
+ }
122
+
123
+ // Run the example
124
+ if (require.main === module) {
125
+ runVADExample();
126
+ }
127
+
128
+ module.exports = {
129
+ runVADExample,
130
+ vadParams,
131
+ traditionalParams
132
+ };
@@ -66,13 +66,12 @@ static int whisper_bench_full(const whisper_params & params) {
66
66
  cparams.use_gpu = params.use_gpu;
67
67
  cparams.flash_attn = params.flash_attn;
68
68
 
69
- struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
70
-
71
69
  {
72
70
  fprintf(stderr, "\n");
73
71
  fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", params.n_threads, std::thread::hardware_concurrency(), whisper_print_system_info());
74
72
  }
75
73
 
74
+ struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
76
75
  if (ctx == nullptr) {
77
76
  fprintf(stderr, "error: failed to initialize whisper context\n");
78
77
  return 2;
@@ -156,6 +155,8 @@ static int whisper_bench_full(const whisper_params & params) {
156
155
  }
157
156
 
158
157
  int main(int argc, char ** argv) {
158
+ ggml_backend_load_all();
159
+
159
160
  whisper_params params;
160
161
 
161
162
  if (whisper_params_parse(argc, argv, params) == false) {
@@ -202,7 +202,7 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
202
202
  else if ( arg == "--vad") { params.vad = true; }
203
203
  else if (arg == "-vm" || arg == "--vad-model") { params.vad_model = ARGV_NEXT; }
204
204
  else if (arg == "-vt" || arg == "--vad-threshold") { params.vad_threshold = std::stof(ARGV_NEXT); }
205
- else if (arg == "-vsd" || arg == "--vad-min-speech-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(ARGV_NEXT); }
205
+ else if (arg == "-vspd" || arg == "--vad-min-speech-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(ARGV_NEXT); }
206
206
  else if (arg == "-vsd" || arg == "--vad-min-silence-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(ARGV_NEXT); }
207
207
  else if (arg == "-vmsd" || arg == "--vad-max-speech-duration-s") { params.vad_max_speech_duration_s = std::stof(ARGV_NEXT); }
208
208
  else if (arg == "-vp" || arg == "--vad-speech-pad-ms") { params.vad_speech_pad_ms = std::stoi(ARGV_NEXT); }
@@ -909,6 +909,8 @@ static void output_lrc(struct whisper_context * ctx, std::ofstream & fout, const
909
909
  static void cb_log_disable(enum ggml_log_level , const char * , void * ) { }
910
910
 
911
911
  int main(int argc, char ** argv) {
912
+ ggml_backend_load_all();
913
+
912
914
  #if defined(_WIN32)
913
915
  // Set the console output code page to UTF-8, while command line arguments
914
916
  // are still encoded in the system's code page. In this way, we can print
@@ -988,7 +990,6 @@ int main(int argc, char ** argv) {
988
990
  }
989
991
 
990
992
  // whisper init
991
-
992
993
  struct whisper_context_params cparams = whisper_context_default_params();
993
994
 
994
995
  cparams.use_gpu = params.use_gpu;
@@ -251,7 +251,7 @@ static std::vector<std::string> get_words(const std::string &txt) {
251
251
 
252
252
  // command-list mode
253
253
  // guide the transcription to match the most likely command from a provided list
254
- static int process_command_list(struct whisper_context * ctx, audio_async &audio, const whisper_params &params) {
254
+ static int process_command_list(struct whisper_context * ctx, audio_async &audio, const whisper_params &params, std::ofstream &fout) {
255
255
  fprintf(stderr, "\n");
256
256
  fprintf(stderr, "%s: guided mode\n", __func__);
257
257
 
@@ -444,12 +444,16 @@ static int process_command_list(struct whisper_context * ctx, audio_async &audio
444
444
 
445
445
  const float prob = probs_id[0].first;
446
446
  const int index = probs_id[0].second;
447
+ const char * best_command = allowed_commands[index].c_str();
447
448
 
448
449
  fprintf(stdout, "\n");
449
450
  fprintf(stdout, "%s: detected command: %s%s%s | p = %f | t = %d ms\n", __func__,
450
- "\033[1m", allowed_commands[index].c_str(), "\033[0m", prob,
451
+ "\033[1m", best_command, "\033[0m", prob,
451
452
  (int) std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count());
452
453
  fprintf(stdout, "\n");
454
+ if (fout.is_open()) {
455
+ fout << best_command << std::endl;
456
+ }
453
457
  }
454
458
  }
455
459
 
@@ -462,7 +466,7 @@ static int process_command_list(struct whisper_context * ctx, audio_async &audio
462
466
 
463
467
  // always-prompt mode
464
468
  // transcribe the voice into text after valid prompt
465
- static int always_prompt_transcription(struct whisper_context * ctx, audio_async & audio, const whisper_params & params) {
469
+ static int always_prompt_transcription(struct whisper_context * ctx, audio_async & audio, const whisper_params & params, std::ofstream & fout) {
466
470
  bool is_running = true;
467
471
  bool ask_prompt = true;
468
472
 
@@ -528,6 +532,9 @@ static int always_prompt_transcription(struct whisper_context * ctx, audio_async
528
532
 
529
533
  if ((sim > 0.7f) && (command.size() > 0)) {
530
534
  fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms);
535
+ if (fout.is_open()) {
536
+ fout << command << std::endl;
537
+ }
531
538
  }
532
539
 
533
540
  fprintf(stdout, "\n");
@@ -542,7 +549,7 @@ static int always_prompt_transcription(struct whisper_context * ctx, audio_async
542
549
 
543
550
  // general-purpose mode
544
551
  // freely transcribe the voice into text
545
- static int process_general_transcription(struct whisper_context * ctx, audio_async & audio, const whisper_params & params) {
552
+ static int process_general_transcription(struct whisper_context * ctx, audio_async & audio, const whisper_params & params, std::ofstream & fout) {
546
553
  bool is_running = true;
547
554
  bool have_prompt = false;
548
555
  bool ask_prompt = true;
@@ -662,8 +669,10 @@ static int process_general_transcription(struct whisper_context * ctx, audio_asy
662
669
  } else {
663
670
  // cut the prompt from the decoded text
664
671
  const std::string command = ::trim(txt.substr(best_len));
665
-
666
672
  fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms);
673
+ if (fout.is_open()) {
674
+ fout << command << std::endl;
675
+ }
667
676
  }
668
677
 
669
678
  fprintf(stdout, "\n");
@@ -678,6 +687,8 @@ static int process_general_transcription(struct whisper_context * ctx, audio_asy
678
687
  }
679
688
 
680
689
  int main(int argc, char ** argv) {
690
+ ggml_backend_load_all();
691
+
681
692
  whisper_params params;
682
693
 
683
694
  if (whisper_params_parse(argc, argv, params) == false) {
@@ -698,6 +709,10 @@ int main(int argc, char ** argv) {
698
709
  cparams.flash_attn = params.flash_attn;
699
710
 
700
711
  struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
712
+ if (ctx == nullptr) {
713
+ fprintf(stderr, "error: failed to initialize whisper context\n");
714
+ return 2;
715
+ }
701
716
 
702
717
  // print some info about the processing
703
718
  {
@@ -757,13 +772,22 @@ int main(int argc, char ** argv) {
757
772
  }
758
773
  }
759
774
 
775
+ std::ofstream fout;
776
+ if (params.fname_out.length() > 0) {
777
+ fout.open(params.fname_out);
778
+ if (!fout.is_open()) {
779
+ fprintf(stderr, "%s: failed to open output file '%s'!\n", __func__, params.fname_out.c_str());
780
+ return 1;
781
+ }
782
+ }
783
+
760
784
  if (ret_val == 0) {
761
785
  if (!params.commands.empty()) {
762
- ret_val = process_command_list(ctx, audio, params);
786
+ ret_val = process_command_list(ctx, audio, params, fout);
763
787
  } else if (!params.prompt.empty() && params.grammar_parsed.rules.empty()) {
764
- ret_val = always_prompt_transcription(ctx, audio, params);
788
+ ret_val = always_prompt_transcription(ctx, audio, params, fout);
765
789
  } else {
766
- ret_val = process_general_transcription(ctx, audio, params);
790
+ ret_val = process_general_transcription(ctx, audio, params, fout);
767
791
  }
768
792
  }
769
793
 
@@ -112,13 +112,20 @@ bool read_audio_data(const std::string & fname, std::vector<float>& pcmf32, std:
112
112
  }
113
113
 
114
114
  if (stereo) {
115
- pcmf32s.resize(2);
116
- pcmf32s[0].resize(frame_count);
117
- pcmf32s[1].resize(frame_count);
118
- for (uint64_t i = 0; i < frame_count; i++) {
119
- pcmf32s[0][i] = pcmf32[2*i];
120
- pcmf32s[1][i] = pcmf32[2*i + 1];
121
- }
115
+ std::vector<float> stereo_data = pcmf32;
116
+ pcmf32.resize(frame_count);
117
+
118
+ for (uint64_t i = 0; i < frame_count; i++) {
119
+ pcmf32[i] = (stereo_data[2*i] + stereo_data[2*i + 1]);
120
+ }
121
+
122
+ pcmf32s.resize(2);
123
+ pcmf32s[0].resize(frame_count);
124
+ pcmf32s[1].resize(frame_count);
125
+ for (uint64_t i = 0; i < frame_count; i++) {
126
+ pcmf32s[0][i] = stereo_data[2*i];
127
+ pcmf32s[1][i] = stereo_data[2*i + 1];
128
+ }
122
129
  }
123
130
 
124
131
  ma_decoder_uninit(&decoder);
@@ -424,6 +424,8 @@ static void process_loop(struct whisper_context * ctx, audio_async &audio, const
424
424
  }
425
425
 
426
426
  int main(int argc, char ** argv) {
427
+ ggml_backend_load_all();
428
+
427
429
  whisper_params params;
428
430
  if (whisper_params_parse(argc, argv, params) == false) {
429
431
  return 1;
@@ -1,4 +1,5 @@
1
1
  #include "ggml.h"
2
+ #include "ggml-backend.h"
2
3
 
3
4
  #include "common.h"
4
5
  #include "common-ggml.h"
@@ -176,6 +177,8 @@ static bool whisper_model_quantize(const std::string & fname_inp, const std::str
176
177
  }
177
178
 
178
179
  int main(int argc, char ** argv) {
180
+ ggml_backend_load_all();
181
+
179
182
  if (argc != 4) {
180
183
  fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
181
184
  ggml_print_ftypes(stderr);
@@ -1,3 +1,6 @@
1
+ set(CMAKE_CXX_STANDARD 17)
2
+ set(CMAKE_CXX_STANDARD_REQUIRED ON)
3
+
1
4
  set(TARGET whisper-server)
2
5
  add_executable(${TARGET} server.cpp httplib.h)
3
6