scribe-cli 0.17.1__tar.gz → 0.18.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/PKG-INFO +3 -1
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/docs/backends.md +2 -2
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/docs/cli.md +22 -3
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/docs/tray.md +9 -4
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/pyproject.toml +11 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/_version.py +3 -3
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/app.py +67 -35
- scribe_cli-0.18.0/scribe/audio.py +379 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/backends/openai_realtime.py +29 -6
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/menu.py +60 -1
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/models.py +80 -27
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe_cli.egg-info/PKG-INFO +3 -1
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe_cli.egg-info/SOURCES.txt +2 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe_cli.egg-info/requires.txt +3 -0
- scribe_cli-0.18.0/scribe_data/silero_vad.LICENSE +21 -0
- scribe_cli-0.18.0/scribe_data/silero_vad.onnx +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/tests/test_pseudo_streaming.py +159 -34
- scribe_cli-0.17.1/scribe/audio.py +0 -76
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/.github/FUNDING.yml +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/.github/workflows/pypi.yml +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/.gitignore +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/LICENSE +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/README.md +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/docs/app-tray-menu.png +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/docs/desktop-install.md +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/docs/installation.md +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/docs/keyboard.md +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/docs/roadmap-libei.md +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/icon.xcf +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/__init__.py +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/backends/__init__.py +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/backends/groq.py +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/backends/openai_api.py +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/backends/vosk.py +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/backends/whisper.py +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/backends/whisper_futo.py +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/install_desktop.py +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/keyboard.py +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/models.toml +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/saverecording.py +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/session.py +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/testpynput.py +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/typers/__init__.py +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/typers/base.py +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/typers/eitype.py +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/typers/pynput.py +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/typers/wtype.py +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/typers/ydotool.py +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/util.py +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe_cli.egg-info/dependency_links.txt +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe_cli.egg-info/entry_points.txt +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe_cli.egg-info/top_level.txt +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe_data/__init__.py +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe_data/share/icon.png +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe_data/share/icon_recording.png +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe_data/share/icon_writing.png +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe_data/templates/scribe.desktop +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scripts/bench_whisper_local.py +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scripts/test_python_versions_install.sh +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/setup.cfg +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/tests/test_openai_realtime_coalesce.py +0 -0
- {scribe_cli-0.17.1 → scribe_cli-0.18.0}/tests/test_whisper_futo.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: scribe-cli
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.18.0
|
|
4
4
|
Summary: Speech-to-text CLI and system-tray app for dictating into any focused window. Local (vosk, faster-whisper) or cloud (groq, openai) backends, batch or streaming.
|
|
5
5
|
Author-email: Mahé Perrette <mahe.perrette@gmail.com>
|
|
6
6
|
License: MIT License
|
|
@@ -52,6 +52,7 @@ Requires-Dist: unidecode
|
|
|
52
52
|
Requires-Dist: termcolor
|
|
53
53
|
Requires-Dist: platformdirs
|
|
54
54
|
Requires-Dist: desktop-ai-core>=0.2.0
|
|
55
|
+
Requires-Dist: onnxruntime
|
|
55
56
|
Provides-Extra: keyboard
|
|
56
57
|
Requires-Dist: pynput; extra == "keyboard"
|
|
57
58
|
Provides-Extra: whisper
|
|
@@ -69,6 +70,7 @@ Requires-Dist: soundfile; extra == "openai"
|
|
|
69
70
|
Provides-Extra: groq
|
|
70
71
|
Requires-Dist: openai<3,>=2.37.0; extra == "groq"
|
|
71
72
|
Requires-Dist: soundfile; extra == "groq"
|
|
73
|
+
Provides-Extra: vad
|
|
72
74
|
Provides-Extra: all
|
|
73
75
|
Requires-Dist: pynput; extra == "all"
|
|
74
76
|
Requires-Dist: faster-whisper; extra == "all"
|
|
@@ -225,8 +225,8 @@ Whisper's prompt window is capped at ~224 tokens; 200 chars of French
|
|
|
225
225
|
sits well under that and leaves room for your static prompt + words
|
|
226
226
|
list.
|
|
227
227
|
|
|
228
|
-
The rolling tail is **dropped**
|
|
229
|
-
|
|
228
|
+
The rolling tail is **dropped** when the silence between two
|
|
229
|
+
utterances exceeds 1.5 seconds — a long pause is treated as a new
|
|
230
230
|
sentence/idea boundary, where carrying a possibly-bad prior chunk
|
|
231
231
|
forward biases the next one more than it helps. This mirrors
|
|
232
232
|
`whisper.cpp`'s `--keep-context off` default: prior-text conditioning
|
|
@@ -65,20 +65,39 @@ flag suppresses only its own side (giving `--prompt ""` still loads
|
|
|
65
65
|
| `--type-direct` | In keystroke mode, type the transcription as keystrokes instead of synthesising Ctrl+V. |
|
|
66
66
|
| `-o, --output-file FILE` | Also append the transcription to this file. |
|
|
67
67
|
|
|
68
|
-
## Silence detection
|
|
68
|
+
## Silence detection
|
|
69
69
|
|
|
70
70
|
| Flag | Default | Purpose |
|
|
71
71
|
|----------------------------|---------|------------------------------------------------------------------------|
|
|
72
72
|
| `--duration SECS` | `120` | Max recording duration in seconds. |
|
|
73
|
-
| `--silence-db DB` | `-40` | dBFS volume floor for "this frame is silent". Used by every silence-driven behavior. |
|
|
74
73
|
| `--silence-duration SECS` | `0.6` | How long silence must persist before triggering a backend's silence behavior (realtime auto-commit, pseudo-streaming cut). |
|
|
75
74
|
|
|
75
|
+
## Voice activity detection
|
|
76
|
+
|
|
77
|
+
scribe ships two silence-detection backends. By default
|
|
78
|
+
(`--vad-mode auto`) it picks **silero-vad** when `onnxruntime` is
|
|
79
|
+
importable (always true on a stock `pip install scribe-cli` since
|
|
80
|
+
`onnxruntime` is a base dependency) and falls back to a plain dB
|
|
81
|
+
volume threshold otherwise. silero is much more robust to ambient
|
|
82
|
+
noise (clicks, fan, traffic) and to soft speech than dB, which drops
|
|
83
|
+
sub-threshold syllables and gets fooled by loud non-speech.
|
|
84
|
+
|
|
85
|
+
The dB and silero parameter groups are independent — the inactive
|
|
86
|
+
mode's knobs are ignored.
|
|
87
|
+
|
|
88
|
+
| Flag | Default | Purpose |
|
|
89
|
+
|-------------------------------|---------|------------------------------------------------------------------------|
|
|
90
|
+
| `--vad-mode {auto,db,silero}` | `auto` | Silence-detection backend. `auto` picks silero when available, dB otherwise. |
|
|
91
|
+
| `--vad-threshold FLOAT` | `0.5` | **[silero only]** Speech-probability threshold in `[0,1]`. Lower = more permissive (catches quiet speech and more noise); higher = stricter. |
|
|
92
|
+
| `--vad-min-silence-ms INT` | `300` | **[silero only]** Minimum sustained low-probability span before speech-end fires, in ms. silero's onset/offset smoothing window. |
|
|
93
|
+
| `--silence-db DB` | `-40` | **[dB only]** dBFS volume floor for "this frame is silent". Ignored when silero is the active mode. |
|
|
94
|
+
|
|
76
95
|
## Realtime (`gpt-realtime-whisper`)
|
|
77
96
|
|
|
78
97
|
| Flag | Default | Purpose |
|
|
79
98
|
|---------------------------------------------------|----------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
|
80
99
|
| `--realtime-delay {minimal,low,medium,high,xhigh}` | `medium` | Trade off latency vs accuracy on `gpt-realtime-whisper`. Lower = faster partials but more paste churn in the focused window. |
|
|
81
|
-
| `--realtime-gate` / `--no-realtime-gate` | on | Drop silent frames (per `--
|
|
100
|
+
| `--realtime-gate` / `--no-realtime-gate` | on | Drop silent frames (per the active `--vad-mode`) before sending them over the WebSocket so silent audio isn't billed as input tokens. After `--silence-duration` of silence, also commit mid-session so trailing words flush live. |
|
|
82
101
|
|
|
83
102
|
Streaming models (Vosk, `gpt-realtime-whisper`) ignore the batch
|
|
84
103
|
silence-chunking knobs; they have their own end-of-utterance signal.
|
|
@@ -58,10 +58,15 @@ Options ▶
|
|
|
58
58
|
Keyboard backend ▶ eitype / pynput / ydotool / wtype
|
|
59
59
|
(rows incompatible with this OS are hidden;
|
|
60
60
|
submenu hidden entirely when ≤ 1 row left)
|
|
61
|
-
Advanced ▶ silence duration,
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
61
|
+
Advanced ▶ silence duration, VAD mode toggle
|
|
62
|
+
(silero ↔ dB), per-mode VAD knobs
|
|
63
|
+
(silero: speech-probability threshold,
|
|
64
|
+
min silence duration; dB: silence
|
|
65
|
+
threshold — only the active mode's
|
|
66
|
+
knobs are shown), realtime gate,
|
|
67
|
+
pseudo-streaming [experimental],
|
|
68
|
+
streaming window [experimental],
|
|
69
|
+
output file
|
|
65
70
|
Quit
|
|
66
71
|
```
|
|
67
72
|
|
|
@@ -22,6 +22,11 @@ dependencies = [
|
|
|
22
22
|
"termcolor",
|
|
23
23
|
"platformdirs",
|
|
24
24
|
"desktop-ai-core>=0.2.0",
|
|
25
|
+
# Runs the bundled silero VAD ONNX model (~2 MB shipped in scribe_data).
|
|
26
|
+
# In base deps so silero is available out of the box — see scribe/audio.py.
|
|
27
|
+
# `faster-whisper` already pulls it transitively, so installing with
|
|
28
|
+
# [whisper] is free; standalone adds ~57 MB which is trivial for an STT tool.
|
|
29
|
+
"onnxruntime",
|
|
25
30
|
]
|
|
26
31
|
|
|
27
32
|
classifiers = [
|
|
@@ -67,12 +72,18 @@ vosk = ["vosk"]
|
|
|
67
72
|
app = ["pystray", "PyGObject"]
|
|
68
73
|
openai = ["openai>=2.37.0,<3", "soundfile"]
|
|
69
74
|
groq = ["openai>=2.37.0,<3", "soundfile"]
|
|
75
|
+
# [vad] is now a no-op alias kept for back-compat (`pip install scribe-cli[vad]`
|
|
76
|
+
# was the documented install before onnxruntime moved into base deps).
|
|
77
|
+
vad = []
|
|
70
78
|
all = ["pynput", "faster-whisper", "pywhispercpp", "openai>=2.37.0,<3", "soundfile", "vosk", "pystray"]
|
|
71
79
|
|
|
72
80
|
|
|
73
81
|
[tool.setuptools]
|
|
74
82
|
packages = [ "scribe", "scribe_data" ]
|
|
75
83
|
|
|
84
|
+
[tool.setuptools.package-data]
|
|
85
|
+
scribe_data = ["share/*.png", "templates/*", "silero_vad.onnx", "silero_vad.LICENSE"]
|
|
86
|
+
|
|
76
87
|
[tool.setuptools_scm]
|
|
77
88
|
write_to = "scribe/_version.py"
|
|
78
89
|
|
|
@@ -18,7 +18,7 @@ version_tuple: tuple[int | str, ...]
|
|
|
18
18
|
commit_id: str | None
|
|
19
19
|
__commit_id__: str | None
|
|
20
20
|
|
|
21
|
-
__version__ = version = '0.
|
|
22
|
-
__version_tuple__ = version_tuple = (0,
|
|
21
|
+
__version__ = version = '0.18.0'
|
|
22
|
+
__version_tuple__ = version_tuple = (0, 18, 0)
|
|
23
23
|
|
|
24
|
-
__commit_id__ = commit_id = '
|
|
24
|
+
__commit_id__ = commit_id = 'gd48d707c7'
|
|
@@ -171,7 +171,8 @@ def _resolve_prompt_and_words(prompt_text, prompt_file, words, words_file):
|
|
|
171
171
|
|
|
172
172
|
|
|
173
173
|
def _build_backend_kwargs(backend, model, language, samplerate, duration,
|
|
174
|
-
silence_db,
|
|
174
|
+
silence_db, silence_duration,
|
|
175
|
+
vad_mode, vad_threshold, vad_min_silence_ms,
|
|
175
176
|
download_folder_vosk, download_folder_whisper,
|
|
176
177
|
download_folder_whisper_futo,
|
|
177
178
|
realtime_delay, realtime_gate,
|
|
@@ -186,6 +187,8 @@ def _build_backend_kwargs(backend, model, language, samplerate, duration,
|
|
|
186
187
|
word_blob = " ".join(words)
|
|
187
188
|
merged_prompt = f"{prompt_text} {word_blob}" if prompt_text else word_blob
|
|
188
189
|
|
|
190
|
+
vad_kwargs = dict(vad_mode=vad_mode, vad_threshold=vad_threshold,
|
|
191
|
+
vad_min_silence_ms=vad_min_silence_ms)
|
|
189
192
|
if backend == "vosk":
|
|
190
193
|
# Vosk has no soft prompt; only a hard grammar. Silently ignore for now.
|
|
191
194
|
return dict(model_name=model, language=language, samplerate=samplerate,
|
|
@@ -194,11 +197,12 @@ def _build_backend_kwargs(backend, model, language, samplerate, duration,
|
|
|
194
197
|
if backend == "whisper":
|
|
195
198
|
return dict(model_name=model, language=language, samplerate=samplerate,
|
|
196
199
|
timeout=duration, silence_duration=silence_duration,
|
|
197
|
-
silence_thresh=silence_db,
|
|
200
|
+
silence_thresh=silence_db,
|
|
198
201
|
pseudo_streaming=pseudo_streaming, streaming_window=streaming_window,
|
|
199
202
|
prompt=prompt_text,
|
|
200
203
|
hotwords=(" ".join(words) if words else None),
|
|
201
|
-
model_kwargs={"download_root": download_folder_whisper}
|
|
204
|
+
model_kwargs={"download_root": download_folder_whisper},
|
|
205
|
+
**vad_kwargs)
|
|
202
206
|
if backend == "whisper-futo":
|
|
203
207
|
# pywhispercpp 1.4.1 exposes `initial_prompt`; the backend folds
|
|
204
208
|
# words+prompt into it (and adds a rolling chunk-tail in
|
|
@@ -206,17 +210,19 @@ def _build_backend_kwargs(backend, model, language, samplerate, duration,
|
|
|
206
210
|
# everything into the prompt like the cloud backends do.
|
|
207
211
|
return dict(model_name=model, language=language, samplerate=samplerate,
|
|
208
212
|
timeout=duration, silence_duration=silence_duration,
|
|
209
|
-
silence_thresh=silence_db,
|
|
213
|
+
silence_thresh=silence_db,
|
|
210
214
|
pseudo_streaming=pseudo_streaming, streaming_window=streaming_window,
|
|
211
215
|
prompt=merged_prompt,
|
|
212
|
-
download_folder=download_folder_whisper_futo
|
|
216
|
+
download_folder=download_folder_whisper_futo,
|
|
217
|
+
**vad_kwargs)
|
|
213
218
|
if backend in ("openai", "groq"):
|
|
214
219
|
from scribe.backends.openai_api import REALTIME_MODELS
|
|
215
220
|
kwargs = dict(model_name=model, samplerate=samplerate,
|
|
216
221
|
timeout=duration, silence_duration=silence_duration,
|
|
217
|
-
silence_thresh=silence_db,
|
|
222
|
+
silence_thresh=silence_db,
|
|
218
223
|
pseudo_streaming=pseudo_streaming, streaming_window=streaming_window,
|
|
219
|
-
prompt=merged_prompt
|
|
224
|
+
prompt=merged_prompt,
|
|
225
|
+
**vad_kwargs)
|
|
220
226
|
if backend == "openai" and model in REALTIME_MODELS:
|
|
221
227
|
kwargs["realtime_delay"] = realtime_delay
|
|
222
228
|
kwargs["realtime_gate"] = realtime_gate
|
|
@@ -231,7 +237,8 @@ def _build_backend_kwargs(backend, model, language, samplerate, duration,
|
|
|
231
237
|
|
|
232
238
|
def get_transcriber(model=None, backend=None, dummy=False, interactive=True, language=None,
|
|
233
239
|
samplerate=None, duration=None,
|
|
234
|
-
silence_db=None,
|
|
240
|
+
silence_db=None, silence_duration=0.6,
|
|
241
|
+
vad_mode="auto", vad_threshold=0.5, vad_min_silence_ms=300,
|
|
235
242
|
download_folder_vosk=None, download_folder_whisper=None,
|
|
236
243
|
download_folder_whisper_futo=None,
|
|
237
244
|
realtime_delay="medium", realtime_gate=True,
|
|
@@ -261,17 +268,14 @@ def get_transcriber(model=None, backend=None, dummy=False, interactive=True, lan
|
|
|
261
268
|
else:
|
|
262
269
|
model = _prompt_model_for_backend(backend, language, interactive)
|
|
263
270
|
print(f"Selected model: {model}")
|
|
264
|
-
# silence_db is the
|
|
265
|
-
# -40
|
|
266
|
-
# gate) used only in pseudo-streaming via hysteresis; -25 keeps ambient
|
|
267
|
-
# noise (keyboard, breathing) from triggering a chunk.
|
|
271
|
+
# silence_db is the single volume floor used by the dB fallback. Silero
|
|
272
|
+
# mode ignores it. Default -40 dBFS — keeps the gate simple by design.
|
|
268
273
|
if silence_db is None:
|
|
269
274
|
silence_db = -40.0
|
|
270
|
-
if silence_onset_db is None:
|
|
271
|
-
silence_onset_db = -25.0 if pseudo_streaming else silence_db
|
|
272
275
|
prompt_text, word_list = _resolve_prompt_and_words(prompt, prompt_file, words, words_file)
|
|
273
276
|
backend_kwargs = _build_backend_kwargs(backend, model, language, samplerate, duration,
|
|
274
|
-
silence_db,
|
|
277
|
+
silence_db, silence_duration,
|
|
278
|
+
vad_mode, vad_threshold, vad_min_silence_ms,
|
|
275
279
|
download_folder_vosk, download_folder_whisper,
|
|
276
280
|
download_folder_whisper_futo,
|
|
277
281
|
realtime_delay, realtime_gate,
|
|
@@ -335,21 +339,9 @@ def get_parser():
|
|
|
335
339
|
group.add_argument("-o", "--output-file",
|
|
336
340
|
help="Also append the transcription to this file.")
|
|
337
341
|
|
|
338
|
-
group = parser.add_argument_group("Silence detection
|
|
342
|
+
group = parser.add_argument_group("Silence detection")
|
|
339
343
|
group.add_argument("--duration", default=120, type=float,
|
|
340
344
|
help="Max recording duration in seconds (default: %(default)s).")
|
|
341
|
-
group.add_argument("--silence-db", default=None, type=float,
|
|
342
|
-
help="LOW silence floor in dBFS — applied while we're "
|
|
343
|
-
"already inside an utterance, so soft trailing "
|
|
344
|
-
"syllables aren't cut. Default: -40. Used by every "
|
|
345
|
-
"silence-driven behavior (pseudo-streaming pause "
|
|
346
|
-
"detection, realtime gate, realtime auto-commit).")
|
|
347
|
-
group.add_argument("--silence-onset-db", default=None, type=float,
|
|
348
|
-
help="HIGH silence floor in dBFS — applied before we've "
|
|
349
|
-
"started capturing speech (audio buffer empty). "
|
|
350
|
-
"Stricter so ambient noise (keyboard, breathing) "
|
|
351
|
-
"doesn't trigger a chunk. Default: -25 in "
|
|
352
|
-
"pseudo-streaming, same as --silence-db otherwise.")
|
|
353
345
|
group.add_argument("--silence-duration", default=0.6, type=float,
|
|
354
346
|
help="Seconds of silence required before triggering a "
|
|
355
347
|
"backend's silence behavior (default: %(default)s). "
|
|
@@ -358,6 +350,31 @@ def get_parser():
|
|
|
358
350
|
"batch backends: candidate cut point within the "
|
|
359
351
|
"streaming window.")
|
|
360
352
|
|
|
353
|
+
group = parser.add_argument_group("Voice activity detection")
|
|
354
|
+
group.add_argument("--vad-mode", choices=("auto", "db", "silero"), default="auto",
|
|
355
|
+
help="Silence-detection backend (default: %(default)s). "
|
|
356
|
+
"'auto' picks silero if installed, dB otherwise. "
|
|
357
|
+
"'silero' uses silero-vad — much more robust to "
|
|
358
|
+
"ambient noise (ticks, fan, traffic) AND to soft "
|
|
359
|
+
"speech (the dB gate drops sub-threshold syllables; "
|
|
360
|
+
"silero recognises speech spectrally). "
|
|
361
|
+
"'db' is a volume-threshold fallback used when "
|
|
362
|
+
"onnxruntime is unavailable (see --silence-db). "
|
|
363
|
+
"The dB and silero parameter groups are independent.")
|
|
364
|
+
group.add_argument("--vad-threshold", default=0.5, type=float,
|
|
365
|
+
help="[silero only] Speech-probability threshold in [0,1] "
|
|
366
|
+
"(default: %(default)s). Lower = more permissive (catches "
|
|
367
|
+
"quiet speech but also more noise); higher = stricter.")
|
|
368
|
+
group.add_argument("--vad-min-silence-ms", default=300, type=int,
|
|
369
|
+
help="[silero only] Minimum sustained low-probability span before "
|
|
370
|
+
"speech-end is emitted, in ms (default: %(default)s). "
|
|
371
|
+
"Acts as silero's onset/offset smoothing window.")
|
|
372
|
+
group.add_argument("--silence-db", default=None, type=float,
|
|
373
|
+
help="[dB only] Silence floor in dBFS for the dB-mode "
|
|
374
|
+
"fallback (default: -40). Ignored when "
|
|
375
|
+
"--vad-mode=silero (or =auto and silero is "
|
|
376
|
+
"available).")
|
|
377
|
+
|
|
361
378
|
group = parser.add_argument_group("Realtime (gpt-realtime-whisper)")
|
|
362
379
|
group.add_argument("--realtime-delay",
|
|
363
380
|
choices=("minimal", "low", "medium", "high", "xhigh"),
|
|
@@ -367,10 +384,10 @@ def get_parser():
|
|
|
367
384
|
"paste churn in the focused window).")
|
|
368
385
|
group.add_argument("--realtime-gate", action=argparse.BooleanOptionalAction,
|
|
369
386
|
default=True,
|
|
370
|
-
help="Drop silent frames (per --
|
|
371
|
-
"them over the WebSocket so silent audio
|
|
372
|
-
"as input tokens (default: on; pass
|
|
373
|
-
"to disable).")
|
|
387
|
+
help="Drop silent frames (per the active --vad-mode) before "
|
|
388
|
+
"sending them over the WebSocket so silent audio "
|
|
389
|
+
"isn't billed as input tokens (default: on; pass "
|
|
390
|
+
"--no-realtime-gate to disable).")
|
|
374
391
|
|
|
375
392
|
group = parser.add_argument_group("Pseudo-streaming (experimental)")
|
|
376
393
|
group.add_argument("--pseudo-streaming", action="store_true",
|
|
@@ -538,14 +555,24 @@ def create_app(micro, app_state):
|
|
|
538
555
|
image = Image.open(Path(scribe_data.__file__).parent / "share" / "icon.png")
|
|
539
556
|
image_recording = Image.open(Path(scribe_data.__file__).parent / "share" / "icon_recording.png")
|
|
540
557
|
image_writing = Image.open(Path(scribe_data.__file__).parent / "share" / "icon_writing.png")
|
|
558
|
+
# Composite (red + writing 'a'): shown while recording AND the silence
|
|
559
|
+
# gate says speech is active. Gives the user a visual confirmation that
|
|
560
|
+
# the audio is actually being captured/sent — not just sitting in
|
|
561
|
+
# detected silence. Plain red = recording but waiting for speech.
|
|
562
|
+
image_recording_active = Image.alpha_composite(
|
|
563
|
+
image_recording.convert("RGBA"), image_writing.convert("RGBA"),
|
|
564
|
+
)
|
|
541
565
|
|
|
542
566
|
if transcriber.backend == "vosk":
|
|
543
|
-
#
|
|
544
|
-
|
|
567
|
+
# vosk transcribes while recording — both recording sub-states show
|
|
568
|
+
# the composite (no meaningful "waiting" since vosk streams
|
|
569
|
+
# continuously).
|
|
570
|
+
image_recording = image_recording_active
|
|
545
571
|
|
|
546
572
|
state_images = {
|
|
547
573
|
None: image,
|
|
548
574
|
"recording": image_recording,
|
|
575
|
+
"recording_active": image_recording_active,
|
|
549
576
|
"busy": image_writing,
|
|
550
577
|
}
|
|
551
578
|
|
|
@@ -564,7 +591,12 @@ def create_app(micro, app_state):
|
|
|
564
591
|
return "busy"
|
|
565
592
|
s = icon._session
|
|
566
593
|
if s.recording:
|
|
567
|
-
|
|
594
|
+
# session.waiting flips True after silence_duration of detected
|
|
595
|
+
# silence, False on the first non-silent chunk. The composite
|
|
596
|
+
# ("recording_active") tells the user audio is actually being
|
|
597
|
+
# sent to the backend — solves the "is it hearing me?" question
|
|
598
|
+
# without printing partial transcripts to the tray.
|
|
599
|
+
return "recording" if s.waiting else "recording_active"
|
|
568
600
|
if s.busy:
|
|
569
601
|
return "busy"
|
|
570
602
|
return None
|