scribe-cli 0.17.0__tar.gz → 0.18.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/PKG-INFO +3 -1
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/docs/backends.md +35 -1
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/docs/cli.md +22 -3
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/docs/keyboard.md +31 -0
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/docs/tray.md +9 -4
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/pyproject.toml +11 -0
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/_version.py +3 -3
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/app.py +98 -25
- scribe_cli-0.18.0/scribe/audio.py +379 -0
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/backends/openai_api.py +3 -1
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/backends/openai_realtime.py +108 -9
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/backends/whisper.py +2 -1
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/backends/whisper_futo.py +81 -20
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/menu.py +60 -1
- scribe_cli-0.18.0/scribe/models.py +333 -0
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/session.py +10 -1
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe_cli.egg-info/PKG-INFO +3 -1
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe_cli.egg-info/SOURCES.txt +6 -1
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe_cli.egg-info/requires.txt +3 -0
- scribe_cli-0.18.0/scribe_data/silero_vad.LICENSE +21 -0
- scribe_cli-0.18.0/scribe_data/silero_vad.onnx +0 -0
- scribe_cli-0.18.0/tests/test_openai_realtime_coalesce.py +221 -0
- scribe_cli-0.18.0/tests/test_pseudo_streaming.py +413 -0
- scribe_cli-0.18.0/tests/test_whisper_futo.py +245 -0
- scribe_cli-0.17.0/scribe/audio.py +0 -76
- scribe_cli-0.17.0/scribe/models.py +0 -182
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/.github/FUNDING.yml +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/.github/workflows/pypi.yml +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/.gitignore +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/LICENSE +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/README.md +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/docs/app-tray-menu.png +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/docs/desktop-install.md +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/docs/installation.md +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/docs/roadmap-libei.md +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/icon.xcf +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/__init__.py +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/backends/__init__.py +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/backends/groq.py +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/backends/vosk.py +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/install_desktop.py +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/keyboard.py +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/models.toml +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/saverecording.py +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/testpynput.py +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/typers/__init__.py +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/typers/base.py +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/typers/eitype.py +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/typers/pynput.py +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/typers/wtype.py +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/typers/ydotool.py +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/util.py +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe_cli.egg-info/dependency_links.txt +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe_cli.egg-info/entry_points.txt +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe_cli.egg-info/top_level.txt +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe_data/__init__.py +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe_data/share/icon.png +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe_data/share/icon_recording.png +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe_data/share/icon_writing.png +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe_data/templates/scribe.desktop +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scripts/bench_whisper_local.py +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scripts/test_python_versions_install.sh +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.18.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: scribe-cli
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.18.0
|
|
4
4
|
Summary: Speech-to-text CLI and system-tray app for dictating into any focused window. Local (vosk, faster-whisper) or cloud (groq, openai) backends, batch or streaming.
|
|
5
5
|
Author-email: Mahé Perrette <mahe.perrette@gmail.com>
|
|
6
6
|
License: MIT License
|
|
@@ -52,6 +52,7 @@ Requires-Dist: unidecode
|
|
|
52
52
|
Requires-Dist: termcolor
|
|
53
53
|
Requires-Dist: platformdirs
|
|
54
54
|
Requires-Dist: desktop-ai-core>=0.2.0
|
|
55
|
+
Requires-Dist: onnxruntime
|
|
55
56
|
Provides-Extra: keyboard
|
|
56
57
|
Requires-Dist: pynput; extra == "keyboard"
|
|
57
58
|
Provides-Extra: whisper
|
|
@@ -69,6 +70,7 @@ Requires-Dist: soundfile; extra == "openai"
|
|
|
69
70
|
Provides-Extra: groq
|
|
70
71
|
Requires-Dist: openai<3,>=2.37.0; extra == "groq"
|
|
71
72
|
Requires-Dist: soundfile; extra == "groq"
|
|
73
|
+
Provides-Extra: vad
|
|
72
74
|
Provides-Extra: all
|
|
73
75
|
Requires-Dist: pynput; extra == "all"
|
|
74
76
|
Requires-Dist: faster-whisper; extra == "all"
|
|
@@ -149,9 +149,10 @@ differently:
|
|
|
149
149
|
| Backend | `--prompt` | `--words` |
|
|
150
150
|
|--------------------------------------|-------------------------------|--------------------------------------------------------|
|
|
151
151
|
| `whisper` (faster-whisper, local) | passed as `initial_prompt=` | passed as `hotwords=` — a **dedicated biasing channel** separate from the prompt |
|
|
152
|
+
| `whisper-futo` (pywhispercpp, local) | passed as `initial_prompt=` | joined onto the prompt string (no separate hotwords channel here) |
|
|
152
153
|
| `openai` batch (`gpt-4o*-transcribe`) | passed as `prompt=` | joined onto the prompt string |
|
|
153
154
|
| `groq` (`whisper-large-v3-turbo`) | passed as `prompt=` | joined onto the prompt string |
|
|
154
|
-
| `openai` realtime (`gpt-realtime-whisper`) |
|
|
155
|
+
| `openai` realtime (`gpt-realtime-whisper`) | *silently ignored* — the model rejects the prompt parameter server-side (HTTP 400 *"The 'prompt' parameter is not supported for this model."*). The kwarg stays accepted for plumbing compatibility but never reaches the API. | same — joined into the (ignored) prompt |
|
|
155
156
|
| `vosk` | *ignored* (no soft prompt) | *ignored* (Vosk only supports a hard `grammar` allowlist; not yet exposed) |
|
|
156
157
|
|
|
157
158
|
The whisper-family APIs cap the prompt around ~224 tokens; longer
|
|
@@ -202,3 +203,36 @@ more than latency.
|
|
|
202
203
|
|
|
203
204
|
This is experimental and off by default. The tray menu surfaces the
|
|
204
205
|
same toggle under Options ▶ Advanced ▶ Pseudo-streaming.
|
|
206
|
+
|
|
207
|
+
### Cross-chunk prompt context
|
|
208
|
+
|
|
209
|
+
In pseudo-streaming mode scribe automatically augments each chunk's
|
|
210
|
+
prompt with the trailing ~200 characters of the *previous* chunk's
|
|
211
|
+
transcription. This rolling tail is concatenated onto whatever static
|
|
212
|
+
`--prompt` / `--words` you configured and reaches the backend through
|
|
213
|
+
the same channel as the static prompt (the vocabulary biasing table
|
|
214
|
+
above). The motivation is cross-chunk continuity:
|
|
215
|
+
|
|
216
|
+
- **Capitalization drift** — without context, a chunk that starts
|
|
217
|
+
right after a period might come back lowercased.
|
|
218
|
+
- **Article gender (FR/IT/ES/…)** — `"la nouveau"` → `"le nouveau"`
|
|
219
|
+
once the prior chunk has established the noun.
|
|
220
|
+
- **Language lock** — `whisper.cpp` auto-detects language per call;
|
|
221
|
+
feeding the previous chunk's tokens keeps the language stable
|
|
222
|
+
across cuts.
|
|
223
|
+
|
|
224
|
+
Whisper's prompt window is capped at ~224 tokens; 200 chars of French
|
|
225
|
+
sits well under that and leaves room for your static prompt + words
|
|
226
|
+
list.
|
|
227
|
+
|
|
228
|
+
The rolling tail is **dropped** when the silence between two
|
|
229
|
+
utterances exceeds 1.5 seconds — a long pause is treated as a new
|
|
230
|
+
sentence/idea boundary, where carrying a possibly-bad prior chunk
|
|
231
|
+
forward biases the next one more than it helps. This mirrors
|
|
232
|
+
`whisper.cpp`'s `--keep-context off` default: prior-text conditioning
|
|
233
|
+
can self-reinforce errors (hallucinations, decoder repetition loops)
|
|
234
|
+
more readily than it provides useful continuity, so we cap it at
|
|
235
|
+
natural sentence boundaries.
|
|
236
|
+
|
|
237
|
+
Short pauses (mid-sentence punctuation) keep the context; the cut at
|
|
238
|
+
the start of every new recording also clears it.
|
|
@@ -65,20 +65,39 @@ flag suppresses only its own side (giving `--prompt ""` still loads
|
|
|
65
65
|
| `--type-direct` | In keystroke mode, type the transcription as keystrokes instead of synthesising Ctrl+V. |
|
|
66
66
|
| `-o, --output-file FILE` | Also append the transcription to this file. |
|
|
67
67
|
|
|
68
|
-
## Silence detection
|
|
68
|
+
## Silence detection
|
|
69
69
|
|
|
70
70
|
| Flag | Default | Purpose |
|
|
71
71
|
|----------------------------|---------|------------------------------------------------------------------------|
|
|
72
72
|
| `--duration SECS` | `120` | Max recording duration in seconds. |
|
|
73
|
-
| `--silence-db DB` | `-40` | dBFS volume floor for "this frame is silent". Used by every silence-driven behavior. |
|
|
74
73
|
| `--silence-duration SECS` | `0.6` | How long silence must persist before triggering a backend's silence behavior (realtime auto-commit, pseudo-streaming cut). |
|
|
75
74
|
|
|
75
|
+
## Voice activity detection
|
|
76
|
+
|
|
77
|
+
scribe ships two silence-detection backends. By default
|
|
78
|
+
(`--vad-mode auto`) it picks **silero-vad** when `onnxruntime` is
|
|
79
|
+
importable (always true on a stock `pip install scribe-cli` since
|
|
80
|
+
`onnxruntime` is a base dependency) and falls back to a plain dB
|
|
81
|
+
volume threshold otherwise. silero is much more robust to ambient
|
|
82
|
+
noise (clicks, fan, traffic) and to soft speech than dB, which drops
|
|
83
|
+
sub-threshold syllables and gets fooled by loud non-speech.
|
|
84
|
+
|
|
85
|
+
The dB and silero parameter groups are independent — the inactive
|
|
86
|
+
mode's knobs are ignored.
|
|
87
|
+
|
|
88
|
+
| Flag | Default | Purpose |
|
|
89
|
+
|-------------------------------|---------|------------------------------------------------------------------------|
|
|
90
|
+
| `--vad-mode {auto,db,silero}` | `auto` | Silence-detection backend. `auto` picks silero when available, dB otherwise. |
|
|
91
|
+
| `--vad-threshold FLOAT` | `0.5` | **[silero only]** Speech-probability threshold in `[0,1]`. Lower = more permissive (catches quiet speech and more noise); higher = stricter. |
|
|
92
|
+
| `--vad-min-silence-ms INT` | `300` | **[silero only]** Minimum sustained low-probability span before speech-end fires, in ms. silero's onset/offset smoothing window. |
|
|
93
|
+
| `--silence-db DB` | `-40` | **[dB only]** dBFS volume floor for "this frame is silent". Ignored when silero is the active mode. |
|
|
94
|
+
|
|
76
95
|
## Realtime (`gpt-realtime-whisper`)
|
|
77
96
|
|
|
78
97
|
| Flag | Default | Purpose |
|
|
79
98
|
|---------------------------------------------------|----------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
|
80
99
|
| `--realtime-delay {minimal,low,medium,high,xhigh}` | `medium` | Trade off latency vs accuracy on `gpt-realtime-whisper`. Lower = faster partials but more paste churn in the focused window. |
|
|
81
|
-
| `--realtime-gate` / `--no-realtime-gate` | on | Drop silent frames (per `--
|
|
100
|
+
| `--realtime-gate` / `--no-realtime-gate` | on | Drop silent frames (per the active `--vad-mode`) before sending them over the WebSocket so silent audio isn't billed as input tokens. After `--silence-duration` of silence, also commit mid-session so trailing words flush live. |
|
|
82
101
|
|
|
83
102
|
Streaming models (Vosk, `gpt-realtime-whisper`) ignore the batch
|
|
84
103
|
silence-chunking knobs; they have their own end-of-utterance signal.
|
|
@@ -167,3 +167,34 @@ If `eitype` is unavailable, two older workarounds also work:
|
|
|
167
167
|
Roadmap for native libei integration (eventual Python bindings,
|
|
168
168
|
expanded compositor support) is tracked in
|
|
169
169
|
[docs/roadmap-libei.md](roadmap-libei.md).
|
|
170
|
+
|
|
171
|
+
## Realtime backend: delta coalescing
|
|
172
|
+
|
|
173
|
+
The `gpt-realtime-whisper` backend emits one transcription delta per
|
|
174
|
+
word/subword at ~30–80 ms intervals — much faster than the
|
|
175
|
+
`pyperclip.copy()` + Ctrl+V cycle can settle on Wayland (≥100 ms,
|
|
176
|
+
because `wl-copy` is asynchronous). Pasting every delta led to
|
|
177
|
+
clipboard races where successive copies overwrote each other before
|
|
178
|
+
Ctrl+V landed, manifesting as dropped and duplicated words
|
|
179
|
+
(*"fait fait le mot mot time time…"*).
|
|
180
|
+
|
|
181
|
+
In **paste mode** (default keystroke output) scribe therefore
|
|
182
|
+
coalesces deltas: incoming tokens accumulate into a small buffer and
|
|
183
|
+
are flushed only when *either* ~400 ms have elapsed since the last
|
|
184
|
+
flush, *or* the buffer ends on sentence-final punctuation
|
|
185
|
+
(`. ! ? \n`). A 200 ms floor between any two flushes prevents
|
|
186
|
+
back-to-back punctuation flushes from racing each other through the
|
|
187
|
+
clipboard.
|
|
188
|
+
|
|
189
|
+
With **`--type-direct`** the coalescing is bypassed entirely — each
|
|
190
|
+
delta goes through the chosen typer as a raw keystroke synchronously
|
|
191
|
+
(uinput / xtest / portal libei), no clipboard involved, no race to
|
|
192
|
+
defeat. The UX is also snappier: tokens appear one at a time rather
|
|
193
|
+
than in ~400 ms-cadenced bursts.
|
|
194
|
+
|
|
195
|
+
macOS and Windows clipboards are synchronous, so the race that
|
|
196
|
+
motivates coalescing is essentially a Wayland artefact; scribe still
|
|
197
|
+
coalesces in paste mode there for consistency, but it's harmless.
|
|
198
|
+
This whole behaviour is realtime-specific — Vosk's per-phrase commits
|
|
199
|
+
already arrive at a sane cadence, and the pseudo-streaming backends
|
|
200
|
+
emit one chunk per silence cut (already coarse enough).
|
|
@@ -58,10 +58,15 @@ Options ▶
|
|
|
58
58
|
Keyboard backend ▶ eitype / pynput / ydotool / wtype
|
|
59
59
|
(rows incompatible with this OS are hidden;
|
|
60
60
|
submenu hidden entirely when ≤ 1 row left)
|
|
61
|
-
Advanced ▶ silence duration,
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
61
|
+
Advanced ▶ silence duration, VAD mode toggle
|
|
62
|
+
(silero ↔ dB), per-mode VAD knobs
|
|
63
|
+
(silero: speech-probability threshold,
|
|
64
|
+
min silence duration; dB: silence
|
|
65
|
+
threshold — only the active mode's
|
|
66
|
+
knobs are shown), realtime gate,
|
|
67
|
+
pseudo-streaming [experimental],
|
|
68
|
+
streaming window [experimental],
|
|
69
|
+
output file
|
|
65
70
|
Quit
|
|
66
71
|
```
|
|
67
72
|
|
|
@@ -22,6 +22,11 @@ dependencies = [
|
|
|
22
22
|
"termcolor",
|
|
23
23
|
"platformdirs",
|
|
24
24
|
"desktop-ai-core>=0.2.0",
|
|
25
|
+
# Runs the bundled silero VAD ONNX model (~2 MB shipped in scribe_data).
|
|
26
|
+
# In base deps so silero is available out of the box — see scribe/audio.py.
|
|
27
|
+
# `faster-whisper` already pulls it transitively, so installing with
|
|
28
|
+
# [whisper] is free; standalone adds ~57 MB which is trivial for an STT tool.
|
|
29
|
+
"onnxruntime",
|
|
25
30
|
]
|
|
26
31
|
|
|
27
32
|
classifiers = [
|
|
@@ -67,12 +72,18 @@ vosk = ["vosk"]
|
|
|
67
72
|
app = ["pystray", "PyGObject"]
|
|
68
73
|
openai = ["openai>=2.37.0,<3", "soundfile"]
|
|
69
74
|
groq = ["openai>=2.37.0,<3", "soundfile"]
|
|
75
|
+
# [vad] is now a no-op alias kept for back-compat (`pip install scribe-cli[vad]`
|
|
76
|
+
# was the documented install before onnxruntime moved into base deps).
|
|
77
|
+
vad = []
|
|
70
78
|
all = ["pynput", "faster-whisper", "pywhispercpp", "openai>=2.37.0,<3", "soundfile", "vosk", "pystray"]
|
|
71
79
|
|
|
72
80
|
|
|
73
81
|
[tool.setuptools]
|
|
74
82
|
packages = [ "scribe", "scribe_data" ]
|
|
75
83
|
|
|
84
|
+
[tool.setuptools.package-data]
|
|
85
|
+
scribe_data = ["share/*.png", "templates/*", "silero_vad.onnx", "silero_vad.LICENSE"]
|
|
86
|
+
|
|
76
87
|
[tool.setuptools_scm]
|
|
77
88
|
write_to = "scribe/_version.py"
|
|
78
89
|
|
|
@@ -18,7 +18,7 @@ version_tuple: tuple[int | str, ...]
|
|
|
18
18
|
commit_id: str | None
|
|
19
19
|
__commit_id__: str | None
|
|
20
20
|
|
|
21
|
-
__version__ = version = '0.
|
|
22
|
-
__version_tuple__ = version_tuple = (0,
|
|
21
|
+
__version__ = version = '0.18.0'
|
|
22
|
+
__version_tuple__ = version_tuple = (0, 18, 0)
|
|
23
23
|
|
|
24
|
-
__commit_id__ = commit_id = '
|
|
24
|
+
__commit_id__ = commit_id = 'gd48d707c7'
|
|
@@ -66,7 +66,10 @@ class DummyTranscriber:
|
|
|
66
66
|
|
|
67
67
|
whisper_models = ["tiny", "base", "small", "medium", "large-v3", "large-v3-turbo"]
|
|
68
68
|
whisper_english_models = ["tiny.en", "base.en", "small.en", "medium.en"]
|
|
69
|
-
# FUTO ACFT publishes only tiny/base/small (+ .en variants)
|
|
69
|
+
# FUTO ACFT publishes only tiny/base/small (+ .en variants). Community
|
|
70
|
+
# conversions exist for large/turbo but their large-v3 encoder is
|
|
71
|
+
# incompatible with the audio_ctx shrinkage that's the point of this
|
|
72
|
+
# backend — for large models use the `whisper` backend instead.
|
|
70
73
|
whisper_futo_models = ["tiny", "base", "small"]
|
|
71
74
|
whisper_futo_english_models = ["tiny.en", "base.en", "small.en"]
|
|
72
75
|
whisperapi_models = ["gpt-4o-transcribe", "gpt-4o-mini-transcribe", "gpt-realtime-whisper"]
|
|
@@ -169,6 +172,7 @@ def _resolve_prompt_and_words(prompt_text, prompt_file, words, words_file):
|
|
|
169
172
|
|
|
170
173
|
def _build_backend_kwargs(backend, model, language, samplerate, duration,
|
|
171
174
|
silence_db, silence_duration,
|
|
175
|
+
vad_mode, vad_threshold, vad_min_silence_ms,
|
|
172
176
|
download_folder_vosk, download_folder_whisper,
|
|
173
177
|
download_folder_whisper_futo,
|
|
174
178
|
realtime_delay, realtime_gate,
|
|
@@ -183,6 +187,8 @@ def _build_backend_kwargs(backend, model, language, samplerate, duration,
|
|
|
183
187
|
word_blob = " ".join(words)
|
|
184
188
|
merged_prompt = f"{prompt_text} {word_blob}" if prompt_text else word_blob
|
|
185
189
|
|
|
190
|
+
vad_kwargs = dict(vad_mode=vad_mode, vad_threshold=vad_threshold,
|
|
191
|
+
vad_min_silence_ms=vad_min_silence_ms)
|
|
186
192
|
if backend == "vosk":
|
|
187
193
|
# Vosk has no soft prompt; only a hard grammar. Silently ignore for now.
|
|
188
194
|
return dict(model_name=model, language=language, samplerate=samplerate,
|
|
@@ -190,25 +196,33 @@ def _build_backend_kwargs(backend, model, language, samplerate, duration,
|
|
|
190
196
|
model_kwargs={"download_root": download_folder_vosk})
|
|
191
197
|
if backend == "whisper":
|
|
192
198
|
return dict(model_name=model, language=language, samplerate=samplerate,
|
|
193
|
-
timeout=duration, silence_duration=silence_duration,
|
|
199
|
+
timeout=duration, silence_duration=silence_duration,
|
|
200
|
+
silence_thresh=silence_db,
|
|
194
201
|
pseudo_streaming=pseudo_streaming, streaming_window=streaming_window,
|
|
195
202
|
prompt=prompt_text,
|
|
196
203
|
hotwords=(" ".join(words) if words else None),
|
|
197
|
-
model_kwargs={"download_root": download_folder_whisper}
|
|
204
|
+
model_kwargs={"download_root": download_folder_whisper},
|
|
205
|
+
**vad_kwargs)
|
|
198
206
|
if backend == "whisper-futo":
|
|
199
|
-
#
|
|
200
|
-
#
|
|
201
|
-
#
|
|
207
|
+
# pywhispercpp 1.4.1 exposes `initial_prompt`; the backend folds
|
|
208
|
+
# words+prompt into it (and adds a rolling chunk-tail in
|
|
209
|
+
# pseudo-streaming). No separate hotwords channel here — fold
|
|
210
|
+
# everything into the prompt like the cloud backends do.
|
|
202
211
|
return dict(model_name=model, language=language, samplerate=samplerate,
|
|
203
|
-
timeout=duration, silence_duration=silence_duration,
|
|
212
|
+
timeout=duration, silence_duration=silence_duration,
|
|
213
|
+
silence_thresh=silence_db,
|
|
204
214
|
pseudo_streaming=pseudo_streaming, streaming_window=streaming_window,
|
|
205
|
-
|
|
215
|
+
prompt=merged_prompt,
|
|
216
|
+
download_folder=download_folder_whisper_futo,
|
|
217
|
+
**vad_kwargs)
|
|
206
218
|
if backend in ("openai", "groq"):
|
|
207
219
|
from scribe.backends.openai_api import REALTIME_MODELS
|
|
208
220
|
kwargs = dict(model_name=model, samplerate=samplerate,
|
|
209
|
-
timeout=duration, silence_duration=silence_duration,
|
|
221
|
+
timeout=duration, silence_duration=silence_duration,
|
|
222
|
+
silence_thresh=silence_db,
|
|
210
223
|
pseudo_streaming=pseudo_streaming, streaming_window=streaming_window,
|
|
211
|
-
prompt=merged_prompt
|
|
224
|
+
prompt=merged_prompt,
|
|
225
|
+
**vad_kwargs)
|
|
212
226
|
if backend == "openai" and model in REALTIME_MODELS:
|
|
213
227
|
kwargs["realtime_delay"] = realtime_delay
|
|
214
228
|
kwargs["realtime_gate"] = realtime_gate
|
|
@@ -223,7 +237,8 @@ def _build_backend_kwargs(backend, model, language, samplerate, duration,
|
|
|
223
237
|
|
|
224
238
|
def get_transcriber(model=None, backend=None, dummy=False, interactive=True, language=None,
|
|
225
239
|
samplerate=None, duration=None,
|
|
226
|
-
silence_db
|
|
240
|
+
silence_db=None, silence_duration=0.6,
|
|
241
|
+
vad_mode="auto", vad_threshold=0.5, vad_min_silence_ms=300,
|
|
227
242
|
download_folder_vosk=None, download_folder_whisper=None,
|
|
228
243
|
download_folder_whisper_futo=None,
|
|
229
244
|
realtime_delay="medium", realtime_gate=True,
|
|
@@ -253,9 +268,14 @@ def get_transcriber(model=None, backend=None, dummy=False, interactive=True, lan
|
|
|
253
268
|
else:
|
|
254
269
|
model = _prompt_model_for_backend(backend, language, interactive)
|
|
255
270
|
print(f"Selected model: {model}")
|
|
271
|
+
# silence_db is the single volume floor used by the dB fallback. Silero
|
|
272
|
+
# mode ignores it. Default -40 dBFS — keeps the gate simple by design.
|
|
273
|
+
if silence_db is None:
|
|
274
|
+
silence_db = -40.0
|
|
256
275
|
prompt_text, word_list = _resolve_prompt_and_words(prompt, prompt_file, words, words_file)
|
|
257
276
|
backend_kwargs = _build_backend_kwargs(backend, model, language, samplerate, duration,
|
|
258
277
|
silence_db, silence_duration,
|
|
278
|
+
vad_mode, vad_threshold, vad_min_silence_ms,
|
|
259
279
|
download_folder_vosk, download_folder_whisper,
|
|
260
280
|
download_folder_whisper_futo,
|
|
261
281
|
realtime_delay, realtime_gate,
|
|
@@ -319,14 +339,9 @@ def get_parser():
|
|
|
319
339
|
group.add_argument("-o", "--output-file",
|
|
320
340
|
help="Also append the transcription to this file.")
|
|
321
341
|
|
|
322
|
-
group = parser.add_argument_group("Silence detection
|
|
342
|
+
group = parser.add_argument_group("Silence detection")
|
|
323
343
|
group.add_argument("--duration", default=120, type=float,
|
|
324
344
|
help="Max recording duration in seconds (default: %(default)s).")
|
|
325
|
-
group.add_argument("--silence-db", default=-40.0, type=float,
|
|
326
|
-
help="dBFS volume floor for 'this frame is silent' "
|
|
327
|
-
"(default: %(default)s). Used by every silence-driven "
|
|
328
|
-
"behavior (realtime gate, realtime auto-commit, "
|
|
329
|
-
"pseudo-streaming chunking).")
|
|
330
345
|
group.add_argument("--silence-duration", default=0.6, type=float,
|
|
331
346
|
help="Seconds of silence required before triggering a "
|
|
332
347
|
"backend's silence behavior (default: %(default)s). "
|
|
@@ -335,6 +350,31 @@ def get_parser():
|
|
|
335
350
|
"batch backends: candidate cut point within the "
|
|
336
351
|
"streaming window.")
|
|
337
352
|
|
|
353
|
+
group = parser.add_argument_group("Voice activity detection")
|
|
354
|
+
group.add_argument("--vad-mode", choices=("auto", "db", "silero"), default="auto",
|
|
355
|
+
help="Silence-detection backend (default: %(default)s). "
|
|
356
|
+
"'auto' picks silero if installed, dB otherwise. "
|
|
357
|
+
"'silero' uses silero-vad — much more robust to "
|
|
358
|
+
"ambient noise (ticks, fan, traffic) AND to soft "
|
|
359
|
+
"speech (the dB gate drops sub-threshold syllables; "
|
|
360
|
+
"silero recognises speech spectrally). "
|
|
361
|
+
"'db' is a volume-threshold fallback used when "
|
|
362
|
+
"onnxruntime is unavailable (see --silence-db). "
|
|
363
|
+
"The dB and silero parameter groups are independent.")
|
|
364
|
+
group.add_argument("--vad-threshold", default=0.5, type=float,
|
|
365
|
+
help="[silero only] Speech-probability threshold in [0,1] "
|
|
366
|
+
"(default: %(default)s). Lower = more permissive (catches "
|
|
367
|
+
"quiet speech but also more noise); higher = stricter.")
|
|
368
|
+
group.add_argument("--vad-min-silence-ms", default=300, type=int,
|
|
369
|
+
help="[silero only] Minimum sustained low-probability span before "
|
|
370
|
+
"speech-end is emitted, in ms (default: %(default)s). "
|
|
371
|
+
"Acts as silero's onset/offset smoothing window.")
|
|
372
|
+
group.add_argument("--silence-db", default=None, type=float,
|
|
373
|
+
help="[dB only] Silence floor in dBFS for the dB-mode "
|
|
374
|
+
"fallback (default: -40). Ignored when "
|
|
375
|
+
"--vad-mode=silero (or =auto and silero is "
|
|
376
|
+
"available).")
|
|
377
|
+
|
|
338
378
|
group = parser.add_argument_group("Realtime (gpt-realtime-whisper)")
|
|
339
379
|
group.add_argument("--realtime-delay",
|
|
340
380
|
choices=("minimal", "low", "medium", "high", "xhigh"),
|
|
@@ -344,10 +384,10 @@ def get_parser():
|
|
|
344
384
|
"paste churn in the focused window).")
|
|
345
385
|
group.add_argument("--realtime-gate", action=argparse.BooleanOptionalAction,
|
|
346
386
|
default=True,
|
|
347
|
-
help="Drop silent frames (per --
|
|
348
|
-
"them over the WebSocket so silent audio
|
|
349
|
-
"as input tokens (default: on; pass
|
|
350
|
-
"to disable).")
|
|
387
|
+
help="Drop silent frames (per the active --vad-mode) before "
|
|
388
|
+
"sending them over the WebSocket so silent audio "
|
|
389
|
+
"isn't billed as input tokens (default: on; pass "
|
|
390
|
+
"--no-realtime-gate to disable).")
|
|
351
391
|
|
|
352
392
|
group = parser.add_argument_group("Pseudo-streaming (experimental)")
|
|
353
393
|
group.add_argument("--pseudo-streaming", action="store_true",
|
|
@@ -399,8 +439,16 @@ def start_recording(micro, session, mode="keystroke", typer="auto",
|
|
|
399
439
|
# Query the live transcriber instance — the registered class may dispatch
|
|
400
440
|
# to a streaming sibling for specific models (e.g. openai →
|
|
401
441
|
# gpt-realtime-whisper), so a class-level lookup via BACKENDS would lie.
|
|
442
|
+
# Pseudo-streaming also yields chunks (silence-cut batch transcriptions)
|
|
443
|
+
# so the output should treat it the same: live paste/type per chunk.
|
|
402
444
|
backend_obj = getattr(session, "backend", session)
|
|
403
|
-
|
|
445
|
+
if isinstance(backend_obj, str):
|
|
446
|
+
is_streaming = False
|
|
447
|
+
else:
|
|
448
|
+
is_streaming = (
|
|
449
|
+
bool(getattr(backend_obj, "supports_streaming", False))
|
|
450
|
+
or bool(getattr(backend_obj, "pseudo_streaming", False))
|
|
451
|
+
)
|
|
404
452
|
# Clipboard is written in clipboard mode (the user pastes manually) and in
|
|
405
453
|
# paste-based keystroke mode (the paste source). type_direct keystroke
|
|
406
454
|
# mode bypasses the clipboard entirely — we type the chunks/text raw.
|
|
@@ -427,6 +475,16 @@ def start_recording(micro, session, mode="keystroke", typer="auto",
|
|
|
427
475
|
import pyperclip
|
|
428
476
|
session.log("The transcription will be copied to clipboard as it becomes available.")
|
|
429
477
|
|
|
478
|
+
# Tell streaming backends whether their output is about to hit the
|
|
479
|
+
# clipboard-paste race or a direct-keystroke typer. The realtime
|
|
480
|
+
# backend's per-token deltas only need coalescing in paste mode;
|
|
481
|
+
# type-direct (ydotool/wtype/pynput via uinput/xtest) types each
|
|
482
|
+
# character synchronously and benefits from raw per-delta emission
|
|
483
|
+
# for snappier UX. Set as a plain attribute — backends that don't
|
|
484
|
+
# implement coalescing ignore it.
|
|
485
|
+
if not isinstance(backend_obj, str) and hasattr(backend_obj, "_coalesce_deltas"):
|
|
486
|
+
backend_obj._coalesce_deltas = do_live_paste
|
|
487
|
+
|
|
430
488
|
fulltext = ""
|
|
431
489
|
|
|
432
490
|
for result in session.start_recording(micro, **greetings):
|
|
@@ -497,14 +555,24 @@ def create_app(micro, app_state):
|
|
|
497
555
|
image = Image.open(Path(scribe_data.__file__).parent / "share" / "icon.png")
|
|
498
556
|
image_recording = Image.open(Path(scribe_data.__file__).parent / "share" / "icon_recording.png")
|
|
499
557
|
image_writing = Image.open(Path(scribe_data.__file__).parent / "share" / "icon_writing.png")
|
|
558
|
+
# Composite (red + writing 'a'): shown while recording AND the silence
|
|
559
|
+
# gate says speech is active. Gives the user a visual confirmation that
|
|
560
|
+
# the audio is actually being captured/sent — not just sitting in
|
|
561
|
+
# detected silence. Plain red = recording but waiting for speech.
|
|
562
|
+
image_recording_active = Image.alpha_composite(
|
|
563
|
+
image_recording.convert("RGBA"), image_writing.convert("RGBA"),
|
|
564
|
+
)
|
|
500
565
|
|
|
501
566
|
if transcriber.backend == "vosk":
|
|
502
|
-
#
|
|
503
|
-
|
|
567
|
+
# vosk transcribes while recording — both recording sub-states show
|
|
568
|
+
# the composite (no meaningful "waiting" since vosk streams
|
|
569
|
+
# continuously).
|
|
570
|
+
image_recording = image_recording_active
|
|
504
571
|
|
|
505
572
|
state_images = {
|
|
506
573
|
None: image,
|
|
507
574
|
"recording": image_recording,
|
|
575
|
+
"recording_active": image_recording_active,
|
|
508
576
|
"busy": image_writing,
|
|
509
577
|
}
|
|
510
578
|
|
|
@@ -523,7 +591,12 @@ def create_app(micro, app_state):
|
|
|
523
591
|
return "busy"
|
|
524
592
|
s = icon._session
|
|
525
593
|
if s.recording:
|
|
526
|
-
|
|
594
|
+
# session.waiting flips True after silence_duration of detected
|
|
595
|
+
# silence, False on the first non-silent chunk. The composite
|
|
596
|
+
# ("recording_active") tells the user audio is actually being
|
|
597
|
+
# sent to the backend — solves the "is it hearing me?" question
|
|
598
|
+
# without printing partial transcripts to the tray.
|
|
599
|
+
return "recording" if s.waiting else "recording_active"
|
|
527
600
|
if s.busy:
|
|
528
601
|
return "busy"
|
|
529
602
|
return None
|