scribe-cli 0.17.0__tar.gz → 0.18.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/PKG-INFO +3 -1
  2. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/docs/backends.md +35 -1
  3. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/docs/cli.md +22 -3
  4. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/docs/keyboard.md +31 -0
  5. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/docs/tray.md +9 -4
  6. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/pyproject.toml +11 -0
  7. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/_version.py +3 -3
  8. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/app.py +98 -25
  9. scribe_cli-0.18.0/scribe/audio.py +379 -0
  10. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/backends/openai_api.py +3 -1
  11. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/backends/openai_realtime.py +108 -9
  12. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/backends/whisper.py +2 -1
  13. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/backends/whisper_futo.py +81 -20
  14. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/menu.py +60 -1
  15. scribe_cli-0.18.0/scribe/models.py +333 -0
  16. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/session.py +10 -1
  17. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe_cli.egg-info/PKG-INFO +3 -1
  18. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe_cli.egg-info/SOURCES.txt +6 -1
  19. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe_cli.egg-info/requires.txt +3 -0
  20. scribe_cli-0.18.0/scribe_data/silero_vad.LICENSE +21 -0
  21. scribe_cli-0.18.0/scribe_data/silero_vad.onnx +0 -0
  22. scribe_cli-0.18.0/tests/test_openai_realtime_coalesce.py +221 -0
  23. scribe_cli-0.18.0/tests/test_pseudo_streaming.py +413 -0
  24. scribe_cli-0.18.0/tests/test_whisper_futo.py +245 -0
  25. scribe_cli-0.17.0/scribe/audio.py +0 -76
  26. scribe_cli-0.17.0/scribe/models.py +0 -182
  27. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/.github/FUNDING.yml +0 -0
  28. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/.github/workflows/pypi.yml +0 -0
  29. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/.gitignore +0 -0
  30. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/LICENSE +0 -0
  31. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/README.md +0 -0
  32. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/docs/app-tray-menu.png +0 -0
  33. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/docs/desktop-install.md +0 -0
  34. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/docs/installation.md +0 -0
  35. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/docs/roadmap-libei.md +0 -0
  36. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/icon.xcf +0 -0
  37. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/__init__.py +0 -0
  38. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/backends/__init__.py +0 -0
  39. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/backends/groq.py +0 -0
  40. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/backends/vosk.py +0 -0
  41. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/install_desktop.py +0 -0
  42. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/keyboard.py +0 -0
  43. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/models.toml +0 -0
  44. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/saverecording.py +0 -0
  45. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/testpynput.py +0 -0
  46. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/typers/__init__.py +0 -0
  47. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/typers/base.py +0 -0
  48. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/typers/eitype.py +0 -0
  49. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/typers/pynput.py +0 -0
  50. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/typers/wtype.py +0 -0
  51. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/typers/ydotool.py +0 -0
  52. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe/util.py +0 -0
  53. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe_cli.egg-info/dependency_links.txt +0 -0
  54. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe_cli.egg-info/entry_points.txt +0 -0
  55. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe_cli.egg-info/top_level.txt +0 -0
  56. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe_data/__init__.py +0 -0
  57. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe_data/share/icon.png +0 -0
  58. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe_data/share/icon_recording.png +0 -0
  59. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe_data/share/icon_writing.png +0 -0
  60. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scribe_data/templates/scribe.desktop +0 -0
  61. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scripts/bench_whisper_local.py +0 -0
  62. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/scripts/test_python_versions_install.sh +0 -0
  63. {scribe_cli-0.17.0 → scribe_cli-0.18.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: scribe-cli
3
- Version: 0.17.0
3
+ Version: 0.18.0
4
4
  Summary: Speech-to-text CLI and system-tray app for dictating into any focused window. Local (vosk, faster-whisper) or cloud (groq, openai) backends, batch or streaming.
5
5
  Author-email: Mahé Perrette <mahe.perrette@gmail.com>
6
6
  License: MIT License
@@ -52,6 +52,7 @@ Requires-Dist: unidecode
52
52
  Requires-Dist: termcolor
53
53
  Requires-Dist: platformdirs
54
54
  Requires-Dist: desktop-ai-core>=0.2.0
55
+ Requires-Dist: onnxruntime
55
56
  Provides-Extra: keyboard
56
57
  Requires-Dist: pynput; extra == "keyboard"
57
58
  Provides-Extra: whisper
@@ -69,6 +70,7 @@ Requires-Dist: soundfile; extra == "openai"
69
70
  Provides-Extra: groq
70
71
  Requires-Dist: openai<3,>=2.37.0; extra == "groq"
71
72
  Requires-Dist: soundfile; extra == "groq"
73
+ Provides-Extra: vad
72
74
  Provides-Extra: all
73
75
  Requires-Dist: pynput; extra == "all"
74
76
  Requires-Dist: faster-whisper; extra == "all"
@@ -149,9 +149,10 @@ differently:
149
149
  | Backend | `--prompt` | `--words` |
150
150
  |--------------------------------------|-------------------------------|--------------------------------------------------------|
151
151
  | `whisper` (faster-whisper, local) | passed as `initial_prompt=` | passed as `hotwords=` — a **dedicated biasing channel** separate from the prompt |
152
+ | `whisper-futo` (pywhispercpp, local) | passed as `initial_prompt=` | joined onto the prompt string (no separate hotwords channel here) |
152
153
  | `openai` batch (`gpt-4o*-transcribe`) | passed as `prompt=` | joined onto the prompt string |
153
154
  | `groq` (`whisper-large-v3-turbo`) | passed as `prompt=` | joined onto the prompt string |
154
- | `openai` realtime (`gpt-realtime-whisper`) | included in the session config as `transcription.prompt` | joined onto the prompt string |
155
+ | `openai` realtime (`gpt-realtime-whisper`) | *silently ignored* the model rejects the prompt parameter server-side (HTTP 400 *"The 'prompt' parameter is not supported for this model."*). The kwarg stays accepted for plumbing compatibility but never reaches the API. | same — joined into the (ignored) prompt |
155
156
  | `vosk` | *ignored* (no soft prompt) | *ignored* (Vosk only supports a hard `grammar` allowlist; not yet exposed) |
156
157
 
157
158
  The whisper-family APIs cap the prompt around ~224 tokens; longer
@@ -202,3 +203,36 @@ more than latency.
202
203
 
203
204
  This is experimental and off by default. The tray menu surfaces the
204
205
  same toggle under Options ▶ Advanced ▶ Pseudo-streaming.
206
+
207
+ ### Cross-chunk prompt context
208
+
209
+ In pseudo-streaming mode scribe automatically augments each chunk's
210
+ prompt with the trailing ~200 characters of the *previous* chunk's
211
+ transcription. This rolling tail is concatenated onto whatever static
212
+ `--prompt` / `--words` you configured and reaches the backend through
213
+ the same channel as the static prompt (the vocabulary biasing table
214
+ above). The motivation is cross-chunk continuity:
215
+
216
+ - **Capitalization drift** — without context, a chunk that starts
217
+ right after a period might come back lowercased.
218
+ - **Article gender (FR/IT/ES/…)** — `"la nouveau"` → `"le nouveau"`
219
+ once the prior chunk has established the noun.
220
+ - **Language lock** — `whisper.cpp` auto-detects language per call;
221
+ feeding the previous chunk's tokens keeps the language stable
222
+ across cuts.
223
+
224
+ Whisper's prompt window is capped at ~224 tokens; 200 chars of French
225
+ sits well under that and leaves room for your static prompt + words
226
+ list.
227
+
228
+ The rolling tail is **dropped** when the silence between two
229
+ utterances exceeds 1.5 seconds — a long pause is treated as a new
230
+ sentence/idea boundary, where carrying a possibly-bad prior chunk
231
+ forward biases the next one more than it helps. This mirrors
232
+ `whisper.cpp`'s `--keep-context off` default: prior-text conditioning
233
+ can self-reinforce errors (hallucinations, decoder repetition loops)
234
+ more readily than it provides useful continuity, so we cap it at
235
+ natural sentence boundaries.
236
+
237
+ Short pauses (mid-sentence punctuation) keep the context; the cut at
238
+ the start of every new recording also clears it.
@@ -65,20 +65,39 @@ flag suppresses only its own side (giving `--prompt ""` still loads
65
65
  | `--type-direct` | In keystroke mode, type the transcription as keystrokes instead of synthesising Ctrl+V. |
66
66
  | `-o, --output-file FILE` | Also append the transcription to this file. |
67
67
 
68
- ## Silence detection (shared)
68
+ ## Silence detection
69
69
 
70
70
  | Flag | Default | Purpose |
71
71
  |----------------------------|---------|------------------------------------------------------------------------|
72
72
  | `--duration SECS` | `120` | Max recording duration in seconds. |
73
- | `--silence-db DB` | `-40` | dBFS volume floor for "this frame is silent". Used by every silence-driven behavior. |
74
73
  | `--silence-duration SECS` | `0.6` | How long silence must persist before triggering a backend's silence behavior (realtime auto-commit, pseudo-streaming cut). |
75
74
 
75
+ ## Voice activity detection
76
+
77
+ scribe ships two silence-detection backends. By default
78
+ (`--vad-mode auto`) it picks **silero-vad** when `onnxruntime` is
79
+ importable (always true on a stock `pip install scribe-cli` since
80
+ `onnxruntime` is a base dependency) and falls back to a plain dB
81
+ volume threshold otherwise. silero is much more robust to ambient
82
+ noise (clicks, fan, traffic) and to soft speech than dB, which drops
83
+ sub-threshold syllables and gets fooled by loud non-speech.
84
+
85
+ The dB and silero parameter groups are independent — the inactive
86
+ mode's knobs are ignored.
87
+
88
+ | Flag | Default | Purpose |
89
+ |-------------------------------|---------|------------------------------------------------------------------------|
90
+ | `--vad-mode {auto,db,silero}` | `auto` | Silence-detection backend. `auto` picks silero when available, dB otherwise. |
91
+ | `--vad-threshold FLOAT` | `0.5` | **[silero only]** Speech-probability threshold in `[0,1]`. Lower = more permissive (catches quiet speech and more noise); higher = stricter. |
92
+ | `--vad-min-silence-ms INT` | `300` | **[silero only]** Minimum sustained low-probability span before speech-end fires, in ms. silero's onset/offset smoothing window. |
93
+ | `--silence-db DB` | `-40` | **[dB only]** dBFS volume floor for "this frame is silent". Ignored when silero is the active mode. |
94
+
76
95
  ## Realtime (`gpt-realtime-whisper`)
77
96
 
78
97
  | Flag | Default | Purpose |
79
98
  |---------------------------------------------------|----------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
80
99
  | `--realtime-delay {minimal,low,medium,high,xhigh}` | `medium` | Trade off latency vs accuracy on `gpt-realtime-whisper`. Lower = faster partials but more paste churn in the focused window. |
81
- | `--realtime-gate` / `--no-realtime-gate` | on | Drop silent frames (per `--silence-db`) before sending them over the WebSocket so silent audio isn't billed as input tokens. After `--silence-duration` of silence, also commit mid-session so trailing words flush live. |
100
+ | `--realtime-gate` / `--no-realtime-gate` | on | Drop silent frames (per the active `--vad-mode`) before sending them over the WebSocket so silent audio isn't billed as input tokens. After `--silence-duration` of silence, also commit mid-session so trailing words flush live. |
82
101
 
83
102
  Streaming models (Vosk, `gpt-realtime-whisper`) ignore the batch
84
103
  silence-chunking knobs; they have their own end-of-utterance signal.
@@ -167,3 +167,34 @@ If `eitype` is unavailable, two older workarounds also work:
167
167
  Roadmap for native libei integration (eventual Python bindings,
168
168
  expanded compositor support) is tracked in
169
169
  [docs/roadmap-libei.md](roadmap-libei.md).
170
+
171
+ ## Realtime backend: delta coalescing
172
+
173
+ The `gpt-realtime-whisper` backend emits one transcription delta per
174
+ word/subword at ~30–80 ms intervals — much faster than the
175
+ `pyperclip.copy()` + Ctrl+V cycle can settle on Wayland (≥100 ms,
176
+ because `wl-copy` is asynchronous). Pasting every delta led to
177
+ clipboard races where successive copies overwrote each other before
178
+ Ctrl+V landed, manifesting as dropped and duplicated words
179
+ (*"fait fait le mot mot time time…"*).
180
+
181
+ In **paste mode** (default keystroke output) scribe therefore
182
+ coalesces deltas: incoming tokens accumulate into a small buffer and
183
+ are flushed only when *either* ~400 ms have elapsed since the last
184
+ flush, *or* the buffer ends on sentence-final punctuation
185
+ (`. ! ? \n`). A 200 ms floor between any two flushes prevents
186
+ back-to-back punctuation flushes from racing each other through the
187
+ clipboard.
188
+
189
+ With **`--type-direct`** the coalescing is bypassed entirely — each
190
+ delta goes through the chosen typer as a raw keystroke synchronously
191
+ (uinput / xtest / portal libei), no clipboard involved, no race to
192
+ defeat. The UX is also snappier: tokens appear one at a time rather
193
+ than in ~400 ms-cadenced bursts.
194
+
195
+ macOS and Windows clipboards are synchronous, so the race that
196
+ motivates coalescing is essentially a Wayland artefact; scribe still
197
+ coalesces in paste mode there for consistency, but it's harmless.
198
+ This whole behaviour is realtime-specific — Vosk's per-phrase commits
199
+ already arrive at a sane cadence, and the pseudo-streaming backends
200
+ emit one chunk per silence cut (already coarse enough).
@@ -58,10 +58,15 @@ Options ▶
58
58
  Keyboard backend ▶ eitype / pynput / ydotool / wtype
59
59
  (rows incompatible with this OS are hidden;
60
60
  submenu hidden entirely when ≤ 1 row left)
61
- Advanced ▶ silence duration, silence threshold,
62
- realtime gate, pseudo-streaming
63
- [experimental], streaming window
64
- [experimental], output file
61
+ Advanced ▶ silence duration, VAD mode toggle
62
+ (silero ↔ dB), per-mode VAD knobs
63
+ (silero: speech-probability threshold,
64
+ min silence duration; dB: silence
65
+ threshold — only the active mode's
66
+ knobs are shown), realtime gate,
67
+ pseudo-streaming [experimental],
68
+ streaming window [experimental],
69
+ output file
65
70
  Quit
66
71
  ```
67
72
 
@@ -22,6 +22,11 @@ dependencies = [
22
22
  "termcolor",
23
23
  "platformdirs",
24
24
  "desktop-ai-core>=0.2.0",
25
+ # Runs the bundled silero VAD ONNX model (~2 MB shipped in scribe_data).
26
+ # In base deps so silero is available out of the box — see scribe/audio.py.
27
+ # `faster-whisper` already pulls it transitively, so installing with
28
+ # [whisper] is free; standalone adds ~57 MB which is trivial for an STT tool.
29
+ "onnxruntime",
25
30
  ]
26
31
 
27
32
  classifiers = [
@@ -67,12 +72,18 @@ vosk = ["vosk"]
67
72
  app = ["pystray", "PyGObject"]
68
73
  openai = ["openai>=2.37.0,<3", "soundfile"]
69
74
  groq = ["openai>=2.37.0,<3", "soundfile"]
75
+ # [vad] is now a no-op alias kept for back-compat (`pip install scribe-cli[vad]`
76
+ # was the documented install before onnxruntime moved into base deps).
77
+ vad = []
70
78
  all = ["pynput", "faster-whisper", "pywhispercpp", "openai>=2.37.0,<3", "soundfile", "vosk", "pystray"]
71
79
 
72
80
 
73
81
  [tool.setuptools]
74
82
  packages = [ "scribe", "scribe_data" ]
75
83
 
84
+ [tool.setuptools.package-data]
85
+ scribe_data = ["share/*.png", "templates/*", "silero_vad.onnx", "silero_vad.LICENSE"]
86
+
76
87
  [tool.setuptools_scm]
77
88
  write_to = "scribe/_version.py"
78
89
 
@@ -18,7 +18,7 @@ version_tuple: tuple[int | str, ...]
18
18
  commit_id: str | None
19
19
  __commit_id__: str | None
20
20
 
21
- __version__ = version = '0.17.0'
22
- __version_tuple__ = version_tuple = (0, 17, 0)
21
+ __version__ = version = '0.18.0'
22
+ __version_tuple__ = version_tuple = (0, 18, 0)
23
23
 
24
- __commit_id__ = commit_id = 'gbfcd2e228'
24
+ __commit_id__ = commit_id = 'gd48d707c7'
@@ -66,7 +66,10 @@ class DummyTranscriber:
66
66
 
67
67
  whisper_models = ["tiny", "base", "small", "medium", "large-v3", "large-v3-turbo"]
68
68
  whisper_english_models = ["tiny.en", "base.en", "small.en", "medium.en"]
69
- # FUTO ACFT publishes only tiny/base/small (+ .en variants); no medium/large/turbo.
69
+ # FUTO ACFT publishes only tiny/base/small (+ .en variants). Community
70
+ # conversions exist for large/turbo but their large-v3 encoder is
71
+ # incompatible with the audio_ctx shrinkage that's the point of this
72
+ # backend — for large models use the `whisper` backend instead.
70
73
  whisper_futo_models = ["tiny", "base", "small"]
71
74
  whisper_futo_english_models = ["tiny.en", "base.en", "small.en"]
72
75
  whisperapi_models = ["gpt-4o-transcribe", "gpt-4o-mini-transcribe", "gpt-realtime-whisper"]
@@ -169,6 +172,7 @@ def _resolve_prompt_and_words(prompt_text, prompt_file, words, words_file):
169
172
 
170
173
  def _build_backend_kwargs(backend, model, language, samplerate, duration,
171
174
  silence_db, silence_duration,
175
+ vad_mode, vad_threshold, vad_min_silence_ms,
172
176
  download_folder_vosk, download_folder_whisper,
173
177
  download_folder_whisper_futo,
174
178
  realtime_delay, realtime_gate,
@@ -183,6 +187,8 @@ def _build_backend_kwargs(backend, model, language, samplerate, duration,
183
187
  word_blob = " ".join(words)
184
188
  merged_prompt = f"{prompt_text} {word_blob}" if prompt_text else word_blob
185
189
 
190
+ vad_kwargs = dict(vad_mode=vad_mode, vad_threshold=vad_threshold,
191
+ vad_min_silence_ms=vad_min_silence_ms)
186
192
  if backend == "vosk":
187
193
  # Vosk has no soft prompt; only a hard grammar. Silently ignore for now.
188
194
  return dict(model_name=model, language=language, samplerate=samplerate,
@@ -190,25 +196,33 @@ def _build_backend_kwargs(backend, model, language, samplerate, duration,
190
196
  model_kwargs={"download_root": download_folder_vosk})
191
197
  if backend == "whisper":
192
198
  return dict(model_name=model, language=language, samplerate=samplerate,
193
- timeout=duration, silence_duration=silence_duration, silence_thresh=silence_db,
199
+ timeout=duration, silence_duration=silence_duration,
200
+ silence_thresh=silence_db,
194
201
  pseudo_streaming=pseudo_streaming, streaming_window=streaming_window,
195
202
  prompt=prompt_text,
196
203
  hotwords=(" ".join(words) if words else None),
197
- model_kwargs={"download_root": download_folder_whisper})
204
+ model_kwargs={"download_root": download_folder_whisper},
205
+ **vad_kwargs)
198
206
  if backend == "whisper-futo":
199
- # whisper.cpp via pywhispercpp doesn't take prompt/hotwords through the
200
- # same surface; drop them for now. Audio_ctx is computed per-call inside
201
- # the backend from actual audio length (the ACFT speedup).
207
+ # pywhispercpp 1.4.1 exposes `initial_prompt`; the backend folds
208
+ # words+prompt into it (and adds a rolling chunk-tail in
209
+ # pseudo-streaming). No separate hotwords channel here fold
210
+ # everything into the prompt like the cloud backends do.
202
211
  return dict(model_name=model, language=language, samplerate=samplerate,
203
- timeout=duration, silence_duration=silence_duration, silence_thresh=silence_db,
212
+ timeout=duration, silence_duration=silence_duration,
213
+ silence_thresh=silence_db,
204
214
  pseudo_streaming=pseudo_streaming, streaming_window=streaming_window,
205
- download_folder=download_folder_whisper_futo)
215
+ prompt=merged_prompt,
216
+ download_folder=download_folder_whisper_futo,
217
+ **vad_kwargs)
206
218
  if backend in ("openai", "groq"):
207
219
  from scribe.backends.openai_api import REALTIME_MODELS
208
220
  kwargs = dict(model_name=model, samplerate=samplerate,
209
- timeout=duration, silence_duration=silence_duration, silence_thresh=silence_db,
221
+ timeout=duration, silence_duration=silence_duration,
222
+ silence_thresh=silence_db,
210
223
  pseudo_streaming=pseudo_streaming, streaming_window=streaming_window,
211
- prompt=merged_prompt)
224
+ prompt=merged_prompt,
225
+ **vad_kwargs)
212
226
  if backend == "openai" and model in REALTIME_MODELS:
213
227
  kwargs["realtime_delay"] = realtime_delay
214
228
  kwargs["realtime_gate"] = realtime_gate
@@ -223,7 +237,8 @@ def _build_backend_kwargs(backend, model, language, samplerate, duration,
223
237
 
224
238
  def get_transcriber(model=None, backend=None, dummy=False, interactive=True, language=None,
225
239
  samplerate=None, duration=None,
226
- silence_db=-40.0, silence_duration=0.6,
240
+ silence_db=None, silence_duration=0.6,
241
+ vad_mode="auto", vad_threshold=0.5, vad_min_silence_ms=300,
227
242
  download_folder_vosk=None, download_folder_whisper=None,
228
243
  download_folder_whisper_futo=None,
229
244
  realtime_delay="medium", realtime_gate=True,
@@ -253,9 +268,14 @@ def get_transcriber(model=None, backend=None, dummy=False, interactive=True, lan
253
268
  else:
254
269
  model = _prompt_model_for_backend(backend, language, interactive)
255
270
  print(f"Selected model: {model}")
271
+ # silence_db is the single volume floor used by the dB fallback. Silero
272
+ # mode ignores it. Default -40 dBFS — keeps the gate simple by design.
273
+ if silence_db is None:
274
+ silence_db = -40.0
256
275
  prompt_text, word_list = _resolve_prompt_and_words(prompt, prompt_file, words, words_file)
257
276
  backend_kwargs = _build_backend_kwargs(backend, model, language, samplerate, duration,
258
277
  silence_db, silence_duration,
278
+ vad_mode, vad_threshold, vad_min_silence_ms,
259
279
  download_folder_vosk, download_folder_whisper,
260
280
  download_folder_whisper_futo,
261
281
  realtime_delay, realtime_gate,
@@ -319,14 +339,9 @@ def get_parser():
319
339
  group.add_argument("-o", "--output-file",
320
340
  help="Also append the transcription to this file.")
321
341
 
322
- group = parser.add_argument_group("Silence detection (shared)")
342
+ group = parser.add_argument_group("Silence detection")
323
343
  group.add_argument("--duration", default=120, type=float,
324
344
  help="Max recording duration in seconds (default: %(default)s).")
325
- group.add_argument("--silence-db", default=-40.0, type=float,
326
- help="dBFS volume floor for 'this frame is silent' "
327
- "(default: %(default)s). Used by every silence-driven "
328
- "behavior (realtime gate, realtime auto-commit, "
329
- "pseudo-streaming chunking).")
330
345
  group.add_argument("--silence-duration", default=0.6, type=float,
331
346
  help="Seconds of silence required before triggering a "
332
347
  "backend's silence behavior (default: %(default)s). "
@@ -335,6 +350,31 @@ def get_parser():
335
350
  "batch backends: candidate cut point within the "
336
351
  "streaming window.")
337
352
 
353
+ group = parser.add_argument_group("Voice activity detection")
354
+ group.add_argument("--vad-mode", choices=("auto", "db", "silero"), default="auto",
355
+ help="Silence-detection backend (default: %(default)s). "
356
+ "'auto' picks silero if installed, dB otherwise. "
357
+ "'silero' uses silero-vad — much more robust to "
358
+ "ambient noise (ticks, fan, traffic) AND to soft "
359
+ "speech (the dB gate drops sub-threshold syllables; "
360
+ "silero recognises speech spectrally). "
361
+ "'db' is a volume-threshold fallback used when "
362
+ "onnxruntime is unavailable (see --silence-db). "
363
+ "The dB and silero parameter groups are independent.")
364
+ group.add_argument("--vad-threshold", default=0.5, type=float,
365
+ help="[silero only] Speech-probability threshold in [0,1] "
366
+ "(default: %(default)s). Lower = more permissive (catches "
367
+ "quiet speech but also more noise); higher = stricter.")
368
+ group.add_argument("--vad-min-silence-ms", default=300, type=int,
369
+ help="[silero only] Minimum sustained low-probability span before "
370
+ "speech-end is emitted, in ms (default: %(default)s). "
371
+ "Acts as silero's onset/offset smoothing window.")
372
+ group.add_argument("--silence-db", default=None, type=float,
373
+ help="[dB only] Silence floor in dBFS for the dB-mode "
374
+ "fallback (default: -40). Ignored when "
375
+ "--vad-mode=silero (or =auto and silero is "
376
+ "available).")
377
+
338
378
  group = parser.add_argument_group("Realtime (gpt-realtime-whisper)")
339
379
  group.add_argument("--realtime-delay",
340
380
  choices=("minimal", "low", "medium", "high", "xhigh"),
@@ -344,10 +384,10 @@ def get_parser():
344
384
  "paste churn in the focused window).")
345
385
  group.add_argument("--realtime-gate", action=argparse.BooleanOptionalAction,
346
386
  default=True,
347
- help="Drop silent frames (per --silence-db) before sending "
348
- "them over the WebSocket so silent audio isn't billed "
349
- "as input tokens (default: on; pass --no-realtime-gate "
350
- "to disable).")
387
+ help="Drop silent frames (per the active --vad-mode) before "
388
+ "sending them over the WebSocket so silent audio "
389
+ "isn't billed as input tokens (default: on; pass "
390
+ "--no-realtime-gate to disable).")
351
391
 
352
392
  group = parser.add_argument_group("Pseudo-streaming (experimental)")
353
393
  group.add_argument("--pseudo-streaming", action="store_true",
@@ -399,8 +439,16 @@ def start_recording(micro, session, mode="keystroke", typer="auto",
399
439
  # Query the live transcriber instance — the registered class may dispatch
400
440
  # to a streaming sibling for specific models (e.g. openai →
401
441
  # gpt-realtime-whisper), so a class-level lookup via BACKENDS would lie.
442
+ # Pseudo-streaming also yields chunks (silence-cut batch transcriptions)
443
+ # so the output should treat it the same: live paste/type per chunk.
402
444
  backend_obj = getattr(session, "backend", session)
403
- is_streaming = bool(getattr(backend_obj, "supports_streaming", False)) if not isinstance(backend_obj, str) else False
445
+ if isinstance(backend_obj, str):
446
+ is_streaming = False
447
+ else:
448
+ is_streaming = (
449
+ bool(getattr(backend_obj, "supports_streaming", False))
450
+ or bool(getattr(backend_obj, "pseudo_streaming", False))
451
+ )
404
452
  # Clipboard is written in clipboard mode (the user pastes manually) and in
405
453
  # paste-based keystroke mode (the paste source). type_direct keystroke
406
454
  # mode bypasses the clipboard entirely — we type the chunks/text raw.
@@ -427,6 +475,16 @@ def start_recording(micro, session, mode="keystroke", typer="auto",
427
475
  import pyperclip
428
476
  session.log("The transcription will be copied to clipboard as it becomes available.")
429
477
 
478
+ # Tell streaming backends whether their output is about to hit the
479
+ # clipboard-paste race or a direct-keystroke typer. The realtime
480
+ # backend's per-token deltas only need coalescing in paste mode;
481
+ # type-direct (ydotool/wtype/pynput via uinput/xtest) types each
482
+ # character synchronously and benefits from raw per-delta emission
483
+ # for snappier UX. Set as a plain attribute — backends that don't
484
+ # implement coalescing ignore it.
485
+ if not isinstance(backend_obj, str) and hasattr(backend_obj, "_coalesce_deltas"):
486
+ backend_obj._coalesce_deltas = do_live_paste
487
+
430
488
  fulltext = ""
431
489
 
432
490
  for result in session.start_recording(micro, **greetings):
@@ -497,14 +555,24 @@ def create_app(micro, app_state):
497
555
  image = Image.open(Path(scribe_data.__file__).parent / "share" / "icon.png")
498
556
  image_recording = Image.open(Path(scribe_data.__file__).parent / "share" / "icon_recording.png")
499
557
  image_writing = Image.open(Path(scribe_data.__file__).parent / "share" / "icon_writing.png")
558
+ # Composite (red + writing 'a'): shown while recording AND the silence
559
+ # gate says speech is active. Gives the user a visual confirmation that
560
+ # the audio is actually being captured/sent — not just sitting in
561
+ # detected silence. Plain red = recording but waiting for speech.
562
+ image_recording_active = Image.alpha_composite(
563
+ image_recording.convert("RGBA"), image_writing.convert("RGBA"),
564
+ )
500
565
 
501
566
  if transcriber.backend == "vosk":
502
- # Recording and writing happen at the same time in this backend.
503
- image_recording = Image.alpha_composite(image_recording.convert("RGBA"), image_writing.convert("RGBA"))
567
+ # vosk transcribes while recording both recording sub-states show
568
+ # the composite (no meaningful "waiting" since vosk streams
569
+ # continuously).
570
+ image_recording = image_recording_active
504
571
 
505
572
  state_images = {
506
573
  None: image,
507
574
  "recording": image_recording,
575
+ "recording_active": image_recording_active,
508
576
  "busy": image_writing,
509
577
  }
510
578
 
@@ -523,7 +591,12 @@ def create_app(micro, app_state):
523
591
  return "busy"
524
592
  s = icon._session
525
593
  if s.recording:
526
- return "recording"
594
+ # session.waiting flips True after silence_duration of detected
595
+ # silence, False on the first non-silent chunk. The composite
596
+ # ("recording_active") tells the user audio is actually being
597
+ # sent to the backend — solves the "is it hearing me?" question
598
+ # without printing partial transcripts to the tray.
599
+ return "recording" if s.waiting else "recording_active"
527
600
  if s.busy:
528
601
  return "busy"
529
602
  return None