scribe-cli 0.17.1__tar.gz → 0.18.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/PKG-INFO +3 -1
  2. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/docs/backends.md +2 -2
  3. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/docs/cli.md +22 -3
  4. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/docs/tray.md +9 -4
  5. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/pyproject.toml +11 -0
  6. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/_version.py +3 -3
  7. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/app.py +67 -35
  8. scribe_cli-0.18.0/scribe/audio.py +379 -0
  9. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/backends/openai_realtime.py +29 -6
  10. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/menu.py +60 -1
  11. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/models.py +80 -27
  12. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe_cli.egg-info/PKG-INFO +3 -1
  13. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe_cli.egg-info/SOURCES.txt +2 -0
  14. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe_cli.egg-info/requires.txt +3 -0
  15. scribe_cli-0.18.0/scribe_data/silero_vad.LICENSE +21 -0
  16. scribe_cli-0.18.0/scribe_data/silero_vad.onnx +0 -0
  17. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/tests/test_pseudo_streaming.py +159 -34
  18. scribe_cli-0.17.1/scribe/audio.py +0 -76
  19. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/.github/FUNDING.yml +0 -0
  20. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/.github/workflows/pypi.yml +0 -0
  21. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/.gitignore +0 -0
  22. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/LICENSE +0 -0
  23. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/README.md +0 -0
  24. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/docs/app-tray-menu.png +0 -0
  25. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/docs/desktop-install.md +0 -0
  26. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/docs/installation.md +0 -0
  27. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/docs/keyboard.md +0 -0
  28. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/docs/roadmap-libei.md +0 -0
  29. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/icon.xcf +0 -0
  30. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/__init__.py +0 -0
  31. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/backends/__init__.py +0 -0
  32. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/backends/groq.py +0 -0
  33. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/backends/openai_api.py +0 -0
  34. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/backends/vosk.py +0 -0
  35. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/backends/whisper.py +0 -0
  36. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/backends/whisper_futo.py +0 -0
  37. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/install_desktop.py +0 -0
  38. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/keyboard.py +0 -0
  39. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/models.toml +0 -0
  40. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/saverecording.py +0 -0
  41. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/session.py +0 -0
  42. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/testpynput.py +0 -0
  43. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/typers/__init__.py +0 -0
  44. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/typers/base.py +0 -0
  45. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/typers/eitype.py +0 -0
  46. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/typers/pynput.py +0 -0
  47. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/typers/wtype.py +0 -0
  48. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/typers/ydotool.py +0 -0
  49. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe/util.py +0 -0
  50. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe_cli.egg-info/dependency_links.txt +0 -0
  51. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe_cli.egg-info/entry_points.txt +0 -0
  52. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe_cli.egg-info/top_level.txt +0 -0
  53. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe_data/__init__.py +0 -0
  54. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe_data/share/icon.png +0 -0
  55. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe_data/share/icon_recording.png +0 -0
  56. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe_data/share/icon_writing.png +0 -0
  57. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scribe_data/templates/scribe.desktop +0 -0
  58. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scripts/bench_whisper_local.py +0 -0
  59. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/scripts/test_python_versions_install.sh +0 -0
  60. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/setup.cfg +0 -0
  61. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/tests/test_openai_realtime_coalesce.py +0 -0
  62. {scribe_cli-0.17.1 → scribe_cli-0.18.0}/tests/test_whisper_futo.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: scribe-cli
3
- Version: 0.17.1
3
+ Version: 0.18.0
4
4
  Summary: Speech-to-text CLI and system-tray app for dictating into any focused window. Local (vosk, faster-whisper) or cloud (groq, openai) backends, batch or streaming.
5
5
  Author-email: Mahé Perrette <mahe.perrette@gmail.com>
6
6
  License: MIT License
@@ -52,6 +52,7 @@ Requires-Dist: unidecode
52
52
  Requires-Dist: termcolor
53
53
  Requires-Dist: platformdirs
54
54
  Requires-Dist: desktop-ai-core>=0.2.0
55
+ Requires-Dist: onnxruntime
55
56
  Provides-Extra: keyboard
56
57
  Requires-Dist: pynput; extra == "keyboard"
57
58
  Provides-Extra: whisper
@@ -69,6 +70,7 @@ Requires-Dist: soundfile; extra == "openai"
69
70
  Provides-Extra: groq
70
71
  Requires-Dist: openai<3,>=2.37.0; extra == "groq"
71
72
  Requires-Dist: soundfile; extra == "groq"
73
+ Provides-Extra: vad
72
74
  Provides-Extra: all
73
75
  Requires-Dist: pynput; extra == "all"
74
76
  Requires-Dist: faster-whisper; extra == "all"
@@ -225,8 +225,8 @@ Whisper's prompt window is capped at ~224 tokens; 200 chars of French
225
225
  sits well under that and leaves room for your static prompt + words
226
226
  list.
227
227
 
228
- The rolling tail is **dropped** whenever the pause that triggered the
229
- chunk cut exceeded 1.5 seconds — a long pause is treated as a new
228
+ The rolling tail is **dropped** when the silence between two
229
+ utterances exceeds 1.5 seconds — a long pause is treated as a new
230
230
  sentence/idea boundary, where carrying a possibly-bad prior chunk
231
231
  forward biases the next one more than it helps. This mirrors
232
232
  `whisper.cpp`'s `--keep-context off` default: prior-text conditioning
@@ -65,20 +65,39 @@ flag suppresses only its own side (giving `--prompt ""` still loads
65
65
  | `--type-direct` | In keystroke mode, type the transcription as keystrokes instead of synthesising Ctrl+V. |
66
66
  | `-o, --output-file FILE` | Also append the transcription to this file. |
67
67
 
68
- ## Silence detection (shared)
68
+ ## Silence detection
69
69
 
70
70
  | Flag | Default | Purpose |
71
71
  |----------------------------|---------|------------------------------------------------------------------------|
72
72
  | `--duration SECS` | `120` | Max recording duration in seconds. |
73
- | `--silence-db DB` | `-40` | dBFS volume floor for "this frame is silent". Used by every silence-driven behavior. |
74
73
  | `--silence-duration SECS` | `0.6` | How long silence must persist before triggering a backend's silence behavior (realtime auto-commit, pseudo-streaming cut). |
75
74
 
75
+ ## Voice activity detection
76
+
77
+ scribe ships two silence-detection backends. By default
78
+ (`--vad-mode auto`) it picks **silero-vad** when `onnxruntime` is
79
+ importable (always true on a stock `pip install scribe-cli` since
80
+ `onnxruntime` is a base dependency) and falls back to a plain dB
81
+ volume threshold otherwise. silero is much more robust to ambient
82
+ noise (clicks, fan, traffic) and to soft speech than dB, which drops
83
+ sub-threshold syllables and gets fooled by loud non-speech.
84
+
85
+ The dB and silero parameter groups are independent — the inactive
86
+ mode's knobs are ignored.
87
+
88
+ | Flag | Default | Purpose |
89
+ |-------------------------------|---------|------------------------------------------------------------------------|
90
+ | `--vad-mode {auto,db,silero}` | `auto` | Silence-detection backend. `auto` picks silero when available, dB otherwise. |
91
+ | `--vad-threshold FLOAT` | `0.5` | **[silero only]** Speech-probability threshold in `[0,1]`. Lower = more permissive (catches quiet speech and more noise); higher = stricter. |
92
+ | `--vad-min-silence-ms INT` | `300` | **[silero only]** Minimum sustained low-probability span before speech-end fires, in ms. silero's onset/offset smoothing window. |
93
+ | `--silence-db DB` | `-40` | **[dB only]** dBFS volume floor for "this frame is silent". Ignored when silero is the active mode. |
94
+
76
95
  ## Realtime (`gpt-realtime-whisper`)
77
96
 
78
97
  | Flag | Default | Purpose |
79
98
  |---------------------------------------------------|----------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
80
99
  | `--realtime-delay {minimal,low,medium,high,xhigh}` | `medium` | Trade off latency vs accuracy on `gpt-realtime-whisper`. Lower = faster partials but more paste churn in the focused window. |
81
- | `--realtime-gate` / `--no-realtime-gate` | on | Drop silent frames (per `--silence-db`) before sending them over the WebSocket so silent audio isn't billed as input tokens. After `--silence-duration` of silence, also commit mid-session so trailing words flush live. |
100
+ | `--realtime-gate` / `--no-realtime-gate` | on | Drop silent frames (per the active `--vad-mode`) before sending them over the WebSocket so silent audio isn't billed as input tokens. After `--silence-duration` of silence, also commit mid-session so trailing words flush live. |
82
101
 
83
102
  Streaming models (Vosk, `gpt-realtime-whisper`) ignore the batch
84
103
  silence-chunking knobs; they have their own end-of-utterance signal.
@@ -58,10 +58,15 @@ Options ▶
58
58
  Keyboard backend ▶ eitype / pynput / ydotool / wtype
59
59
  (rows incompatible with this OS are hidden;
60
60
  submenu hidden entirely when ≤ 1 row left)
61
- Advanced ▶ silence duration, silence threshold,
62
- realtime gate, pseudo-streaming
63
- [experimental], streaming window
64
- [experimental], output file
61
+ Advanced ▶ silence duration, VAD mode toggle
62
+ (silero ↔ dB), per-mode VAD knobs
63
+ (silero: speech-probability threshold,
64
+ min silence duration; dB: silence
65
+ threshold — only the active mode's
66
+ knobs are shown), realtime gate,
67
+ pseudo-streaming [experimental],
68
+ streaming window [experimental],
69
+ output file
65
70
  Quit
66
71
  ```
67
72
 
@@ -22,6 +22,11 @@ dependencies = [
22
22
  "termcolor",
23
23
  "platformdirs",
24
24
  "desktop-ai-core>=0.2.0",
25
+ # Runs the bundled silero VAD ONNX model (~2 MB shipped in scribe_data).
26
+ # In base deps so silero is available out of the box — see scribe/audio.py.
27
+ # `faster-whisper` already pulls it transitively, so installing with
28
+ # [whisper] is free; standalone adds ~57 MB which is trivial for an STT tool.
29
+ "onnxruntime",
25
30
  ]
26
31
 
27
32
  classifiers = [
@@ -67,12 +72,18 @@ vosk = ["vosk"]
67
72
  app = ["pystray", "PyGObject"]
68
73
  openai = ["openai>=2.37.0,<3", "soundfile"]
69
74
  groq = ["openai>=2.37.0,<3", "soundfile"]
75
+ # [vad] is now a no-op alias kept for back-compat (`pip install scribe-cli[vad]`
76
+ # was the documented install before onnxruntime moved into base deps).
77
+ vad = []
70
78
  all = ["pynput", "faster-whisper", "pywhispercpp", "openai>=2.37.0,<3", "soundfile", "vosk", "pystray"]
71
79
 
72
80
 
73
81
  [tool.setuptools]
74
82
  packages = [ "scribe", "scribe_data" ]
75
83
 
84
+ [tool.setuptools.package-data]
85
+ scribe_data = ["share/*.png", "templates/*", "silero_vad.onnx", "silero_vad.LICENSE"]
86
+
76
87
  [tool.setuptools_scm]
77
88
  write_to = "scribe/_version.py"
78
89
 
@@ -18,7 +18,7 @@ version_tuple: tuple[int | str, ...]
18
18
  commit_id: str | None
19
19
  __commit_id__: str | None
20
20
 
21
- __version__ = version = '0.17.1'
22
- __version_tuple__ = version_tuple = (0, 17, 1)
21
+ __version__ = version = '0.18.0'
22
+ __version_tuple__ = version_tuple = (0, 18, 0)
23
23
 
24
- __commit_id__ = commit_id = 'g67d90f5e4'
24
+ __commit_id__ = commit_id = 'gd48d707c7'
@@ -171,7 +171,8 @@ def _resolve_prompt_and_words(prompt_text, prompt_file, words, words_file):
171
171
 
172
172
 
173
173
  def _build_backend_kwargs(backend, model, language, samplerate, duration,
174
- silence_db, silence_onset_db, silence_duration,
174
+ silence_db, silence_duration,
175
+ vad_mode, vad_threshold, vad_min_silence_ms,
175
176
  download_folder_vosk, download_folder_whisper,
176
177
  download_folder_whisper_futo,
177
178
  realtime_delay, realtime_gate,
@@ -186,6 +187,8 @@ def _build_backend_kwargs(backend, model, language, samplerate, duration,
186
187
  word_blob = " ".join(words)
187
188
  merged_prompt = f"{prompt_text} {word_blob}" if prompt_text else word_blob
188
189
 
190
+ vad_kwargs = dict(vad_mode=vad_mode, vad_threshold=vad_threshold,
191
+ vad_min_silence_ms=vad_min_silence_ms)
189
192
  if backend == "vosk":
190
193
  # Vosk has no soft prompt; only a hard grammar. Silently ignore for now.
191
194
  return dict(model_name=model, language=language, samplerate=samplerate,
@@ -194,11 +197,12 @@ def _build_backend_kwargs(backend, model, language, samplerate, duration,
194
197
  if backend == "whisper":
195
198
  return dict(model_name=model, language=language, samplerate=samplerate,
196
199
  timeout=duration, silence_duration=silence_duration,
197
- silence_thresh=silence_db, silence_thresh_onset=silence_onset_db,
200
+ silence_thresh=silence_db,
198
201
  pseudo_streaming=pseudo_streaming, streaming_window=streaming_window,
199
202
  prompt=prompt_text,
200
203
  hotwords=(" ".join(words) if words else None),
201
- model_kwargs={"download_root": download_folder_whisper})
204
+ model_kwargs={"download_root": download_folder_whisper},
205
+ **vad_kwargs)
202
206
  if backend == "whisper-futo":
203
207
  # pywhispercpp 1.4.1 exposes `initial_prompt`; the backend folds
204
208
  # words+prompt into it (and adds a rolling chunk-tail in
@@ -206,17 +210,19 @@ def _build_backend_kwargs(backend, model, language, samplerate, duration,
206
210
  # everything into the prompt like the cloud backends do.
207
211
  return dict(model_name=model, language=language, samplerate=samplerate,
208
212
  timeout=duration, silence_duration=silence_duration,
209
- silence_thresh=silence_db, silence_thresh_onset=silence_onset_db,
213
+ silence_thresh=silence_db,
210
214
  pseudo_streaming=pseudo_streaming, streaming_window=streaming_window,
211
215
  prompt=merged_prompt,
212
- download_folder=download_folder_whisper_futo)
216
+ download_folder=download_folder_whisper_futo,
217
+ **vad_kwargs)
213
218
  if backend in ("openai", "groq"):
214
219
  from scribe.backends.openai_api import REALTIME_MODELS
215
220
  kwargs = dict(model_name=model, samplerate=samplerate,
216
221
  timeout=duration, silence_duration=silence_duration,
217
- silence_thresh=silence_db, silence_thresh_onset=silence_onset_db,
222
+ silence_thresh=silence_db,
218
223
  pseudo_streaming=pseudo_streaming, streaming_window=streaming_window,
219
- prompt=merged_prompt)
224
+ prompt=merged_prompt,
225
+ **vad_kwargs)
220
226
  if backend == "openai" and model in REALTIME_MODELS:
221
227
  kwargs["realtime_delay"] = realtime_delay
222
228
  kwargs["realtime_gate"] = realtime_gate
@@ -231,7 +237,8 @@ def _build_backend_kwargs(backend, model, language, samplerate, duration,
231
237
 
232
238
  def get_transcriber(model=None, backend=None, dummy=False, interactive=True, language=None,
233
239
  samplerate=None, duration=None,
234
- silence_db=None, silence_onset_db=None, silence_duration=0.6,
240
+ silence_db=None, silence_duration=0.6,
241
+ vad_mode="auto", vad_threshold=0.5, vad_min_silence_ms=300,
235
242
  download_folder_vosk=None, download_folder_whisper=None,
236
243
  download_folder_whisper_futo=None,
237
244
  realtime_delay="medium", realtime_gate=True,
@@ -261,17 +268,14 @@ def get_transcriber(model=None, backend=None, dummy=False, interactive=True, lan
261
268
  else:
262
269
  model = _prompt_model_for_backend(backend, language, interactive)
263
270
  print(f"Selected model: {model}")
264
- # silence_db is the LOW threshold (in-speech pause detection) default
265
- # -40 in all modes. silence_onset_db is the HIGH threshold (speech-start
266
- # gate) used only in pseudo-streaming via hysteresis; -25 keeps ambient
267
- # noise (keyboard, breathing) from triggering a chunk.
271
+ # silence_db is the single volume floor used by the dB fallback. Silero
272
+ # mode ignores it. Default -40 dBFS keeps the gate simple by design.
268
273
  if silence_db is None:
269
274
  silence_db = -40.0
270
- if silence_onset_db is None:
271
- silence_onset_db = -25.0 if pseudo_streaming else silence_db
272
275
  prompt_text, word_list = _resolve_prompt_and_words(prompt, prompt_file, words, words_file)
273
276
  backend_kwargs = _build_backend_kwargs(backend, model, language, samplerate, duration,
274
- silence_db, silence_onset_db, silence_duration,
277
+ silence_db, silence_duration,
278
+ vad_mode, vad_threshold, vad_min_silence_ms,
275
279
  download_folder_vosk, download_folder_whisper,
276
280
  download_folder_whisper_futo,
277
281
  realtime_delay, realtime_gate,
@@ -335,21 +339,9 @@ def get_parser():
335
339
  group.add_argument("-o", "--output-file",
336
340
  help="Also append the transcription to this file.")
337
341
 
338
- group = parser.add_argument_group("Silence detection (shared)")
342
+ group = parser.add_argument_group("Silence detection")
339
343
  group.add_argument("--duration", default=120, type=float,
340
344
  help="Max recording duration in seconds (default: %(default)s).")
341
- group.add_argument("--silence-db", default=None, type=float,
342
- help="LOW silence floor in dBFS — applied while we're "
343
- "already inside an utterance, so soft trailing "
344
- "syllables aren't cut. Default: -40. Used by every "
345
- "silence-driven behavior (pseudo-streaming pause "
346
- "detection, realtime gate, realtime auto-commit).")
347
- group.add_argument("--silence-onset-db", default=None, type=float,
348
- help="HIGH silence floor in dBFS — applied before we've "
349
- "started capturing speech (audio buffer empty). "
350
- "Stricter so ambient noise (keyboard, breathing) "
351
- "doesn't trigger a chunk. Default: -25 in "
352
- "pseudo-streaming, same as --silence-db otherwise.")
353
345
  group.add_argument("--silence-duration", default=0.6, type=float,
354
346
  help="Seconds of silence required before triggering a "
355
347
  "backend's silence behavior (default: %(default)s). "
@@ -358,6 +350,31 @@ def get_parser():
358
350
  "batch backends: candidate cut point within the "
359
351
  "streaming window.")
360
352
 
353
+ group = parser.add_argument_group("Voice activity detection")
354
+ group.add_argument("--vad-mode", choices=("auto", "db", "silero"), default="auto",
355
+ help="Silence-detection backend (default: %(default)s). "
356
+ "'auto' picks silero if installed, dB otherwise. "
357
+ "'silero' uses silero-vad — much more robust to "
358
+ "ambient noise (ticks, fan, traffic) AND to soft "
359
+ "speech (the dB gate drops sub-threshold syllables; "
360
+ "silero recognises speech spectrally). "
361
+ "'db' is a volume-threshold fallback used when "
362
+ "onnxruntime is unavailable (see --silence-db). "
363
+ "The dB and silero parameter groups are independent.")
364
+ group.add_argument("--vad-threshold", default=0.5, type=float,
365
+ help="[silero only] Speech-probability threshold in [0,1] "
366
+ "(default: %(default)s). Lower = more permissive (catches "
367
+ "quiet speech but also more noise); higher = stricter.")
368
+ group.add_argument("--vad-min-silence-ms", default=300, type=int,
369
+ help="[silero only] Minimum sustained low-probability span before "
370
+ "speech-end is emitted, in ms (default: %(default)s). "
371
+ "Acts as silero's onset/offset smoothing window.")
372
+ group.add_argument("--silence-db", default=None, type=float,
373
+ help="[dB only] Silence floor in dBFS for the dB-mode "
374
+ "fallback (default: -40). Ignored when "
375
+ "--vad-mode=silero (or =auto and silero is "
376
+ "available).")
377
+
361
378
  group = parser.add_argument_group("Realtime (gpt-realtime-whisper)")
362
379
  group.add_argument("--realtime-delay",
363
380
  choices=("minimal", "low", "medium", "high", "xhigh"),
@@ -367,10 +384,10 @@ def get_parser():
367
384
  "paste churn in the focused window).")
368
385
  group.add_argument("--realtime-gate", action=argparse.BooleanOptionalAction,
369
386
  default=True,
370
- help="Drop silent frames (per --silence-db) before sending "
371
- "them over the WebSocket so silent audio isn't billed "
372
- "as input tokens (default: on; pass --no-realtime-gate "
373
- "to disable).")
387
+ help="Drop silent frames (per the active --vad-mode) before "
388
+ "sending them over the WebSocket so silent audio "
389
+ "isn't billed as input tokens (default: on; pass "
390
+ "--no-realtime-gate to disable).")
374
391
 
375
392
  group = parser.add_argument_group("Pseudo-streaming (experimental)")
376
393
  group.add_argument("--pseudo-streaming", action="store_true",
@@ -538,14 +555,24 @@ def create_app(micro, app_state):
538
555
  image = Image.open(Path(scribe_data.__file__).parent / "share" / "icon.png")
539
556
  image_recording = Image.open(Path(scribe_data.__file__).parent / "share" / "icon_recording.png")
540
557
  image_writing = Image.open(Path(scribe_data.__file__).parent / "share" / "icon_writing.png")
558
+ # Composite (red + writing 'a'): shown while recording AND the silence
559
+ # gate says speech is active. Gives the user a visual confirmation that
560
+ # the audio is actually being captured/sent — not just sitting in
561
+ # detected silence. Plain red = recording but waiting for speech.
562
+ image_recording_active = Image.alpha_composite(
563
+ image_recording.convert("RGBA"), image_writing.convert("RGBA"),
564
+ )
541
565
 
542
566
  if transcriber.backend == "vosk":
543
- # Recording and writing happen at the same time in this backend.
544
- image_recording = Image.alpha_composite(image_recording.convert("RGBA"), image_writing.convert("RGBA"))
567
+ # vosk transcribes while recording both recording sub-states show
568
+ # the composite (no meaningful "waiting" since vosk streams
569
+ # continuously).
570
+ image_recording = image_recording_active
545
571
 
546
572
  state_images = {
547
573
  None: image,
548
574
  "recording": image_recording,
575
+ "recording_active": image_recording_active,
549
576
  "busy": image_writing,
550
577
  }
551
578
 
@@ -564,7 +591,12 @@ def create_app(micro, app_state):
564
591
  return "busy"
565
592
  s = icon._session
566
593
  if s.recording:
567
- return "recording"
594
+ # session.waiting flips True after silence_duration of detected
595
+ # silence, False on the first non-silent chunk. The composite
596
+ # ("recording_active") tells the user audio is actually being
597
+ # sent to the backend — solves the "is it hearing me?" question
598
+ # without printing partial transcripts to the tray.
599
+ return "recording" if s.waiting else "recording_active"
568
600
  if s.busy:
569
601
  return "busy"
570
602
  return None