scribe-cli 1.0.1__tar.gz → 1.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/PKG-INFO +3 -2
  2. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/README.md +2 -1
  3. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/docs/backends.md +77 -0
  4. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/docs/cli.md +1 -0
  5. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/scribe/_version.py +3 -3
  6. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/scribe/app.py +103 -22
  7. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/scribe/backends/openai_api.py +1 -0
  8. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/scribe/backends/whisper.py +5 -1
  9. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/scribe/backends/whisper_futo.py +5 -0
  10. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/scribe/dialog.py +26 -0
  11. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/scribe/menu.py +126 -2
  12. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/scribe/models.py +68 -5
  13. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/scribe_cli.egg-info/PKG-INFO +3 -2
  14. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/scribe_cli.egg-info/SOURCES.txt +3 -0
  15. scribe_cli-1.1.0/tests/test_compose_prompt.py +153 -0
  16. scribe_cli-1.1.0/tests/test_debug_logging.py +191 -0
  17. scribe_cli-1.1.0/tests/test_prompt_file_picker.py +165 -0
  18. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/tests/test_pseudo_streaming.py +103 -5
  19. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/.github/FUNDING.yml +0 -0
  20. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/.github/workflows/pypi.yml +0 -0
  21. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/.gitignore +0 -0
  22. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/LICENSE +0 -0
  23. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/docs/app-tray-menu.png +0 -0
  24. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/docs/desktop-install.md +0 -0
  25. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/docs/installation.md +0 -0
  26. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/docs/output.md +0 -0
  27. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/docs/roadmap-libei.md +0 -0
  28. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/docs/tray.md +0 -0
  29. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/icon.xcf +0 -0
  30. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/pyproject.toml +0 -0
  31. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/scribe/__init__.py +0 -0
  32. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/scribe/audio.py +0 -0
  33. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/scribe/backends/__init__.py +0 -0
  34. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/scribe/backends/groq.py +0 -0
  35. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/scribe/backends/openai_realtime.py +0 -0
  36. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/scribe/backends/vosk.py +0 -0
  37. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/scribe/install_desktop.py +0 -0
  38. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/scribe/keyboard.py +0 -0
  39. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/scribe/models.toml +0 -0
  40. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/scribe/output.py +0 -0
  41. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/scribe/saverecording.py +0 -0
  42. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/scribe/session.py +0 -0
  43. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/scribe/testpynput.py +0 -0
  44. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/scribe/typers/__init__.py +0 -0
  45. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/scribe/typers/base.py +0 -0
  46. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/scribe/typers/eitype.py +0 -0
  47. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/scribe/typers/pynput.py +0 -0
  48. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/scribe/typers/wtype.py +0 -0
  49. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/scribe/typers/ydotool.py +0 -0
  50. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/scribe/util.py +0 -0
  51. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/scribe_cli.egg-info/dependency_links.txt +0 -0
  52. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/scribe_cli.egg-info/entry_points.txt +0 -0
  53. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/scribe_cli.egg-info/requires.txt +0 -0
  54. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/scribe_cli.egg-info/top_level.txt +0 -0
  55. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/scribe_data/__init__.py +0 -0
  56. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/scribe_data/share/icon.png +0 -0
  57. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/scribe_data/share/icon_recording.png +0 -0
  58. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/scribe_data/share/icon_writing.png +0 -0
  59. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/scribe_data/silero_vad.LICENSE +0 -0
  60. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/scribe_data/silero_vad.onnx +0 -0
  61. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/scribe_data/templates/scribe.desktop +0 -0
  62. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/scripts/bench_whisper_local.py +0 -0
  63. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/scripts/test_python_versions_install.sh +0 -0
  64. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/setup.cfg +0 -0
  65. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/tests/test_backend_matrix.py +0 -0
  66. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/tests/test_openai_realtime_coalesce.py +0 -0
  67. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/tests/test_output.py +0 -0
  68. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/tests/test_output_file_picker.py +0 -0
  69. {scribe_cli-1.0.1 → scribe_cli-1.1.0}/tests/test_whisper_futo.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: scribe-cli
3
- Version: 1.0.1
3
+ Version: 1.1.0
4
4
  Summary: Speech-to-text CLI and system-tray app for dictating into any focused window. Local (vosk, faster-whisper) or cloud (groq, openai) backends, batch or streaming.
5
5
  Author-email: Mahé Perrette <mahe.perrette@gmail.com>
6
6
  License: MIT License
@@ -217,7 +217,8 @@ I personally use [OpenAI](https://openai.com/api/) with `gpt-4o-mini-transcribe`
217
217
  - [Installation & dependencies](docs/installation.md) — PortAudio,
218
218
  extras, Ubuntu / GNOME tray libs.
219
219
  - [Backends in detail](docs/backends.md) — model lists, when to pick
220
- which, the realtime model.
220
+ which, the realtime model, [Streaming recipes](docs/backends.md#streaming-recipes--two-profiles)
221
+ (Balanced / Patient profiles).
221
222
  - [Output modes & typer backends](docs/output.md) — keystroke vs
222
223
  clipboard, Wayland / `eitype`, `--type-direct`.
223
224
  - [System tray & global hotkeys](docs/tray.md) — menu tree, icon
@@ -113,7 +113,8 @@ I personally use [OpenAI](https://openai.com/api/) with `gpt-4o-mini-transcribe`
113
113
  - [Installation & dependencies](docs/installation.md) — PortAudio,
114
114
  extras, Ubuntu / GNOME tray libs.
115
115
  - [Backends in detail](docs/backends.md) — model lists, when to pick
116
- which, the realtime model.
116
+ which, the realtime model, [Streaming recipes](docs/backends.md#streaming-recipes--two-profiles)
117
+ (Balanced / Patient profiles).
117
118
  - [Output modes & typer backends](docs/output.md) — keystroke vs
118
119
  clipboard, Wayland / `eitype`, `--type-direct`.
119
120
  - [System tray & global hotkeys](docs/tray.md) — menu tree, icon
@@ -164,6 +164,31 @@ the one place a separate "dictionary" really exists — everywhere else
164
164
  `--words` is just a convenience to keep your word list out of the
165
165
  prompt string in the CLI.
166
166
 
167
+ ### Prompt style biases output style
168
+
169
+ Whisper mirrors the *style* of whatever prompt it receives. A
170
+ prompt like `"Tierney Comet"` (a bare wordlist) biases the model
171
+ toward unpunctuated, list-style output — sentences come out without
172
+ periods. A prompt like `"Tierney, Comet."` (or any prose ending in a
173
+ period) biases it toward punctuated output. Two practical
174
+ consequences:
175
+
176
+ - **`--prompt` is yours to control.** If your `prompt.txt` ends with
177
+ a period and looks like a sentence, your transcripts will be
178
+ punctuated. If it ends with a bare keyword, they probably won't.
179
+ This effect is most visible in **Stream mode**, where Whisper sees
180
+ short audio chunks and leans more heavily on the prompt for style
181
+ cues.
182
+ - **`--words` is auto-formatted by scribe.** For backends that fold
183
+ words into the prompt (`whisper-futo`, `openai`, `groq`), scribe
184
+ renders the word list as `"word1, word2, …, wordN."` — comma-
185
+ separated with a single terminal period — so your `words.txt` can
186
+ stay a bare list with no special formatting and the bias still
187
+ comes out punctuated. Stray punctuation on individual entries is
188
+ stripped first, so `words.txt` content is normalised regardless of
189
+ layout. On `whisper` (faster-whisper, local), words go to the
190
+ dedicated `hotwords` channel and bypass the prompt entirely.
191
+
167
192
  Both flags read from the corresponding `*-file` argument when present.
168
193
  Inline + file inputs are combined.
169
194
 
@@ -250,6 +275,21 @@ Once the buffer has grown to at least `--stream-chunk-min` (default
250
275
  (default 10 s) regardless of silence, to cap latency. The session
251
276
  continues until you stop it manually.
252
277
 
278
+ The **first** chunk of a streaming thread uses a different floor:
279
+ `--stream-first-chunk-min` (default 3 s). The bootstrap chunk has no
280
+ prior text to bias Whisper's punctuation/casing, so a longer audio
281
+ window lets the model produce a properly-punctuated transcript whose
282
+ tail then seeds the rolling prompt for every chunk after it.
283
+ Subsequent chunks fall back to `--stream-chunk-min`. The override
284
+ also re-engages right after a context-reset silence (i.e. when a long
285
+ pause cleared the rolling tail — see *Cross-chunk prompt context*
286
+ below). Set `--stream-first-chunk-min` equal to `--stream-chunk-min`
287
+ to disable the override. It's automatically inactive when
288
+ `--stream-context-length 0` (Patient profile), where there is no
289
+ rolling context to bootstrap. Internally clamped to `≤
290
+ --stream-chunk-max` so a misconfigured pair can't deadlock the
291
+ chunker.
292
+
253
293
  ### Does pseudo-streaming change the API cost?
254
294
 
255
295
  For cloud backends, going from one big transcription to N chunked
@@ -321,3 +361,40 @@ arbitrarily long pauses.
321
361
 
322
362
  Short pauses (mid-sentence punctuation) keep the context; the cut at
323
363
  the start of every new recording also clears it.
364
+
365
+ ### Streaming recipes — two profiles
366
+
367
+ The defaults stream phrases in as you talk; the Patient profile waits
368
+ for natural pauses and transcribes one utterance at a time. They make
369
+ opposite trade-offs around the same fundamental tension: short audio
370
+ windows give Whisper less to work with, so cross-chunk *context*
371
+ matters more in Balanced, less in Patient.
372
+
373
+ #### Balanced (default)
374
+
375
+ ```bash
376
+ scribe --stream
377
+ ```
378
+
379
+ Phrases commit every ~10 s or on a 0.6 s pause, with a 200-char
380
+ rolling prompt carrying earlier text forward as context for each new
381
+ chunk. Whisper sees short audio windows in isolation; the rolling
382
+ context partially compensates by telling the model what was just
383
+ said. Good live-feel, small per-chunk accuracy hit vs. Patient.
384
+
385
+ #### Patient (auto-clip)
386
+
387
+ ```bash
388
+ scribe --stream \
389
+ --stream-chunk-min 0.5 \
390
+ --stream-chunk-max 300 \
391
+ --stream-chunk-silence-break 2 \
392
+ --stream-context-length 0
393
+ ```
394
+
395
+ Each utterance is a complete self-contained sentence. scribe waits
396
+ for a 2 s pause, transcribes the whole thing at once, then waits for
397
+ the next one. No rolling context (`context-length 0`) because each
398
+ chunk is already a full utterance — there's nothing short to
399
+ compensate for. Highest per-chunk accuracy; no text appears until
400
+ you finish talking.
@@ -121,6 +121,7 @@ silence-chunking knobs; they have their own end-of-utterance signal.
121
121
  | `--clip` | default | Transcribe the whole recording at end. Same as the tray's **Mode: Clip**. |
122
122
  | `--stream-chunk-max SECS` | `10` | Maximum chunk duration in seconds. Force-cut fires at this threshold when no silence pause has been detected (default `10`). |
123
123
  | `--stream-chunk-min SECS` | `1.5` | Minimum chunk size before a silence-cut is allowed (default `1.5`). Prevents very short clips that cause Whisper hallucinations. |
124
+ | `--stream-first-chunk-min SECS` | `3.0` | Minimum chunk size for the *first* chunk of a streaming thread (default `3.0`). Higher than `--stream-chunk-min` so the bootstrap chunk has enough audio for Whisper to produce a punctuated transcript whose tail seeds the rolling prompt for the rest. Applies on recording start and right after a context-reset silence. Inactive when `--stream-context-length 0`. Clamped to `≤ --stream-chunk-max`. Set equal to `--stream-chunk-min` to disable. |
124
125
  | `--stream-chunk-silence-break SECS` | `0.6` | Silence duration that triggers a chunk cut (default `0.6`). Special value `0` enables Auto mode (best-silence-in-window at force-cut time). |
125
126
  | `--stream-context-reset-silence X` | `3.0` | Multiplier of `--stream-chunk-silence-break` above which the rolling cross-chunk prompt context is discarded (default `3.0`, i.e. 1.8 s at default silence-break). Use `inf` to never reset. |
126
127
  | `--clip-timeout SECS` | `120` | Auto-stop after this many seconds in Clip mode (default `120`). |
@@ -18,7 +18,7 @@ version_tuple: tuple[int | str, ...]
18
18
  commit_id: str | None
19
19
  __commit_id__: str | None
20
20
 
21
- __version__ = version = '1.0.1'
22
- __version_tuple__ = version_tuple = (1, 0, 1)
21
+ __version__ = version = '1.1.0'
22
+ __version_tuple__ = version_tuple = (1, 1, 0)
23
23
 
24
- __commit_id__ = commit_id = 'g768aa6b57'
24
+ __commit_id__ = commit_id = 'g9b8b835fd'
@@ -144,6 +144,29 @@ DEFAULT_WORDS_FILE = os.path.join(SCRIBE_CONFIG_DIR, "words.txt")
144
144
  DEFAULT_OUTPUT_FILE = os.path.join(platformdirs.user_desktop_dir(), "scribe-notes.txt")
145
145
 
146
146
 
147
+ def autodiscover_prompt_files(o):
148
+ """Persist auto-discovered ``prompt.txt`` / ``words.txt`` defaults into
149
+ the argparse namespace ``o`` so downstream consumers (the tray menu's
150
+ "Prompt file: …" label, the runtime reload helper) can read them as
151
+ first-class state instead of re-deriving the defaults. Mirrors the
152
+ fallback condition in :func:`_resolve_prompt_and_words` exactly: only
153
+ fires when both the inline flag and the file flag are *unset* — passing
154
+ ``--prompt ""`` / ``--prompt-file ""`` still suppresses the default.
155
+ ``o.prompt`` / ``o.prompt_file`` (and the words counterparts) are
156
+ expected to exist (argparse fills them with ``None``); missing attrs
157
+ are tolerated for tests that build minimal namespaces."""
158
+ if (getattr(o, "prompt", None) is None
159
+ and getattr(o, "prompt_file", None) is None
160
+ and os.path.exists(DEFAULT_PROMPT_FILE)):
161
+ o.prompt_file = DEFAULT_PROMPT_FILE
162
+ print(f"Using default prompt file: {DEFAULT_PROMPT_FILE}")
163
+ if (getattr(o, "words", None) is None
164
+ and getattr(o, "words_file", None) is None
165
+ and os.path.exists(DEFAULT_WORDS_FILE)):
166
+ o.words_file = DEFAULT_WORDS_FILE
167
+ print(f"Using default words file: {DEFAULT_WORDS_FILE}")
168
+
169
+
147
170
  def _resolve_prompt_and_words(prompt_text, prompt_file, words, words_file):
148
171
  """Read --prompt-file / --words-file from disk and merge with the inline
149
172
  flags. Returns ``(prompt_str_or_None, words_list_or_empty)``.
@@ -178,6 +201,43 @@ def _resolve_prompt_and_words(prompt_text, prompt_file, words, words_file):
178
201
  return (prompt_text or None), words
179
202
 
180
203
 
204
+ _WORD_STRIP_CHARS = " \t\r\n.,;:!?"
205
+
206
+
207
+ def _format_words_for_prompt(words):
208
+ """Render a `--words` list as a punctuated string suitable for joining
209
+ into a Whisper-family prompt. ``["Tierney", "Comet"]`` → ``"Tierney,
210
+ Comet."``. Trailing period biases the model toward emitting periods
211
+ of its own (Whisper mirrors prompt style); comma separator avoids the
212
+ "every word is its own sentence" look. Strips any stray punctuation
213
+ the user may have left on individual entries so the output is well-
214
+ formed regardless of input. Returns ``""`` for an empty list."""
215
+ cleaned = [w.strip(_WORD_STRIP_CHARS) for w in (words or [])]
216
+ cleaned = [w for w in cleaned if w]
217
+ if not cleaned:
218
+ return ""
219
+ return ", ".join(cleaned) + "."
220
+
221
+
222
+ def compose_prompt_for_backend(backend, prompt_text, words):
223
+ """Compose ``(prompt, hotwords)`` for a backend, applying the words-
224
+ auto-format rule. faster-whisper has a dedicated `hotwords` channel so
225
+ we keep words separate and untouched; every other prompt-using backend
226
+ (whisper-futo / openai / groq) gets words folded into the prompt as a
227
+ punctuated sentence so the prompt style biases Whisper toward
228
+ punctuated output. Returns ``(None, None)`` when both sides are empty
229
+ so callers can skip the kwarg entirely."""
230
+ if backend == "whisper":
231
+ return ((prompt_text or None),
232
+ (" ".join(words) if words else None))
233
+ words_blob = _format_words_for_prompt(words)
234
+ if prompt_text and words_blob:
235
+ merged = f"{prompt_text} {words_blob}"
236
+ else:
237
+ merged = prompt_text or words_blob
238
+ return ((merged or None), None)
239
+
240
+
181
241
  def _build_backend_kwargs(backend, model, language, samplerate, duration,
182
242
  silence_db, stream_chunk_silence_break, realtime_commit_silence,
183
243
  vad_mode, vad_threshold, vad_min_silence_ms,
@@ -185,17 +245,11 @@ def _build_backend_kwargs(backend, model, language, samplerate, duration,
185
245
  download_folder_whisper_futo,
186
246
  realtime_delay, realtime_gate,
187
247
  pseudo_streaming, stream_chunk_max,
188
- stream_chunk_min, stream_context_reset_silence,
248
+ stream_chunk_min, stream_first_chunk_min,
249
+ stream_context_reset_silence,
189
250
  stream_context_length,
190
- prompt_text, words, dry_run=False):
191
- # Cloud whisper variants (OpenAI batch, Groq, OpenAI realtime) take a
192
- # single `prompt` string — fold the word list into it. faster-whisper
193
- # gets the word list separately via `hotwords=` (dedicated biasing
194
- # channel), so we pass it through unmerged.
195
- merged_prompt = prompt_text
196
- if words and backend != "whisper":
197
- word_blob = " ".join(words)
198
- merged_prompt = f"{prompt_text} {word_blob}" if prompt_text else word_blob
251
+ prompt_text, words, dry_run=False, debug=False):
252
+ composed_prompt, composed_hotwords = compose_prompt_for_backend(backend, prompt_text, words)
199
253
 
200
254
  vad_kwargs = dict(vad_mode=vad_mode, vad_threshold=vad_threshold,
201
255
  vad_min_silence_ms=vad_min_silence_ms)
@@ -204,7 +258,7 @@ def _build_backend_kwargs(backend, model, language, samplerate, duration,
204
258
  return dict(model_name=model, language=language, samplerate=samplerate,
205
259
  timeout=None,
206
260
  model_kwargs={"download_root": download_folder_vosk},
207
- dry_run=dry_run)
261
+ dry_run=dry_run, debug=debug)
208
262
  if backend == "whisper":
209
263
  return dict(model_name=model, language=language, samplerate=samplerate,
210
264
  timeout=duration,
@@ -213,12 +267,13 @@ def _build_backend_kwargs(backend, model, language, samplerate, duration,
213
267
  silence_thresh=silence_db,
214
268
  pseudo_streaming=pseudo_streaming, stream_chunk_max=stream_chunk_max,
215
269
  stream_chunk_min=stream_chunk_min,
270
+ stream_first_chunk_min=stream_first_chunk_min,
216
271
  stream_context_reset_silence=stream_context_reset_silence,
217
272
  stream_context_length=stream_context_length,
218
- prompt=prompt_text,
219
- hotwords=(" ".join(words) if words else None),
273
+ prompt=composed_prompt,
274
+ hotwords=composed_hotwords,
220
275
  model_kwargs={"download_root": download_folder_whisper},
221
- dry_run=dry_run,
276
+ dry_run=dry_run, debug=debug,
222
277
  **vad_kwargs)
223
278
  if backend == "whisper-futo":
224
279
  # pywhispercpp 1.4.1 exposes `initial_prompt`; the backend folds
@@ -232,11 +287,12 @@ def _build_backend_kwargs(backend, model, language, samplerate, duration,
232
287
  silence_thresh=silence_db,
233
288
  pseudo_streaming=pseudo_streaming, stream_chunk_max=stream_chunk_max,
234
289
  stream_chunk_min=stream_chunk_min,
290
+ stream_first_chunk_min=stream_first_chunk_min,
235
291
  stream_context_reset_silence=stream_context_reset_silence,
236
292
  stream_context_length=stream_context_length,
237
- prompt=merged_prompt,
293
+ prompt=composed_prompt,
238
294
  download_folder=download_folder_whisper_futo,
239
- dry_run=dry_run,
295
+ dry_run=dry_run, debug=debug,
240
296
  **vad_kwargs)
241
297
  if backend in ("openai", "groq"):
242
298
  from scribe.backends.openai_api import REALTIME_MODELS
@@ -247,10 +303,11 @@ def _build_backend_kwargs(backend, model, language, samplerate, duration,
247
303
  silence_thresh=silence_db,
248
304
  pseudo_streaming=pseudo_streaming, stream_chunk_max=stream_chunk_max,
249
305
  stream_chunk_min=stream_chunk_min,
306
+ stream_first_chunk_min=stream_first_chunk_min,
250
307
  stream_context_reset_silence=stream_context_reset_silence,
251
308
  stream_context_length=stream_context_length,
252
- prompt=merged_prompt,
253
- dry_run=dry_run,
309
+ prompt=composed_prompt,
310
+ dry_run=dry_run, debug=debug,
254
311
  **vad_kwargs)
255
312
  if backend == "openai" and model in REALTIME_MODELS:
256
313
  kwargs["realtime_delay"] = realtime_delay
@@ -272,10 +329,11 @@ def get_transcriber(model=None, backend=None, dummy=False, interactive=True, lan
272
329
  download_folder_whisper_futo=None,
273
330
  realtime_delay="medium", realtime_gate=True,
274
331
  pseudo_streaming=False, stream_chunk_max=10.0,
275
- stream_chunk_min=1.5, stream_context_reset_silence=3.0,
332
+ stream_chunk_min=1.5, stream_first_chunk_min=3.0,
333
+ stream_context_reset_silence=3.0,
276
334
  stream_context_length=200,
277
335
  prompt=None, prompt_file=None, words=None, words_file=None,
278
- dry_run=False, **kwargs):
336
+ dry_run=False, debug=False, **kwargs):
279
337
  if dummy:
280
338
  return DummyTranscriber("whisper", "dummy")
281
339
  if model and not backend:
@@ -313,9 +371,10 @@ def get_transcriber(model=None, backend=None, dummy=False, interactive=True, lan
313
371
  download_folder_whisper_futo,
314
372
  realtime_delay, realtime_gate,
315
373
  pseudo_streaming, stream_chunk_max,
316
- stream_chunk_min, stream_context_reset_silence,
374
+ stream_chunk_min, stream_first_chunk_min,
375
+ stream_context_reset_silence,
317
376
  stream_context_length,
318
- prompt_text, word_list, dry_run=dry_run)
377
+ prompt_text, word_list, dry_run=dry_run, debug=debug)
319
378
  try:
320
379
  return _build_transcriber(backend, **backend_kwargs)
321
380
  except Exception as error:
@@ -378,6 +437,10 @@ def get_parser():
378
437
  "Used by tests/test_backend_matrix.py to exercise the "
379
438
  "recording pipeline without network access or every "
380
439
  "model on disk.")
440
+ group.add_argument("--debug", action="store_true", dest="debug",
441
+ help="Log one line per STT request (model, language, "
442
+ "prompt, audio length) for diagnosing transcription "
443
+ "issues.")
381
444
 
382
445
  group = parser.add_argument_group("Output")
383
446
  group.add_argument("-m", "--mode",
@@ -480,6 +543,16 @@ def get_parser():
480
543
  group.add_argument("--streaming-window", type=lambda s: 2.0 * float(s),
481
544
  dest="stream_chunk_max", default=argparse.SUPPRESS,
482
545
  help=argparse.SUPPRESS)
546
+ group.add_argument("--stream-first-chunk-min", default=3.0, type=float,
547
+ dest="stream_first_chunk_min",
548
+ help="Minimum chunk size in seconds for the *first* chunk "
549
+ "of a streaming thread (default: %(default)s). Higher "
550
+ "than --stream-chunk-min so the bootstrap chunk has "
551
+ "enough audio for Whisper to produce a punctuated "
552
+ "transcript, whose tail then seeds the rolling prompt "
553
+ "for subsequent chunks. Applies on recording start and "
554
+ "right after a context-reset silence. Clamped to "
555
+ "<= --stream-chunk-max.")
483
556
  group.add_argument("--stream-chunk-min", default=1.5, type=float,
484
557
  help="Minimum chunk size in seconds before a silence-cut "
485
558
  "is allowed in --stream mode (default: %(default)s). "
@@ -788,6 +861,14 @@ def main(args=None):
788
861
  parser = get_parser()
789
862
  o = parser.parse_args(args)
790
863
 
864
+ # Surface auto-discovered prompt.txt / words.txt defaults on the
865
+ # namespace before downstream consumers read it. Without this, the
866
+ # tray menu's "Prompt file: …" / "Words file: …" labels show "(none)"
867
+ # even when scribe is actively biasing on a default file — the file
868
+ # was being loaded by `_resolve_prompt_and_words`, but the resolved
869
+ # path stayed local to that function and never propagated to `o`.
870
+ autodiscover_prompt_files(o)
871
+
791
872
  # Reconcile --stream / --clip with the legacy --pseudo-streaming flag.
792
873
  # --stream / --clip win when present; otherwise the existing
793
874
  # --pseudo-streaming boolean drives the default.
@@ -57,6 +57,7 @@ class OpenaiAPITranscriber(WhisperTranscriber):
57
57
  buffer.name = "audio.wav" # Set a filename with a valid extension
58
58
  prompt = self.compose_prompt(self._prompt)
59
59
  extra = {"prompt": prompt} if prompt else {}
60
+ self.debug_log_request(audio_bytes, model=self.model_name, prompt=prompt)
60
61
  try:
61
62
  transcription = self.model.audio.transcriptions.create(
62
63
  model=self.model_name,
@@ -33,12 +33,16 @@ class WhisperTranscriber(AbstractTranscriber):
33
33
  self.update_streaming_context(text)
34
34
  return {"text": text}
35
35
  audio_array = np.frombuffer(audio_bytes, dtype=np.int16).flatten().astype(np.float32) / 32768.0
36
+ composed_prompt = self.compose_prompt(self._prompt)
37
+ self.debug_log_request(audio_bytes, model=self.model_name,
38
+ language=self.language, prompt=composed_prompt,
39
+ hotwords=self._hotwords)
36
40
  segments, _info = self.model.transcribe(
37
41
  audio_array,
38
42
  language=self.language,
39
43
  vad_filter=True,
40
44
  beam_size=1,
41
- initial_prompt=self.compose_prompt(self._prompt),
45
+ initial_prompt=composed_prompt,
42
46
  hotwords=self._hotwords,
43
47
  no_speech_threshold=0.6,
44
48
  log_prob_threshold=-1.0,
@@ -165,6 +165,11 @@ class WhisperFutoTranscriber(AbstractTranscriber):
165
165
  # recording is a single longer utterance.
166
166
  if self.pseudo_streaming:
167
167
  kwargs["max_tokens"] = max(12, int(duration_s * 12))
168
+ self.debug_log_request(audio_bytes, model=self.model_name,
169
+ language=kwargs.get("language"),
170
+ prompt=kwargs.get("initial_prompt"),
171
+ audio_ctx=kwargs.get("audio_ctx"),
172
+ max_tokens=kwargs.get("max_tokens"))
168
173
  segments = self.model.transcribe(audio, **kwargs)
169
174
  text = "".join(s.text for s in segments)
170
175
  if self.pseudo_streaming:
@@ -7,6 +7,32 @@ Kept scribe-local for now so the worktree is self-contained; promotion to
7
7
  from __future__ import annotations
8
8
 
9
9
 
10
+ def select_file_open(
11
+ title: str = "Choose file",
12
+ initial_dir: str | None = None,
13
+ initial_file: str | None = None,
14
+ filetypes: list[tuple[str, str]] | None = None,
15
+ ) -> str | None:
16
+ """Open a native 'Open' file dialog for an existing file. Returns the
17
+ chosen path or None if the user cancelled. Same Tk-lifecycle pattern as
18
+ ``select_file_save`` (withdrawn root, destroy in finally) so repeated
19
+ invocations from the tray menu don't leak top-level windows."""
20
+ from tkinter import Tk, filedialog
21
+
22
+ root = Tk()
23
+ root.withdraw()
24
+ try:
25
+ path = filedialog.askopenfilename(
26
+ title=title,
27
+ initialdir=initial_dir,
28
+ initialfile=initial_file,
29
+ filetypes=filetypes or [("All files", "*.*"), ("Text", "*.txt")],
30
+ )
31
+ return path or None
32
+ finally:
33
+ root.destroy()
34
+
35
+
10
36
  def select_file_save(
11
37
  title: str = "Choose output file",
12
38
  initial_dir: str | None = None,
@@ -514,6 +514,73 @@ class AppState(AbstractFrontendApp):
514
514
  self._refresh_tray_menu()
515
515
  return True
516
516
 
517
+ def cb_pick_prompt_file_path(self, view, item):
518
+ """Open a native 'Open File' dialog and route the chosen file as the
519
+ prompt source. Updates ``o.prompt_file`` + ``self.params``, then
520
+ re-resolves the prompt/words and pushes the result into the live
521
+ transcriber's ``_prompt`` / ``_hotwords`` so the new bias takes
522
+ effect on the next chunk. Cancel → no-op."""
523
+ return self._pick_prompt_or_words("prompt_file", "Choose prompt file")
524
+
525
+ def cb_pick_words_file_path(self, view, item):
526
+ """Same as :meth:`cb_pick_prompt_file_path` but for the words file."""
527
+ return self._pick_prompt_or_words("words_file", "Choose words file")
528
+
529
+ def cb_reload_prompt_files(self, view, item):
530
+ """Re-read the currently-selected prompt + words files from disk
531
+ without opening a dialog. Lets the user edit ``prompt.txt`` /
532
+ ``words.txt`` in a text editor and pick up the change with a single
533
+ click instead of having to re-select the same file via the picker."""
534
+ self._reload_prompt_into_transcriber()
535
+ self._refresh_tray_menu()
536
+ return True
537
+
538
+ def _pick_prompt_or_words(self, attr, title):
539
+ """Shared core for the two file pickers — ``attr`` is the namespace
540
+ key (``"prompt_file"`` or ``"words_file"``) and ``title`` is the
541
+ dialog caption."""
542
+ from os.path import basename, dirname
543
+
544
+ from scribe.app import SCRIBE_CONFIG_DIR
545
+ from scribe.dialog import select_file_open
546
+
547
+ current = getattr(self.o, attr, None)
548
+ initial_dir = dirname(current) if current else SCRIBE_CONFIG_DIR
549
+ initial_file = basename(current) if current else None
550
+ path = select_file_open(title=title,
551
+ initial_dir=initial_dir,
552
+ initial_file=initial_file)
553
+ if path is None:
554
+ return True
555
+ setattr(self.o, attr, path)
556
+ self.params[attr] = path
557
+ self._reload_prompt_into_transcriber()
558
+ self._refresh_tray_menu()
559
+ return True
560
+
561
+ def _reload_prompt_into_transcriber(self):
562
+ """Re-resolve ``--prompt`` / ``--prompt-file`` / ``--words`` /
563
+ ``--words-file`` from the current ``self.o`` snapshot and push the
564
+ composed result into the live transcriber. No-op when no transcriber
565
+ is attached (e.g. during early menu construction in tests)."""
566
+ from scribe.app import _resolve_prompt_and_words, compose_prompt_for_backend
567
+
568
+ prompt_text, word_list = _resolve_prompt_and_words(
569
+ getattr(self.o, "prompt", None),
570
+ getattr(self.o, "prompt_file", None),
571
+ getattr(self.o, "words", None),
572
+ getattr(self.o, "words_file", None),
573
+ )
574
+ t = self.transcriber
575
+ if t is None:
576
+ return
577
+ backend = getattr(t, "backend", None)
578
+ composed_prompt, composed_hotwords = compose_prompt_for_backend(
579
+ backend, prompt_text, word_list)
580
+ t._prompt = composed_prompt
581
+ if hasattr(t, "_hotwords"):
582
+ t._hotwords = composed_hotwords
583
+
517
584
  def cb_set_input_mode(self, type_direct: bool) -> Callable:
518
585
  """Factory: callback for the Keyboard → Input mode radio.
519
586
 
@@ -1175,7 +1242,7 @@ def _stream_advanced_submenu(app_state) -> Menu:
1175
1242
 
1176
1243
  chunk_max_item = Item("max",
1177
1244
  _picker_submenu("Chunk max",
1178
- [3.0, 5.0, 10.0, 20.0, None],
1245
+ [3.0, 5.0, 10.0, 15.0, 20.0, None],
1179
1246
  get_chunk_max, _chunk_max_label,
1180
1247
  app_state.cb_set_stream_chunk_max),
1181
1248
  help="Chunk max",
@@ -1202,7 +1269,7 @@ def _stream_advanced_submenu(app_state) -> Menu:
1202
1269
 
1203
1270
  context_reset_item = Item("reset",
1204
1271
  _picker_submenu("Context reset",
1205
- [1.0, 1.5, 2.0, 3.0, 5.0, 10.0, math.inf],
1272
+ [1.0, 1.5, 2.0, 3.0, 5.0, 8.0, 10.0, math.inf],
1206
1273
  get_context_reset, _context_reset_label,
1207
1274
  app_state.cb_set_stream_context_reset_silence),
1208
1275
  help="Context reset",
@@ -1312,6 +1379,51 @@ def _keyboard_advanced_submenu(app_state) -> Menu:
1312
1379
  return Menu(items, name="Keyboard (advanced)")
1313
1380
 
1314
1381
 
1382
+ def _prompt_status_label(o) -> str:
1383
+ """Short status string for the Options → Prompt label: which of the two
1384
+ files is loaded, e.g. ``"prompt+words"``, ``"words only"``, ``"none"``.
1385
+ Keeps the menu line compact while still telling the user whether *any*
1386
+ bias is in effect; basenames live inside the submenu items."""
1387
+ has_prompt = bool(getattr(o, "prompt_file", None) or getattr(o, "prompt", None))
1388
+ has_words = bool(getattr(o, "words_file", None) or getattr(o, "words", None))
1389
+ if has_prompt and has_words:
1390
+ return "prompt + words"
1391
+ if has_prompt:
1392
+ return "prompt only"
1393
+ if has_words:
1394
+ return "words only"
1395
+ return "none"
1396
+
1397
+
1398
+ def _prompt_files_submenu(app_state) -> Menu:
1399
+ """Options → Prompt submenu: pickers for the prompt file and words file.
1400
+
1401
+ Each leaf's label shows the basename of the currently-loaded file (or
1402
+ "(none)") so the user can see at a glance which file is biasing the
1403
+ model. Click → native Open File dialog. Mirrors the Output → Choose
1404
+ path… picker UX. Both fall back to ``~/.config/scribe/`` (resolved via
1405
+ platformdirs in :data:`scribe.app.SCRIBE_CONFIG_DIR`) as the initial
1406
+ directory when no file is currently set."""
1407
+ from os.path import basename
1408
+
1409
+ def _label(attr, kind):
1410
+ path = getattr(app_state.o, attr, None)
1411
+ return f"{kind} file: {basename(path) if path else '(none)'}"
1412
+
1413
+ prompt_item = Item("prompt", app_state.cb_pick_prompt_file_path,
1414
+ help="Prompt file (free-text style hint)")
1415
+ prompt_item.label_fn = lambda: _label("prompt_file", "Prompt")
1416
+ words_item = Item("words", app_state.cb_pick_words_file_path,
1417
+ help="Words file (vocabulary bias)")
1418
+ words_item.label_fn = lambda: _label("words_file", "Words")
1419
+ # Reload re-reads the currently-selected files from disk — handy after
1420
+ # editing prompt.txt / words.txt in a text editor, no need to re-select
1421
+ # them via the picker.
1422
+ reload_item = Item("Reload now", app_state.cb_reload_prompt_files,
1423
+ help="Re-read the selected files from disk")
1424
+ return Menu([prompt_item, words_item, reload_item], name="Prompt")
1425
+
1426
+
1315
1427
  def _toggle_options_menu(app_state) -> Menu:
1316
1428
  is_terminal = _is_terminal_frontend(app_state)
1317
1429
 
@@ -1336,6 +1448,17 @@ def _toggle_options_menu(app_state) -> Menu:
1336
1448
  output_item = Item("output", _output_mode_submenu(app_state), help="Output")
1337
1449
  output_item.label_fn = lambda: f"Output: {_output_mode_label(app_state.o)}"
1338
1450
 
1451
+ # Prompt sub-menu: file pickers for the prompt file + words file so the
1452
+ # user can see which file is biasing the model and swap it without
1453
+ # restarting. Visible only for backends that actually consume the
1454
+ # prompt; vosk silently ignores it and openai-realtime rejects it
1455
+ # server-side.
1456
+ prompt_item = Item("prompt", _prompt_files_submenu(app_state), help="Prompt")
1457
+ prompt_item.label_fn = lambda: (
1458
+ f"Prompt: "
1459
+ f"{_prompt_status_label(app_state.o)}"
1460
+ )
1461
+
1339
1462
  # Keyboard sub-menu: only meaningful when Output=Keyboard. Holds the
1340
1463
  # Input mode (keystroke vs paste) and the Backend typer radio.
1341
1464
  keyboard_item = Item("kbd", _keyboard_advanced_submenu(app_state),
@@ -1351,6 +1474,7 @@ def _toggle_options_menu(app_state) -> Menu:
1351
1474
  stream_advanced_item,
1352
1475
  clip_timeout_item,
1353
1476
  output_item,
1477
+ prompt_item,
1354
1478
  keyboard_item,
1355
1479
  Item("x", app_state.cb_toggle_frontend, help="Toggle tray app mode",
1356
1480
  checked=lambda item: getattr(app_state.o, "frontend", None) == "tray",