scribe-cli 0.17.0__tar.gz → 0.17.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/PKG-INFO +1 -1
  2. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/docs/backends.md +35 -1
  3. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/docs/keyboard.md +31 -0
  4. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/_version.py +3 -3
  5. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/app.py +57 -16
  6. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/backends/openai_api.py +3 -1
  7. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/backends/openai_realtime.py +79 -3
  8. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/backends/whisper.py +2 -1
  9. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/backends/whisper_futo.py +81 -20
  10. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/models.py +109 -11
  11. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/session.py +10 -1
  12. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe_cli.egg-info/PKG-INFO +1 -1
  13. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe_cli.egg-info/SOURCES.txt +4 -1
  14. scribe_cli-0.17.1/tests/test_openai_realtime_coalesce.py +221 -0
  15. scribe_cli-0.17.1/tests/test_pseudo_streaming.py +288 -0
  16. scribe_cli-0.17.1/tests/test_whisper_futo.py +245 -0
  17. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/.github/FUNDING.yml +0 -0
  18. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/.github/workflows/pypi.yml +0 -0
  19. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/.gitignore +0 -0
  20. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/LICENSE +0 -0
  21. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/README.md +0 -0
  22. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/docs/app-tray-menu.png +0 -0
  23. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/docs/cli.md +0 -0
  24. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/docs/desktop-install.md +0 -0
  25. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/docs/installation.md +0 -0
  26. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/docs/roadmap-libei.md +0 -0
  27. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/docs/tray.md +0 -0
  28. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/icon.xcf +0 -0
  29. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/pyproject.toml +0 -0
  30. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/__init__.py +0 -0
  31. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/audio.py +0 -0
  32. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/backends/__init__.py +0 -0
  33. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/backends/groq.py +0 -0
  34. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/backends/vosk.py +0 -0
  35. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/install_desktop.py +0 -0
  36. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/keyboard.py +0 -0
  37. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/menu.py +0 -0
  38. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/models.toml +0 -0
  39. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/saverecording.py +0 -0
  40. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/testpynput.py +0 -0
  41. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/typers/__init__.py +0 -0
  42. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/typers/base.py +0 -0
  43. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/typers/eitype.py +0 -0
  44. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/typers/pynput.py +0 -0
  45. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/typers/wtype.py +0 -0
  46. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/typers/ydotool.py +0 -0
  47. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/util.py +0 -0
  48. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe_cli.egg-info/dependency_links.txt +0 -0
  49. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe_cli.egg-info/entry_points.txt +0 -0
  50. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe_cli.egg-info/requires.txt +0 -0
  51. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe_cli.egg-info/top_level.txt +0 -0
  52. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe_data/__init__.py +0 -0
  53. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe_data/share/icon.png +0 -0
  54. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe_data/share/icon_recording.png +0 -0
  55. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe_data/share/icon_writing.png +0 -0
  56. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe_data/templates/scribe.desktop +0 -0
  57. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scripts/bench_whisper_local.py +0 -0
  58. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scripts/test_python_versions_install.sh +0 -0
  59. {scribe_cli-0.17.0 → scribe_cli-0.17.1}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: scribe-cli
3
- Version: 0.17.0
3
+ Version: 0.17.1
4
4
  Summary: Speech-to-text CLI and system-tray app for dictating into any focused window. Local (vosk, faster-whisper) or cloud (groq, openai) backends, batch or streaming.
5
5
  Author-email: Mahé Perrette <mahe.perrette@gmail.com>
6
6
  License: MIT License
@@ -149,9 +149,10 @@ differently:
149
149
  | Backend | `--prompt` | `--words` |
150
150
  |--------------------------------------|-------------------------------|--------------------------------------------------------|
151
151
  | `whisper` (faster-whisper, local) | passed as `initial_prompt=` | passed as `hotwords=` — a **dedicated biasing channel** separate from the prompt |
152
+ | `whisper-futo` (pywhispercpp, local) | passed as `initial_prompt=` | joined onto the prompt string (no separate hotwords channel here) |
152
153
  | `openai` batch (`gpt-4o*-transcribe`) | passed as `prompt=` | joined onto the prompt string |
153
154
  | `groq` (`whisper-large-v3-turbo`) | passed as `prompt=` | joined onto the prompt string |
154
- | `openai` realtime (`gpt-realtime-whisper`) | included in the session config as `transcription.prompt` | joined onto the prompt string |
155
+ | `openai` realtime (`gpt-realtime-whisper`) | *silently ignored* the model rejects the prompt parameter server-side (HTTP 400 *"The 'prompt' parameter is not supported for this model."*). The kwarg stays accepted for plumbing compatibility but never reaches the API. | same — joined into the (ignored) prompt |
155
156
  | `vosk` | *ignored* (no soft prompt) | *ignored* (Vosk only supports a hard `grammar` allowlist; not yet exposed) |
156
157
 
157
158
  The whisper-family APIs cap the prompt around ~224 tokens; longer
@@ -202,3 +203,36 @@ more than latency.
202
203
 
203
204
  This is experimental and off by default. The tray menu surfaces the
204
205
  same toggle under Options ▶ Advanced ▶ Pseudo-streaming.
206
+
207
+ ### Cross-chunk prompt context
208
+
209
+ In pseudo-streaming mode scribe automatically augments each chunk's
210
+ prompt with the trailing ~200 characters of the *previous* chunk's
211
+ transcription. This rolling tail is concatenated onto whatever static
212
+ `--prompt` / `--words` you configured and reaches the backend through
213
+ the same channel as the static prompt (the vocabulary biasing table
214
+ above). The motivation is cross-chunk continuity:
215
+
216
+ - **Capitalization drift** — without context, a chunk that starts
217
+ right after a period might come back lowercased.
218
+ - **Article gender (FR/IT/ES/…)** — `"la nouveau"` → `"le nouveau"`
219
+ once the prior chunk has established the noun.
220
+ - **Language lock** — `whisper.cpp` auto-detects language per call;
221
+ feeding the previous chunk's tokens keeps the language stable
222
+ across cuts.
223
+
224
+ Whisper's prompt window is capped at ~224 tokens; 200 chars of French
225
+ sits well under that and leaves room for your static prompt + words
226
+ list.
227
+
228
+ The rolling tail is **dropped** whenever the pause that triggered the
229
+ chunk cut exceeded 1.5 seconds — a long pause is treated as a new
230
+ sentence/idea boundary, where carrying a possibly-bad prior chunk
231
+ forward biases the next one more than it helps. This mirrors
232
+ `whisper.cpp`'s `--keep-context off` default: prior-text conditioning
233
+ can self-reinforce errors (hallucinations, decoder repetition loops)
234
+ more readily than it provides useful continuity, so we cap it at
235
+ natural sentence boundaries.
236
+
237
+ Short pauses (mid-sentence punctuation) keep the context; the cut at
238
+ the start of every new recording also clears it.
@@ -167,3 +167,34 @@ If `eitype` is unavailable, two older workarounds also work:
167
167
  Roadmap for native libei integration (eventual Python bindings,
168
168
  expanded compositor support) is tracked in
169
169
  [docs/roadmap-libei.md](roadmap-libei.md).
170
+
171
+ ## Realtime backend: delta coalescing
172
+
173
+ The `gpt-realtime-whisper` backend emits one transcription delta per
174
+ word/subword at ~30–80 ms intervals — much faster than the
175
+ `pyperclip.copy()` + Ctrl+V cycle can settle on Wayland (≥100 ms,
176
+ because `wl-copy` is asynchronous). Pasting every delta led to
177
+ clipboard races where successive copies overwrote each other before
178
+ Ctrl+V landed, manifesting as dropped and duplicated words
179
+ (*"fait fait le mot mot time time…"*).
180
+
181
+ In **paste mode** (default keystroke output) scribe therefore
182
+ coalesces deltas: incoming tokens accumulate into a small buffer and
183
+ are flushed only when *either* ~400 ms have elapsed since the last
184
+ flush, *or* the buffer ends on sentence-final punctuation
185
+ (`. ! ? \n`). A 200 ms floor between any two flushes prevents
186
+ back-to-back punctuation flushes from racing each other through the
187
+ clipboard.
188
+
189
+ With **`--type-direct`** the coalescing is bypassed entirely — each
190
+ delta goes through the chosen typer as a raw keystroke synchronously
191
+ (uinput / xtest / portal libei), no clipboard involved, no race to
192
+ defeat. The UX is also snappier: tokens appear one at a time rather
193
+ than in ~400 ms-cadenced bursts.
194
+
195
+ macOS and Windows clipboards are synchronous, so the race that
196
+ motivates coalescing is essentially a Wayland artefact; scribe still
197
+ coalesces in paste mode there for consistency, but it's harmless.
198
+ This whole behaviour is realtime-specific — Vosk's per-phrase commits
199
+ already arrive at a sane cadence, and the pseudo-streaming backends
200
+ emit one chunk per silence cut (already coarse enough).
@@ -18,7 +18,7 @@ version_tuple: tuple[int | str, ...]
18
18
  commit_id: str | None
19
19
  __commit_id__: str | None
20
20
 
21
- __version__ = version = '0.17.0'
22
- __version_tuple__ = version_tuple = (0, 17, 0)
21
+ __version__ = version = '0.17.1'
22
+ __version_tuple__ = version_tuple = (0, 17, 1)
23
23
 
24
- __commit_id__ = commit_id = 'gbfcd2e228'
24
+ __commit_id__ = commit_id = 'g67d90f5e4'
@@ -66,7 +66,10 @@ class DummyTranscriber:
66
66
 
67
67
  whisper_models = ["tiny", "base", "small", "medium", "large-v3", "large-v3-turbo"]
68
68
  whisper_english_models = ["tiny.en", "base.en", "small.en", "medium.en"]
69
- # FUTO ACFT publishes only tiny/base/small (+ .en variants); no medium/large/turbo.
69
+ # FUTO ACFT publishes only tiny/base/small (+ .en variants). Community
70
+ # conversions exist for large/turbo but their large-v3 encoder is
71
+ # incompatible with the audio_ctx shrinkage that's the point of this
72
+ # backend — for large models use the `whisper` backend instead.
70
73
  whisper_futo_models = ["tiny", "base", "small"]
71
74
  whisper_futo_english_models = ["tiny.en", "base.en", "small.en"]
72
75
  whisperapi_models = ["gpt-4o-transcribe", "gpt-4o-mini-transcribe", "gpt-realtime-whisper"]
@@ -168,7 +171,7 @@ def _resolve_prompt_and_words(prompt_text, prompt_file, words, words_file):
168
171
 
169
172
 
170
173
  def _build_backend_kwargs(backend, model, language, samplerate, duration,
171
- silence_db, silence_duration,
174
+ silence_db, silence_onset_db, silence_duration,
172
175
  download_folder_vosk, download_folder_whisper,
173
176
  download_folder_whisper_futo,
174
177
  realtime_delay, realtime_gate,
@@ -190,23 +193,28 @@ def _build_backend_kwargs(backend, model, language, samplerate, duration,
190
193
  model_kwargs={"download_root": download_folder_vosk})
191
194
  if backend == "whisper":
192
195
  return dict(model_name=model, language=language, samplerate=samplerate,
193
- timeout=duration, silence_duration=silence_duration, silence_thresh=silence_db,
196
+ timeout=duration, silence_duration=silence_duration,
197
+ silence_thresh=silence_db, silence_thresh_onset=silence_onset_db,
194
198
  pseudo_streaming=pseudo_streaming, streaming_window=streaming_window,
195
199
  prompt=prompt_text,
196
200
  hotwords=(" ".join(words) if words else None),
197
201
  model_kwargs={"download_root": download_folder_whisper})
198
202
  if backend == "whisper-futo":
199
- # whisper.cpp via pywhispercpp doesn't take prompt/hotwords through the
200
- # same surface; drop them for now. Audio_ctx is computed per-call inside
201
- # the backend from actual audio length (the ACFT speedup).
203
+ # pywhispercpp 1.4.1 exposes `initial_prompt`; the backend folds
204
+ # words+prompt into it (and adds a rolling chunk-tail in
205
+ # pseudo-streaming). No separate hotwords channel here fold
206
+ # everything into the prompt like the cloud backends do.
202
207
  return dict(model_name=model, language=language, samplerate=samplerate,
203
- timeout=duration, silence_duration=silence_duration, silence_thresh=silence_db,
208
+ timeout=duration, silence_duration=silence_duration,
209
+ silence_thresh=silence_db, silence_thresh_onset=silence_onset_db,
204
210
  pseudo_streaming=pseudo_streaming, streaming_window=streaming_window,
211
+ prompt=merged_prompt,
205
212
  download_folder=download_folder_whisper_futo)
206
213
  if backend in ("openai", "groq"):
207
214
  from scribe.backends.openai_api import REALTIME_MODELS
208
215
  kwargs = dict(model_name=model, samplerate=samplerate,
209
- timeout=duration, silence_duration=silence_duration, silence_thresh=silence_db,
216
+ timeout=duration, silence_duration=silence_duration,
217
+ silence_thresh=silence_db, silence_thresh_onset=silence_onset_db,
210
218
  pseudo_streaming=pseudo_streaming, streaming_window=streaming_window,
211
219
  prompt=merged_prompt)
212
220
  if backend == "openai" and model in REALTIME_MODELS:
@@ -223,7 +231,7 @@ def _build_backend_kwargs(backend, model, language, samplerate, duration,
223
231
 
224
232
  def get_transcriber(model=None, backend=None, dummy=False, interactive=True, language=None,
225
233
  samplerate=None, duration=None,
226
- silence_db=-40.0, silence_duration=0.6,
234
+ silence_db=None, silence_onset_db=None, silence_duration=0.6,
227
235
  download_folder_vosk=None, download_folder_whisper=None,
228
236
  download_folder_whisper_futo=None,
229
237
  realtime_delay="medium", realtime_gate=True,
@@ -253,9 +261,17 @@ def get_transcriber(model=None, backend=None, dummy=False, interactive=True, lan
253
261
  else:
254
262
  model = _prompt_model_for_backend(backend, language, interactive)
255
263
  print(f"Selected model: {model}")
264
+ # silence_db is the LOW threshold (in-speech pause detection) — default
265
+ # -40 in all modes. silence_onset_db is the HIGH threshold (speech-start
266
+ # gate) used only in pseudo-streaming via hysteresis; -25 keeps ambient
267
+ # noise (keyboard, breathing) from triggering a chunk.
268
+ if silence_db is None:
269
+ silence_db = -40.0
270
+ if silence_onset_db is None:
271
+ silence_onset_db = -25.0 if pseudo_streaming else silence_db
256
272
  prompt_text, word_list = _resolve_prompt_and_words(prompt, prompt_file, words, words_file)
257
273
  backend_kwargs = _build_backend_kwargs(backend, model, language, samplerate, duration,
258
- silence_db, silence_duration,
274
+ silence_db, silence_onset_db, silence_duration,
259
275
  download_folder_vosk, download_folder_whisper,
260
276
  download_folder_whisper_futo,
261
277
  realtime_delay, realtime_gate,
@@ -322,11 +338,18 @@ def get_parser():
322
338
  group = parser.add_argument_group("Silence detection (shared)")
323
339
  group.add_argument("--duration", default=120, type=float,
324
340
  help="Max recording duration in seconds (default: %(default)s).")
325
- group.add_argument("--silence-db", default=-40.0, type=float,
326
- help="dBFS volume floor for 'this frame is silent' "
327
- "(default: %(default)s). Used by every silence-driven "
328
- "behavior (realtime gate, realtime auto-commit, "
329
- "pseudo-streaming chunking).")
341
+ group.add_argument("--silence-db", default=None, type=float,
342
+ help="LOW silence floor in dBFS applied while we're "
343
+ "already inside an utterance, so soft trailing "
344
+ "syllables aren't cut. Default: -40. Used by every "
345
+ "silence-driven behavior (pseudo-streaming pause "
346
+ "detection, realtime gate, realtime auto-commit).")
347
+ group.add_argument("--silence-onset-db", default=None, type=float,
348
+ help="HIGH silence floor in dBFS — applied before we've "
349
+ "started capturing speech (audio buffer empty). "
350
+ "Stricter so ambient noise (keyboard, breathing) "
351
+ "doesn't trigger a chunk. Default: -25 in "
352
+ "pseudo-streaming, same as --silence-db otherwise.")
330
353
  group.add_argument("--silence-duration", default=0.6, type=float,
331
354
  help="Seconds of silence required before triggering a "
332
355
  "backend's silence behavior (default: %(default)s). "
@@ -399,8 +422,16 @@ def start_recording(micro, session, mode="keystroke", typer="auto",
399
422
  # Query the live transcriber instance — the registered class may dispatch
400
423
  # to a streaming sibling for specific models (e.g. openai →
401
424
  # gpt-realtime-whisper), so a class-level lookup via BACKENDS would lie.
425
+ # Pseudo-streaming also yields chunks (silence-cut batch transcriptions)
426
+ # so the output should treat it the same: live paste/type per chunk.
402
427
  backend_obj = getattr(session, "backend", session)
403
- is_streaming = bool(getattr(backend_obj, "supports_streaming", False)) if not isinstance(backend_obj, str) else False
428
+ if isinstance(backend_obj, str):
429
+ is_streaming = False
430
+ else:
431
+ is_streaming = (
432
+ bool(getattr(backend_obj, "supports_streaming", False))
433
+ or bool(getattr(backend_obj, "pseudo_streaming", False))
434
+ )
404
435
  # Clipboard is written in clipboard mode (the user pastes manually) and in
405
436
  # paste-based keystroke mode (the paste source). type_direct keystroke
406
437
  # mode bypasses the clipboard entirely — we type the chunks/text raw.
@@ -427,6 +458,16 @@ def start_recording(micro, session, mode="keystroke", typer="auto",
427
458
  import pyperclip
428
459
  session.log("The transcription will be copied to clipboard as it becomes available.")
429
460
 
461
+ # Tell streaming backends whether their output is about to hit the
462
+ # clipboard-paste race or a direct-keystroke typer. The realtime
463
+ # backend's per-token deltas only need coalescing in paste mode;
464
+ # type-direct (ydotool/wtype/pynput via uinput/xtest) types each
465
+ # character synchronously and benefits from raw per-delta emission
466
+ # for snappier UX. Set as a plain attribute — backends that don't
467
+ # implement coalescing ignore it.
468
+ if not isinstance(backend_obj, str) and hasattr(backend_obj, "_coalesce_deltas"):
469
+ backend_obj._coalesce_deltas = do_live_paste
470
+
430
471
  fulltext = ""
431
472
 
432
473
  for result in session.start_recording(micro, **greetings):
@@ -47,7 +47,8 @@ class OpenaiAPITranscriber(WhisperTranscriber):
47
47
  sf.write(buffer, audio_data, self.samplerate, format='WAV')
48
48
  buffer.seek(0)
49
49
  buffer.name = "audio.wav" # Set a filename with a valid extension
50
- extra = {"prompt": self._prompt} if self._prompt else {}
50
+ prompt = self.compose_prompt(self._prompt)
51
+ extra = {"prompt": prompt} if prompt else {}
51
52
  try:
52
53
  transcription = self.model.audio.transcriptions.create(
53
54
  model=self.model_name,
@@ -58,6 +59,7 @@ class OpenaiAPITranscriber(WhisperTranscriber):
58
59
  title, message = format_openai_error(e)
59
60
  self.notify_error(title, message)
60
61
  return {"text": ""}
62
+ self.update_streaming_context(transcription.text)
61
63
  return {"text": transcription.text}
62
64
 
63
65
 
@@ -2,6 +2,7 @@ import base64
2
2
  import logging
3
3
  import queue
4
4
  import threading
5
+ import time
5
6
  from typing import ClassVar
6
7
 
7
8
  import numpy as np
@@ -34,6 +35,29 @@ class OpenaiRealtimeTranscriber(AbstractStreamingTranscriber):
34
35
  # click) followed by silence would otherwise trigger an error popup.
35
36
  _SERVER_COMMIT_MIN_MS = 100.0
36
37
 
38
+ # Coalesce token-level deltas before yielding to the app layer.
39
+ # gpt-realtime-whisper emits one delta per word/subword (~30-80 ms
40
+ # apart). The live-paste path (paste_via_clipboard) needs ~100 ms
41
+ # per call to defeat Wayland's wl-copy async race — pasting every
42
+ # delta caused token drops + duplications because the clipboard got
43
+ # overwritten before Ctrl+V landed.
44
+ #
45
+ # _INTERVAL: regular cadence for in-progress sentences (no punct
46
+ # yet). Long enough that most short sentences finish before it fires
47
+ # — that way the natural commit point is the period, not a mid-
48
+ # sentence timeout (which would split a phrase across two pastes and
49
+ # race them through the clipboard).
50
+ #
51
+ # _MIN_INTERVAL: floor between successive flushes regardless of
52
+ # trigger. Even when the buffer ends on a period, we hold the flush
53
+ # until the floor has elapsed since the prior one. Two punctuation
54
+ # flushes <200ms apart was the residual failure mode that mangled
55
+ # rapid repeated phrases ("Tout rentre dans l'ordre. Tout rentre
56
+ # dans l'ordre.") even after the initial coalescing landed.
57
+ _DELTA_FLUSH_INTERVAL_S = 0.4
58
+ _DELTA_FLUSH_MIN_INTERVAL_S = 0.2
59
+ _DELTA_FLUSH_PUNCT = frozenset(".!?\n")
60
+
37
61
  def __init__(self, model_name="gpt-realtime-whisper", language=None, model_kwargs={},
38
62
  model=None, realtime_delay="medium",
39
63
  realtime_gate=True, prompt=None, **kwargs):
@@ -66,6 +90,16 @@ class OpenaiRealtimeTranscriber(AbstractStreamingTranscriber):
66
90
  self._has_uncommitted_audio = False
67
91
  self._silent_samples = 0
68
92
  self._uncommitted_ms = 0.0
93
+ # Delta coalescing state (see _DELTA_FLUSH_INTERVAL_S). The flag
94
+ # below is set by the app layer at recording time: True when
95
+ # live-paste-via-clipboard is the output (clipboard race exists
96
+ # → coalesce); False in type-direct mode (uinput/xtest tap each
97
+ # character — no clipboard, no race, no need to batch). Default
98
+ # True so backends instantiated outside the scribe app loop
99
+ # (smoke tests, library use) keep the safer batched behaviour.
100
+ self._coalesce_deltas = True
101
+ self._delta_buffer = ""
102
+ self._last_delta_flush = 0.0
69
103
 
70
104
  def _session_config(self) -> dict:
71
105
  # gpt-realtime-whisper does NOT support server VAD (rejected as
@@ -73,11 +107,17 @@ class OpenaiRealtimeTranscriber(AbstractStreamingTranscriber):
73
107
  # The streaming knob for this model is `delay` — "minimal" emits
74
108
  # partials as early as possible; higher values trade latency for
75
109
  # accuracy. Surfaced as the --realtime-delay CLI flag.
110
+ #
111
+ # NOTE: this model also rejects `prompt` server-side
112
+ # (400 "The 'prompt' parameter is not supported for this model.",
113
+ # param `session.audio.input.transcription.prompt`). The shared
114
+ # backend kwarg `prompt` is silently ignored here — the
115
+ # pseudo-streaming chunk-tail context machinery doesn't apply
116
+ # either (this backend is true streaming, not chunked). If a
117
+ # future REALTIME_MODELS entry supports it, gate by model name.
76
118
  transcription: dict = {"model": self.model_name, "delay": self._realtime_delay}
77
119
  if self.language:
78
120
  transcription["language"] = self.language
79
- if self._prompt:
80
- transcription["prompt"] = self._prompt
81
121
  audio_input: dict = {
82
122
  "format": {"type": "audio/pcm", "rate": self._GA_SAMPLE_RATE},
83
123
  "transcription": transcription,
@@ -100,6 +140,8 @@ class OpenaiRealtimeTranscriber(AbstractStreamingTranscriber):
100
140
  self._has_uncommitted_audio = False
101
141
  self._silent_samples = 0
102
142
  self._uncommitted_ms = 0.0
143
+ self._delta_buffer = ""
144
+ self._last_delta_flush = time.time()
103
145
 
104
146
  self._client = openai.OpenAI()
105
147
 
@@ -250,6 +292,9 @@ class OpenaiRealtimeTranscriber(AbstractStreamingTranscriber):
250
292
  else:
251
293
  self._silent_samples = 0
252
294
 
295
+ # Drain queue. Errors surface immediately in both modes. Text
296
+ # deltas either get buffered for coalesced flush (paste mode)
297
+ # or yielded raw (type-direct mode — see _coalesce_deltas).
253
298
  while True:
254
299
  try:
255
300
  item = self._event_queue.get_nowait()
@@ -259,7 +304,31 @@ class OpenaiRealtimeTranscriber(AbstractStreamingTranscriber):
259
304
  title, message = item["_error"]
260
305
  self.notify_error(title, message)
261
306
  continue
262
- yield item
307
+ text = item.get("text", "")
308
+ if not text:
309
+ continue
310
+ if self._coalesce_deltas:
311
+ self._delta_buffer += text
312
+ else:
313
+ yield {"text": text}
314
+
315
+ # Flush the coalesced buffer when both:
316
+ # (a) the floor _DELTA_FLUSH_MIN_INTERVAL_S has elapsed since
317
+ # the last flush — no two pastes within the clipboard race
318
+ # window, regardless of trigger; and
319
+ # (b) either the regular interval elapsed, or the buffer ends
320
+ # on sentence-final punctuation (natural commit boundary).
321
+ # In raw-delta mode the buffer stays empty so this is a no-op.
322
+ if self._delta_buffer:
323
+ now = time.time()
324
+ elapsed = now - self._last_delta_flush
325
+ ends_on_punct = self._delta_buffer[-1] in self._DELTA_FLUSH_PUNCT
326
+ if elapsed >= self._DELTA_FLUSH_MIN_INTERVAL_S and (
327
+ elapsed >= self._DELTA_FLUSH_INTERVAL_S or ends_on_punct
328
+ ):
329
+ yield {"text": self._delta_buffer}
330
+ self._delta_buffer = ""
331
+ self._last_delta_flush = now
263
332
 
264
333
  def finalize(self):
265
334
  if self._connection is None or self._closed:
@@ -288,7 +357,14 @@ class OpenaiRealtimeTranscriber(AbstractStreamingTranscriber):
288
357
  # transcript was already streamed live as `text` deltas during
289
358
  # recording, so we only return the tail.
290
359
  self._completed_event.wait(timeout=self._FINALIZE_TIMEOUT)
360
+ # Start with whatever sat in the coalescing buffer (deltas seen
361
+ # by feed_audio but not yet flushed by the interval/punct check),
362
+ # then append any tail deltas the recv_loop pushed in after the
363
+ # recording loop exited.
291
364
  tail_parts: list[str] = []
365
+ if self._delta_buffer:
366
+ tail_parts.append(self._delta_buffer)
367
+ self._delta_buffer = ""
292
368
  while True:
293
369
  try:
294
370
  item = self._event_queue.get_nowait()
@@ -29,7 +29,7 @@ class WhisperTranscriber(AbstractTranscriber):
29
29
  language=self.language,
30
30
  vad_filter=True,
31
31
  beam_size=1,
32
- initial_prompt=self._prompt,
32
+ initial_prompt=self.compose_prompt(self._prompt),
33
33
  hotwords=self._hotwords,
34
34
  no_speech_threshold=0.6,
35
35
  log_prob_threshold=-1.0,
@@ -37,6 +37,7 @@ class WhisperTranscriber(AbstractTranscriber):
37
37
  temperature=(0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
38
38
  )
39
39
  text = "".join(segment.text for segment in segments)
40
+ self.update_streaming_context(text)
40
41
  return {"text": text}
41
42
 
42
43
  def finalize(self):
@@ -15,6 +15,7 @@ from __future__ import annotations
15
15
 
16
16
  import math
17
17
  import os
18
+ import re
18
19
  from pathlib import Path
19
20
  from typing import ClassVar
20
21
 
@@ -23,9 +24,36 @@ import numpy as np
23
24
  from scribe.models import AbstractTranscriber
24
25
 
25
26
 
27
+ # Whisper hallucinates sound-effect annotations like "(music)", "[Applause]"
28
+ # on near-silence, and occasionally emits IPA-modifier-letter garbage
29
+ # (U+02B0–02FF) or U+FFFD when the audio is unintelligible. Two filters:
30
+ # - WHOLE_RE: chunk is one such artifact end-to-end → drop.
31
+ # - INLINE_RE: artifact embedded mid-text ("Bonjour (typing) ça va") →
32
+ # substitute out. Restricted to lowercase ASCII + spaces inside the
33
+ # brackets so legitimate French parentheticals (accents) and proper
34
+ # nouns (uppercase) are preserved. pywhispercpp 1.4.1 advertises
35
+ # `suppress_non_speech_tokens` in its schema but the C struct doesn't
36
+ # expose it, so this lives at the text layer.
37
+ _NON_SPEECH_WHOLE_RE = re.compile(r"^\s*[(\[*][^()\[\]*]{1,60}[)\]*]\s*[.!?]?\s*$")
38
+ # Allow any case ([Breathing], [KNOCKING], [Door opens], (footsteps)) and
39
+ # consume any trailing punctuation so adjacent text doesn't end up with
40
+ # stray commas. Substitute with a space (not "") so adjacent words don't
41
+ # collide when the noise token has no surrounding whitespace
42
+ # ("[door][door]" or "word(typing)word"); a follow-up \s+ collapse cleans
43
+ # up any doubles.
44
+ _NON_SPEECH_INLINE_RE = re.compile(r"[(\[][A-Za-z][A-Za-z\s\-]{0,30}[)\]][.,!?:;]?")
45
+ _WHITESPACE_RE = re.compile(r"\s+")
46
+ _PHONETIC_RE = re.compile(r"[ʰ-˿�]")
47
+
48
+
26
49
  _FUTO_BASE_URL = "https://voiceinput.futo.org/VoiceInput/"
27
50
 
28
- # Map user-visible model name → ggml filename on FUTO's CDN.
51
+ # Map user-visible model name → ggml filename on FUTO's CDN. FUTO publishes
52
+ # only tiny/base/small (+ .en variants). The DeadBranches community q8_0 of
53
+ # large-v3-turbo was tried briefly but its large-v3 encoder is incompatible
54
+ # with the audio_ctx-shrinkage that's the whole point of this backend
55
+ # (Progress: 1612% / CJK garbage on short clips), so we stick to the FUTO
56
+ # set where ACFT works as advertised.
29
57
  _FUTO_MODELS: dict[str, str] = {
30
58
  "tiny": "tiny_acft_q8_0.bin",
31
59
  "tiny.en": "tiny_en_acft_q8_0.bin",
@@ -90,7 +118,7 @@ class WhisperFutoTranscriber(AbstractTranscriber):
90
118
  is_local: ClassVar[bool] = True
91
119
 
92
120
  def __init__(self, model_name, language=None, model=None, model_kwargs={},
93
- download_folder=None, **kwargs):
121
+ download_folder=None, prompt=None, **kwargs):
94
122
  if model is None:
95
123
  from pywhispercpp.model import Model
96
124
  path = _model_path(model_name, download_folder)
@@ -101,29 +129,62 @@ class WhisperFutoTranscriber(AbstractTranscriber):
101
129
  init_kwargs = {k: v for k, v in model_kwargs.items() if k != "n_threads"}
102
130
  model = Model(str(path), n_threads=n_threads, **init_kwargs)
103
131
  super().__init__(model, model_name, language, model_kwargs=model_kwargs, **kwargs)
132
+ self._prompt = prompt
104
133
 
105
134
  def transcribe_audio(self, audio_bytes):
106
135
  self.log("\nTranscribing")
107
136
  audio = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0
108
- # ACFT shortcut: shrink the encoder window to the actual audio length.
109
- # Works for both explicit language and auto-detect (whisper.cpp runs its
110
- # language ID head on the same shrunk encoder output; FUTO's L2-distill
111
- # training preserves enough representational quality at short contexts).
112
- # pywhispercpp wants "" (not "auto") to request auto-detection.
113
137
  duration_s = len(audio) / self.samplerate
114
- audio_ctx = min(_AUDIO_CTX_MAX,
115
- max(_AUDIO_CTX_MIN,
116
- math.ceil(duration_s * _AUDIO_CTX_PER_SECOND)))
117
- segments = self.model.transcribe(
118
- audio,
119
- language=self.language or "",
120
- audio_ctx=audio_ctx,
121
- no_speech_thold=0.6,
122
- entropy_thold=2.4,
123
- logprob_thold=-1.0,
124
- temperature_inc=0.2,
125
- )
126
- return {"text": "".join(s.text for s in segments)}
138
+
139
+ # ACFT shortcut: shrink the encoder window to the actual audio
140
+ # length. This is the whole point of the FUTO backend — without it,
141
+ # a 2 s clip runs against the full 30 s window and inference is
142
+ # 5-10× slower. Safe for the FUTO ACFT set (tiny/base/small +
143
+ # .en) which was trained to preserve quality at short audio_ctx.
144
+ # pywhispercpp wants "" (not "auto") to request auto-detect.
145
+ kwargs = {
146
+ "language": self.language or "",
147
+ "audio_ctx": min(_AUDIO_CTX_MAX,
148
+ max(_AUDIO_CTX_MIN,
149
+ math.ceil(duration_s * _AUDIO_CTX_PER_SECOND))),
150
+ }
151
+ prompt = self.compose_prompt(self._prompt)
152
+ if prompt:
153
+ kwargs["initial_prompt"] = prompt
154
+ # Streaming-only safety nets. max_tokens caps decoder repetition
155
+ # loops on short silence-split chunks; the non-speech filter
156
+ # below drops "(music)"-style hallucinations from those same
157
+ # tiny chunks. Both can clip real speech in batch where the
158
+ # recording is a single longer utterance.
159
+ if self.pseudo_streaming:
160
+ kwargs["max_tokens"] = max(12, int(duration_s * 12))
161
+ segments = self.model.transcribe(audio, **kwargs)
162
+ text = "".join(s.text for s in segments)
163
+ if self.pseudo_streaming:
164
+ # Inline pass first: catches concatenated noise tokens like
165
+ # "[door opens][door closes]" and mid-sentence "(typing)"
166
+ # inserts. Replace with " " then collapse to avoid gluing
167
+ # adjacent words. Whole-chunk fallback catches artifacts the
168
+ # inline pattern misses (internal punctuation inside brackets).
169
+ text = _NON_SPEECH_INLINE_RE.sub(" ", text)
170
+ text = _WHITESPACE_RE.sub(" ", text).strip()
171
+ if _NON_SPEECH_WHOLE_RE.match(text):
172
+ text = ""
173
+ else:
174
+ text = text.strip()
175
+ # Phonetic garbage (IPA modifier letters, U+FFFD) is always a
176
+ # decode failure — drop in both modes.
177
+ if _PHONETIC_RE.search(text):
178
+ text = ""
179
+ # Carry the cleaned text forward as cross-chunk context. Done
180
+ # post-filter so hallucination/phonetic-garbage chunks (now "")
181
+ # don't poison the next chunk's prompt.
182
+ self.update_streaming_context(text)
183
+ # Trailing space lets pseudo-streaming chunks concatenate cleanly
184
+ # (vosk convention). Harmless in batch mode — downstream strips.
185
+ if text:
186
+ text += " "
187
+ return {"text": text}
127
188
 
128
189
  def finalize(self):
129
190
  if len(self.session.audio_buffer) == 0: