scribe-cli 0.17.0__tar.gz → 0.17.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/PKG-INFO +1 -1
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/docs/backends.md +35 -1
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/docs/keyboard.md +31 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/_version.py +3 -3
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/app.py +57 -16
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/backends/openai_api.py +3 -1
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/backends/openai_realtime.py +79 -3
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/backends/whisper.py +2 -1
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/backends/whisper_futo.py +81 -20
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/models.py +109 -11
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/session.py +10 -1
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe_cli.egg-info/PKG-INFO +1 -1
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe_cli.egg-info/SOURCES.txt +4 -1
- scribe_cli-0.17.1/tests/test_openai_realtime_coalesce.py +221 -0
- scribe_cli-0.17.1/tests/test_pseudo_streaming.py +288 -0
- scribe_cli-0.17.1/tests/test_whisper_futo.py +245 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/.github/FUNDING.yml +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/.github/workflows/pypi.yml +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/.gitignore +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/LICENSE +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/README.md +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/docs/app-tray-menu.png +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/docs/cli.md +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/docs/desktop-install.md +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/docs/installation.md +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/docs/roadmap-libei.md +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/docs/tray.md +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/icon.xcf +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/pyproject.toml +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/__init__.py +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/audio.py +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/backends/__init__.py +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/backends/groq.py +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/backends/vosk.py +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/install_desktop.py +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/keyboard.py +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/menu.py +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/models.toml +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/saverecording.py +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/testpynput.py +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/typers/__init__.py +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/typers/base.py +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/typers/eitype.py +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/typers/pynput.py +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/typers/wtype.py +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/typers/ydotool.py +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe/util.py +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe_cli.egg-info/dependency_links.txt +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe_cli.egg-info/entry_points.txt +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe_cli.egg-info/requires.txt +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe_cli.egg-info/top_level.txt +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe_data/__init__.py +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe_data/share/icon.png +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe_data/share/icon_recording.png +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe_data/share/icon_writing.png +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scribe_data/templates/scribe.desktop +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scripts/bench_whisper_local.py +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/scripts/test_python_versions_install.sh +0 -0
- {scribe_cli-0.17.0 → scribe_cli-0.17.1}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: scribe-cli
|
|
3
|
-
Version: 0.17.
|
|
3
|
+
Version: 0.17.1
|
|
4
4
|
Summary: Speech-to-text CLI and system-tray app for dictating into any focused window. Local (vosk, faster-whisper) or cloud (groq, openai) backends, batch or streaming.
|
|
5
5
|
Author-email: Mahé Perrette <mahe.perrette@gmail.com>
|
|
6
6
|
License: MIT License
|
|
@@ -149,9 +149,10 @@ differently:
|
|
|
149
149
|
| Backend | `--prompt` | `--words` |
|
|
150
150
|
|--------------------------------------|-------------------------------|--------------------------------------------------------|
|
|
151
151
|
| `whisper` (faster-whisper, local) | passed as `initial_prompt=` | passed as `hotwords=` — a **dedicated biasing channel** separate from the prompt |
|
|
152
|
+
| `whisper-futo` (pywhispercpp, local) | passed as `initial_prompt=` | joined onto the prompt string (no separate hotwords channel here) |
|
|
152
153
|
| `openai` batch (`gpt-4o*-transcribe`) | passed as `prompt=` | joined onto the prompt string |
|
|
153
154
|
| `groq` (`whisper-large-v3-turbo`) | passed as `prompt=` | joined onto the prompt string |
|
|
154
|
-
| `openai` realtime (`gpt-realtime-whisper`) |
|
|
155
|
+
| `openai` realtime (`gpt-realtime-whisper`) | *silently ignored* — the model rejects the prompt parameter server-side (HTTP 400 *"The 'prompt' parameter is not supported for this model."*). The kwarg stays accepted for plumbing compatibility but never reaches the API. | same — joined into the (ignored) prompt |
|
|
155
156
|
| `vosk` | *ignored* (no soft prompt) | *ignored* (Vosk only supports a hard `grammar` allowlist; not yet exposed) |
|
|
156
157
|
|
|
157
158
|
The whisper-family APIs cap the prompt around ~224 tokens; longer
|
|
@@ -202,3 +203,36 @@ more than latency.
|
|
|
202
203
|
|
|
203
204
|
This is experimental and off by default. The tray menu surfaces the
|
|
204
205
|
same toggle under Options ▶ Advanced ▶ Pseudo-streaming.
|
|
206
|
+
|
|
207
|
+
### Cross-chunk prompt context
|
|
208
|
+
|
|
209
|
+
In pseudo-streaming mode scribe automatically augments each chunk's
|
|
210
|
+
prompt with the trailing ~200 characters of the *previous* chunk's
|
|
211
|
+
transcription. This rolling tail is concatenated onto whatever static
|
|
212
|
+
`--prompt` / `--words` you configured and reaches the backend through
|
|
213
|
+
the same channel as the static prompt (the vocabulary biasing table
|
|
214
|
+
above). The motivation is cross-chunk continuity:
|
|
215
|
+
|
|
216
|
+
- **Capitalization drift** — without context, a chunk that starts
|
|
217
|
+
right after a period might come back lowercased.
|
|
218
|
+
- **Article gender (FR/IT/ES/…)** — `"la nouveau"` → `"le nouveau"`
|
|
219
|
+
once the prior chunk has established the noun.
|
|
220
|
+
- **Language lock** — `whisper.cpp` auto-detects language per call;
|
|
221
|
+
feeding the previous chunk's tokens keeps the language stable
|
|
222
|
+
across cuts.
|
|
223
|
+
|
|
224
|
+
Whisper's prompt window is capped at ~224 tokens; 200 chars of French
|
|
225
|
+
sits well under that and leaves room for your static prompt + words
|
|
226
|
+
list.
|
|
227
|
+
|
|
228
|
+
The rolling tail is **dropped** whenever the pause that triggered the
|
|
229
|
+
chunk cut exceeded 1.5 seconds — a long pause is treated as a new
|
|
230
|
+
sentence/idea boundary, where carrying a possibly-bad prior chunk
|
|
231
|
+
forward biases the next one more than it helps. This mirrors
|
|
232
|
+
`whisper.cpp`'s `--keep-context off` default: prior-text conditioning
|
|
233
|
+
can self-reinforce errors (hallucinations, decoder repetition loops)
|
|
234
|
+
more readily than it provides useful continuity, so we cap it at
|
|
235
|
+
natural sentence boundaries.
|
|
236
|
+
|
|
237
|
+
Short pauses (mid-sentence punctuation) keep the context; the cut at
|
|
238
|
+
the start of every new recording also clears it.
|
|
@@ -167,3 +167,34 @@ If `eitype` is unavailable, two older workarounds also work:
|
|
|
167
167
|
Roadmap for native libei integration (eventual Python bindings,
|
|
168
168
|
expanded compositor support) is tracked in
|
|
169
169
|
[docs/roadmap-libei.md](roadmap-libei.md).
|
|
170
|
+
|
|
171
|
+
## Realtime backend: delta coalescing
|
|
172
|
+
|
|
173
|
+
The `gpt-realtime-whisper` backend emits one transcription delta per
|
|
174
|
+
word/subword at ~30–80 ms intervals — much faster than the
|
|
175
|
+
`pyperclip.copy()` + Ctrl+V cycle can settle on Wayland (≥100 ms,
|
|
176
|
+
because `wl-copy` is asynchronous). Pasting every delta led to
|
|
177
|
+
clipboard races where successive copies overwrote each other before
|
|
178
|
+
Ctrl+V landed, manifesting as dropped and duplicated words
|
|
179
|
+
(*"fait fait le mot mot time time…"*).
|
|
180
|
+
|
|
181
|
+
In **paste mode** (default keystroke output) scribe therefore
|
|
182
|
+
coalesces deltas: incoming tokens accumulate into a small buffer and
|
|
183
|
+
are flushed only when *either* ~400 ms have elapsed since the last
|
|
184
|
+
flush, *or* the buffer ends on sentence-final punctuation
|
|
185
|
+
(`. ! ? \n`). A 200 ms floor between any two flushes prevents
|
|
186
|
+
back-to-back punctuation flushes from racing each other through the
|
|
187
|
+
clipboard.
|
|
188
|
+
|
|
189
|
+
With **`--type-direct`** the coalescing is bypassed entirely — each
|
|
190
|
+
delta goes through the chosen typer as a raw keystroke synchronously
|
|
191
|
+
(uinput / xtest / portal libei), no clipboard involved, no race to
|
|
192
|
+
defeat. The UX is also snappier: tokens appear one at a time rather
|
|
193
|
+
than in ~400 ms-cadenced bursts.
|
|
194
|
+
|
|
195
|
+
macOS and Windows clipboards are synchronous, so the race that
|
|
196
|
+
motivates coalescing is essentially a Wayland artefact; scribe still
|
|
197
|
+
coalesces in paste mode there for consistency, but it's harmless.
|
|
198
|
+
This whole behaviour is realtime-specific — Vosk's per-phrase commits
|
|
199
|
+
already arrive at a sane cadence, and the pseudo-streaming backends
|
|
200
|
+
emit one chunk per silence cut (already coarse enough).
|
|
@@ -18,7 +18,7 @@ version_tuple: tuple[int | str, ...]
|
|
|
18
18
|
commit_id: str | None
|
|
19
19
|
__commit_id__: str | None
|
|
20
20
|
|
|
21
|
-
__version__ = version = '0.17.
|
|
22
|
-
__version_tuple__ = version_tuple = (0, 17,
|
|
21
|
+
__version__ = version = '0.17.1'
|
|
22
|
+
__version_tuple__ = version_tuple = (0, 17, 1)
|
|
23
23
|
|
|
24
|
-
__commit_id__ = commit_id = '
|
|
24
|
+
__commit_id__ = commit_id = 'g67d90f5e4'
|
|
@@ -66,7 +66,10 @@ class DummyTranscriber:
|
|
|
66
66
|
|
|
67
67
|
whisper_models = ["tiny", "base", "small", "medium", "large-v3", "large-v3-turbo"]
|
|
68
68
|
whisper_english_models = ["tiny.en", "base.en", "small.en", "medium.en"]
|
|
69
|
-
# FUTO ACFT publishes only tiny/base/small (+ .en variants)
|
|
69
|
+
# FUTO ACFT publishes only tiny/base/small (+ .en variants). Community
|
|
70
|
+
# conversions exist for large/turbo but their large-v3 encoder is
|
|
71
|
+
# incompatible with the audio_ctx shrinkage that's the point of this
|
|
72
|
+
# backend — for large models use the `whisper` backend instead.
|
|
70
73
|
whisper_futo_models = ["tiny", "base", "small"]
|
|
71
74
|
whisper_futo_english_models = ["tiny.en", "base.en", "small.en"]
|
|
72
75
|
whisperapi_models = ["gpt-4o-transcribe", "gpt-4o-mini-transcribe", "gpt-realtime-whisper"]
|
|
@@ -168,7 +171,7 @@ def _resolve_prompt_and_words(prompt_text, prompt_file, words, words_file):
|
|
|
168
171
|
|
|
169
172
|
|
|
170
173
|
def _build_backend_kwargs(backend, model, language, samplerate, duration,
|
|
171
|
-
silence_db, silence_duration,
|
|
174
|
+
silence_db, silence_onset_db, silence_duration,
|
|
172
175
|
download_folder_vosk, download_folder_whisper,
|
|
173
176
|
download_folder_whisper_futo,
|
|
174
177
|
realtime_delay, realtime_gate,
|
|
@@ -190,23 +193,28 @@ def _build_backend_kwargs(backend, model, language, samplerate, duration,
|
|
|
190
193
|
model_kwargs={"download_root": download_folder_vosk})
|
|
191
194
|
if backend == "whisper":
|
|
192
195
|
return dict(model_name=model, language=language, samplerate=samplerate,
|
|
193
|
-
timeout=duration, silence_duration=silence_duration,
|
|
196
|
+
timeout=duration, silence_duration=silence_duration,
|
|
197
|
+
silence_thresh=silence_db, silence_thresh_onset=silence_onset_db,
|
|
194
198
|
pseudo_streaming=pseudo_streaming, streaming_window=streaming_window,
|
|
195
199
|
prompt=prompt_text,
|
|
196
200
|
hotwords=(" ".join(words) if words else None),
|
|
197
201
|
model_kwargs={"download_root": download_folder_whisper})
|
|
198
202
|
if backend == "whisper-futo":
|
|
199
|
-
#
|
|
200
|
-
#
|
|
201
|
-
#
|
|
203
|
+
# pywhispercpp 1.4.1 exposes `initial_prompt`; the backend folds
|
|
204
|
+
# words+prompt into it (and adds a rolling chunk-tail in
|
|
205
|
+
# pseudo-streaming). No separate hotwords channel here — fold
|
|
206
|
+
# everything into the prompt like the cloud backends do.
|
|
202
207
|
return dict(model_name=model, language=language, samplerate=samplerate,
|
|
203
|
-
timeout=duration, silence_duration=silence_duration,
|
|
208
|
+
timeout=duration, silence_duration=silence_duration,
|
|
209
|
+
silence_thresh=silence_db, silence_thresh_onset=silence_onset_db,
|
|
204
210
|
pseudo_streaming=pseudo_streaming, streaming_window=streaming_window,
|
|
211
|
+
prompt=merged_prompt,
|
|
205
212
|
download_folder=download_folder_whisper_futo)
|
|
206
213
|
if backend in ("openai", "groq"):
|
|
207
214
|
from scribe.backends.openai_api import REALTIME_MODELS
|
|
208
215
|
kwargs = dict(model_name=model, samplerate=samplerate,
|
|
209
|
-
timeout=duration, silence_duration=silence_duration,
|
|
216
|
+
timeout=duration, silence_duration=silence_duration,
|
|
217
|
+
silence_thresh=silence_db, silence_thresh_onset=silence_onset_db,
|
|
210
218
|
pseudo_streaming=pseudo_streaming, streaming_window=streaming_window,
|
|
211
219
|
prompt=merged_prompt)
|
|
212
220
|
if backend == "openai" and model in REALTIME_MODELS:
|
|
@@ -223,7 +231,7 @@ def _build_backend_kwargs(backend, model, language, samplerate, duration,
|
|
|
223
231
|
|
|
224
232
|
def get_transcriber(model=None, backend=None, dummy=False, interactive=True, language=None,
|
|
225
233
|
samplerate=None, duration=None,
|
|
226
|
-
silence_db
|
|
234
|
+
silence_db=None, silence_onset_db=None, silence_duration=0.6,
|
|
227
235
|
download_folder_vosk=None, download_folder_whisper=None,
|
|
228
236
|
download_folder_whisper_futo=None,
|
|
229
237
|
realtime_delay="medium", realtime_gate=True,
|
|
@@ -253,9 +261,17 @@ def get_transcriber(model=None, backend=None, dummy=False, interactive=True, lan
|
|
|
253
261
|
else:
|
|
254
262
|
model = _prompt_model_for_backend(backend, language, interactive)
|
|
255
263
|
print(f"Selected model: {model}")
|
|
264
|
+
# silence_db is the LOW threshold (in-speech pause detection) — default
|
|
265
|
+
# -40 in all modes. silence_onset_db is the HIGH threshold (speech-start
|
|
266
|
+
# gate) used only in pseudo-streaming via hysteresis; -25 keeps ambient
|
|
267
|
+
# noise (keyboard, breathing) from triggering a chunk.
|
|
268
|
+
if silence_db is None:
|
|
269
|
+
silence_db = -40.0
|
|
270
|
+
if silence_onset_db is None:
|
|
271
|
+
silence_onset_db = -25.0 if pseudo_streaming else silence_db
|
|
256
272
|
prompt_text, word_list = _resolve_prompt_and_words(prompt, prompt_file, words, words_file)
|
|
257
273
|
backend_kwargs = _build_backend_kwargs(backend, model, language, samplerate, duration,
|
|
258
|
-
silence_db, silence_duration,
|
|
274
|
+
silence_db, silence_onset_db, silence_duration,
|
|
259
275
|
download_folder_vosk, download_folder_whisper,
|
|
260
276
|
download_folder_whisper_futo,
|
|
261
277
|
realtime_delay, realtime_gate,
|
|
@@ -322,11 +338,18 @@ def get_parser():
|
|
|
322
338
|
group = parser.add_argument_group("Silence detection (shared)")
|
|
323
339
|
group.add_argument("--duration", default=120, type=float,
|
|
324
340
|
help="Max recording duration in seconds (default: %(default)s).")
|
|
325
|
-
group.add_argument("--silence-db", default
|
|
326
|
-
help="
|
|
327
|
-
"
|
|
328
|
-
"
|
|
329
|
-
"pseudo-streaming
|
|
341
|
+
group.add_argument("--silence-db", default=None, type=float,
|
|
342
|
+
help="LOW silence floor in dBFS — applied while we're "
|
|
343
|
+
"already inside an utterance, so soft trailing "
|
|
344
|
+
"syllables aren't cut. Default: -40. Used by every "
|
|
345
|
+
"silence-driven behavior (pseudo-streaming pause "
|
|
346
|
+
"detection, realtime gate, realtime auto-commit).")
|
|
347
|
+
group.add_argument("--silence-onset-db", default=None, type=float,
|
|
348
|
+
help="HIGH silence floor in dBFS — applied before we've "
|
|
349
|
+
"started capturing speech (audio buffer empty). "
|
|
350
|
+
"Stricter so ambient noise (keyboard, breathing) "
|
|
351
|
+
"doesn't trigger a chunk. Default: -25 in "
|
|
352
|
+
"pseudo-streaming, same as --silence-db otherwise.")
|
|
330
353
|
group.add_argument("--silence-duration", default=0.6, type=float,
|
|
331
354
|
help="Seconds of silence required before triggering a "
|
|
332
355
|
"backend's silence behavior (default: %(default)s). "
|
|
@@ -399,8 +422,16 @@ def start_recording(micro, session, mode="keystroke", typer="auto",
|
|
|
399
422
|
# Query the live transcriber instance — the registered class may dispatch
|
|
400
423
|
# to a streaming sibling for specific models (e.g. openai →
|
|
401
424
|
# gpt-realtime-whisper), so a class-level lookup via BACKENDS would lie.
|
|
425
|
+
# Pseudo-streaming also yields chunks (silence-cut batch transcriptions)
|
|
426
|
+
# so the output should treat it the same: live paste/type per chunk.
|
|
402
427
|
backend_obj = getattr(session, "backend", session)
|
|
403
|
-
|
|
428
|
+
if isinstance(backend_obj, str):
|
|
429
|
+
is_streaming = False
|
|
430
|
+
else:
|
|
431
|
+
is_streaming = (
|
|
432
|
+
bool(getattr(backend_obj, "supports_streaming", False))
|
|
433
|
+
or bool(getattr(backend_obj, "pseudo_streaming", False))
|
|
434
|
+
)
|
|
404
435
|
# Clipboard is written in clipboard mode (the user pastes manually) and in
|
|
405
436
|
# paste-based keystroke mode (the paste source). type_direct keystroke
|
|
406
437
|
# mode bypasses the clipboard entirely — we type the chunks/text raw.
|
|
@@ -427,6 +458,16 @@ def start_recording(micro, session, mode="keystroke", typer="auto",
|
|
|
427
458
|
import pyperclip
|
|
428
459
|
session.log("The transcription will be copied to clipboard as it becomes available.")
|
|
429
460
|
|
|
461
|
+
# Tell streaming backends whether their output is about to hit the
|
|
462
|
+
# clipboard-paste race or a direct-keystroke typer. The realtime
|
|
463
|
+
# backend's per-token deltas only need coalescing in paste mode;
|
|
464
|
+
# type-direct (ydotool/wtype/pynput via uinput/xtest) types each
|
|
465
|
+
# character synchronously and benefits from raw per-delta emission
|
|
466
|
+
# for snappier UX. Set as a plain attribute — backends that don't
|
|
467
|
+
# implement coalescing ignore it.
|
|
468
|
+
if not isinstance(backend_obj, str) and hasattr(backend_obj, "_coalesce_deltas"):
|
|
469
|
+
backend_obj._coalesce_deltas = do_live_paste
|
|
470
|
+
|
|
430
471
|
fulltext = ""
|
|
431
472
|
|
|
432
473
|
for result in session.start_recording(micro, **greetings):
|
|
@@ -47,7 +47,8 @@ class OpenaiAPITranscriber(WhisperTranscriber):
|
|
|
47
47
|
sf.write(buffer, audio_data, self.samplerate, format='WAV')
|
|
48
48
|
buffer.seek(0)
|
|
49
49
|
buffer.name = "audio.wav" # Set a filename with a valid extension
|
|
50
|
-
|
|
50
|
+
prompt = self.compose_prompt(self._prompt)
|
|
51
|
+
extra = {"prompt": prompt} if prompt else {}
|
|
51
52
|
try:
|
|
52
53
|
transcription = self.model.audio.transcriptions.create(
|
|
53
54
|
model=self.model_name,
|
|
@@ -58,6 +59,7 @@ class OpenaiAPITranscriber(WhisperTranscriber):
|
|
|
58
59
|
title, message = format_openai_error(e)
|
|
59
60
|
self.notify_error(title, message)
|
|
60
61
|
return {"text": ""}
|
|
62
|
+
self.update_streaming_context(transcription.text)
|
|
61
63
|
return {"text": transcription.text}
|
|
62
64
|
|
|
63
65
|
|
|
@@ -2,6 +2,7 @@ import base64
|
|
|
2
2
|
import logging
|
|
3
3
|
import queue
|
|
4
4
|
import threading
|
|
5
|
+
import time
|
|
5
6
|
from typing import ClassVar
|
|
6
7
|
|
|
7
8
|
import numpy as np
|
|
@@ -34,6 +35,29 @@ class OpenaiRealtimeTranscriber(AbstractStreamingTranscriber):
|
|
|
34
35
|
# click) followed by silence would otherwise trigger an error popup.
|
|
35
36
|
_SERVER_COMMIT_MIN_MS = 100.0
|
|
36
37
|
|
|
38
|
+
# Coalesce token-level deltas before yielding to the app layer.
|
|
39
|
+
# gpt-realtime-whisper emits one delta per word/subword (~30-80 ms
|
|
40
|
+
# apart). The live-paste path (paste_via_clipboard) needs ~100 ms
|
|
41
|
+
# per call to defeat Wayland's wl-copy async race — pasting every
|
|
42
|
+
# delta caused token drops + duplications because the clipboard got
|
|
43
|
+
# overwritten before Ctrl+V landed.
|
|
44
|
+
#
|
|
45
|
+
# _INTERVAL: regular cadence for in-progress sentences (no punct
|
|
46
|
+
# yet). Long enough that most short sentences finish before it fires
|
|
47
|
+
# — that way the natural commit point is the period, not a mid-
|
|
48
|
+
# sentence timeout (which would split a phrase across two pastes and
|
|
49
|
+
# race them through the clipboard).
|
|
50
|
+
#
|
|
51
|
+
# _MIN_INTERVAL: floor between successive flushes regardless of
|
|
52
|
+
# trigger. Even when the buffer ends on a period, we hold the flush
|
|
53
|
+
# until the floor has elapsed since the prior one. Two punctuation
|
|
54
|
+
# flushes <200ms apart was the residual failure mode that mangled
|
|
55
|
+
# rapid repeated phrases ("Tout rentre dans l'ordre. Tout rentre
|
|
56
|
+
# dans l'ordre.") even after the initial coalescing landed.
|
|
57
|
+
_DELTA_FLUSH_INTERVAL_S = 0.4
|
|
58
|
+
_DELTA_FLUSH_MIN_INTERVAL_S = 0.2
|
|
59
|
+
_DELTA_FLUSH_PUNCT = frozenset(".!?\n")
|
|
60
|
+
|
|
37
61
|
def __init__(self, model_name="gpt-realtime-whisper", language=None, model_kwargs={},
|
|
38
62
|
model=None, realtime_delay="medium",
|
|
39
63
|
realtime_gate=True, prompt=None, **kwargs):
|
|
@@ -66,6 +90,16 @@ class OpenaiRealtimeTranscriber(AbstractStreamingTranscriber):
|
|
|
66
90
|
self._has_uncommitted_audio = False
|
|
67
91
|
self._silent_samples = 0
|
|
68
92
|
self._uncommitted_ms = 0.0
|
|
93
|
+
# Delta coalescing state (see _DELTA_FLUSH_INTERVAL_S). The flag
|
|
94
|
+
# below is set by the app layer at recording time: True when
|
|
95
|
+
# live-paste-via-clipboard is the output (clipboard race exists
|
|
96
|
+
# → coalesce); False in type-direct mode (uinput/xtest tap each
|
|
97
|
+
# character — no clipboard, no race, no need to batch). Default
|
|
98
|
+
# True so backends instantiated outside the scribe app loop
|
|
99
|
+
# (smoke tests, library use) keep the safer batched behaviour.
|
|
100
|
+
self._coalesce_deltas = True
|
|
101
|
+
self._delta_buffer = ""
|
|
102
|
+
self._last_delta_flush = 0.0
|
|
69
103
|
|
|
70
104
|
def _session_config(self) -> dict:
|
|
71
105
|
# gpt-realtime-whisper does NOT support server VAD (rejected as
|
|
@@ -73,11 +107,17 @@ class OpenaiRealtimeTranscriber(AbstractStreamingTranscriber):
|
|
|
73
107
|
# The streaming knob for this model is `delay` — "minimal" emits
|
|
74
108
|
# partials as early as possible; higher values trade latency for
|
|
75
109
|
# accuracy. Surfaced as the --realtime-delay CLI flag.
|
|
110
|
+
#
|
|
111
|
+
# NOTE: this model also rejects `prompt` server-side
|
|
112
|
+
# (400 "The 'prompt' parameter is not supported for this model.",
|
|
113
|
+
# param `session.audio.input.transcription.prompt`). The shared
|
|
114
|
+
# backend kwarg `prompt` is silently ignored here — the
|
|
115
|
+
# pseudo-streaming chunk-tail context machinery doesn't apply
|
|
116
|
+
# either (this backend is true streaming, not chunked). If a
|
|
117
|
+
# future REALTIME_MODELS entry supports it, gate by model name.
|
|
76
118
|
transcription: dict = {"model": self.model_name, "delay": self._realtime_delay}
|
|
77
119
|
if self.language:
|
|
78
120
|
transcription["language"] = self.language
|
|
79
|
-
if self._prompt:
|
|
80
|
-
transcription["prompt"] = self._prompt
|
|
81
121
|
audio_input: dict = {
|
|
82
122
|
"format": {"type": "audio/pcm", "rate": self._GA_SAMPLE_RATE},
|
|
83
123
|
"transcription": transcription,
|
|
@@ -100,6 +140,8 @@ class OpenaiRealtimeTranscriber(AbstractStreamingTranscriber):
|
|
|
100
140
|
self._has_uncommitted_audio = False
|
|
101
141
|
self._silent_samples = 0
|
|
102
142
|
self._uncommitted_ms = 0.0
|
|
143
|
+
self._delta_buffer = ""
|
|
144
|
+
self._last_delta_flush = time.time()
|
|
103
145
|
|
|
104
146
|
self._client = openai.OpenAI()
|
|
105
147
|
|
|
@@ -250,6 +292,9 @@ class OpenaiRealtimeTranscriber(AbstractStreamingTranscriber):
|
|
|
250
292
|
else:
|
|
251
293
|
self._silent_samples = 0
|
|
252
294
|
|
|
295
|
+
# Drain queue. Errors surface immediately in both modes. Text
|
|
296
|
+
# deltas either get buffered for coalesced flush (paste mode)
|
|
297
|
+
# or yielded raw (type-direct mode — see _coalesce_deltas).
|
|
253
298
|
while True:
|
|
254
299
|
try:
|
|
255
300
|
item = self._event_queue.get_nowait()
|
|
@@ -259,7 +304,31 @@ class OpenaiRealtimeTranscriber(AbstractStreamingTranscriber):
|
|
|
259
304
|
title, message = item["_error"]
|
|
260
305
|
self.notify_error(title, message)
|
|
261
306
|
continue
|
|
262
|
-
|
|
307
|
+
text = item.get("text", "")
|
|
308
|
+
if not text:
|
|
309
|
+
continue
|
|
310
|
+
if self._coalesce_deltas:
|
|
311
|
+
self._delta_buffer += text
|
|
312
|
+
else:
|
|
313
|
+
yield {"text": text}
|
|
314
|
+
|
|
315
|
+
# Flush the coalesced buffer when both:
|
|
316
|
+
# (a) the floor _DELTA_FLUSH_MIN_INTERVAL_S has elapsed since
|
|
317
|
+
# the last flush — no two pastes within the clipboard race
|
|
318
|
+
# window, regardless of trigger; and
|
|
319
|
+
# (b) either the regular interval elapsed, or the buffer ends
|
|
320
|
+
# on sentence-final punctuation (natural commit boundary).
|
|
321
|
+
# In raw-delta mode the buffer stays empty so this is a no-op.
|
|
322
|
+
if self._delta_buffer:
|
|
323
|
+
now = time.time()
|
|
324
|
+
elapsed = now - self._last_delta_flush
|
|
325
|
+
ends_on_punct = self._delta_buffer[-1] in self._DELTA_FLUSH_PUNCT
|
|
326
|
+
if elapsed >= self._DELTA_FLUSH_MIN_INTERVAL_S and (
|
|
327
|
+
elapsed >= self._DELTA_FLUSH_INTERVAL_S or ends_on_punct
|
|
328
|
+
):
|
|
329
|
+
yield {"text": self._delta_buffer}
|
|
330
|
+
self._delta_buffer = ""
|
|
331
|
+
self._last_delta_flush = now
|
|
263
332
|
|
|
264
333
|
def finalize(self):
|
|
265
334
|
if self._connection is None or self._closed:
|
|
@@ -288,7 +357,14 @@ class OpenaiRealtimeTranscriber(AbstractStreamingTranscriber):
|
|
|
288
357
|
# transcript was already streamed live as `text` deltas during
|
|
289
358
|
# recording, so we only return the tail.
|
|
290
359
|
self._completed_event.wait(timeout=self._FINALIZE_TIMEOUT)
|
|
360
|
+
# Start with whatever sat in the coalescing buffer (deltas seen
|
|
361
|
+
# by feed_audio but not yet flushed by the interval/punct check),
|
|
362
|
+
# then append any tail deltas the recv_loop pushed in after the
|
|
363
|
+
# recording loop exited.
|
|
291
364
|
tail_parts: list[str] = []
|
|
365
|
+
if self._delta_buffer:
|
|
366
|
+
tail_parts.append(self._delta_buffer)
|
|
367
|
+
self._delta_buffer = ""
|
|
292
368
|
while True:
|
|
293
369
|
try:
|
|
294
370
|
item = self._event_queue.get_nowait()
|
|
@@ -29,7 +29,7 @@ class WhisperTranscriber(AbstractTranscriber):
|
|
|
29
29
|
language=self.language,
|
|
30
30
|
vad_filter=True,
|
|
31
31
|
beam_size=1,
|
|
32
|
-
initial_prompt=self._prompt,
|
|
32
|
+
initial_prompt=self.compose_prompt(self._prompt),
|
|
33
33
|
hotwords=self._hotwords,
|
|
34
34
|
no_speech_threshold=0.6,
|
|
35
35
|
log_prob_threshold=-1.0,
|
|
@@ -37,6 +37,7 @@ class WhisperTranscriber(AbstractTranscriber):
|
|
|
37
37
|
temperature=(0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
|
|
38
38
|
)
|
|
39
39
|
text = "".join(segment.text for segment in segments)
|
|
40
|
+
self.update_streaming_context(text)
|
|
40
41
|
return {"text": text}
|
|
41
42
|
|
|
42
43
|
def finalize(self):
|
|
@@ -15,6 +15,7 @@ from __future__ import annotations
|
|
|
15
15
|
|
|
16
16
|
import math
|
|
17
17
|
import os
|
|
18
|
+
import re
|
|
18
19
|
from pathlib import Path
|
|
19
20
|
from typing import ClassVar
|
|
20
21
|
|
|
@@ -23,9 +24,36 @@ import numpy as np
|
|
|
23
24
|
from scribe.models import AbstractTranscriber
|
|
24
25
|
|
|
25
26
|
|
|
27
|
+
# Whisper hallucinates sound-effect annotations like "(music)", "[Applause]"
|
|
28
|
+
# on near-silence, and occasionally emits IPA-modifier-letter garbage
|
|
29
|
+
# (U+02B0–02FF) or U+FFFD when the audio is unintelligible. Two filters:
|
|
30
|
+
# - WHOLE_RE: chunk is one such artifact end-to-end → drop.
|
|
31
|
+
# - INLINE_RE: artifact embedded mid-text ("Bonjour (typing) ça va") →
|
|
32
|
+
# substitute out. Restricted to lowercase ASCII + spaces inside the
|
|
33
|
+
# brackets so legitimate French parentheticals (accents) and proper
|
|
34
|
+
# nouns (uppercase) are preserved. pywhispercpp 1.4.1 advertises
|
|
35
|
+
# `suppress_non_speech_tokens` in its schema but the C struct doesn't
|
|
36
|
+
# expose it, so this lives at the text layer.
|
|
37
|
+
_NON_SPEECH_WHOLE_RE = re.compile(r"^\s*[(\[*][^()\[\]*]{1,60}[)\]*]\s*[.!?]?\s*$")
|
|
38
|
+
# Allow any case ([Breathing], [KNOCKING], [Door opens], (footsteps)) and
|
|
39
|
+
# consume any trailing punctuation so adjacent text doesn't end up with
|
|
40
|
+
# stray commas. Substitute with a space (not "") so adjacent words don't
|
|
41
|
+
# collide when the noise token has no surrounding whitespace
|
|
42
|
+
# ("[door][door]" or "word(typing)word"); a follow-up \s+ collapse cleans
|
|
43
|
+
# up any doubles.
|
|
44
|
+
_NON_SPEECH_INLINE_RE = re.compile(r"[(\[][A-Za-z][A-Za-z\s\-]{0,30}[)\]][.,!?:;]?")
|
|
45
|
+
_WHITESPACE_RE = re.compile(r"\s+")
|
|
46
|
+
_PHONETIC_RE = re.compile(r"[ʰ-˿�]")
|
|
47
|
+
|
|
48
|
+
|
|
26
49
|
_FUTO_BASE_URL = "https://voiceinput.futo.org/VoiceInput/"
|
|
27
50
|
|
|
28
|
-
# Map user-visible model name → ggml filename on FUTO's CDN.
|
|
51
|
+
# Map user-visible model name → ggml filename on FUTO's CDN. FUTO publishes
|
|
52
|
+
# only tiny/base/small (+ .en variants). The DeadBranches community q8_0 of
|
|
53
|
+
# large-v3-turbo was tried briefly but its large-v3 encoder is incompatible
|
|
54
|
+
# with the audio_ctx-shrinkage that's the whole point of this backend
|
|
55
|
+
# (Progress: 1612% / CJK garbage on short clips), so we stick to the FUTO
|
|
56
|
+
# set where ACFT works as advertised.
|
|
29
57
|
_FUTO_MODELS: dict[str, str] = {
|
|
30
58
|
"tiny": "tiny_acft_q8_0.bin",
|
|
31
59
|
"tiny.en": "tiny_en_acft_q8_0.bin",
|
|
@@ -90,7 +118,7 @@ class WhisperFutoTranscriber(AbstractTranscriber):
|
|
|
90
118
|
is_local: ClassVar[bool] = True
|
|
91
119
|
|
|
92
120
|
def __init__(self, model_name, language=None, model=None, model_kwargs={},
|
|
93
|
-
download_folder=None, **kwargs):
|
|
121
|
+
download_folder=None, prompt=None, **kwargs):
|
|
94
122
|
if model is None:
|
|
95
123
|
from pywhispercpp.model import Model
|
|
96
124
|
path = _model_path(model_name, download_folder)
|
|
@@ -101,29 +129,62 @@ class WhisperFutoTranscriber(AbstractTranscriber):
|
|
|
101
129
|
init_kwargs = {k: v for k, v in model_kwargs.items() if k != "n_threads"}
|
|
102
130
|
model = Model(str(path), n_threads=n_threads, **init_kwargs)
|
|
103
131
|
super().__init__(model, model_name, language, model_kwargs=model_kwargs, **kwargs)
|
|
132
|
+
self._prompt = prompt
|
|
104
133
|
|
|
105
134
|
def transcribe_audio(self, audio_bytes):
|
|
106
135
|
self.log("\nTranscribing")
|
|
107
136
|
audio = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0
|
|
108
|
-
# ACFT shortcut: shrink the encoder window to the actual audio length.
|
|
109
|
-
# Works for both explicit language and auto-detect (whisper.cpp runs its
|
|
110
|
-
# language ID head on the same shrunk encoder output; FUTO's L2-distill
|
|
111
|
-
# training preserves enough representational quality at short contexts).
|
|
112
|
-
# pywhispercpp wants "" (not "auto") to request auto-detection.
|
|
113
137
|
duration_s = len(audio) / self.samplerate
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
138
|
+
|
|
139
|
+
# ACFT shortcut: shrink the encoder window to the actual audio
|
|
140
|
+
# length. This is the whole point of the FUTO backend — without it,
|
|
141
|
+
# a 2 s clip runs against the full 30 s window and inference is
|
|
142
|
+
# 5-10× slower. Safe for the FUTO ACFT set (tiny/base/small +
|
|
143
|
+
# .en) which was trained to preserve quality at short audio_ctx.
|
|
144
|
+
# pywhispercpp wants "" (not "auto") to request auto-detect.
|
|
145
|
+
kwargs = {
|
|
146
|
+
"language": self.language or "",
|
|
147
|
+
"audio_ctx": min(_AUDIO_CTX_MAX,
|
|
148
|
+
max(_AUDIO_CTX_MIN,
|
|
149
|
+
math.ceil(duration_s * _AUDIO_CTX_PER_SECOND))),
|
|
150
|
+
}
|
|
151
|
+
prompt = self.compose_prompt(self._prompt)
|
|
152
|
+
if prompt:
|
|
153
|
+
kwargs["initial_prompt"] = prompt
|
|
154
|
+
# Streaming-only safety nets. max_tokens caps decoder repetition
|
|
155
|
+
# loops on short silence-split chunks; the non-speech filter
|
|
156
|
+
# below drops "(music)"-style hallucinations from those same
|
|
157
|
+
# tiny chunks. Both can clip real speech in batch where the
|
|
158
|
+
# recording is a single longer utterance.
|
|
159
|
+
if self.pseudo_streaming:
|
|
160
|
+
kwargs["max_tokens"] = max(12, int(duration_s * 12))
|
|
161
|
+
segments = self.model.transcribe(audio, **kwargs)
|
|
162
|
+
text = "".join(s.text for s in segments)
|
|
163
|
+
if self.pseudo_streaming:
|
|
164
|
+
# Inline pass first: catches concatenated noise tokens like
|
|
165
|
+
# "[door opens][door closes]" and mid-sentence "(typing)"
|
|
166
|
+
# inserts. Replace with " " then collapse to avoid gluing
|
|
167
|
+
# adjacent words. Whole-chunk fallback catches artifacts the
|
|
168
|
+
# inline pattern misses (internal punctuation inside brackets).
|
|
169
|
+
text = _NON_SPEECH_INLINE_RE.sub(" ", text)
|
|
170
|
+
text = _WHITESPACE_RE.sub(" ", text).strip()
|
|
171
|
+
if _NON_SPEECH_WHOLE_RE.match(text):
|
|
172
|
+
text = ""
|
|
173
|
+
else:
|
|
174
|
+
text = text.strip()
|
|
175
|
+
# Phonetic garbage (IPA modifier letters, U+FFFD) is always a
|
|
176
|
+
# decode failure — drop in both modes.
|
|
177
|
+
if _PHONETIC_RE.search(text):
|
|
178
|
+
text = ""
|
|
179
|
+
# Carry the cleaned text forward as cross-chunk context. Done
|
|
180
|
+
# post-filter so hallucination/phonetic-garbage chunks (now "")
|
|
181
|
+
# don't poison the next chunk's prompt.
|
|
182
|
+
self.update_streaming_context(text)
|
|
183
|
+
# Trailing space lets pseudo-streaming chunks concatenate cleanly
|
|
184
|
+
# (vosk convention). Harmless in batch mode — downstream strips.
|
|
185
|
+
if text:
|
|
186
|
+
text += " "
|
|
187
|
+
return {"text": text}
|
|
127
188
|
|
|
128
189
|
def finalize(self):
|
|
129
190
|
if len(self.session.audio_buffer) == 0:
|