scribe-cli 0.16.0__tar.gz → 0.17.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/PKG-INFO +58 -12
  2. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/README.md +53 -11
  3. scribe_cli-0.17.1/docs/app-tray-menu.png +0 -0
  4. scribe_cli-0.17.1/docs/backends.md +238 -0
  5. scribe_cli-0.17.1/docs/cli.md +137 -0
  6. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/docs/keyboard.md +31 -0
  7. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/docs/tray.md +4 -2
  8. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/pyproject.toml +3 -1
  9. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/_version.py +3 -3
  10. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/app.py +230 -40
  11. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/backends/__init__.py +3 -0
  12. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/backends/groq.py +4 -2
  13. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/backends/openai_api.py +7 -2
  14. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/backends/openai_realtime.py +167 -32
  15. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/backends/vosk.py +6 -4
  16. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/backends/whisper.py +11 -1
  17. scribe_cli-0.17.1/scribe/backends/whisper_futo.py +201 -0
  18. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/menu.py +210 -15
  19. scribe_cli-0.17.1/scribe/models.py +280 -0
  20. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/session.py +10 -1
  21. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe_cli.egg-info/PKG-INFO +58 -12
  22. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe_cli.egg-info/SOURCES.txt +6 -1
  23. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe_cli.egg-info/requires.txt +5 -0
  24. scribe_cli-0.17.1/scripts/bench_whisper_local.py +156 -0
  25. scribe_cli-0.17.1/tests/test_openai_realtime_coalesce.py +221 -0
  26. scribe_cli-0.17.1/tests/test_pseudo_streaming.py +288 -0
  27. scribe_cli-0.17.1/tests/test_whisper_futo.py +245 -0
  28. scribe_cli-0.16.0/docs/app-tray-menu.png +0 -0
  29. scribe_cli-0.16.0/docs/backends.md +0 -122
  30. scribe_cli-0.16.0/docs/cli.md +0 -95
  31. scribe_cli-0.16.0/scribe/models.py +0 -144
  32. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/.github/FUNDING.yml +0 -0
  33. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/.github/workflows/pypi.yml +0 -0
  34. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/.gitignore +0 -0
  35. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/LICENSE +0 -0
  36. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/docs/desktop-install.md +0 -0
  37. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/docs/installation.md +0 -0
  38. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/docs/roadmap-libei.md +0 -0
  39. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/icon.xcf +0 -0
  40. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/__init__.py +0 -0
  41. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/audio.py +0 -0
  42. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/install_desktop.py +0 -0
  43. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/keyboard.py +0 -0
  44. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/models.toml +0 -0
  45. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/saverecording.py +0 -0
  46. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/testpynput.py +0 -0
  47. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/typers/__init__.py +0 -0
  48. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/typers/base.py +0 -0
  49. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/typers/eitype.py +0 -0
  50. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/typers/pynput.py +0 -0
  51. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/typers/wtype.py +0 -0
  52. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/typers/ydotool.py +0 -0
  53. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/util.py +0 -0
  54. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe_cli.egg-info/dependency_links.txt +0 -0
  55. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe_cli.egg-info/entry_points.txt +0 -0
  56. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe_cli.egg-info/top_level.txt +0 -0
  57. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe_data/__init__.py +0 -0
  58. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe_data/share/icon.png +0 -0
  59. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe_data/share/icon_recording.png +0 -0
  60. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe_data/share/icon_writing.png +0 -0
  61. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe_data/templates/scribe.desktop +0 -0
  62. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scripts/test_python_versions_install.sh +0 -0
  63. {scribe_cli-0.16.0 → scribe_cli-0.17.1}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: scribe-cli
3
- Version: 0.16.0
3
+ Version: 0.17.1
4
4
  Summary: Speech-to-text CLI and system-tray app for dictating into any focused window. Local (vosk, faster-whisper) or cloud (groq, openai) backends, batch or streaming.
5
5
  Author-email: Mahé Perrette <mahe.perrette@gmail.com>
6
6
  License: MIT License
@@ -50,11 +50,14 @@ Requires-Dist: requests
50
50
  Requires-Dist: pyperclip
51
51
  Requires-Dist: unidecode
52
52
  Requires-Dist: termcolor
53
+ Requires-Dist: platformdirs
53
54
  Requires-Dist: desktop-ai-core>=0.2.0
54
55
  Provides-Extra: keyboard
55
56
  Requires-Dist: pynput; extra == "keyboard"
56
57
  Provides-Extra: whisper
57
58
  Requires-Dist: faster-whisper; extra == "whisper"
59
+ Provides-Extra: whisper-futo
60
+ Requires-Dist: pywhispercpp; extra == "whisper-futo"
58
61
  Provides-Extra: vosk
59
62
  Requires-Dist: vosk; extra == "vosk"
60
63
  Provides-Extra: app
@@ -69,6 +72,7 @@ Requires-Dist: soundfile; extra == "groq"
69
72
  Provides-Extra: all
70
73
  Requires-Dist: pynput; extra == "all"
71
74
  Requires-Dist: faster-whisper; extra == "all"
75
+ Requires-Dist: pywhispercpp; extra == "all"
72
76
  Requires-Dist: openai<3,>=2.37.0; extra == "all"
73
77
  Requires-Dist: soundfile; extra == "all"
74
78
  Requires-Dist: vosk; extra == "all"
@@ -98,29 +102,60 @@ cloud-based APIs, batch and streaming workflows.
98
102
  - Cross-platform: tested on Ubuntu (X11 and Wayland), macOS, Windows;
99
103
  works under Termux for clipboard / terminal output.
100
104
 
101
- ## Getting started
105
+ ## Install
102
106
 
103
107
  ```bash
104
108
  sudo apt-get install portaudio19-dev xclip # Ubuntu; macOS: brew install portaudio
105
109
  pip install scribe-cli[all]
106
110
  export GROQ_API_KEY=YOURAPIKEY # or OPENAI_API_KEY, or skip and run local
111
+ ```
112
+
113
+ See documentation below for setting up keyboard input on Ubuntu Wayland.
114
+
115
+
116
+ ## Usage
117
+
118
+ In a terminal:
119
+
120
+ ```bash
107
121
  scribe
108
122
  ```
109
123
 
110
- Scribe picks the first backend whose key / dependency is present, in
111
- order **`groq` `openai` `whisper` `vosk`**, and launches the
112
- tray icon. Press Record, speak, press Stop.
124
+ This launches the system tray icon. Press Record, speak, press Stop
125
+ the transcription lands in the focused window. Scribe picks the first
126
+ backend whose key / dependency is present, in order **`groq` →
127
+ `openai` → `whisper` → `vosk`**, so with `GROQ_API_KEY` set the
128
+ command above is equivalent to:
113
129
 
114
- See documentation below for setting up keyboard input on Ubuntu Wayland.
130
+ ```bash
131
+ scribe --backend groq --model whisper-large-v3-turbo
132
+ ```
115
133
 
116
- ### Getting an API key
134
+ <img src=https://raw.githubusercontent.com/perrette/scribe/main/docs/app-tray-menu.png width=300px>
117
135
 
118
- Groq is a good cloud backend to start with — very fast, quite accurate, and the
119
- **free tier** is generous enough for everyday dictation. Sign up at
120
- [console.groq.com](https://console.groq.com/), create an API key
121
- under **Settings API Keys**, and export it as `GROQ_API_KEY`.
136
+ You can override the defaults or drop the tray entirely:
137
+
138
+ ```bash
139
+ scribe --backend openai --model gpt-4o-mini-transcribe # OpenAI sweet spot
140
+ scribe --backend openai --model gpt-realtime-whisper # OpenAI streaming
141
+ scribe --backend whisper --model small # local, no API key
142
+ scribe --frontend terminal # interactive TUI menu
143
+ scribe --frontend terminal --no-interactive # record immediately, no menu
144
+ scribe --mode clipboard # copy to clipboard, no keystroke
145
+ scribe --mode terminal # only print to stdout
146
+ scribe -o transcript.txt # also append to a file
147
+ ```
148
+
149
+ With `--no-interactive` (terminal frontend only), scribe skips the
150
+ interactive menu and starts recording right away — handy for scripted,
151
+ one-shot transcriptions. `--no-prompt` is kept as a deprecated alias.
152
+
153
+ Bias the recogniser toward names, jargon, or a domain glossary with
154
+ `--prompt "free text hint"` and `--words word1 word2 ...` (each also
155
+ accepts a `--prompt-file` / `--words-file` companion). See
156
+ [docs/backends.md › Vocabulary biasing](docs/backends.md#vocabulary-biasing)
157
+ for what each backend does with them.
122
158
 
123
- I personally use [OpenAI](https://openai.com/api/) with `gpt-4o-mini-transcribe` as it is also fast and perhaps more accurate for my accent-tainted English.
124
159
 
125
160
  ## Backends at a glance
126
161
 
@@ -135,6 +170,17 @@ Whether a transcription appears live as you speak or all at once when
135
170
  you stop depends on the **model** picked — see
136
171
  [docs/backends.md](docs/backends.md).
137
172
 
173
+
174
+ ### Getting an API key
175
+
176
+ Groq is a good cloud backend to start with — very fast, quite accurate, and the
177
+ **free tier** is generous enough for everyday dictation. Sign up at
178
+ [console.groq.com](https://console.groq.com/), create an API key
179
+ under **Settings → API Keys**, and export it as `GROQ_API_KEY`.
180
+
181
+ I personally use [OpenAI](https://openai.com/api/) with `gpt-4o-mini-transcribe` as it is also fast and perhaps more accurate for my accent-tainted English.
182
+
183
+
138
184
  ## Documentation
139
185
 
140
186
  - [Installation & dependencies](docs/installation.md) — PortAudio,
@@ -21,29 +21,60 @@ cloud-based APIs, batch and streaming workflows.
21
21
  - Cross-platform: tested on Ubuntu (X11 and Wayland), macOS, Windows;
22
22
  works under Termux for clipboard / terminal output.
23
23
 
24
- ## Getting started
24
+ ## Install
25
25
 
26
26
  ```bash
27
27
  sudo apt-get install portaudio19-dev xclip # Ubuntu; macOS: brew install portaudio
28
28
  pip install scribe-cli[all]
29
29
  export GROQ_API_KEY=YOURAPIKEY # or OPENAI_API_KEY, or skip and run local
30
+ ```
31
+
32
+ See documentation below for setting up keyboard input on Ubuntu Wayland.
33
+
34
+
35
+ ## Usage
36
+
37
+ In a terminal:
38
+
39
+ ```bash
30
40
  scribe
31
41
  ```
32
42
 
33
- Scribe picks the first backend whose key / dependency is present, in
34
- order **`groq` `openai` `whisper` `vosk`**, and launches the
35
- tray icon. Press Record, speak, press Stop.
43
+ This launches the system tray icon. Press Record, speak, press Stop
44
+ the transcription lands in the focused window. Scribe picks the first
45
+ backend whose key / dependency is present, in order **`groq` →
46
+ `openai` → `whisper` → `vosk`**, so with `GROQ_API_KEY` set the
47
+ command above is equivalent to:
36
48
 
37
- See documentation below for setting up keyboard input on Ubuntu Wayland.
49
+ ```bash
50
+ scribe --backend groq --model whisper-large-v3-turbo
51
+ ```
38
52
 
39
- ### Getting an API key
53
+ <img src=https://raw.githubusercontent.com/perrette/scribe/main/docs/app-tray-menu.png width=300px>
40
54
 
41
- Groq is a good cloud backend to start with — very fast, quite accurate, and the
42
- **free tier** is generous enough for everyday dictation. Sign up at
43
- [console.groq.com](https://console.groq.com/), create an API key
44
- under **Settings API Keys**, and export it as `GROQ_API_KEY`.
55
+ You can override the defaults or drop the tray entirely:
56
+
57
+ ```bash
58
+ scribe --backend openai --model gpt-4o-mini-transcribe # OpenAI sweet spot
59
+ scribe --backend openai --model gpt-realtime-whisper # OpenAI streaming
60
+ scribe --backend whisper --model small # local, no API key
61
+ scribe --frontend terminal # interactive TUI menu
62
+ scribe --frontend terminal --no-interactive # record immediately, no menu
63
+ scribe --mode clipboard # copy to clipboard, no keystroke
64
+ scribe --mode terminal # only print to stdout
65
+ scribe -o transcript.txt # also append to a file
66
+ ```
67
+
68
+ With `--no-interactive` (terminal frontend only), scribe skips the
69
+ interactive menu and starts recording right away — handy for scripted,
70
+ one-shot transcriptions. `--no-prompt` is kept as a deprecated alias.
71
+
72
+ Bias the recogniser toward names, jargon, or a domain glossary with
73
+ `--prompt "free text hint"` and `--words word1 word2 ...` (each also
74
+ accepts a `--prompt-file` / `--words-file` companion). See
75
+ [docs/backends.md › Vocabulary biasing](docs/backends.md#vocabulary-biasing)
76
+ for what each backend does with them.
45
77
 
46
- I personally use [OpenAI](https://openai.com/api/) with `gpt-4o-mini-transcribe` as it is also fast and perhaps more accurate for my accent-tainted English.
47
78
 
48
79
  ## Backends at a glance
49
80
 
@@ -58,6 +89,17 @@ Whether a transcription appears live as you speak or all at once when
58
89
  you stop depends on the **model** picked — see
59
90
  [docs/backends.md](docs/backends.md).
60
91
 
92
+
93
+ ### Getting an API key
94
+
95
+ Groq is a good cloud backend to start with — very fast, quite accurate, and the
96
+ **free tier** is generous enough for everyday dictation. Sign up at
97
+ [console.groq.com](https://console.groq.com/), create an API key
98
+ under **Settings → API Keys**, and export it as `GROQ_API_KEY`.
99
+
100
+ I personally use [OpenAI](https://openai.com/api/) with `gpt-4o-mini-transcribe` as it is also fast and perhaps more accurate for my accent-tainted English.
101
+
102
+
61
103
  ## Documentation
62
104
 
63
105
  - [Installation & dependencies](docs/installation.md) — PortAudio,
Binary file
@@ -0,0 +1,238 @@
1
+ # Backends in detail
2
+
3
+ Scribe ships five speech-to-text backends. They are all picked through
4
+ the same `--backend` / `--model` CLI flags (or the **Model** submenu in
5
+ the tray / terminal frontend). Whether a transcription is *streaming*
6
+ (text appears live as you speak) or *batch* (text arrives at end of
7
+ recording) depends on the **model** chosen — not the backend.
8
+
9
+ ## At a glance
10
+
11
+ | Backend | `--backend` | Default model | Streaming model(s) | Requires |
12
+ |------------------------|-----------------|----------------------------|---------------------------|-----------------------------------------|
13
+ | Groq (cloud) | `groq` | `whisper-large-v3-turbo` | — | `GROQ_API_KEY` |
14
+ | OpenAI (cloud) | `openai` | `gpt-4o-mini-transcribe` | `gpt-realtime-whisper` | `OPENAI_API_KEY` |
15
+ | Whisper FUTO (local) | `whisper-futo` | `small` | — | `pip install scribe-cli[whisper-futo]` |
16
+ | Whisper (local) | `whisper` | `small` | — | `pip install scribe-cli[whisper]` |
17
+ | Vosk (local) | `vosk` | language-dependent | all Vosk models | `pip install scribe-cli[vosk]` |
18
+
19
+ Run `scribe` without arguments and it picks the first backend whose
20
+ dependency / API key is present, preferring cloud over local and the
21
+ faster local option first:
22
+ `groq → openai → whisper-futo → whisper → vosk`.
23
+
24
+ ## `whisper-futo` (local, fast on short dictations)
25
+
26
+ Runs locally via [whisper.cpp](https://github.com/ggml-org/whisper.cpp)
27
+ (through [`pywhispercpp`](https://github.com/absadiki/pywhispercpp))
28
+ using [FUTO's ACFT-finetuned models](https://github.com/futo-org/whisper-acft).
29
+ ACFT (Audio Context Fine-Tuning) lets the encoder run on the actual
30
+ audio length instead of always padding to 30 s — a meaningful speedup
31
+ on short dictations, which is the typical scribe workload.
32
+
33
+ The available models offered in the tray menu are
34
+ `tiny / base / small`. FUTO has not released ACFT weights for
35
+ `medium / large / turbo`; for those sizes use the `whisper` backend.
36
+
37
+ With `--language en` (or `-l en`) scribe auto-substitutes the
38
+ English-only variant (e.g. `small` → `small.en`) when it exists.
39
+
40
+ Models are auto-downloaded on first use from `voiceinput.futo.org`
41
+ to `$XDG_CACHE_HOME/whisper-futo/` (override with
42
+ `--download-folder-whisper-futo`).
43
+
44
+ For audio ≥ 30 s the ACFT speedup tapers off and the encoder window
45
+ collapses to the standard 30 s; quality and speed in that regime are
46
+ similar to the `whisper` backend. Pick `whisper-futo` if most of your
47
+ dictations are short, the `whisper` backend if you regularly do
48
+ multi-minute recordings or need `medium` / `large` / `turbo`.
49
+
50
+ ## `whisper` (local)
51
+
52
+ Runs locally via
53
+ [`faster-whisper`](https://github.com/SYSTRAN/faster-whisper) and
54
+ defaults to the `small` model. Excellent at full-utterance
55
+ transcription in
56
+ [many languages](https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages),
57
+ but it does not stream — text appears at end-of-recording — and
58
+ execution time depends on model size and hardware.
59
+
60
+ The available models offered in the tray menu are
61
+ `tiny / base / small / medium / large-v3 / large-v3-turbo`. Larger
62
+ models trade speed for accuracy.
63
+
64
+ With `--language en` (or `-l en`) scribe auto-substitutes the
65
+ English-only variant (e.g. `small` → `small.en`) when it exists.
66
+
67
+ ## `vosk` (local, streaming)
68
+
69
+ Vosk transcribes in real time and is very good at one language at a
70
+ time, but tends to make more mistakes than Whisper and does not produce
71
+ punctuation. It becomes really useful in longer, interactive sessions
72
+ where the live "appears as you speak" UX matters — see
73
+ [keyboard.md](keyboard.md) for how the keystroke mode interacts with
74
+ streaming models.
75
+
76
+ There are many [Vosk models](https://alphacephei.com/vosk/models)
77
+ available; a handful are pre-mapped to common languages (`en`, `fr`,
78
+ `de`, `it`) in
79
+ [`scribe/models.toml`](../scribe/models.toml). Pick one with
80
+ `-l <lang>` or browse the full list interactively from the menu.
81
+
82
+ ## `openai` (OpenAI cloud)
83
+
84
+ The OpenAI backend supports three models:
85
+
86
+ - `gpt-4o-mini-transcribe` *(default)* — fast, low-cost batch
87
+ transcription.
88
+ - `gpt-4o-transcribe` — higher-quality batch transcription.
89
+ - `gpt-realtime-whisper` *(streaming)* — partial transcripts arrive
90
+ as you speak. Same UX as Vosk but using OpenAI's cloud model.
91
+
92
+ All three share the same `OPENAI_API_KEY` and the `[openai]` extra; no
93
+ extra dependencies. Set the key once:
94
+
95
+ ```bash
96
+ export OPENAI_API_KEY=YOURAPIKEY
97
+ scribe --backend openai # default: gpt-4o-mini-transcribe
98
+ scribe --model gpt-4o-transcribe # batch, higher quality
99
+ scribe --model gpt-realtime-whisper # streaming
100
+ ```
101
+
102
+ `--model` alone auto-routes to the `openai` backend for any of the
103
+ three models above, so `--backend openai` is optional.
104
+
105
+ ### `--realtime-delay` (gpt-realtime-whisper only)
106
+
107
+ The streaming model has a latency-vs-accuracy knob exposed as
108
+ `--realtime-delay {minimal,low,medium,high,xhigh}` (default `medium`).
109
+ Lower values emit partial transcripts sooner — at the cost of more
110
+ revisions arriving in the focused window. Higher values batch tokens
111
+ into longer chunks so what gets pasted is more stable.
112
+
113
+ See OpenAI's
114
+ [gpt-realtime-whisper model card](https://developers.openai.com/api/docs/models/gpt-realtime-whisper)
115
+ for the full picture.
116
+
117
+ ## `groq` (Groq cloud)
118
+
119
+ Talks to Groq's OpenAI-compatible API and defaults to
120
+ `whisper-large-v3-turbo`. Typically the fastest cloud option for
121
+ full-utterance transcription:
122
+
123
+ ```bash
124
+ export GROQ_API_KEY=YOURAPIKEY
125
+ scribe --backend groq
126
+ ```
127
+
128
+ The `groq` backend reuses the `openai` Python client under the hood, so
129
+ installing `[openai]` is enough for both.
130
+
131
+ ## Stopping a recording
132
+
133
+ For batch models (Whisper local, Whisper-via-API, Groq, `gpt-4o-*`) the
134
+ recording continues for up to 2 minutes until you stop it manually
135
+ (Stop in the tray, Ctrl+C in the terminal) — the transcription happens
136
+ once when you stop.
137
+
138
+ Streaming models (Vosk, `gpt-realtime-whisper`) emit partials as you
139
+ speak and stop on the same Stop / Ctrl+C action.
140
+
141
+ ## Vocabulary biasing
142
+
143
+ `--prompt TEXT` and `--words W [W ...]` (plus the `--prompt-file` /
144
+ `--words-file` companions) bias the recogniser toward a particular
145
+ style, domain, or word list. The concept is generic across the
146
+ whisper-family backends but each backend exposes it slightly
147
+ differently:
148
+
149
+ | Backend | `--prompt` | `--words` |
150
+ |--------------------------------------|-------------------------------|--------------------------------------------------------|
151
+ | `whisper` (faster-whisper, local) | passed as `initial_prompt=` | passed as `hotwords=` — a **dedicated biasing channel** separate from the prompt |
152
+ | `whisper-futo` (pywhispercpp, local) | passed as `initial_prompt=` | joined onto the prompt string (no separate hotwords channel here) |
153
+ | `openai` batch (`gpt-4o*-transcribe`) | passed as `prompt=` | joined onto the prompt string |
154
+ | `groq` (`whisper-large-v3-turbo`) | passed as `prompt=` | joined onto the prompt string |
155
+ | `openai` realtime (`gpt-realtime-whisper`) | *silently ignored* — the model rejects the prompt parameter server-side (HTTP 400 *"The 'prompt' parameter is not supported for this model."*). The kwarg stays accepted for plumbing compatibility but never reaches the API. | same — joined into the (ignored) prompt |
156
+ | `vosk` | *ignored* (no soft prompt) | *ignored* (Vosk only supports a hard `grammar` allowlist; not yet exposed) |
157
+
158
+ The whisper-family APIs cap the prompt around ~224 tokens; longer
159
+ hints are silently truncated. Faster-whisper's `hotwords` channel is
160
+ the one place a separate "dictionary" really exists — everywhere else
161
+ `--words` is just a convenience to keep your word list out of the
162
+ prompt string in the CLI.
163
+
164
+ Both flags read from the corresponding `*-file` argument when present.
165
+ Inline + file inputs are combined.
166
+
167
+ ```bash
168
+ # Inline
169
+ scribe --prompt "ML systems infra: K8s, etcd, Envoy." \
170
+ --words kubectl envoyproxy etcdctl
171
+
172
+ # From files (handy for long-lived glossaries)
173
+ scribe --prompt-file ~/.config/scribe/prompt.txt \
174
+ --words-file ~/.config/scribe/words.txt
175
+ ```
176
+
177
+ When *no* prompt/words flag is given, scribe also auto-loads
178
+ `prompt.txt` and `words.txt` from the platform user-config dir
179
+ (`~/.config/scribe/` on Linux, `~/Library/Application Support/scribe/`
180
+ on macOS, `%LOCALAPPDATA%\scribe\` on Windows — resolved via
181
+ `platformdirs`) if they exist. To suppress the default for one
182
+ invocation, pass an explicit empty value: `--prompt ""` (or
183
+ `--prompt-file ""`) suppresses the prompt default; `--words` with no
184
+ arguments (or `--words-file ""`) suppresses the words default. Each
185
+ side is independent.
186
+
187
+ ## Pseudo-streaming (experimental)
188
+
189
+ `--pseudo-streaming` makes a batch backend behave streaming-like by
190
+ cutting the running buffer into chunks driven by silence:
191
+
192
+ ```bash
193
+ scribe --pseudo-streaming --streaming-window 5
194
+ ```
195
+
196
+ After `--streaming-window` seconds of buffered audio, scribe cuts at
197
+ the first silence of at least `--silence-duration` and transcribes the
198
+ chunk; if no silence arrives by `2 × --streaming-window`, it
199
+ force-cuts. The session continues until you stop it. Default `5` s
200
+ trades a little Whisper context for snappier "text appears as you
201
+ speak" UX; raise it (10–30 s) if accuracy on long sentences matters
202
+ more than latency.
203
+
204
+ This is experimental and off by default. The tray menu surfaces the
205
+ same toggle under Options ▶ Advanced ▶ Pseudo-streaming.
206
+
207
+ ### Cross-chunk prompt context
208
+
209
+ In pseudo-streaming mode scribe automatically augments each chunk's
210
+ prompt with the trailing ~200 characters of the *previous* chunk's
211
+ transcription. This rolling tail is concatenated onto whatever static
212
+ `--prompt` / `--words` you configured and reaches the backend through
213
+ the same channel as the static prompt (the vocabulary biasing table
214
+ above). The motivation is cross-chunk continuity:
215
+
216
+ - **Capitalization drift** — without context, a chunk that starts
217
+ right after a period might come back lowercased.
218
+ - **Article gender (FR/IT/ES/…)** — `"la nouveau"` → `"le nouveau"`
219
+ once the prior chunk has established the noun.
220
+ - **Language lock** — `whisper.cpp` auto-detects language per call;
221
+ feeding the previous chunk's tokens keeps the language stable
222
+ across cuts.
223
+
224
+ Whisper's prompt window is capped at ~224 tokens; 200 chars of French
225
+ sits well under that and leaves room for your static prompt + words
226
+ list.
227
+
228
+ The rolling tail is **dropped** whenever the pause that triggered the
229
+ chunk cut exceeded 1.5 seconds — a long pause is treated as a new
230
+ sentence/idea boundary, where carrying a possibly-bad prior chunk
231
+ forward biases the next one more than it helps. This mirrors
232
+ `whisper.cpp`'s `--keep-context off` default: prior-text conditioning
233
+ can self-reinforce errors (hallucinations, decoder repetition loops)
234
+ more readily than it provides useful continuity, so we cap it at
235
+ natural sentence boundaries.
236
+
237
+ Short pauses (mid-sentence punctuation) keep the context; the cut at
238
+ the start of every new recording also clears it.
@@ -0,0 +1,137 @@
1
+ # Fine tuning & CLI reference
2
+
3
+ For a complete, always-current listing run:
4
+
5
+ ```bash
6
+ scribe --help
7
+ ```
8
+
9
+ The flags are grouped to mirror the source-of-truth in
10
+ [`scribe/app.py`](../scribe/app.py).
11
+
12
+ ## Backend
13
+
14
+ | Flag | Purpose |
15
+ |---------------------------------|-------------------------------------------------------------------------|
16
+ | `--backend {vosk,whisper,openai,groq}` | Speech-recognition backend (prompted if omitted). |
17
+ | `--model NAME` | Model name for the chosen backend. Auto-routes to the right backend for known model names (e.g. `--model gpt-realtime-whisper` selects `openai`). |
18
+ | `-l, --language LANG` | Language alias selecting a preset Vosk model (`en`/`fr`/`de`/`it`), or `en` for English-only Whisper models. |
19
+ | `--download-folder-whisper DIR` | Folder to store Whisper models. |
20
+ | `--download-folder-vosk DIR` | Folder to store Vosk models. |
21
+
22
+ ## Prompting & vocabulary biasing
23
+
24
+ Bias the model toward particular names, jargon, or topics. Two
25
+ complementary knobs:
26
+
27
+ | Flag | Purpose |
28
+ |--------------------------|--------------------------------------------------------------------------------------------------|
29
+ | `--prompt TEXT` | Free-text style / context hint shown to the model. |
30
+ | `--prompt-file PATH` | Reads the prompt from a file; appended to `--prompt` if both are given. |
31
+ | `--words W [W ...]` | List of words to emphasise. Joined onto the prompt for cloud Whisper; routed to faster-whisper's dedicated `hotwords` channel locally. |
32
+ | `--words-file PATH` | Whitespace-separated words from a file; merged with `--words`. |
33
+
34
+ The whisper-family APIs cap the prompt around ~224 tokens; longer hints
35
+ are silently truncated. Vosk has no soft prompt and ignores both flags.
36
+ See [backends.md › Vocabulary biasing](backends.md#vocabulary-biasing)
37
+ for the per-backend wiring.
38
+
39
+ **Default files.** When none of the four flags above are given, scribe
40
+ also looks for `prompt.txt` and `words.txt` in the platform user-config
41
+ dir and loads them if they exist — handy for a long-lived personal
42
+ glossary. The path is resolved via `platformdirs`:
43
+
44
+ - Linux: `$XDG_CONFIG_HOME/scribe/` (default `~/.config/scribe/`)
45
+ - macOS: `~/Library/Application Support/scribe/`
46
+ - Windows: `%LOCALAPPDATA%\scribe\`
47
+
48
+ To suppress the default on a single invocation, pass an empty value:
49
+ `--prompt ""`, `--prompt-file ""`, or `--words` with no arguments. Each
50
+ flag suppresses only its own side (giving `--prompt ""` still loads
51
+ `words.txt` if present).
52
+
53
+ ## Audio
54
+
55
+ | Flag | Purpose |
56
+ |-----------------------|----------------------------------------------------------|
57
+ | `--input-device N` | Microphone device index (see `python -m sounddevice`). |
58
+
59
+ ## Output
60
+
61
+ | Flag | Purpose |
62
+ |-----------------------------|---------------------------------------------------------------------------------------------|
63
+ | `-m, --mode {keystroke,clipboard,terminal}` | Where transcribed text goes (default `keystroke`). See [keyboard.md](keyboard.md). |
64
+ | `--typer {auto,eitype,pynput,wtype,ydotool}` | Keystroke-injection backend (default `auto`). |
65
+ | `--type-direct` | In keystroke mode, type the transcription as keystrokes instead of synthesising Ctrl+V. |
66
+ | `-o, --output-file FILE` | Also append the transcription to this file. |
67
+
68
+ ## Silence detection (shared)
69
+
70
+ | Flag | Default | Purpose |
71
+ |----------------------------|---------|------------------------------------------------------------------------|
72
+ | `--duration SECS` | `120` | Max recording duration in seconds. |
73
+ | `--silence-db DB` | `-40` | dBFS volume floor for "this frame is silent". Used by every silence-driven behavior. |
74
+ | `--silence-duration SECS` | `0.6` | How long silence must persist before triggering a backend's silence behavior (realtime auto-commit, pseudo-streaming cut). |
75
+
76
+ ## Realtime (`gpt-realtime-whisper`)
77
+
78
+ | Flag | Default | Purpose |
79
+ |---------------------------------------------------|----------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
80
+ | `--realtime-delay {minimal,low,medium,high,xhigh}` | `medium` | Trade off latency vs accuracy on `gpt-realtime-whisper`. Lower = faster partials but more paste churn in the focused window. |
81
+ | `--realtime-gate` / `--no-realtime-gate` | on | Drop silent frames (per `--silence-db`) before sending them over the WebSocket so silent audio isn't billed as input tokens. After `--silence-duration` of silence, also commit mid-session so trailing words flush live. |
82
+
83
+ Streaming models (Vosk, `gpt-realtime-whisper`) ignore the batch
84
+ silence-chunking knobs; they have their own end-of-utterance signal.
85
+
86
+ ## Frontend
87
+
88
+ | Flag | Purpose |
89
+ |-----------------------------|----------------------------------------------------------------------|
90
+ | `--frontend {tray,terminal}` | UI to launch (default `tray`). |
91
+ | `--no-interactive` | In terminal mode, skip the interactive menu and record immediately. (`--no-prompt` is kept as a deprecated alias.) |
92
+ | `--vosk-models M [M ...]` | Vosk models offered in the tray menu. |
93
+ | `--whisper-models M [M ...]` | Whisper models offered in the tray menu. |
94
+
95
+ ## Examples
96
+
97
+ Predefine the tray menu's Whisper / Vosk model lists:
98
+
99
+ ```bash
100
+ scribe --vosk-models vosk-model-fr-0.22 \
101
+ --whisper-models small large-v3-turbo
102
+ ```
103
+
104
+ Stream OpenAI realtime transcripts with the most aggressive latency
105
+ setting:
106
+
107
+ ```bash
108
+ scribe --model gpt-realtime-whisper --realtime-delay minimal
109
+ ```
110
+
111
+ Disable the realtime silence gate (e.g. to A/B against a noisy
112
+ environment) — you'll pay for silent audio while the session is open:
113
+
114
+ ```bash
115
+ scribe --model gpt-realtime-whisper --no-realtime-gate
116
+ ```
117
+
118
+ Run scribe headlessly into a file without touching the clipboard or
119
+ focused window:
120
+
121
+ ```bash
122
+ scribe --frontend terminal --no-interactive --mode terminal -o session.txt
123
+ ```
124
+
125
+ Bias the recogniser toward domain jargon (medical terms, proper names):
126
+
127
+ ```bash
128
+ scribe --prompt "Patient notes from a cardiology consult." \
129
+ --words tachycardia bradycardia echocardiogram metoprolol
130
+ ```
131
+
132
+ Or store the lists in files for reuse across sessions:
133
+
134
+ ```bash
135
+ scribe --prompt-file ~/.config/scribe/prompt.txt \
136
+ --words-file ~/.config/scribe/words.txt
137
+ ```
@@ -167,3 +167,34 @@ If `eitype` is unavailable, two older workarounds also work:
167
167
  Roadmap for native libei integration (eventual Python bindings,
168
168
  expanded compositor support) is tracked in
169
169
  [docs/roadmap-libei.md](roadmap-libei.md).
170
+
171
+ ## Realtime backend: delta coalescing
172
+
173
+ The `gpt-realtime-whisper` backend emits one transcription delta per
174
+ word/subword at ~30–80 ms intervals — much faster than the
175
+ `pyperclip.copy()` + Ctrl+V cycle can settle on Wayland (≥100 ms,
176
+ because `wl-copy` is asynchronous). Pasting every delta led to
177
+ clipboard races where successive copies overwrote each other before
178
+ Ctrl+V landed, manifesting as dropped and duplicated words
179
+ (*"fait fait le mot mot time time…"*).
180
+
181
+ In **paste mode** (default keystroke output) scribe therefore
182
+ coalesces deltas: incoming tokens accumulate into a small buffer and
183
+ are flushed only when *either* ~400 ms have elapsed since the last
184
+ flush, *or* the buffer ends on sentence-final punctuation
185
+ (`. ! ? \n`). A 200 ms floor between any two flushes prevents
186
+ back-to-back punctuation flushes from racing each other through the
187
+ clipboard.
188
+
189
+ With **`--type-direct`** the coalescing is bypassed entirely — each
190
+ delta goes through the chosen typer as a raw keystroke synchronously
191
+ (uinput / xtest / portal libei), no clipboard involved, no race to
192
+ defeat. The UX is also snappier: tokens appear one at a time rather
193
+ than in ~400 ms-cadenced bursts.
194
+
195
+ macOS and Windows clipboards are synchronous, so the race that
196
+ motivates coalescing is essentially a Wayland artefact; scribe still
197
+ coalesces in paste mode there for consistency, but it's harmless.
198
+ This whole behaviour is realtime-specific — Vosk's per-phrase commits
199
+ already arrive at a sane cadence, and the pseudo-streaming backends
200
+ emit one chunk per silence cut (already coarse enough).
@@ -58,8 +58,10 @@ Options ▶
58
58
  Keyboard backend ▶ eitype / pynput / ydotool / wtype
59
59
  (rows incompatible with this OS are hidden;
60
60
  submenu hidden entirely when ≤ 1 row left)
61
- Advanced ▶ auto-restart after silence, duration,
62
- silence threshold, output file
61
+ Advanced ▶ silence duration, silence threshold,
62
+ realtime gate, pseudo-streaming
63
+ [experimental], streaming window
64
+ [experimental], output file
63
65
  Quit
64
66
  ```
65
67
 
@@ -20,6 +20,7 @@ dependencies = [
20
20
  "pyperclip",
21
21
  "unidecode",
22
22
  "termcolor",
23
+ "platformdirs",
23
24
  "desktop-ai-core>=0.2.0",
24
25
  ]
25
26
 
@@ -61,11 +62,12 @@ keywords = [
61
62
  [project.optional-dependencies]
62
63
  keyboard = ["pynput"]
63
64
  whisper = ["faster-whisper"]
65
+ whisper-futo = ["pywhispercpp"]
64
66
  vosk = ["vosk"]
65
67
  app = ["pystray", "PyGObject"]
66
68
  openai = ["openai>=2.37.0,<3", "soundfile"]
67
69
  groq = ["openai>=2.37.0,<3", "soundfile"]
68
- all = ["pynput", "faster-whisper", "openai>=2.37.0,<3", "soundfile", "vosk", "pystray"]
70
+ all = ["pynput", "faster-whisper", "pywhispercpp", "openai>=2.37.0,<3", "soundfile", "vosk", "pystray"]
69
71
 
70
72
 
71
73
  [tool.setuptools]