scribe-cli 0.17.1__tar.gz → 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/.gitignore +1 -0
  2. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/PKG-INFO +60 -22
  3. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/README.md +35 -20
  4. scribe_cli-1.0.0/docs/app-tray-menu.png +0 -0
  5. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/docs/backends.md +126 -41
  6. scribe_cli-1.0.0/docs/cli.md +207 -0
  7. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/docs/desktop-install.md +1 -1
  8. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/docs/installation.md +1 -1
  9. scribe_cli-0.17.1/docs/keyboard.md → scribe_cli-1.0.0/docs/output.md +98 -36
  10. scribe_cli-1.0.0/docs/tray.md +127 -0
  11. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/pyproject.toml +38 -6
  12. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/_version.py +3 -3
  13. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/app.py +362 -164
  14. scribe_cli-1.0.0/scribe/audio.py +379 -0
  15. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/backends/groq.py +4 -3
  16. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/backends/openai_api.py +11 -3
  17. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/backends/openai_realtime.py +87 -10
  18. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/backends/vosk.py +20 -4
  19. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/backends/whisper.py +12 -3
  20. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/backends/whisper_futo.py +10 -3
  21. scribe_cli-1.0.0/scribe/dialog.py +56 -0
  22. scribe_cli-1.0.0/scribe/menu.py +1554 -0
  23. scribe_cli-1.0.0/scribe/models.py +403 -0
  24. scribe_cli-1.0.0/scribe/output.py +237 -0
  25. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/session.py +29 -4
  26. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe_cli.egg-info/PKG-INFO +60 -22
  27. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe_cli.egg-info/SOURCES.txt +8 -1
  28. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe_cli.egg-info/requires.txt +3 -0
  29. scribe_cli-1.0.0/scribe_data/silero_vad.LICENSE +21 -0
  30. scribe_cli-1.0.0/scribe_data/silero_vad.onnx +0 -0
  31. scribe_cli-1.0.0/tests/test_backend_matrix.py +295 -0
  32. scribe_cli-1.0.0/tests/test_output.py +165 -0
  33. scribe_cli-1.0.0/tests/test_output_file_picker.py +57 -0
  34. scribe_cli-1.0.0/tests/test_pseudo_streaming.py +490 -0
  35. scribe_cli-0.17.1/docs/app-tray-menu.png +0 -0
  36. scribe_cli-0.17.1/docs/cli.md +0 -137
  37. scribe_cli-0.17.1/docs/tray.md +0 -92
  38. scribe_cli-0.17.1/scribe/audio.py +0 -76
  39. scribe_cli-0.17.1/scribe/menu.py +0 -960
  40. scribe_cli-0.17.1/scribe/models.py +0 -280
  41. scribe_cli-0.17.1/tests/test_pseudo_streaming.py +0 -288
  42. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/.github/FUNDING.yml +0 -0
  43. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/.github/workflows/pypi.yml +0 -0
  44. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/LICENSE +0 -0
  45. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/docs/roadmap-libei.md +0 -0
  46. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/icon.xcf +0 -0
  47. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/__init__.py +0 -0
  48. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/backends/__init__.py +0 -0
  49. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/install_desktop.py +0 -0
  50. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/keyboard.py +0 -0
  51. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/models.toml +0 -0
  52. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/saverecording.py +0 -0
  53. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/testpynput.py +0 -0
  54. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/typers/__init__.py +0 -0
  55. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/typers/base.py +0 -0
  56. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/typers/eitype.py +0 -0
  57. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/typers/pynput.py +0 -0
  58. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/typers/wtype.py +0 -0
  59. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/typers/ydotool.py +0 -0
  60. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/util.py +0 -0
  61. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe_cli.egg-info/dependency_links.txt +0 -0
  62. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe_cli.egg-info/entry_points.txt +0 -0
  63. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe_cli.egg-info/top_level.txt +0 -0
  64. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe_data/__init__.py +0 -0
  65. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe_data/share/icon.png +0 -0
  66. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe_data/share/icon_recording.png +0 -0
  67. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe_data/share/icon_writing.png +0 -0
  68. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe_data/templates/scribe.desktop +0 -0
  69. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scripts/bench_whisper_local.py +0 -0
  70. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scripts/test_python_versions_install.sh +0 -0
  71. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/setup.cfg +0 -0
  72. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/tests/test_openai_realtime_coalesce.py +0 -0
  73. {scribe_cli-0.17.1 → scribe_cli-1.0.0}/tests/test_whisper_futo.py +0 -0
@@ -7,3 +7,4 @@ scribe/_version.py
7
7
 
8
8
  # Autonomous roadmap workflows (local coordination artifacts; never committed)
9
9
  workflows/
10
+ .worktrees/
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: scribe-cli
3
- Version: 0.17.1
3
+ Version: 1.0.0
4
4
  Summary: Speech-to-text CLI and system-tray app for dictating into any focused window. Local (vosk, faster-whisper) or cloud (groq, openai) backends, batch or streaming.
5
5
  Author-email: Mahé Perrette <mahe.perrette@gmail.com>
6
6
  License: MIT License
@@ -33,13 +33,34 @@ License: MIT License
33
33
  licenses of all dependencies before using or distributing this software to
34
34
  ensure compliance with their respective terms.
35
35
  Project-URL: Homepage, https://github.com/perrette/scribe
36
- Keywords: speech-to-text,speech recognition,transcription,dictation,voice-typing,voice-to-text,realtime,streaming,language,AI,local,API,cli,tray,vosk,whisper,openai,groq,gpt-4o,linux,wayland,keyboard,clipboard
36
+ Project-URL: Source, https://github.com/perrette/scribe
37
+ Project-URL: Issues, https://github.com/perrette/scribe/issues
38
+ Project-URL: Changelog, https://github.com/perrette/scribe/releases
39
+ Project-URL: Funding, https://github.com/sponsors/perrette
40
+ Keywords: speech-to-text,stt,transcription,dictation,voice-typing,voice-recognition,multilingual,realtime,streaming,cli,tray,vosk,whisper,faster-whisper,openai,groq,gpt-4o,linux,wayland,keyboard,clipboard,microphone,audio
41
+ Classifier: Development Status :: 5 - Production/Stable
42
+ Classifier: Intended Audience :: End Users/Desktop
43
+ Classifier: Intended Audience :: Developers
44
+ Classifier: License :: OSI Approved :: MIT License
37
45
  Classifier: Programming Language :: Python :: 3.9
38
46
  Classifier: Programming Language :: Python :: 3.10
39
47
  Classifier: Programming Language :: Python :: 3.11
40
48
  Classifier: Programming Language :: Python :: 3.12
41
49
  Classifier: Programming Language :: Python :: 3.13
42
50
  Classifier: Operating System :: OS Independent
51
+ Classifier: Environment :: Console
52
+ Classifier: Environment :: X11 Applications
53
+ Classifier: Environment :: MacOS X
54
+ Classifier: Environment :: Win32 (MS Windows)
55
+ Classifier: Natural Language :: English
56
+ Classifier: Natural Language :: French
57
+ Classifier: Natural Language :: German
58
+ Classifier: Natural Language :: Italian
59
+ Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
60
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
61
+ Classifier: Topic :: Office/Business
62
+ Classifier: Topic :: Text Processing :: Linguistic
63
+ Classifier: Topic :: Utilities
43
64
  Requires-Python: >=3.9
44
65
  Description-Content-Type: text/markdown
45
66
  License-File: LICENSE
@@ -52,6 +73,7 @@ Requires-Dist: unidecode
52
73
  Requires-Dist: termcolor
53
74
  Requires-Dist: platformdirs
54
75
  Requires-Dist: desktop-ai-core>=0.2.0
76
+ Requires-Dist: onnxruntime
55
77
  Provides-Extra: keyboard
56
78
  Requires-Dist: pynput; extra == "keyboard"
57
79
  Provides-Extra: whisper
@@ -69,6 +91,7 @@ Requires-Dist: soundfile; extra == "openai"
69
91
  Provides-Extra: groq
70
92
  Requires-Dist: openai<3,>=2.37.0; extra == "groq"
71
93
  Requires-Dist: soundfile; extra == "groq"
94
+ Provides-Extra: vad
72
95
  Provides-Extra: all
73
96
  Requires-Dist: pynput; extra == "all"
74
97
  Requires-Dist: faster-whisper; extra == "all"
@@ -90,11 +113,13 @@ cloud-based APIs, batch and streaming workflows.
90
113
 
91
114
  ## What it does
92
115
 
93
- - Records from your mic and transcribes via one of four backends —
94
- **Vosk** (local, streaming), **Whisper** (local, batch), **OpenAI**
95
- (cloud, batch *or* streaming), **Groq** (cloud, batch).
96
- - Delivers the transcript three ways: paste into the focused window
97
- (default), copy to clipboard, or print to the terminal.
116
+ - Records from your mic and transcribes via one of five backends —
117
+ **Vosk** (local, streaming), **Whisper** (local, batch),
118
+ **Whisper FUTO** (local, batch ACFT-tuned for short dictations),
119
+ **OpenAI** (cloud, batch *or* streaming), **Groq** (cloud, batch).
120
+ - Delivers the transcript four ways: paste into the focused window
121
+ (default), copy to clipboard, print to the terminal, or write to
122
+ a file.
98
123
  - Runs as a **system tray icon** with a single Record button, or as an
99
124
  interactive **terminal TUI** — same menu in both.
100
125
  - Hooks into your DE's keyboard shortcuts via `SIGUSR1` (toggle
@@ -124,8 +149,8 @@ scribe
124
149
  This launches the system tray icon. Press Record, speak, press Stop —
125
150
  the transcription lands in the focused window. Scribe picks the first
126
151
  backend whose key / dependency is present, in order **`groq` →
127
- `openai` → `whisper` → `vosk`**, so with `GROQ_API_KEY` set the
128
- command above is equivalent to:
152
+ `openai` → `whisper-futo` → `whisper` → `vosk`**, so with `GROQ_API_KEY`
153
+ set the command above is equivalent to:
129
154
 
130
155
  ```bash
131
156
  scribe --backend groq --model whisper-large-v3-turbo
@@ -140,15 +165,17 @@ scribe --backend openai --model gpt-4o-mini-transcribe # OpenAI sweet spot
140
165
  scribe --backend openai --model gpt-realtime-whisper # OpenAI streaming
141
166
  scribe --backend whisper --model small # local, no API key
142
167
  scribe --frontend terminal # interactive TUI menu
143
- scribe --frontend terminal --no-interactive # record immediately, no menu
168
+ scribe --record # start recording immediately on launch (works in tray or terminal)
169
+ scribe --record --frontend terminal --mode file # one-shot batched dictation → file
170
+ scribe --record --frontend terminal --mode file --stream # streamed: chunks appended live as you speak
144
171
  scribe --mode clipboard # copy to clipboard, no keystroke
145
172
  scribe --mode terminal # only print to stdout
146
- scribe -o transcript.txt # also append to a file
173
+ scribe --mode file -o transcript.txt # append to a file (no keystroke / clipboard)
147
174
  ```
148
175
 
149
176
  With `--no-interactive` (terminal frontend only), scribe skips the
150
177
  interactive menu and starts recording right away — handy for scripted,
151
- one-shot transcriptions. `--no-prompt` is kept as a deprecated alias.
178
+ one-shot transcriptions.
152
179
 
153
180
  Bias the recogniser toward names, jargon, or a domain glossary with
154
181
  `--prompt "free text hint"` and `--words word1 word2 ...` (each also
@@ -159,12 +186,13 @@ for what each backend does with them.
159
186
 
160
187
  ## Backends at a glance
161
188
 
162
- | Backend | `--backend` | Default model | Streaming model(s) | Requires |
163
- |-----------------|-------------|----------------------------|---------------------------|-------------------------------------|
164
- | Groq (cloud) | `groq` | `whisper-large-v3-turbo` | — | `GROQ_API_KEY` |
165
- | OpenAI (cloud) | `openai` | `gpt-4o-mini-transcribe` | `gpt-realtime-whisper` | `OPENAI_API_KEY` |
166
- | Whisper (local) | `whisper` | `small` | — | `pip install scribe-cli[whisper]` |
167
- | Vosk (local) | `vosk` | language-dependent | all Vosk models | `pip install scribe-cli[vosk]` |
189
+ | Backend | `--backend` | Default model | Streaming model(s) | Requires |
190
+ |----------------------|-----------------|----------------------------|---------------------------|----------------------------------------|
191
+ | Groq (cloud) | `groq` | `whisper-large-v3-turbo` | — | `GROQ_API_KEY` |
192
+ | OpenAI (cloud) | `openai` | `gpt-4o-mini-transcribe` | `gpt-realtime-whisper` | `OPENAI_API_KEY` |
193
+ | Whisper FUTO (local) | `whisper-futo` | `small` | — | `pip install scribe-cli[whisper-futo]` |
194
+ | Whisper (local) | `whisper` | `small` | | `pip install scribe-cli[whisper]` |
195
+ | Vosk (local) | `vosk` | language-dependent | all Vosk models | `pip install scribe-cli[vosk]` |
168
196
 
169
197
  Whether a transcription appears live as you speak or all at once when
170
198
  you stop depends on the **model** picked — see
@@ -173,8 +201,11 @@ you stop depends on the **model** picked — see
173
201
 
174
202
  ### Getting an API key
175
203
 
176
- Groq is a good cloud backend to start with very fast, quite accurate, and the
177
- **free tier** is generous enough for everyday dictation. Sign up at
204
+ Groq is the **recommended cloud backend by default**extremely fast
205
+ (by a wide margin compared to other cloud STT options, especially in
206
+ **Stream** mode where the per-chunk roundtrip latency dominates the
207
+ perceived speed), quite accurate, and the **free tier** is generous
208
+ enough for everyday dictation. Sign up at
178
209
  [console.groq.com](https://console.groq.com/), create an API key
179
210
  under **Settings → API Keys**, and export it as `GROQ_API_KEY`.
180
211
 
@@ -187,7 +218,7 @@ I personally use [OpenAI](https://openai.com/api/) with `gpt-4o-mini-transcribe`
187
218
  extras, Ubuntu / GNOME tray libs.
188
219
  - [Backends in detail](docs/backends.md) — model lists, when to pick
189
220
  which, the realtime model.
190
- - [Keyboard modes & typer backends](docs/keyboard.md) — keystroke vs
221
+ - [Output modes & typer backends](docs/output.md) — keystroke vs
191
222
  clipboard, Wayland / `eitype`, `--type-direct`.
192
223
  - [System tray & global hotkeys](docs/tray.md) — menu tree, icon
193
224
  states, `SIGUSR1`/`SIGUSR2`.
@@ -196,10 +227,17 @@ I personally use [OpenAI](https://openai.com/api/) with `gpt-4o-mini-transcribe`
196
227
  - [Fine tuning & CLI reference](docs/cli.md) — every `scribe --help`
197
228
  flag with examples.
198
229
 
230
+ ## Related projects
231
+
232
+ - **[bard](https://github.com/perrette/bard)** — TTS sibling of scribe,
233
+ same tray/CLI architecture in reverse: highlight text, hear it
234
+ spoken. Shares the [`desktop-ai-core`](https://github.com/perrette/desktop-ai-core)
235
+ backbone (frontends, providers, dialog helpers).
236
+
199
237
  ## Compatibility
200
238
 
201
239
  Initially developed for Python 3 on Ubuntu 24.04 (GNOME + Wayland);
202
240
  works on macOS and Windows too. Wayland keystroke injection is
203
- convoluted but [solved](docs/keyboard.md). For dependencies of
241
+ convoluted but [solved](docs/output.md). For dependencies of
204
242
  individual subsystems, check `pynput` (keyboard) and `pystray` (tray
205
243
  icon).
@@ -9,11 +9,13 @@ cloud-based APIs, batch and streaming workflows.
9
9
 
10
10
  ## What it does
11
11
 
12
- - Records from your mic and transcribes via one of four backends —
13
- **Vosk** (local, streaming), **Whisper** (local, batch), **OpenAI**
14
- (cloud, batch *or* streaming), **Groq** (cloud, batch).
15
- - Delivers the transcript three ways: paste into the focused window
16
- (default), copy to clipboard, or print to the terminal.
12
+ - Records from your mic and transcribes via one of five backends —
13
+ **Vosk** (local, streaming), **Whisper** (local, batch),
14
+ **Whisper FUTO** (local, batch ACFT-tuned for short dictations),
15
+ **OpenAI** (cloud, batch *or* streaming), **Groq** (cloud, batch).
16
+ - Delivers the transcript four ways: paste into the focused window
17
+ (default), copy to clipboard, print to the terminal, or write to
18
+ a file.
17
19
  - Runs as a **system tray icon** with a single Record button, or as an
18
20
  interactive **terminal TUI** — same menu in both.
19
21
  - Hooks into your DE's keyboard shortcuts via `SIGUSR1` (toggle
@@ -43,8 +45,8 @@ scribe
43
45
  This launches the system tray icon. Press Record, speak, press Stop —
44
46
  the transcription lands in the focused window. Scribe picks the first
45
47
  backend whose key / dependency is present, in order **`groq` →
46
- `openai` → `whisper` → `vosk`**, so with `GROQ_API_KEY` set the
47
- command above is equivalent to:
48
+ `openai` → `whisper-futo` → `whisper` → `vosk`**, so with `GROQ_API_KEY`
49
+ set the command above is equivalent to:
48
50
 
49
51
  ```bash
50
52
  scribe --backend groq --model whisper-large-v3-turbo
@@ -59,15 +61,17 @@ scribe --backend openai --model gpt-4o-mini-transcribe # OpenAI sweet spot
59
61
  scribe --backend openai --model gpt-realtime-whisper # OpenAI streaming
60
62
  scribe --backend whisper --model small # local, no API key
61
63
  scribe --frontend terminal # interactive TUI menu
62
- scribe --frontend terminal --no-interactive # record immediately, no menu
64
+ scribe --record # start recording immediately on launch (works in tray or terminal)
65
+ scribe --record --frontend terminal --mode file # one-shot batched dictation → file
66
+ scribe --record --frontend terminal --mode file --stream # streamed: chunks appended live as you speak
63
67
  scribe --mode clipboard # copy to clipboard, no keystroke
64
68
  scribe --mode terminal # only print to stdout
65
- scribe -o transcript.txt # also append to a file
69
+ scribe --mode file -o transcript.txt # append to a file (no keystroke / clipboard)
66
70
  ```
67
71
 
68
72
  With `--no-interactive` (terminal frontend only), scribe skips the
69
73
  interactive menu and starts recording right away — handy for scripted,
70
- one-shot transcriptions. `--no-prompt` is kept as a deprecated alias.
74
+ one-shot transcriptions.
71
75
 
72
76
  Bias the recogniser toward names, jargon, or a domain glossary with
73
77
  `--prompt "free text hint"` and `--words word1 word2 ...` (each also
@@ -78,12 +82,13 @@ for what each backend does with them.
78
82
 
79
83
  ## Backends at a glance
80
84
 
81
- | Backend | `--backend` | Default model | Streaming model(s) | Requires |
82
- |-----------------|-------------|----------------------------|---------------------------|-------------------------------------|
83
- | Groq (cloud) | `groq` | `whisper-large-v3-turbo` | — | `GROQ_API_KEY` |
84
- | OpenAI (cloud) | `openai` | `gpt-4o-mini-transcribe` | `gpt-realtime-whisper` | `OPENAI_API_KEY` |
85
- | Whisper (local) | `whisper` | `small` | — | `pip install scribe-cli[whisper]` |
86
- | Vosk (local) | `vosk` | language-dependent | all Vosk models | `pip install scribe-cli[vosk]` |
85
+ | Backend | `--backend` | Default model | Streaming model(s) | Requires |
86
+ |----------------------|-----------------|----------------------------|---------------------------|----------------------------------------|
87
+ | Groq (cloud) | `groq` | `whisper-large-v3-turbo` | — | `GROQ_API_KEY` |
88
+ | OpenAI (cloud) | `openai` | `gpt-4o-mini-transcribe` | `gpt-realtime-whisper` | `OPENAI_API_KEY` |
89
+ | Whisper FUTO (local) | `whisper-futo` | `small` | — | `pip install scribe-cli[whisper-futo]` |
90
+ | Whisper (local) | `whisper` | `small` | | `pip install scribe-cli[whisper]` |
91
+ | Vosk (local) | `vosk` | language-dependent | all Vosk models | `pip install scribe-cli[vosk]` |
87
92
 
88
93
  Whether a transcription appears live as you speak or all at once when
89
94
  you stop depends on the **model** picked — see
@@ -92,8 +97,11 @@ you stop depends on the **model** picked — see
92
97
 
93
98
  ### Getting an API key
94
99
 
95
- Groq is a good cloud backend to start with very fast, quite accurate, and the
96
- **free tier** is generous enough for everyday dictation. Sign up at
100
+ Groq is the **recommended cloud backend by default**extremely fast
101
+ (by a wide margin compared to other cloud STT options, especially in
102
+ **Stream** mode where the per-chunk roundtrip latency dominates the
103
+ perceived speed), quite accurate, and the **free tier** is generous
104
+ enough for everyday dictation. Sign up at
97
105
  [console.groq.com](https://console.groq.com/), create an API key
98
106
  under **Settings → API Keys**, and export it as `GROQ_API_KEY`.
99
107
 
@@ -106,7 +114,7 @@ I personally use [OpenAI](https://openai.com/api/) with `gpt-4o-mini-transcribe`
106
114
  extras, Ubuntu / GNOME tray libs.
107
115
  - [Backends in detail](docs/backends.md) — model lists, when to pick
108
116
  which, the realtime model.
109
- - [Keyboard modes & typer backends](docs/keyboard.md) — keystroke vs
117
+ - [Output modes & typer backends](docs/output.md) — keystroke vs
110
118
  clipboard, Wayland / `eitype`, `--type-direct`.
111
119
  - [System tray & global hotkeys](docs/tray.md) — menu tree, icon
112
120
  states, `SIGUSR1`/`SIGUSR2`.
@@ -115,10 +123,17 @@ I personally use [OpenAI](https://openai.com/api/) with `gpt-4o-mini-transcribe`
115
123
  - [Fine tuning & CLI reference](docs/cli.md) — every `scribe --help`
116
124
  flag with examples.
117
125
 
126
+ ## Related projects
127
+
128
+ - **[bard](https://github.com/perrette/bard)** — TTS sibling of scribe,
129
+ same tray/CLI architecture in reverse: highlight text, hear it
130
+ spoken. Shares the [`desktop-ai-core`](https://github.com/perrette/desktop-ai-core)
131
+ backbone (frontends, providers, dialog helpers).
132
+
118
133
  ## Compatibility
119
134
 
120
135
  Initially developed for Python 3 on Ubuntu 24.04 (GNOME + Wayland);
121
136
  works on macOS and Windows too. Wayland keystroke injection is
122
- convoluted but [solved](docs/keyboard.md). For dependencies of
137
+ convoluted but [solved](docs/output.md). For dependencies of
123
138
  individual subsystems, check `pynput` (keyboard) and `pystray` (tray
124
139
  icon).
Binary file
@@ -70,7 +70,7 @@ Vosk transcribes in real time and is very good at one language at a
70
70
  time, but tends to make more mistakes than Whisper and does not produce
71
71
  punctuation. It becomes really useful in longer, interactive sessions
72
72
  where the live "appears as you speak" UX matters — see
73
- [keyboard.md](keyboard.md) for how the keystroke mode interacts with
73
+ [output.md](output.md) for how the keystroke mode interacts with
74
74
  streaming models.
75
75
 
76
76
  There are many [Vosk models](https://alphacephei.com/vosk/models)
@@ -117,12 +117,15 @@ for the full picture.
117
117
  ## `groq` (Groq cloud)
118
118
 
119
119
  Talks to Groq's OpenAI-compatible API and defaults to
120
- `whisper-large-v3-turbo`. Typically the fastest cloud option for
121
- full-utterance transcription:
120
+ `whisper-large-v3-turbo`. **Extremely fast** thanks to Groq's
121
+ inference hardware — the recommended cloud backend by default, and
122
+ the natural pick for `--stream` mode where per-chunk roundtrip
123
+ latency dominates perceived speed:
122
124
 
123
125
  ```bash
124
126
  export GROQ_API_KEY=YOURAPIKEY
125
- scribe --backend groq
127
+ scribe --backend groq # Clip mode (default)
128
+ scribe --backend groq --stream # live transcription, per-chunk
126
129
  ```
127
130
 
128
131
  The `groq` backend reuses the `openai` Python client under the hood, so
@@ -146,14 +149,14 @@ style, domain, or word list. The concept is generic across the
146
149
  whisper-family backends but each backend exposes it slightly
147
150
  differently:
148
151
 
149
- | Backend | `--prompt` | `--words` |
150
- |--------------------------------------|-------------------------------|--------------------------------------------------------|
151
- | `whisper` (faster-whisper, local) | passed as `initial_prompt=` | passed as `hotwords=` — a **dedicated biasing channel** separate from the prompt |
152
- | `whisper-futo` (pywhispercpp, local) | passed as `initial_prompt=` | joined onto the prompt string (no separate hotwords channel here) |
153
- | `openai` batch (`gpt-4o*-transcribe`) | passed as `prompt=` | joined onto the prompt string |
154
- | `groq` (`whisper-large-v3-turbo`) | passed as `prompt=` | joined onto the prompt string |
155
- | `openai` realtime (`gpt-realtime-whisper`) | *silently ignored* — the model rejects the prompt parameter server-side (HTTP 400 *"The 'prompt' parameter is not supported for this model."*). The kwarg stays accepted for plumbing compatibility but never reaches the API. | same — joined into the (ignored) prompt |
156
- | `vosk` | *ignored* (no soft prompt) | *ignored* (Vosk only supports a hard `grammar` allowlist; not yet exposed) |
152
+ | Backend | `--prompt` | `--words` | `--language` |
153
+ |--------------------------------------|-------------------------------|--------------------------------------------------------|---------------------------------------------------------|
154
+ | `whisper` (faster-whisper, local) | passed as `initial_prompt=` | passed as `hotwords=` — a **dedicated biasing channel** separate from the prompt | passed as `language=` (ISO 639-1); `-l en` also auto-substitutes `small.en` etc. |
155
+ | `whisper-futo` (pywhispercpp, local) | passed as `initial_prompt=` | joined onto the prompt string (no separate hotwords channel here) | passed as `language=` (ISO 639-1); `-l en` auto-substitutes `small.en` etc. |
156
+ | `openai` batch (`gpt-4o*-transcribe`) | passed as `prompt=` | joined onto the prompt string | passed as `language=` hint (ISO 639-1) |
157
+ | `groq` (`whisper-large-v3-turbo`) | passed as `prompt=` | joined onto the prompt string | passed as `language=` hint (ISO 639-1) |
158
+ | `openai` realtime (`gpt-realtime-whisper`) | *silently ignored* — the model rejects the prompt parameter server-side (HTTP 400 *"The 'prompt' parameter is not supported for this model."*). The kwarg stays accepted for plumbing compatibility but never reaches the API. | same — joined into the (ignored) prompt | passed as `language=` (ISO 639-1) |
159
+ | `vosk` | *ignored* (no soft prompt) | *ignored* (Vosk only supports a hard `grammar` allowlist; not yet exposed) | picks a per-language model from `scribe/models.toml`; no runtime parameter |
157
160
 
158
161
  The whisper-family APIs cap the prompt around ~224 tokens; longer
159
162
  hints are silently truncated. Faster-whisper's `hotwords` channel is
@@ -184,34 +187,117 @@ invocation, pass an explicit empty value: `--prompt ""` (or
184
187
  arguments (or `--words-file ""`) suppresses the words default. Each
185
188
  side is independent.
186
189
 
187
- ## Pseudo-streaming (experimental)
188
-
189
- `--pseudo-streaming` makes a batch backend behave streaming-like by
190
- cutting the running buffer into chunks driven by silence:
190
+ ## Language
191
+
192
+ `-l / --language LANG` tells the backend which language to expect.
193
+ What that means in practice varies by backend (see the per-backend
194
+ column in the table above):
195
+
196
+ - **Whisper-family** (`whisper`, `whisper-futo`, `openai` batch +
197
+ realtime, `groq`) — the language is passed to the model as a hard
198
+ lock: the decoder generates that language regardless of what it
199
+ hears acoustically. Accepts any [ISO 639-1 short code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes)
200
+ Whisper recognises (~99 languages). When unset, Whisper auto-detects
201
+ per chunk.
202
+ - **English-only model variants** — for `whisper` and `whisper-futo`,
203
+ `-l en` *also* auto-substitutes the English-only model when one
204
+ exists (`small` → `small.en`, etc.). These variants trade
205
+ multilingual coverage for English accuracy.
206
+ - **Vosk** — language isn't a runtime parameter; vosk ships a
207
+ separate model per language. `-l fr` looks up the vosk model
208
+ pre-mapped to French in [`scribe/models.toml`](../scribe/models.toml)
209
+ and instantiates that one. Vosk has no auto-detect path, so the
210
+ Language menu's `Auto` entry on vosk falls back to a sensible
211
+ default — the tray shows `Auto (🇬🇧 en)` to make this explicit
212
+ without mutating the stored `language=None`.
213
+
214
+ The tray's **Language** submenu exposes the four curated languages
215
+ (`en` / `fr` / `de` / `it`) with origin-country flag prefixes
216
+ (🇬🇧 / 🇫🇷 / 🇩🇪 / 🇮🇹). The CLI accepts these plus any other ISO 639-1
217
+ code the active backend recognises.
218
+
219
+ ## Stream mode (works with any backend)
220
+
221
+ `--stream` (or **Mode: Stream** in the tray) emits transcribed text
222
+ **live as you speak**, regardless of which backend you picked. This
223
+ is the headline v1.0.0 improvement: scribe abstracts over the two
224
+ different mechanisms that backends use to deliver live output, so
225
+ `--stream` works uniformly across every supported backend.
226
+
227
+ - **Native streaming backends** (Vosk, `gpt-realtime-whisper`) push
228
+ partial results from the server as audio is received — scribe just
229
+ forwards them to the chosen output (focused window / clipboard /
230
+ terminal / file). These backends are *always* in Stream mode; the
231
+ Mode toggle reads "Mode: Stream (native)" for them and is read-only.
232
+ - **Batch backends** (Whisper local, Whisper FUTO, OpenAI
233
+ `gpt-4o-*-transcribe`, Groq `whisper-large-v3-turbo`) don't accept
234
+ partial audio. scribe instead cuts the recording buffer on
235
+ detected silence and issues a separate transcription request for
236
+ each chunk — internally called *pseudo-streaming*. The user sees
237
+ the same live experience.
191
238
 
192
239
  ```bash
193
- scribe --pseudo-streaming --streaming-window 5
240
+ scribe --stream # any backend, live transcription
241
+ scribe --stream --backend groq # Groq + Stream is the sweet spot
242
+ scribe --stream --backend whisper # local, live, no API key
194
243
  ```
195
244
 
196
- After `--streaming-window` seconds of buffered audio, scribe cuts at
197
- the first silence of at least `--silence-duration` and transcribes the
198
- chunk; if no silence arrives by `2 × --streaming-window`, it
199
- force-cuts. The session continues until you stop it. Default `5` s
200
- trades a little Whisper context for snappier "text appears as you
201
- speak" UX; raise it (10–30 s) if accuracy on long sentences matters
202
- more than latency.
203
-
204
- This is experimental and off by default. The tray menu surfaces the
205
- same toggle under Options ▶ Advanced ▶ Pseudo-streaming.
245
+ ### How pseudo-streaming carves up a recording
246
+
247
+ Once the buffer has grown to at least `--stream-chunk-min` (default
248
+ 1.5 s), silence of at least `--stream-chunk-silence-break` (default
249
+ 0.6 s) triggers a chunk cut. A force-cut fires at `--stream-chunk-max`
250
+ (default 10 s) regardless of silence, to cap latency. The session
251
+ continues until you stop it manually.
252
+
253
+ ### Does pseudo-streaming change the API cost?
254
+
255
+ For cloud backends, going from one big transcription to N chunked
256
+ requests **does not normally change the bill**:
257
+
258
+ - **Groq** (`whisper-large-v3-turbo`) is billed per second of audio.
259
+ Total audio is unchanged → same cost.
260
+ - **OpenAI `whisper-1`** (legacy) is billed per minute of audio. Same
261
+ logic, same cost.
262
+ - **OpenAI `gpt-4o-transcribe` / `gpt-4o-mini-transcribe`** are token-
263
+ billed (audio-in + text-out + prompt-in). Audio and output stay
264
+ identical; the only delta is the rolling cross-chunk *prompt*
265
+ context (~200 chars ≈ 50–60 tokens per chunk after the first).
266
+ At gpt-4o-mini-transcribe input rates this is negligible — well
267
+ under a cent per long session.
268
+
269
+ That said, your real cost depends on your usage and your account's
270
+ pricing tier — **verify on your provider's billing dashboard** if
271
+ cost is a hard constraint.
272
+
273
+ Two special values for `--stream-chunk-silence-break` (set via the
274
+ tray's **Silence break** picker or `--stream-chunk-silence-break 0`
275
+ at the CLI):
276
+
277
+ - **Auto** (`0`) — disables the fixed-threshold trigger. At force-cut
278
+ time scribe picks the *longest* silence interval within the window
279
+ whose start position is at least `--stream-chunk-min` into the chunk,
280
+ re-cutting there for a more natural word boundary. Falls back to a
281
+ brute force-cut if no qualifying silence is found.
282
+ - **Max** — disables silence-based cuts entirely; only the force-cut at
283
+ `--stream-chunk-max` fires. Useful when you want uniform chunk sizes
284
+ regardless of speech patterns. (Only selectable from the tray picker.)
285
+
286
+ Stream mode is off by default — the default `Clip` mode transcribes the
287
+ whole recording at end (`--clip`). The tray menu surfaces the same
288
+ toggle as the top-level **Mode: Stream / Clip** item. Native
289
+ streamers (vosk, `gpt-realtime-whisper`) are always streaming and the
290
+ menu shows **Mode: Stream (native)** for them.
206
291
 
207
292
  ### Cross-chunk prompt context
208
293
 
209
- In pseudo-streaming mode scribe automatically augments each chunk's
210
- prompt with the trailing ~200 characters of the *previous* chunk's
211
- transcription. This rolling tail is concatenated onto whatever static
212
- `--prompt` / `--words` you configured and reaches the backend through
213
- the same channel as the static prompt (the vocabulary biasing table
214
- above). The motivation is cross-chunk continuity:
294
+ In Stream mode (pseudo-streaming) scribe automatically augments
295
+ each chunk's prompt with the trailing ~200 characters of the
296
+ *previous* chunk's transcription. This rolling tail is concatenated
297
+ onto whatever static `--prompt` / `--words` you configured and
298
+ reaches the backend through the same channel as the static prompt
299
+ (the vocabulary biasing table above). The motivation is cross-chunk
300
+ continuity:
215
301
 
216
302
  - **Capitalization drift** — without context, a chunk that starts
217
303
  right after a period might come back lowercased.
@@ -225,14 +311,13 @@ Whisper's prompt window is capped at ~224 tokens; 200 chars of French
225
311
  sits well under that and leaves room for your static prompt + words
226
312
  list.
227
313
 
228
- The rolling tail is **dropped** whenever the pause that triggered the
229
- chunk cut exceeded 1.5 seconds — a long pause is treated as a new
230
- sentence/idea boundary, where carrying a possibly-bad prior chunk
231
- forward biases the next one more than it helps. This mirrors
232
- `whisper.cpp`'s `--keep-context off` default: prior-text conditioning
233
- can self-reinforce errors (hallucinations, decoder repetition loops)
234
- more readily than it provides useful continuity, so we cap it at
235
- natural sentence boundaries.
314
+ The rolling tail is **dropped** when the silence between two
315
+ utterances exceeds `--stream-context-reset-silence` ×
316
+ `--stream-chunk-silence-break` (default 3 × 0.6 s = 1.8 s) — a long
317
+ pause is treated as a new sentence/idea boundary, where carrying a
318
+ possibly-bad prior chunk forward biases the next one more than it
319
+ helps. Use `--stream-context-reset-silence inf` to keep context across
320
+ arbitrarily long pauses.
236
321
 
237
322
  Short pauses (mid-sentence punctuation) keep the context; the cut at
238
323
  the start of every new recording also clears it.