scribe-cli 0.13.1__tar.gz → 0.17.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. scribe_cli-0.17.0/.github/FUNDING.yml +3 -0
  2. scribe_cli-0.17.0/.gitignore +9 -0
  3. scribe_cli-0.17.0/PKG-INFO +205 -0
  4. scribe_cli-0.17.0/README.md +124 -0
  5. scribe_cli-0.17.0/docs/app-tray-menu.png +0 -0
  6. scribe_cli-0.17.0/docs/backends.md +204 -0
  7. scribe_cli-0.17.0/docs/cli.md +137 -0
  8. scribe_cli-0.17.0/docs/desktop-install.md +44 -0
  9. scribe_cli-0.17.0/docs/installation.md +84 -0
  10. scribe_cli-0.17.0/docs/keyboard.md +169 -0
  11. scribe_cli-0.17.0/docs/roadmap-libei.md +158 -0
  12. scribe_cli-0.17.0/docs/tray.md +92 -0
  13. {scribe_cli-0.13.1 → scribe_cli-0.17.0}/pyproject.toml +19 -4
  14. scribe_cli-0.17.0/scribe/_version.py +24 -0
  15. scribe_cli-0.17.0/scribe/app.py +634 -0
  16. scribe_cli-0.17.0/scribe/backends/__init__.py +51 -0
  17. scribe_cli-0.17.0/scribe/backends/groq.py +28 -0
  18. scribe_cli-0.17.0/scribe/backends/openai_api.py +67 -0
  19. scribe_cli-0.17.0/scribe/backends/openai_realtime.py +333 -0
  20. scribe_cli-0.17.0/scribe/backends/vosk.py +60 -0
  21. scribe_cli-0.17.0/scribe/backends/whisper.py +54 -0
  22. scribe_cli-0.17.0/scribe/backends/whisper_futo.py +140 -0
  23. scribe_cli-0.17.0/scribe/install_desktop.py +135 -0
  24. scribe_cli-0.17.0/scribe/keyboard.py +72 -0
  25. scribe_cli-0.17.0/scribe/menu.py +960 -0
  26. scribe_cli-0.17.0/scribe/models.py +182 -0
  27. scribe_cli-0.17.0/scribe/session.py +150 -0
  28. scribe_cli-0.17.0/scribe/typers/__init__.py +44 -0
  29. scribe_cli-0.17.0/scribe/typers/base.py +18 -0
  30. scribe_cli-0.17.0/scribe/typers/eitype.py +64 -0
  31. scribe_cli-0.17.0/scribe/typers/pynput.py +65 -0
  32. scribe_cli-0.17.0/scribe/typers/wtype.py +78 -0
  33. scribe_cli-0.17.0/scribe/typers/ydotool.py +72 -0
  34. scribe_cli-0.17.0/scribe_cli.egg-info/PKG-INFO +205 -0
  35. {scribe_cli-0.13.1 → scribe_cli-0.17.0}/scribe_cli.egg-info/SOURCES.txt +25 -0
  36. {scribe_cli-0.13.1 → scribe_cli-0.17.0}/scribe_cli.egg-info/requires.txt +14 -4
  37. scribe_cli-0.17.0/scripts/bench_whisper_local.py +156 -0
  38. scribe_cli-0.13.1/.gitignore +0 -6
  39. scribe_cli-0.13.1/PKG-INFO +0 -286
  40. scribe_cli-0.13.1/README.md +0 -214
  41. scribe_cli-0.13.1/scribe/_version.py +0 -21
  42. scribe_cli-0.13.1/scribe/app.py +0 -589
  43. scribe_cli-0.13.1/scribe/install_desktop.py +0 -50
  44. scribe_cli-0.13.1/scribe/keyboard.py +0 -68
  45. scribe_cli-0.13.1/scribe/models.py +0 -278
  46. scribe_cli-0.13.1/scribe_cli.egg-info/PKG-INFO +0 -286
  47. {scribe_cli-0.13.1 → scribe_cli-0.17.0}/.github/workflows/pypi.yml +0 -0
  48. {scribe_cli-0.13.1 → scribe_cli-0.17.0}/LICENSE +0 -0
  49. {scribe_cli-0.13.1 → scribe_cli-0.17.0}/icon.xcf +0 -0
  50. {scribe_cli-0.13.1 → scribe_cli-0.17.0}/scribe/__init__.py +0 -0
  51. {scribe_cli-0.13.1 → scribe_cli-0.17.0}/scribe/audio.py +0 -0
  52. {scribe_cli-0.13.1 → scribe_cli-0.17.0}/scribe/models.toml +0 -0
  53. {scribe_cli-0.13.1 → scribe_cli-0.17.0}/scribe/saverecording.py +0 -0
  54. {scribe_cli-0.13.1 → scribe_cli-0.17.0}/scribe/testpynput.py +0 -0
  55. {scribe_cli-0.13.1 → scribe_cli-0.17.0}/scribe/util.py +0 -0
  56. {scribe_cli-0.13.1 → scribe_cli-0.17.0}/scribe_cli.egg-info/dependency_links.txt +0 -0
  57. {scribe_cli-0.13.1 → scribe_cli-0.17.0}/scribe_cli.egg-info/entry_points.txt +0 -0
  58. {scribe_cli-0.13.1 → scribe_cli-0.17.0}/scribe_cli.egg-info/top_level.txt +0 -0
  59. {scribe_cli-0.13.1 → scribe_cli-0.17.0}/scribe_data/__init__.py +0 -0
  60. {scribe_cli-0.13.1 → scribe_cli-0.17.0}/scribe_data/share/icon.png +0 -0
  61. {scribe_cli-0.13.1 → scribe_cli-0.17.0}/scribe_data/share/icon_recording.png +0 -0
  62. {scribe_cli-0.13.1 → scribe_cli-0.17.0}/scribe_data/share/icon_writing.png +0 -0
  63. {scribe_cli-0.13.1 → scribe_cli-0.17.0}/scribe_data/templates/scribe.desktop +0 -0
  64. {scribe_cli-0.13.1 → scribe_cli-0.17.0}/scripts/test_python_versions_install.sh +0 -0
  65. {scribe_cli-0.13.1 → scribe_cli-0.17.0}/setup.cfg +0 -0
@@ -0,0 +1,3 @@
1
+ # These are supported funding model platforms
2
+
3
+ github: [ perrette ]
@@ -0,0 +1,9 @@
1
+ __pycache__
2
+ *.pyc
3
+ .venv
4
+ build
5
+ dist
6
+ scribe/_version.py
7
+
8
+ # Autonomous roadmap workflows (local coordination artifacts; never committed)
9
+ workflows/
@@ -0,0 +1,205 @@
1
+ Metadata-Version: 2.4
2
+ Name: scribe-cli
3
+ Version: 0.17.0
4
+ Summary: Speech-to-text CLI and system-tray app for dictating into any focused window. Local (vosk, faster-whisper) or cloud (groq, openai) backends, batch or streaming.
5
+ Author-email: Mahé Perrette <mahe.perrette@gmail.com>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2024 Mahé Perrette
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ ---
29
+
30
+ Note: This project relies on external packages that may have more restrictive
31
+ licenses. For example, the `pynput` package is licensed under LGPLv3, which
32
+ has different requirements compared to the MIT License. Please review the
33
+ licenses of all dependencies before using or distributing this software to
34
+ ensure compliance with their respective terms.
35
+ Project-URL: Homepage, https://github.com/perrette/scribe
36
+ Keywords: speech-to-text,speech recognition,transcription,dictation,voice-typing,voice-to-text,realtime,streaming,language,AI,local,API,cli,tray,vosk,whisper,openai,groq,gpt-4o,linux,wayland,keyboard,clipboard
37
+ Classifier: Programming Language :: Python :: 3.9
38
+ Classifier: Programming Language :: Python :: 3.10
39
+ Classifier: Programming Language :: Python :: 3.11
40
+ Classifier: Programming Language :: Python :: 3.12
41
+ Classifier: Programming Language :: Python :: 3.13
42
+ Classifier: Operating System :: OS Independent
43
+ Requires-Python: >=3.9
44
+ Description-Content-Type: text/markdown
45
+ License-File: LICENSE
46
+ Requires-Dist: numpy
47
+ Requires-Dist: sounddevice
48
+ Requires-Dist: tqdm
49
+ Requires-Dist: requests
50
+ Requires-Dist: pyperclip
51
+ Requires-Dist: unidecode
52
+ Requires-Dist: termcolor
53
+ Requires-Dist: platformdirs
54
+ Requires-Dist: desktop-ai-core>=0.2.0
55
+ Provides-Extra: keyboard
56
+ Requires-Dist: pynput; extra == "keyboard"
57
+ Provides-Extra: whisper
58
+ Requires-Dist: faster-whisper; extra == "whisper"
59
+ Provides-Extra: whisper-futo
60
+ Requires-Dist: pywhispercpp; extra == "whisper-futo"
61
+ Provides-Extra: vosk
62
+ Requires-Dist: vosk; extra == "vosk"
63
+ Provides-Extra: app
64
+ Requires-Dist: pystray; extra == "app"
65
+ Requires-Dist: PyGObject; extra == "app"
66
+ Provides-Extra: openai
67
+ Requires-Dist: openai<3,>=2.37.0; extra == "openai"
68
+ Requires-Dist: soundfile; extra == "openai"
69
+ Provides-Extra: groq
70
+ Requires-Dist: openai<3,>=2.37.0; extra == "groq"
71
+ Requires-Dist: soundfile; extra == "groq"
72
+ Provides-Extra: all
73
+ Requires-Dist: pynput; extra == "all"
74
+ Requires-Dist: faster-whisper; extra == "all"
75
+ Requires-Dist: pywhispercpp; extra == "all"
76
+ Requires-Dist: openai<3,>=2.37.0; extra == "all"
77
+ Requires-Dist: soundfile; extra == "all"
78
+ Requires-Dist: vosk; extra == "all"
79
+ Requires-Dist: pystray; extra == "all"
80
+ Dynamic: license-file
81
+
82
+ [![pypi](https://img.shields.io/pypi/v/scribe-cli)](https://pypi.org/project/scribe-cli)
83
+ ![](https://img.shields.io/python/required-version-toml?tomlFilePath=https%3A%2F%2Fraw.githubusercontent.com%2Fperrette%2Fscribe%2Frefs%2Fheads%2Fmain%2Fpyproject.toml)
84
+
85
+ # Scribe <img src="https://github.com/perrette/scribe/raw/main/scribe_data/share/icon.png" width="48">
86
+
87
+ **Talk. It types.** Scribe is a speech-to-text CLI and tray app that
88
+ pipes transcribed text straight into the focused window. It supports local and
89
+ cloud-based APIs, batch and streaming workflows.
90
+
91
+ ## What it does
92
+
93
+ - Records from your mic and transcribes via one of four backends —
94
+ **Vosk** (local, streaming), **Whisper** (local, batch), **OpenAI**
95
+ (cloud, batch *or* streaming), **Groq** (cloud, batch).
96
+ - Delivers the transcript three ways: paste into the focused window
97
+ (default), copy to clipboard, or print to the terminal.
98
+ - Runs as a **system tray icon** with a single Record button, or as an
99
+ interactive **terminal TUI** — same menu in both.
100
+ - Hooks into your DE's keyboard shortcuts via `SIGUSR1` (toggle
101
+ recording) and `SIGUSR2` (cancel).
102
+ - Cross-platform: tested on Ubuntu (X11 and Wayland), macOS, Windows;
103
+ works under Termux for clipboard / terminal output.
104
+
105
+ ## Install
106
+
107
+ ```bash
108
+ sudo apt-get install portaudio19-dev xclip # Ubuntu; macOS: brew install portaudio
109
+ pip install scribe-cli[all]
110
+ export GROQ_API_KEY=YOURAPIKEY # or OPENAI_API_KEY, or skip and run local
111
+ ```
112
+
113
+ See documentation below for setting up keyboard input on Ubuntu Wayland.
114
+
115
+
116
+ ## Usage
117
+
118
+ In a terminal:
119
+
120
+ ```bash
121
+ scribe
122
+ ```
123
+
124
+ This launches the system tray icon. Press Record, speak, press Stop —
125
+ the transcription lands in the focused window. Scribe picks the first
126
+ backend whose key / dependency is present, in order **`groq` →
127
+ `openai` → `whisper` → `vosk`**, so with `GROQ_API_KEY` set the
128
+ command above is equivalent to:
129
+
130
+ ```bash
131
+ scribe --backend groq --model whisper-large-v3-turbo
132
+ ```
133
+
134
+ <img src=https://raw.githubusercontent.com/perrette/scribe/main/docs/app-tray-menu.png width=300px>
135
+
136
+ You can override the defaults or drop the tray entirely:
137
+
138
+ ```bash
139
+ scribe --backend openai --model gpt-4o-mini-transcribe # OpenAI sweet spot
140
+ scribe --backend openai --model gpt-realtime-whisper # OpenAI streaming
141
+ scribe --backend whisper --model small # local, no API key
142
+ scribe --frontend terminal # interactive TUI menu
143
+ scribe --frontend terminal --no-interactive # record immediately, no menu
144
+ scribe --mode clipboard # copy to clipboard, no keystroke
145
+ scribe --mode terminal # only print to stdout
146
+ scribe -o transcript.txt # also append to a file
147
+ ```
148
+
149
+ With `--no-interactive` (terminal frontend only), scribe skips the
150
+ interactive menu and starts recording right away — handy for scripted,
151
+ one-shot transcriptions. `--no-prompt` is kept as a deprecated alias.
152
+
153
+ Bias the recogniser toward names, jargon, or a domain glossary with
154
+ `--prompt "free text hint"` and `--words word1 word2 ...` (each also
155
+ accepts a `--prompt-file` / `--words-file` companion). See
156
+ [docs/backends.md › Vocabulary biasing](docs/backends.md#vocabulary-biasing)
157
+ for what each backend does with them.
158
+
159
+
160
+ ## Backends at a glance
161
+
162
+ | Backend | `--backend` | Default model | Streaming model(s) | Requires |
163
+ |-----------------|-------------|----------------------------|---------------------------|-------------------------------------|
164
+ | Groq (cloud) | `groq` | `whisper-large-v3-turbo` | — | `GROQ_API_KEY` |
165
+ | OpenAI (cloud) | `openai` | `gpt-4o-mini-transcribe` | `gpt-realtime-whisper` | `OPENAI_API_KEY` |
166
+ | Whisper (local) | `whisper` | `small` | — | `pip install scribe-cli[whisper]` |
167
+ | Vosk (local) | `vosk` | language-dependent | all Vosk models | `pip install scribe-cli[vosk]` |
168
+
169
+ Whether a transcription appears live as you speak or all at once when
170
+ you stop depends on the **model** picked — see
171
+ [docs/backends.md](docs/backends.md).
172
+
173
+
174
+ ### Getting an API key
175
+
176
+ Groq is a good cloud backend to start with — very fast, quite accurate, and the
177
+ **free tier** is generous enough for everyday dictation. Sign up at
178
+ [console.groq.com](https://console.groq.com/), create an API key
179
+ under **Settings → API Keys**, and export it as `GROQ_API_KEY`.
180
+
181
+ I personally use [OpenAI](https://openai.com/api/) with `gpt-4o-mini-transcribe` as it is also fast and perhaps more accurate for my accent-tainted English.
182
+
183
+
184
+ ## Documentation
185
+
186
+ - [Installation & dependencies](docs/installation.md) — PortAudio,
187
+ extras, Ubuntu / GNOME tray libs.
188
+ - [Backends in detail](docs/backends.md) — model lists, when to pick
189
+ which, the realtime model.
190
+ - [Keyboard modes & typer backends](docs/keyboard.md) — keystroke vs
191
+ clipboard, Wayland / `eitype`, `--type-direct`.
192
+ - [System tray & global hotkeys](docs/tray.md) — menu tree, icon
193
+ states, `SIGUSR1`/`SIGUSR2`.
194
+ - [Desktop entry & autostart (`scribe-install`)](docs/desktop-install.md)
195
+ — GNOME / KDE launcher integration.
196
+ - [Fine tuning & CLI reference](docs/cli.md) — every `scribe --help`
197
+ flag with examples.
198
+
199
+ ## Compatibility
200
+
201
+ Initially developed for Python 3 on Ubuntu 24.04 (GNOME + Wayland);
202
+ works on macOS and Windows too. Wayland keystroke injection is
203
+ convoluted but [solved](docs/keyboard.md). For dependencies of
204
+ individual subsystems, check `pynput` (keyboard) and `pystray` (tray
205
+ icon).
@@ -0,0 +1,124 @@
1
+ [![pypi](https://img.shields.io/pypi/v/scribe-cli)](https://pypi.org/project/scribe-cli)
2
+ ![](https://img.shields.io/python/required-version-toml?tomlFilePath=https%3A%2F%2Fraw.githubusercontent.com%2Fperrette%2Fscribe%2Frefs%2Fheads%2Fmain%2Fpyproject.toml)
3
+
4
+ # Scribe <img src="https://github.com/perrette/scribe/raw/main/scribe_data/share/icon.png" width="48">
5
+
6
+ **Talk. It types.** Scribe is a speech-to-text CLI and tray app that
7
+ pipes transcribed text straight into the focused window. It supports local and
8
+ cloud-based APIs, batch and streaming workflows.
9
+
10
+ ## What it does
11
+
12
+ - Records from your mic and transcribes via one of four backends —
13
+ **Vosk** (local, streaming), **Whisper** (local, batch), **OpenAI**
14
+ (cloud, batch *or* streaming), **Groq** (cloud, batch).
15
+ - Delivers the transcript three ways: paste into the focused window
16
+ (default), copy to clipboard, or print to the terminal.
17
+ - Runs as a **system tray icon** with a single Record button, or as an
18
+ interactive **terminal TUI** — same menu in both.
19
+ - Hooks into your DE's keyboard shortcuts via `SIGUSR1` (toggle
20
+ recording) and `SIGUSR2` (cancel).
21
+ - Cross-platform: tested on Ubuntu (X11 and Wayland), macOS, Windows;
22
+ works under Termux for clipboard / terminal output.
23
+
24
+ ## Install
25
+
26
+ ```bash
27
+ sudo apt-get install portaudio19-dev xclip # Ubuntu; macOS: brew install portaudio
28
+ pip install scribe-cli[all]
29
+ export GROQ_API_KEY=YOURAPIKEY # or OPENAI_API_KEY, or skip and run local
30
+ ```
31
+
32
+ See documentation below for setting up keyboard input on Ubuntu Wayland.
33
+
34
+
35
+ ## Usage
36
+
37
+ In a terminal:
38
+
39
+ ```bash
40
+ scribe
41
+ ```
42
+
43
+ This launches the system tray icon. Press Record, speak, press Stop —
44
+ the transcription lands in the focused window. Scribe picks the first
45
+ backend whose key / dependency is present, in order **`groq` →
46
+ `openai` → `whisper` → `vosk`**, so with `GROQ_API_KEY` set the
47
+ command above is equivalent to:
48
+
49
+ ```bash
50
+ scribe --backend groq --model whisper-large-v3-turbo
51
+ ```
52
+
53
+ <img src=https://raw.githubusercontent.com/perrette/scribe/main/docs/app-tray-menu.png width=300px>
54
+
55
+ You can override the defaults or drop the tray entirely:
56
+
57
+ ```bash
58
+ scribe --backend openai --model gpt-4o-mini-transcribe # OpenAI sweet spot
59
+ scribe --backend openai --model gpt-realtime-whisper # OpenAI streaming
60
+ scribe --backend whisper --model small # local, no API key
61
+ scribe --frontend terminal # interactive TUI menu
62
+ scribe --frontend terminal --no-interactive # record immediately, no menu
63
+ scribe --mode clipboard # copy to clipboard, no keystroke
64
+ scribe --mode terminal # only print to stdout
65
+ scribe -o transcript.txt # also append to a file
66
+ ```
67
+
68
+ With `--no-interactive` (terminal frontend only), scribe skips the
69
+ interactive menu and starts recording right away — handy for scripted,
70
+ one-shot transcriptions. `--no-prompt` is kept as a deprecated alias.
71
+
72
+ Bias the recogniser toward names, jargon, or a domain glossary with
73
+ `--prompt "free text hint"` and `--words word1 word2 ...` (each also
74
+ accepts a `--prompt-file` / `--words-file` companion). See
75
+ [docs/backends.md › Vocabulary biasing](docs/backends.md#vocabulary-biasing)
76
+ for what each backend does with them.
77
+
78
+
79
+ ## Backends at a glance
80
+
81
+ | Backend | `--backend` | Default model | Streaming model(s) | Requires |
82
+ |-----------------|-------------|----------------------------|---------------------------|-------------------------------------|
83
+ | Groq (cloud) | `groq` | `whisper-large-v3-turbo` | — | `GROQ_API_KEY` |
84
+ | OpenAI (cloud) | `openai` | `gpt-4o-mini-transcribe` | `gpt-realtime-whisper` | `OPENAI_API_KEY` |
85
+ | Whisper (local) | `whisper` | `small` | — | `pip install scribe-cli[whisper]` |
86
+ | Vosk (local) | `vosk` | language-dependent | all Vosk models | `pip install scribe-cli[vosk]` |
87
+
88
+ Whether a transcription appears live as you speak or all at once when
89
+ you stop depends on the **model** picked — see
90
+ [docs/backends.md](docs/backends.md).
91
+
92
+
93
+ ### Getting an API key
94
+
95
+ Groq is a good cloud backend to start with — very fast, quite accurate, and the
96
+ **free tier** is generous enough for everyday dictation. Sign up at
97
+ [console.groq.com](https://console.groq.com/), create an API key
98
+ under **Settings → API Keys**, and export it as `GROQ_API_KEY`.
99
+
100
+ I personally use [OpenAI](https://openai.com/api/) with `gpt-4o-mini-transcribe` as it is also fast and perhaps more accurate for my accent-tainted English.
101
+
102
+
103
+ ## Documentation
104
+
105
+ - [Installation & dependencies](docs/installation.md) — PortAudio,
106
+ extras, Ubuntu / GNOME tray libs.
107
+ - [Backends in detail](docs/backends.md) — model lists, when to pick
108
+ which, the realtime model.
109
+ - [Keyboard modes & typer backends](docs/keyboard.md) — keystroke vs
110
+ clipboard, Wayland / `eitype`, `--type-direct`.
111
+ - [System tray & global hotkeys](docs/tray.md) — menu tree, icon
112
+ states, `SIGUSR1`/`SIGUSR2`.
113
+ - [Desktop entry & autostart (`scribe-install`)](docs/desktop-install.md)
114
+ — GNOME / KDE launcher integration.
115
+ - [Fine tuning & CLI reference](docs/cli.md) — every `scribe --help`
116
+ flag with examples.
117
+
118
+ ## Compatibility
119
+
120
+ Initially developed for Python 3 on Ubuntu 24.04 (GNOME + Wayland);
121
+ works on macOS and Windows too. Wayland keystroke injection is
122
+ convoluted but [solved](docs/keyboard.md). For dependencies of
123
+ individual subsystems, check `pynput` (keyboard) and `pystray` (tray
124
+ icon).
Binary file
@@ -0,0 +1,204 @@
1
+ # Backends in detail
2
+
3
+ Scribe ships five speech-to-text backends. They are all picked through
4
+ the same `--backend` / `--model` CLI flags (or the **Model** submenu in
5
+ the tray / terminal frontend). Whether a transcription is *streaming*
6
+ (text appears live as you speak) or *batch* (text arrives at end of
7
+ recording) depends on the **model** chosen — not the backend.
8
+
9
+ ## At a glance
10
+
11
+ | Backend | `--backend` | Default model | Streaming model(s) | Requires |
12
+ |------------------------|-----------------|----------------------------|---------------------------|-----------------------------------------|
13
+ | Groq (cloud) | `groq` | `whisper-large-v3-turbo` | — | `GROQ_API_KEY` |
14
+ | OpenAI (cloud) | `openai` | `gpt-4o-mini-transcribe` | `gpt-realtime-whisper` | `OPENAI_API_KEY` |
15
+ | Whisper FUTO (local) | `whisper-futo` | `small` | — | `pip install scribe-cli[whisper-futo]` |
16
+ | Whisper (local) | `whisper` | `small` | — | `pip install scribe-cli[whisper]` |
17
+ | Vosk (local) | `vosk` | language-dependent | all Vosk models | `pip install scribe-cli[vosk]` |
18
+
19
+ Run `scribe` without arguments and it picks the first backend whose
20
+ dependency / API key is present, preferring cloud over local and the
21
+ faster local option first:
22
+ `groq → openai → whisper-futo → whisper → vosk`.
23
+
24
+ ## `whisper-futo` (local, fast on short dictations)
25
+
26
+ Runs locally via [whisper.cpp](https://github.com/ggml-org/whisper.cpp)
27
+ (through [`pywhispercpp`](https://github.com/absadiki/pywhispercpp))
28
+ using [FUTO's ACFT-finetuned models](https://github.com/futo-org/whisper-acft).
29
+ ACFT (Audio Context Fine-Tuning) lets the encoder run on the actual
30
+ audio length instead of always padding to 30 s — a meaningful speedup
31
+ on short dictations, which is the typical scribe workload.
32
+
33
+ The available models offered in the tray menu are
34
+ `tiny / base / small`. FUTO has not released ACFT weights for
35
+ `medium / large / turbo`; for those sizes use the `whisper` backend.
36
+
37
+ With `--language en` (or `-l en`) scribe auto-substitutes the
38
+ English-only variant (e.g. `small` → `small.en`) when it exists.
39
+
40
+ Models are auto-downloaded on first use from `voiceinput.futo.org`
41
+ to `$XDG_CACHE_HOME/whisper-futo/` (override with
42
+ `--download-folder-whisper-futo`).
43
+
44
+ For audio ≥ 30 s the ACFT speedup tapers off and the encoder window
45
+ collapses to the standard 30 s; quality and speed in that regime are
46
+ similar to the `whisper` backend. Pick `whisper-futo` if most of your
47
+ dictations are short, the `whisper` backend if you regularly do
48
+ multi-minute recordings or need `medium` / `large` / `turbo`.
49
+
50
+ ## `whisper` (local)
51
+
52
+ Runs locally via
53
+ [`faster-whisper`](https://github.com/SYSTRAN/faster-whisper) and
54
+ defaults to the `small` model. Excellent at full-utterance
55
+ transcription in
56
+ [many languages](https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages),
57
+ but it does not stream — text appears at end-of-recording — and
58
+ execution time depends on model size and hardware.
59
+
60
+ The available models offered in the tray menu are
61
+ `tiny / base / small / medium / large-v3 / large-v3-turbo`. Larger
62
+ models trade speed for accuracy.
63
+
64
+ With `--language en` (or `-l en`) scribe auto-substitutes the
65
+ English-only variant (e.g. `small` → `small.en`) when it exists.
66
+
67
+ ## `vosk` (local, streaming)
68
+
69
+ Vosk transcribes in real time and is very good at one language at a
70
+ time, but tends to make more mistakes than Whisper and does not produce
71
+ punctuation. It becomes really useful in longer, interactive sessions
72
+ where the live "appears as you speak" UX matters — see
73
+ [keyboard.md](keyboard.md) for how the keystroke mode interacts with
74
+ streaming models.
75
+
76
+ There are many [Vosk models](https://alphacephei.com/vosk/models)
77
+ available; a handful are pre-mapped to common languages (`en`, `fr`,
78
+ `de`, `it`) in
79
+ [`scribe/models.toml`](../scribe/models.toml). Pick one with
80
+ `-l <lang>` or browse the full list interactively from the menu.
81
+
82
+ ## `openai` (OpenAI cloud)
83
+
84
+ The OpenAI backend supports three models:
85
+
86
+ - `gpt-4o-mini-transcribe` *(default)* — fast, low-cost batch
87
+ transcription.
88
+ - `gpt-4o-transcribe` — higher-quality batch transcription.
89
+ - `gpt-realtime-whisper` *(streaming)* — partial transcripts arrive
90
+ as you speak. Same UX as Vosk but using OpenAI's cloud model.
91
+
92
+ All three share the same `OPENAI_API_KEY` and the `[openai]` extra; no
93
+ extra dependencies. Set the key once:
94
+
95
+ ```bash
96
+ export OPENAI_API_KEY=YOURAPIKEY
97
+ scribe --backend openai # default: gpt-4o-mini-transcribe
98
+ scribe --model gpt-4o-transcribe # batch, higher quality
99
+ scribe --model gpt-realtime-whisper # streaming
100
+ ```
101
+
102
+ `--model` alone auto-routes to the `openai` backend for any of the
103
+ three models above, so `--backend openai` is optional.
104
+
105
+ ### `--realtime-delay` (gpt-realtime-whisper only)
106
+
107
+ The streaming model has a latency-vs-accuracy knob exposed as
108
+ `--realtime-delay {minimal,low,medium,high,xhigh}` (default `medium`).
109
+ Lower values emit partial transcripts sooner — at the cost of more
110
+ revisions arriving in the focused window. Higher values batch tokens
111
+ into longer chunks so what gets pasted is more stable.
112
+
113
+ See OpenAI's
114
+ [gpt-realtime-whisper model card](https://developers.openai.com/api/docs/models/gpt-realtime-whisper)
115
+ for the full picture.
116
+
117
+ ## `groq` (Groq cloud)
118
+
119
+ Talks to Groq's OpenAI-compatible API and defaults to
120
+ `whisper-large-v3-turbo`. Typically the fastest cloud option for
121
+ full-utterance transcription:
122
+
123
+ ```bash
124
+ export GROQ_API_KEY=YOURAPIKEY
125
+ scribe --backend groq
126
+ ```
127
+
128
+ The `groq` backend reuses the `openai` Python client under the hood, so
129
+ installing `[openai]` is enough for both.
130
+
131
+ ## Stopping a recording
132
+
133
+ For batch models (Whisper local, Whisper-via-API, Groq, `gpt-4o-*`) the
134
+ recording continues for up to 2 minutes until you stop it manually
135
+ (Stop in the tray, Ctrl+C in the terminal) — the transcription happens
136
+ once when you stop.
137
+
138
+ Streaming models (Vosk, `gpt-realtime-whisper`) emit partials as you
139
+ speak and stop on the same Stop / Ctrl+C action.
140
+
141
+ ## Vocabulary biasing
142
+
143
+ `--prompt TEXT` and `--words W [W ...]` (plus the `--prompt-file` /
144
+ `--words-file` companions) bias the recogniser toward a particular
145
+ style, domain, or word list. The concept is generic across the
146
+ whisper-family backends but each backend exposes it slightly
147
+ differently:
148
+
149
+ | Backend | `--prompt` | `--words` |
150
+ |--------------------------------------|-------------------------------|--------------------------------------------------------|
151
+ | `whisper` (faster-whisper, local) | passed as `initial_prompt=` | passed as `hotwords=` — a **dedicated biasing channel** separate from the prompt |
152
+ | `openai` batch (`gpt-4o*-transcribe`) | passed as `prompt=` | joined onto the prompt string |
153
+ | `groq` (`whisper-large-v3-turbo`) | passed as `prompt=` | joined onto the prompt string |
154
+ | `openai` realtime (`gpt-realtime-whisper`) | included in the session config as `transcription.prompt` | joined onto the prompt string |
155
+ | `vosk` | *ignored* (no soft prompt) | *ignored* (Vosk only supports a hard `grammar` allowlist; not yet exposed) |
156
+
157
+ The whisper-family APIs cap the prompt around ~224 tokens; longer
158
+ hints are silently truncated. Faster-whisper's `hotwords` channel is
159
+ the one place a separate "dictionary" really exists — everywhere else
160
+ `--words` is just a convenience to keep your word list out of the
161
+ prompt string in the CLI.
162
+
163
+ Both flags read from the corresponding `*-file` argument when present.
164
+ Inline + file inputs are combined.
165
+
166
+ ```bash
167
+ # Inline
168
+ scribe --prompt "ML systems infra: K8s, etcd, Envoy." \
169
+ --words kubectl envoyproxy etcdctl
170
+
171
+ # From files (handy for long-lived glossaries)
172
+ scribe --prompt-file ~/.config/scribe/prompt.txt \
173
+ --words-file ~/.config/scribe/words.txt
174
+ ```
175
+
176
+ When *no* prompt/words flag is given, scribe also auto-loads
177
+ `prompt.txt` and `words.txt` from the platform user-config dir
178
+ (`~/.config/scribe/` on Linux, `~/Library/Application Support/scribe/`
179
+ on macOS, `%LOCALAPPDATA%\scribe\` on Windows — resolved via
180
+ `platformdirs`) if they exist. To suppress the default for one
181
+ invocation, pass an explicit empty value: `--prompt ""` (or
182
+ `--prompt-file ""`) suppresses the prompt default; `--words` with no
183
+ arguments (or `--words-file ""`) suppresses the words default. Each
184
+ side is independent.
185
+
186
+ ## Pseudo-streaming (experimental)
187
+
188
+ `--pseudo-streaming` makes a batch backend behave streaming-like by
189
+ cutting the running buffer into chunks driven by silence:
190
+
191
+ ```bash
192
+ scribe --pseudo-streaming --streaming-window 5
193
+ ```
194
+
195
+ After `--streaming-window` seconds of buffered audio, scribe cuts at
196
+ the first silence of at least `--silence-duration` and transcribes the
197
+ chunk; if no silence arrives by `2 × --streaming-window`, it
198
+ force-cuts. The session continues until you stop it. Default `5` s
199
+ trades a little Whisper context for snappier "text appears as you
200
+ speak" UX; raise it (10–30 s) if accuracy on long sentences matters
201
+ more than latency.
202
+
203
+ This is experimental and off by default. The tray menu surfaces the
204
+ same toggle under Options ▶ Advanced ▶ Pseudo-streaming.