scribe-cli 0.17.1__tar.gz → 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/.gitignore +1 -0
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/PKG-INFO +60 -22
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/README.md +35 -20
- scribe_cli-1.0.0/docs/app-tray-menu.png +0 -0
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/docs/backends.md +126 -41
- scribe_cli-1.0.0/docs/cli.md +207 -0
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/docs/desktop-install.md +1 -1
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/docs/installation.md +1 -1
- scribe_cli-0.17.1/docs/keyboard.md → scribe_cli-1.0.0/docs/output.md +98 -36
- scribe_cli-1.0.0/docs/tray.md +127 -0
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/pyproject.toml +38 -6
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/_version.py +3 -3
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/app.py +362 -164
- scribe_cli-1.0.0/scribe/audio.py +379 -0
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/backends/groq.py +4 -3
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/backends/openai_api.py +11 -3
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/backends/openai_realtime.py +87 -10
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/backends/vosk.py +20 -4
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/backends/whisper.py +12 -3
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/backends/whisper_futo.py +10 -3
- scribe_cli-1.0.0/scribe/dialog.py +56 -0
- scribe_cli-1.0.0/scribe/menu.py +1554 -0
- scribe_cli-1.0.0/scribe/models.py +403 -0
- scribe_cli-1.0.0/scribe/output.py +237 -0
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/session.py +29 -4
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe_cli.egg-info/PKG-INFO +60 -22
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe_cli.egg-info/SOURCES.txt +8 -1
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe_cli.egg-info/requires.txt +3 -0
- scribe_cli-1.0.0/scribe_data/silero_vad.LICENSE +21 -0
- scribe_cli-1.0.0/scribe_data/silero_vad.onnx +0 -0
- scribe_cli-1.0.0/tests/test_backend_matrix.py +295 -0
- scribe_cli-1.0.0/tests/test_output.py +165 -0
- scribe_cli-1.0.0/tests/test_output_file_picker.py +57 -0
- scribe_cli-1.0.0/tests/test_pseudo_streaming.py +490 -0
- scribe_cli-0.17.1/docs/app-tray-menu.png +0 -0
- scribe_cli-0.17.1/docs/cli.md +0 -137
- scribe_cli-0.17.1/docs/tray.md +0 -92
- scribe_cli-0.17.1/scribe/audio.py +0 -76
- scribe_cli-0.17.1/scribe/menu.py +0 -960
- scribe_cli-0.17.1/scribe/models.py +0 -280
- scribe_cli-0.17.1/tests/test_pseudo_streaming.py +0 -288
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/.github/FUNDING.yml +0 -0
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/.github/workflows/pypi.yml +0 -0
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/LICENSE +0 -0
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/docs/roadmap-libei.md +0 -0
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/icon.xcf +0 -0
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/__init__.py +0 -0
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/backends/__init__.py +0 -0
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/install_desktop.py +0 -0
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/keyboard.py +0 -0
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/models.toml +0 -0
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/saverecording.py +0 -0
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/testpynput.py +0 -0
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/typers/__init__.py +0 -0
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/typers/base.py +0 -0
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/typers/eitype.py +0 -0
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/typers/pynput.py +0 -0
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/typers/wtype.py +0 -0
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/typers/ydotool.py +0 -0
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe/util.py +0 -0
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe_cli.egg-info/dependency_links.txt +0 -0
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe_cli.egg-info/entry_points.txt +0 -0
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe_cli.egg-info/top_level.txt +0 -0
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe_data/__init__.py +0 -0
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe_data/share/icon.png +0 -0
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe_data/share/icon_recording.png +0 -0
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe_data/share/icon_writing.png +0 -0
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scribe_data/templates/scribe.desktop +0 -0
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scripts/bench_whisper_local.py +0 -0
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/scripts/test_python_versions_install.sh +0 -0
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/setup.cfg +0 -0
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/tests/test_openai_realtime_coalesce.py +0 -0
- {scribe_cli-0.17.1 → scribe_cli-1.0.0}/tests/test_whisper_futo.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: scribe-cli
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 1.0.0
|
|
4
4
|
Summary: Speech-to-text CLI and system-tray app for dictating into any focused window. Local (vosk, faster-whisper) or cloud (groq, openai) backends, batch or streaming.
|
|
5
5
|
Author-email: Mahé Perrette <mahe.perrette@gmail.com>
|
|
6
6
|
License: MIT License
|
|
@@ -33,13 +33,34 @@ License: MIT License
|
|
|
33
33
|
licenses of all dependencies before using or distributing this software to
|
|
34
34
|
ensure compliance with their respective terms.
|
|
35
35
|
Project-URL: Homepage, https://github.com/perrette/scribe
|
|
36
|
-
|
|
36
|
+
Project-URL: Source, https://github.com/perrette/scribe
|
|
37
|
+
Project-URL: Issues, https://github.com/perrette/scribe/issues
|
|
38
|
+
Project-URL: Changelog, https://github.com/perrette/scribe/releases
|
|
39
|
+
Project-URL: Funding, https://github.com/sponsors/perrette
|
|
40
|
+
Keywords: speech-to-text,stt,transcription,dictation,voice-typing,voice-recognition,multilingual,realtime,streaming,cli,tray,vosk,whisper,faster-whisper,openai,groq,gpt-4o,linux,wayland,keyboard,clipboard,microphone,audio
|
|
41
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
42
|
+
Classifier: Intended Audience :: End Users/Desktop
|
|
43
|
+
Classifier: Intended Audience :: Developers
|
|
44
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
37
45
|
Classifier: Programming Language :: Python :: 3.9
|
|
38
46
|
Classifier: Programming Language :: Python :: 3.10
|
|
39
47
|
Classifier: Programming Language :: Python :: 3.11
|
|
40
48
|
Classifier: Programming Language :: Python :: 3.12
|
|
41
49
|
Classifier: Programming Language :: Python :: 3.13
|
|
42
50
|
Classifier: Operating System :: OS Independent
|
|
51
|
+
Classifier: Environment :: Console
|
|
52
|
+
Classifier: Environment :: X11 Applications
|
|
53
|
+
Classifier: Environment :: MacOS X
|
|
54
|
+
Classifier: Environment :: Win32 (MS Windows)
|
|
55
|
+
Classifier: Natural Language :: English
|
|
56
|
+
Classifier: Natural Language :: French
|
|
57
|
+
Classifier: Natural Language :: German
|
|
58
|
+
Classifier: Natural Language :: Italian
|
|
59
|
+
Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
|
|
60
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
61
|
+
Classifier: Topic :: Office/Business
|
|
62
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
63
|
+
Classifier: Topic :: Utilities
|
|
43
64
|
Requires-Python: >=3.9
|
|
44
65
|
Description-Content-Type: text/markdown
|
|
45
66
|
License-File: LICENSE
|
|
@@ -52,6 +73,7 @@ Requires-Dist: unidecode
|
|
|
52
73
|
Requires-Dist: termcolor
|
|
53
74
|
Requires-Dist: platformdirs
|
|
54
75
|
Requires-Dist: desktop-ai-core>=0.2.0
|
|
76
|
+
Requires-Dist: onnxruntime
|
|
55
77
|
Provides-Extra: keyboard
|
|
56
78
|
Requires-Dist: pynput; extra == "keyboard"
|
|
57
79
|
Provides-Extra: whisper
|
|
@@ -69,6 +91,7 @@ Requires-Dist: soundfile; extra == "openai"
|
|
|
69
91
|
Provides-Extra: groq
|
|
70
92
|
Requires-Dist: openai<3,>=2.37.0; extra == "groq"
|
|
71
93
|
Requires-Dist: soundfile; extra == "groq"
|
|
94
|
+
Provides-Extra: vad
|
|
72
95
|
Provides-Extra: all
|
|
73
96
|
Requires-Dist: pynput; extra == "all"
|
|
74
97
|
Requires-Dist: faster-whisper; extra == "all"
|
|
@@ -90,11 +113,13 @@ cloud-based APIs, batch and streaming workflows.
|
|
|
90
113
|
|
|
91
114
|
## What it does
|
|
92
115
|
|
|
93
|
-
- Records from your mic and transcribes via one of
|
|
94
|
-
**Vosk** (local, streaming), **Whisper** (local, batch),
|
|
95
|
-
(
|
|
96
|
-
|
|
97
|
-
|
|
116
|
+
- Records from your mic and transcribes via one of five backends —
|
|
117
|
+
**Vosk** (local, streaming), **Whisper** (local, batch),
|
|
118
|
+
**Whisper FUTO** (local, batch — ACFT-tuned for short dictations),
|
|
119
|
+
**OpenAI** (cloud, batch *or* streaming), **Groq** (cloud, batch).
|
|
120
|
+
- Delivers the transcript four ways: paste into the focused window
|
|
121
|
+
(default), copy to clipboard, print to the terminal, or write to
|
|
122
|
+
a file.
|
|
98
123
|
- Runs as a **system tray icon** with a single Record button, or as an
|
|
99
124
|
interactive **terminal TUI** — same menu in both.
|
|
100
125
|
- Hooks into your DE's keyboard shortcuts via `SIGUSR1` (toggle
|
|
@@ -124,8 +149,8 @@ scribe
|
|
|
124
149
|
This launches the system tray icon. Press Record, speak, press Stop —
|
|
125
150
|
the transcription lands in the focused window. Scribe picks the first
|
|
126
151
|
backend whose key / dependency is present, in order **`groq` →
|
|
127
|
-
`openai` → `whisper` → `vosk`**, so with `GROQ_API_KEY`
|
|
128
|
-
command above is equivalent to:
|
|
152
|
+
`openai` → `whisper-futo` → `whisper` → `vosk`**, so with `GROQ_API_KEY`
|
|
153
|
+
set the command above is equivalent to:
|
|
129
154
|
|
|
130
155
|
```bash
|
|
131
156
|
scribe --backend groq --model whisper-large-v3-turbo
|
|
@@ -140,15 +165,17 @@ scribe --backend openai --model gpt-4o-mini-transcribe # OpenAI sweet spot
|
|
|
140
165
|
scribe --backend openai --model gpt-realtime-whisper # OpenAI streaming
|
|
141
166
|
scribe --backend whisper --model small # local, no API key
|
|
142
167
|
scribe --frontend terminal # interactive TUI menu
|
|
143
|
-
scribe --
|
|
168
|
+
scribe --record # start recording immediately on launch (works in tray or terminal)
|
|
169
|
+
scribe --record --frontend terminal --mode file # one-shot batched dictation → file
|
|
170
|
+
scribe --record --frontend terminal --mode file --stream # streamed: chunks appended live as you speak
|
|
144
171
|
scribe --mode clipboard # copy to clipboard, no keystroke
|
|
145
172
|
scribe --mode terminal # only print to stdout
|
|
146
|
-
scribe -o transcript.txt
|
|
173
|
+
scribe --mode file -o transcript.txt # append to a file (no keystroke / clipboard)
|
|
147
174
|
```
|
|
148
175
|
|
|
149
176
|
With `--no-interactive` (terminal frontend only), scribe skips the
|
|
150
177
|
interactive menu and starts recording right away — handy for scripted,
|
|
151
|
-
one-shot transcriptions.
|
|
178
|
+
one-shot transcriptions.
|
|
152
179
|
|
|
153
180
|
Bias the recogniser toward names, jargon, or a domain glossary with
|
|
154
181
|
`--prompt "free text hint"` and `--words word1 word2 ...` (each also
|
|
@@ -159,12 +186,13 @@ for what each backend does with them.
|
|
|
159
186
|
|
|
160
187
|
## Backends at a glance
|
|
161
188
|
|
|
162
|
-
| Backend
|
|
163
|
-
|
|
164
|
-
| Groq (cloud)
|
|
165
|
-
| OpenAI (cloud)
|
|
166
|
-
| Whisper (local) | `whisper`
|
|
167
|
-
|
|
|
189
|
+
| Backend | `--backend` | Default model | Streaming model(s) | Requires |
|
|
190
|
+
|----------------------|-----------------|----------------------------|---------------------------|----------------------------------------|
|
|
191
|
+
| Groq (cloud) | `groq` | `whisper-large-v3-turbo` | — | `GROQ_API_KEY` |
|
|
192
|
+
| OpenAI (cloud) | `openai` | `gpt-4o-mini-transcribe` | `gpt-realtime-whisper` | `OPENAI_API_KEY` |
|
|
193
|
+
| Whisper FUTO (local) | `whisper-futo` | `small` | — | `pip install scribe-cli[whisper-futo]` |
|
|
194
|
+
| Whisper (local) | `whisper` | `small` | — | `pip install scribe-cli[whisper]` |
|
|
195
|
+
| Vosk (local) | `vosk` | language-dependent | all Vosk models | `pip install scribe-cli[vosk]` |
|
|
168
196
|
|
|
169
197
|
Whether a transcription appears live as you speak or all at once when
|
|
170
198
|
you stop depends on the **model** picked — see
|
|
@@ -173,8 +201,11 @@ you stop depends on the **model** picked — see
|
|
|
173
201
|
|
|
174
202
|
### Getting an API key
|
|
175
203
|
|
|
176
|
-
Groq is
|
|
177
|
-
|
|
204
|
+
Groq is the **recommended cloud backend by default** — extremely fast
|
|
205
|
+
(by a wide margin compared to other cloud STT options, especially in
|
|
206
|
+
**Stream** mode where the per-chunk roundtrip latency dominates the
|
|
207
|
+
perceived speed), quite accurate, and the **free tier** is generous
|
|
208
|
+
enough for everyday dictation. Sign up at
|
|
178
209
|
[console.groq.com](https://console.groq.com/), create an API key
|
|
179
210
|
under **Settings → API Keys**, and export it as `GROQ_API_KEY`.
|
|
180
211
|
|
|
@@ -187,7 +218,7 @@ I personally use [OpenAI](https://openai.com/api/) with `gpt-4o-mini-transcribe`
|
|
|
187
218
|
extras, Ubuntu / GNOME tray libs.
|
|
188
219
|
- [Backends in detail](docs/backends.md) — model lists, when to pick
|
|
189
220
|
which, the realtime model.
|
|
190
|
-
- [
|
|
221
|
+
- [Output modes & typer backends](docs/output.md) — keystroke vs
|
|
191
222
|
clipboard, Wayland / `eitype`, `--type-direct`.
|
|
192
223
|
- [System tray & global hotkeys](docs/tray.md) — menu tree, icon
|
|
193
224
|
states, `SIGUSR1`/`SIGUSR2`.
|
|
@@ -196,10 +227,17 @@ I personally use [OpenAI](https://openai.com/api/) with `gpt-4o-mini-transcribe`
|
|
|
196
227
|
- [Fine tuning & CLI reference](docs/cli.md) — every `scribe --help`
|
|
197
228
|
flag with examples.
|
|
198
229
|
|
|
230
|
+
## Related projects
|
|
231
|
+
|
|
232
|
+
- **[bard](https://github.com/perrette/bard)** — TTS sibling of scribe,
|
|
233
|
+
same tray/CLI architecture in reverse: highlight text, hear it
|
|
234
|
+
spoken. Shares the [`desktop-ai-core`](https://github.com/perrette/desktop-ai-core)
|
|
235
|
+
backbone (frontends, providers, dialog helpers).
|
|
236
|
+
|
|
199
237
|
## Compatibility
|
|
200
238
|
|
|
201
239
|
Initially developed for Python 3 on Ubuntu 24.04 (GNOME + Wayland);
|
|
202
240
|
works on macOS and Windows too. Wayland keystroke injection is
|
|
203
|
-
convoluted but [solved](docs/
|
|
241
|
+
convoluted but [solved](docs/output.md). For dependencies of
|
|
204
242
|
individual subsystems, check `pynput` (keyboard) and `pystray` (tray
|
|
205
243
|
icon).
|
|
@@ -9,11 +9,13 @@ cloud-based APIs, batch and streaming workflows.
|
|
|
9
9
|
|
|
10
10
|
## What it does
|
|
11
11
|
|
|
12
|
-
- Records from your mic and transcribes via one of
|
|
13
|
-
**Vosk** (local, streaming), **Whisper** (local, batch),
|
|
14
|
-
(
|
|
15
|
-
|
|
16
|
-
|
|
12
|
+
- Records from your mic and transcribes via one of five backends —
|
|
13
|
+
**Vosk** (local, streaming), **Whisper** (local, batch),
|
|
14
|
+
**Whisper FUTO** (local, batch — ACFT-tuned for short dictations),
|
|
15
|
+
**OpenAI** (cloud, batch *or* streaming), **Groq** (cloud, batch).
|
|
16
|
+
- Delivers the transcript four ways: paste into the focused window
|
|
17
|
+
(default), copy to clipboard, print to the terminal, or write to
|
|
18
|
+
a file.
|
|
17
19
|
- Runs as a **system tray icon** with a single Record button, or as an
|
|
18
20
|
interactive **terminal TUI** — same menu in both.
|
|
19
21
|
- Hooks into your DE's keyboard shortcuts via `SIGUSR1` (toggle
|
|
@@ -43,8 +45,8 @@ scribe
|
|
|
43
45
|
This launches the system tray icon. Press Record, speak, press Stop —
|
|
44
46
|
the transcription lands in the focused window. Scribe picks the first
|
|
45
47
|
backend whose key / dependency is present, in order **`groq` →
|
|
46
|
-
`openai` → `whisper` → `vosk`**, so with `GROQ_API_KEY`
|
|
47
|
-
command above is equivalent to:
|
|
48
|
+
`openai` → `whisper-futo` → `whisper` → `vosk`**, so with `GROQ_API_KEY`
|
|
49
|
+
set the command above is equivalent to:
|
|
48
50
|
|
|
49
51
|
```bash
|
|
50
52
|
scribe --backend groq --model whisper-large-v3-turbo
|
|
@@ -59,15 +61,17 @@ scribe --backend openai --model gpt-4o-mini-transcribe # OpenAI sweet spot
|
|
|
59
61
|
scribe --backend openai --model gpt-realtime-whisper # OpenAI streaming
|
|
60
62
|
scribe --backend whisper --model small # local, no API key
|
|
61
63
|
scribe --frontend terminal # interactive TUI menu
|
|
62
|
-
scribe --
|
|
64
|
+
scribe --record # start recording immediately on launch (works in tray or terminal)
|
|
65
|
+
scribe --record --frontend terminal --mode file # one-shot batched dictation → file
|
|
66
|
+
scribe --record --frontend terminal --mode file --stream # streamed: chunks appended live as you speak
|
|
63
67
|
scribe --mode clipboard # copy to clipboard, no keystroke
|
|
64
68
|
scribe --mode terminal # only print to stdout
|
|
65
|
-
scribe -o transcript.txt
|
|
69
|
+
scribe --mode file -o transcript.txt # append to a file (no keystroke / clipboard)
|
|
66
70
|
```
|
|
67
71
|
|
|
68
72
|
With `--no-interactive` (terminal frontend only), scribe skips the
|
|
69
73
|
interactive menu and starts recording right away — handy for scripted,
|
|
70
|
-
one-shot transcriptions.
|
|
74
|
+
one-shot transcriptions.
|
|
71
75
|
|
|
72
76
|
Bias the recogniser toward names, jargon, or a domain glossary with
|
|
73
77
|
`--prompt "free text hint"` and `--words word1 word2 ...` (each also
|
|
@@ -78,12 +82,13 @@ for what each backend does with them.
|
|
|
78
82
|
|
|
79
83
|
## Backends at a glance
|
|
80
84
|
|
|
81
|
-
| Backend
|
|
82
|
-
|
|
83
|
-
| Groq (cloud)
|
|
84
|
-
| OpenAI (cloud)
|
|
85
|
-
| Whisper (local) | `whisper`
|
|
86
|
-
|
|
|
85
|
+
| Backend | `--backend` | Default model | Streaming model(s) | Requires |
|
|
86
|
+
|----------------------|-----------------|----------------------------|---------------------------|----------------------------------------|
|
|
87
|
+
| Groq (cloud) | `groq` | `whisper-large-v3-turbo` | — | `GROQ_API_KEY` |
|
|
88
|
+
| OpenAI (cloud) | `openai` | `gpt-4o-mini-transcribe` | `gpt-realtime-whisper` | `OPENAI_API_KEY` |
|
|
89
|
+
| Whisper FUTO (local) | `whisper-futo` | `small` | — | `pip install scribe-cli[whisper-futo]` |
|
|
90
|
+
| Whisper (local) | `whisper` | `small` | — | `pip install scribe-cli[whisper]` |
|
|
91
|
+
| Vosk (local) | `vosk` | language-dependent | all Vosk models | `pip install scribe-cli[vosk]` |
|
|
87
92
|
|
|
88
93
|
Whether a transcription appears live as you speak or all at once when
|
|
89
94
|
you stop depends on the **model** picked — see
|
|
@@ -92,8 +97,11 @@ you stop depends on the **model** picked — see
|
|
|
92
97
|
|
|
93
98
|
### Getting an API key
|
|
94
99
|
|
|
95
|
-
Groq is
|
|
96
|
-
|
|
100
|
+
Groq is the **recommended cloud backend by default** — extremely fast
|
|
101
|
+
(by a wide margin compared to other cloud STT options, especially in
|
|
102
|
+
**Stream** mode where the per-chunk roundtrip latency dominates the
|
|
103
|
+
perceived speed), quite accurate, and the **free tier** is generous
|
|
104
|
+
enough for everyday dictation. Sign up at
|
|
97
105
|
[console.groq.com](https://console.groq.com/), create an API key
|
|
98
106
|
under **Settings → API Keys**, and export it as `GROQ_API_KEY`.
|
|
99
107
|
|
|
@@ -106,7 +114,7 @@ I personally use [OpenAI](https://openai.com/api/) with `gpt-4o-mini-transcribe`
|
|
|
106
114
|
extras, Ubuntu / GNOME tray libs.
|
|
107
115
|
- [Backends in detail](docs/backends.md) — model lists, when to pick
|
|
108
116
|
which, the realtime model.
|
|
109
|
-
- [
|
|
117
|
+
- [Output modes & typer backends](docs/output.md) — keystroke vs
|
|
110
118
|
clipboard, Wayland / `eitype`, `--type-direct`.
|
|
111
119
|
- [System tray & global hotkeys](docs/tray.md) — menu tree, icon
|
|
112
120
|
states, `SIGUSR1`/`SIGUSR2`.
|
|
@@ -115,10 +123,17 @@ I personally use [OpenAI](https://openai.com/api/) with `gpt-4o-mini-transcribe`
|
|
|
115
123
|
- [Fine tuning & CLI reference](docs/cli.md) — every `scribe --help`
|
|
116
124
|
flag with examples.
|
|
117
125
|
|
|
126
|
+
## Related projects
|
|
127
|
+
|
|
128
|
+
- **[bard](https://github.com/perrette/bard)** — TTS sibling of scribe,
|
|
129
|
+
same tray/CLI architecture in reverse: highlight text, hear it
|
|
130
|
+
spoken. Shares the [`desktop-ai-core`](https://github.com/perrette/desktop-ai-core)
|
|
131
|
+
backbone (frontends, providers, dialog helpers).
|
|
132
|
+
|
|
118
133
|
## Compatibility
|
|
119
134
|
|
|
120
135
|
Initially developed for Python 3 on Ubuntu 24.04 (GNOME + Wayland);
|
|
121
136
|
works on macOS and Windows too. Wayland keystroke injection is
|
|
122
|
-
convoluted but [solved](docs/
|
|
137
|
+
convoluted but [solved](docs/output.md). For dependencies of
|
|
123
138
|
individual subsystems, check `pynput` (keyboard) and `pystray` (tray
|
|
124
139
|
icon).
|
|
Binary file
|
|
@@ -70,7 +70,7 @@ Vosk transcribes in real time and is very good at one language at a
|
|
|
70
70
|
time, but tends to make more mistakes than Whisper and does not produce
|
|
71
71
|
punctuation. It becomes really useful in longer, interactive sessions
|
|
72
72
|
where the live "appears as you speak" UX matters — see
|
|
73
|
-
[
|
|
73
|
+
[output.md](output.md) for how the keystroke mode interacts with
|
|
74
74
|
streaming models.
|
|
75
75
|
|
|
76
76
|
There are many [Vosk models](https://alphacephei.com/vosk/models)
|
|
@@ -117,12 +117,15 @@ for the full picture.
|
|
|
117
117
|
## `groq` (Groq cloud)
|
|
118
118
|
|
|
119
119
|
Talks to Groq's OpenAI-compatible API and defaults to
|
|
120
|
-
`whisper-large-v3-turbo`.
|
|
121
|
-
|
|
120
|
+
`whisper-large-v3-turbo`. **Extremely fast** thanks to Groq's
|
|
121
|
+
inference hardware — the recommended cloud backend by default, and
|
|
122
|
+
the natural pick for `--stream` mode where per-chunk roundtrip
|
|
123
|
+
latency dominates perceived speed:
|
|
122
124
|
|
|
123
125
|
```bash
|
|
124
126
|
export GROQ_API_KEY=YOURAPIKEY
|
|
125
|
-
scribe --backend groq
|
|
127
|
+
scribe --backend groq # Clip mode (default)
|
|
128
|
+
scribe --backend groq --stream # live transcription, per-chunk
|
|
126
129
|
```
|
|
127
130
|
|
|
128
131
|
The `groq` backend reuses the `openai` Python client under the hood, so
|
|
@@ -146,14 +149,14 @@ style, domain, or word list. The concept is generic across the
|
|
|
146
149
|
whisper-family backends but each backend exposes it slightly
|
|
147
150
|
differently:
|
|
148
151
|
|
|
149
|
-
| Backend | `--prompt` | `--words` |
|
|
150
|
-
|
|
151
|
-
| `whisper` (faster-whisper, local) | passed as `initial_prompt=` | passed as `hotwords=` — a **dedicated biasing channel** separate from the prompt |
|
|
152
|
-
| `whisper-futo` (pywhispercpp, local) | passed as `initial_prompt=` | joined onto the prompt string (no separate hotwords channel here) |
|
|
153
|
-
| `openai` batch (`gpt-4o*-transcribe`) | passed as `prompt=` | joined onto the prompt string |
|
|
154
|
-
| `groq` (`whisper-large-v3-turbo`) | passed as `prompt=` | joined onto the prompt string |
|
|
155
|
-
| `openai` realtime (`gpt-realtime-whisper`) | *silently ignored* — the model rejects the prompt parameter server-side (HTTP 400 *"The 'prompt' parameter is not supported for this model."*). The kwarg stays accepted for plumbing compatibility but never reaches the API. | same — joined into the (ignored) prompt |
|
|
156
|
-
| `vosk` | *ignored* (no soft prompt) | *ignored* (Vosk only supports a hard `grammar` allowlist; not yet exposed) |
|
|
152
|
+
| Backend | `--prompt` | `--words` | `--language` |
|
|
153
|
+
|--------------------------------------|-------------------------------|--------------------------------------------------------|---------------------------------------------------------|
|
|
154
|
+
| `whisper` (faster-whisper, local) | passed as `initial_prompt=` | passed as `hotwords=` — a **dedicated biasing channel** separate from the prompt | passed as `language=` (ISO 639-1); `-l en` also auto-substitutes `small.en` etc. |
|
|
155
|
+
| `whisper-futo` (pywhispercpp, local) | passed as `initial_prompt=` | joined onto the prompt string (no separate hotwords channel here) | passed as `language=` (ISO 639-1); `-l en` auto-substitutes `small.en` etc. |
|
|
156
|
+
| `openai` batch (`gpt-4o*-transcribe`) | passed as `prompt=` | joined onto the prompt string | passed as `language=` hint (ISO 639-1) |
|
|
157
|
+
| `groq` (`whisper-large-v3-turbo`) | passed as `prompt=` | joined onto the prompt string | passed as `language=` hint (ISO 639-1) |
|
|
158
|
+
| `openai` realtime (`gpt-realtime-whisper`) | *silently ignored* — the model rejects the prompt parameter server-side (HTTP 400 *"The 'prompt' parameter is not supported for this model."*). The kwarg stays accepted for plumbing compatibility but never reaches the API. | same — joined into the (ignored) prompt | passed as `language=` (ISO 639-1) |
|
|
159
|
+
| `vosk` | *ignored* (no soft prompt) | *ignored* (Vosk only supports a hard `grammar` allowlist; not yet exposed) | picks a per-language model from `scribe/models.toml`; no runtime parameter |
|
|
157
160
|
|
|
158
161
|
The whisper-family APIs cap the prompt around ~224 tokens; longer
|
|
159
162
|
hints are silently truncated. Faster-whisper's `hotwords` channel is
|
|
@@ -184,34 +187,117 @@ invocation, pass an explicit empty value: `--prompt ""` (or
|
|
|
184
187
|
arguments (or `--words-file ""`) suppresses the words default. Each
|
|
185
188
|
side is independent.
|
|
186
189
|
|
|
187
|
-
##
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
190
|
+
## Language
|
|
191
|
+
|
|
192
|
+
`-l / --language LANG` tells the backend which language to expect.
|
|
193
|
+
What that means in practice varies by backend (see the per-backend
|
|
194
|
+
column in the table above):
|
|
195
|
+
|
|
196
|
+
- **Whisper-family** (`whisper`, `whisper-futo`, `openai` batch +
|
|
197
|
+
realtime, `groq`) — the language is passed to the model as a hard
|
|
198
|
+
lock: the decoder generates that language regardless of what it
|
|
199
|
+
hears acoustically. Accepts any [ISO 639-1 short code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes)
|
|
200
|
+
Whisper recognises (~99 languages). When unset, Whisper auto-detects
|
|
201
|
+
per chunk.
|
|
202
|
+
- **English-only model variants** — for `whisper` and `whisper-futo`,
|
|
203
|
+
`-l en` *also* auto-substitutes the English-only model when one
|
|
204
|
+
exists (`small` → `small.en`, etc.). These variants trade
|
|
205
|
+
multilingual coverage for English accuracy.
|
|
206
|
+
- **Vosk** — language isn't a runtime parameter; vosk ships a
|
|
207
|
+
separate model per language. `-l fr` looks up the vosk model
|
|
208
|
+
pre-mapped to French in [`scribe/models.toml`](../scribe/models.toml)
|
|
209
|
+
and instantiates that one. Vosk has no auto-detect path, so the
|
|
210
|
+
Language menu's `Auto` entry on vosk falls back to a sensible
|
|
211
|
+
default — the tray shows `Auto (🇬🇧 en)` to make this explicit
|
|
212
|
+
without mutating the stored `language=None`.
|
|
213
|
+
|
|
214
|
+
The tray's **Language** submenu exposes the four curated languages
|
|
215
|
+
(`en` / `fr` / `de` / `it`) with origin-country flag prefixes
|
|
216
|
+
(🇬🇧 / 🇫🇷 / 🇩🇪 / 🇮🇹). The CLI accepts these plus any other ISO 639-1
|
|
217
|
+
code the active backend recognises.
|
|
218
|
+
|
|
219
|
+
## Stream mode (works with any backend)
|
|
220
|
+
|
|
221
|
+
`--stream` (or **Mode: Stream** in the tray) emits transcribed text
|
|
222
|
+
**live as you speak**, regardless of which backend you picked. This
|
|
223
|
+
is the headline v1.0.0 improvement: scribe abstracts over the two
|
|
224
|
+
different mechanisms that backends use to deliver live output, so
|
|
225
|
+
`--stream` works uniformly across every supported backend.
|
|
226
|
+
|
|
227
|
+
- **Native streaming backends** (Vosk, `gpt-realtime-whisper`) push
|
|
228
|
+
partial results from the server as audio is received — scribe just
|
|
229
|
+
forwards them to the chosen output (focused window / clipboard /
|
|
230
|
+
terminal / file). These backends are *always* in Stream mode; the
|
|
231
|
+
Mode toggle reads "Mode: Stream (native)" for them and is read-only.
|
|
232
|
+
- **Batch backends** (Whisper local, Whisper FUTO, OpenAI
|
|
233
|
+
`gpt-4o-*-transcribe`, Groq `whisper-large-v3-turbo`) don't accept
|
|
234
|
+
partial audio. scribe instead cuts the recording buffer on
|
|
235
|
+
detected silence and issues a separate transcription request for
|
|
236
|
+
each chunk — internally called *pseudo-streaming*. The user sees
|
|
237
|
+
the same live experience.
|
|
191
238
|
|
|
192
239
|
```bash
|
|
193
|
-
scribe --
|
|
240
|
+
scribe --stream # any backend, live transcription
|
|
241
|
+
scribe --stream --backend groq # Groq + Stream is the sweet spot
|
|
242
|
+
scribe --stream --backend whisper # local, live, no API key
|
|
194
243
|
```
|
|
195
244
|
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
245
|
+
### How pseudo-streaming carves up a recording
|
|
246
|
+
|
|
247
|
+
Once the buffer has grown to at least `--stream-chunk-min` (default
|
|
248
|
+
1.5 s), silence of at least `--stream-chunk-silence-break` (default
|
|
249
|
+
0.6 s) triggers a chunk cut. A force-cut fires at `--stream-chunk-max`
|
|
250
|
+
(default 10 s) regardless of silence, to cap latency. The session
|
|
251
|
+
continues until you stop it manually.
|
|
252
|
+
|
|
253
|
+
### Does pseudo-streaming change the API cost?
|
|
254
|
+
|
|
255
|
+
For cloud backends, going from one big transcription to N chunked
|
|
256
|
+
requests **does not normally change the bill**:
|
|
257
|
+
|
|
258
|
+
- **Groq** (`whisper-large-v3-turbo`) is billed per second of audio.
|
|
259
|
+
Total audio is unchanged → same cost.
|
|
260
|
+
- **OpenAI `whisper-1`** (legacy) is billed per minute of audio. Same
|
|
261
|
+
logic, same cost.
|
|
262
|
+
- **OpenAI `gpt-4o-transcribe` / `gpt-4o-mini-transcribe`** are token-
|
|
263
|
+
billed (audio-in + text-out + prompt-in). Audio and output stay
|
|
264
|
+
identical; the only delta is the rolling cross-chunk *prompt*
|
|
265
|
+
context (~200 chars ≈ 50–60 tokens per chunk after the first).
|
|
266
|
+
At gpt-4o-mini-transcribe input rates this is negligible — well
|
|
267
|
+
under a cent per long session.
|
|
268
|
+
|
|
269
|
+
That said, your real cost depends on your usage and your account's
|
|
270
|
+
pricing tier — **verify on your provider's billing dashboard** if
|
|
271
|
+
cost is a hard constraint.
|
|
272
|
+
|
|
273
|
+
Two special values for `--stream-chunk-silence-break` (set via the
|
|
274
|
+
tray's **Silence break** picker or `--stream-chunk-silence-break 0`
|
|
275
|
+
at the CLI):
|
|
276
|
+
|
|
277
|
+
- **Auto** (`0`) — disables the fixed-threshold trigger. At force-cut
|
|
278
|
+
time scribe picks the *longest* silence interval within the window
|
|
279
|
+
whose start position is at least `--stream-chunk-min` into the chunk,
|
|
280
|
+
re-cutting there for a more natural word boundary. Falls back to a
|
|
281
|
+
brute force-cut if no qualifying silence is found.
|
|
282
|
+
- **Max** — disables silence-based cuts entirely; only the force-cut at
|
|
283
|
+
`--stream-chunk-max` fires. Useful when you want uniform chunk sizes
|
|
284
|
+
regardless of speech patterns. (Only selectable from the tray picker.)
|
|
285
|
+
|
|
286
|
+
Stream mode is off by default — the default `Clip` mode transcribes the
|
|
287
|
+
whole recording at end (`--clip`). The tray menu surfaces the same
|
|
288
|
+
toggle as the top-level **Mode: Stream / Clip** item. Native
|
|
289
|
+
streamers (vosk, `gpt-realtime-whisper`) are always streaming and the
|
|
290
|
+
menu shows **Mode: Stream (native)** for them.
|
|
206
291
|
|
|
207
292
|
### Cross-chunk prompt context
|
|
208
293
|
|
|
209
|
-
In pseudo-streaming
|
|
210
|
-
prompt with the trailing ~200 characters of the
|
|
211
|
-
transcription. This rolling tail is concatenated
|
|
212
|
-
`--prompt` / `--words` you configured and
|
|
213
|
-
the same channel as the static prompt
|
|
214
|
-
above). The motivation is cross-chunk
|
|
294
|
+
In Stream mode (pseudo-streaming) scribe automatically augments
|
|
295
|
+
each chunk's prompt with the trailing ~200 characters of the
|
|
296
|
+
*previous* chunk's transcription. This rolling tail is concatenated
|
|
297
|
+
onto whatever static `--prompt` / `--words` you configured and
|
|
298
|
+
reaches the backend through the same channel as the static prompt
|
|
299
|
+
(the vocabulary biasing table above). The motivation is cross-chunk
|
|
300
|
+
continuity:
|
|
215
301
|
|
|
216
302
|
- **Capitalization drift** — without context, a chunk that starts
|
|
217
303
|
right after a period might come back lowercased.
|
|
@@ -225,14 +311,13 @@ Whisper's prompt window is capped at ~224 tokens; 200 chars of French
|
|
|
225
311
|
sits well under that and leaves room for your static prompt + words
|
|
226
312
|
list.
|
|
227
313
|
|
|
228
|
-
The rolling tail is **dropped**
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
natural sentence boundaries.
|
|
314
|
+
The rolling tail is **dropped** when the silence between two
|
|
315
|
+
utterances exceeds `--stream-context-reset-silence` ×
|
|
316
|
+
`--stream-chunk-silence-break` (default 3 × 0.6 s = 1.8 s) — a long
|
|
317
|
+
pause is treated as a new sentence/idea boundary, where carrying a
|
|
318
|
+
possibly-bad prior chunk forward biases the next one more than it
|
|
319
|
+
helps. Use `--stream-context-reset-silence inf` to keep context across
|
|
320
|
+
arbitrarily long pauses.
|
|
236
321
|
|
|
237
322
|
Short pauses (mid-sentence punctuation) keep the context; the cut at
|
|
238
323
|
the start of every new recording also clears it.
|