scribe-cli 0.18.0__tar.gz → 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/.gitignore +1 -0
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/PKG-INFO +58 -22
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/README.md +35 -20
- scribe_cli-1.0.0/docs/app-tray-menu.png +0 -0
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/docs/backends.md +125 -40
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/docs/cli.md +67 -16
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/docs/desktop-install.md +1 -1
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/docs/installation.md +1 -1
- scribe_cli-0.18.0/docs/keyboard.md → scribe_cli-1.0.0/docs/output.md +98 -36
- scribe_cli-1.0.0/docs/tray.md +127 -0
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/pyproject.toml +27 -6
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/scribe/_version.py +3 -3
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/scribe/app.py +298 -132
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/scribe/backends/groq.py +4 -3
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/scribe/backends/openai_api.py +11 -3
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/scribe/backends/openai_realtime.py +59 -5
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/scribe/backends/vosk.py +20 -4
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/scribe/backends/whisper.py +12 -3
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/scribe/backends/whisper_futo.py +10 -3
- scribe_cli-1.0.0/scribe/dialog.py +56 -0
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/scribe/menu.py +698 -163
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/scribe/models.py +145 -75
- scribe_cli-1.0.0/scribe/output.py +237 -0
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/scribe/session.py +29 -4
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/scribe_cli.egg-info/PKG-INFO +58 -22
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/scribe_cli.egg-info/SOURCES.txt +6 -1
- scribe_cli-1.0.0/tests/test_backend_matrix.py +295 -0
- scribe_cli-1.0.0/tests/test_output.py +165 -0
- scribe_cli-1.0.0/tests/test_output_file_picker.py +57 -0
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/tests/test_pseudo_streaming.py +120 -43
- scribe_cli-0.18.0/docs/app-tray-menu.png +0 -0
- scribe_cli-0.18.0/docs/tray.md +0 -97
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/.github/FUNDING.yml +0 -0
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/.github/workflows/pypi.yml +0 -0
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/LICENSE +0 -0
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/docs/roadmap-libei.md +0 -0
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/icon.xcf +0 -0
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/scribe/__init__.py +0 -0
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/scribe/audio.py +0 -0
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/scribe/backends/__init__.py +0 -0
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/scribe/install_desktop.py +0 -0
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/scribe/keyboard.py +0 -0
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/scribe/models.toml +0 -0
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/scribe/saverecording.py +0 -0
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/scribe/testpynput.py +0 -0
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/scribe/typers/__init__.py +0 -0
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/scribe/typers/base.py +0 -0
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/scribe/typers/eitype.py +0 -0
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/scribe/typers/pynput.py +0 -0
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/scribe/typers/wtype.py +0 -0
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/scribe/typers/ydotool.py +0 -0
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/scribe/util.py +0 -0
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/scribe_cli.egg-info/dependency_links.txt +0 -0
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/scribe_cli.egg-info/entry_points.txt +0 -0
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/scribe_cli.egg-info/requires.txt +0 -0
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/scribe_cli.egg-info/top_level.txt +0 -0
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/scribe_data/__init__.py +0 -0
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/scribe_data/share/icon.png +0 -0
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/scribe_data/share/icon_recording.png +0 -0
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/scribe_data/share/icon_writing.png +0 -0
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/scribe_data/silero_vad.LICENSE +0 -0
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/scribe_data/silero_vad.onnx +0 -0
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/scribe_data/templates/scribe.desktop +0 -0
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/scripts/bench_whisper_local.py +0 -0
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/scripts/test_python_versions_install.sh +0 -0
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/setup.cfg +0 -0
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/tests/test_openai_realtime_coalesce.py +0 -0
- {scribe_cli-0.18.0 → scribe_cli-1.0.0}/tests/test_whisper_futo.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: scribe-cli
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 1.0.0
|
|
4
4
|
Summary: Speech-to-text CLI and system-tray app for dictating into any focused window. Local (vosk, faster-whisper) or cloud (groq, openai) backends, batch or streaming.
|
|
5
5
|
Author-email: Mahé Perrette <mahe.perrette@gmail.com>
|
|
6
6
|
License: MIT License
|
|
@@ -33,13 +33,34 @@ License: MIT License
|
|
|
33
33
|
licenses of all dependencies before using or distributing this software to
|
|
34
34
|
ensure compliance with their respective terms.
|
|
35
35
|
Project-URL: Homepage, https://github.com/perrette/scribe
|
|
36
|
-
|
|
36
|
+
Project-URL: Source, https://github.com/perrette/scribe
|
|
37
|
+
Project-URL: Issues, https://github.com/perrette/scribe/issues
|
|
38
|
+
Project-URL: Changelog, https://github.com/perrette/scribe/releases
|
|
39
|
+
Project-URL: Funding, https://github.com/sponsors/perrette
|
|
40
|
+
Keywords: speech-to-text,stt,transcription,dictation,voice-typing,voice-recognition,multilingual,realtime,streaming,cli,tray,vosk,whisper,faster-whisper,openai,groq,gpt-4o,linux,wayland,keyboard,clipboard,microphone,audio
|
|
41
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
42
|
+
Classifier: Intended Audience :: End Users/Desktop
|
|
43
|
+
Classifier: Intended Audience :: Developers
|
|
44
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
37
45
|
Classifier: Programming Language :: Python :: 3.9
|
|
38
46
|
Classifier: Programming Language :: Python :: 3.10
|
|
39
47
|
Classifier: Programming Language :: Python :: 3.11
|
|
40
48
|
Classifier: Programming Language :: Python :: 3.12
|
|
41
49
|
Classifier: Programming Language :: Python :: 3.13
|
|
42
50
|
Classifier: Operating System :: OS Independent
|
|
51
|
+
Classifier: Environment :: Console
|
|
52
|
+
Classifier: Environment :: X11 Applications
|
|
53
|
+
Classifier: Environment :: MacOS X
|
|
54
|
+
Classifier: Environment :: Win32 (MS Windows)
|
|
55
|
+
Classifier: Natural Language :: English
|
|
56
|
+
Classifier: Natural Language :: French
|
|
57
|
+
Classifier: Natural Language :: German
|
|
58
|
+
Classifier: Natural Language :: Italian
|
|
59
|
+
Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
|
|
60
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
61
|
+
Classifier: Topic :: Office/Business
|
|
62
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
63
|
+
Classifier: Topic :: Utilities
|
|
43
64
|
Requires-Python: >=3.9
|
|
44
65
|
Description-Content-Type: text/markdown
|
|
45
66
|
License-File: LICENSE
|
|
@@ -92,11 +113,13 @@ cloud-based APIs, batch and streaming workflows.
|
|
|
92
113
|
|
|
93
114
|
## What it does
|
|
94
115
|
|
|
95
|
-
- Records from your mic and transcribes via one of
|
|
96
|
-
**Vosk** (local, streaming), **Whisper** (local, batch),
|
|
97
|
-
(
|
|
98
|
-
|
|
99
|
-
|
|
116
|
+
- Records from your mic and transcribes via one of five backends —
|
|
117
|
+
**Vosk** (local, streaming), **Whisper** (local, batch),
|
|
118
|
+
**Whisper FUTO** (local, batch — ACFT-tuned for short dictations),
|
|
119
|
+
**OpenAI** (cloud, batch *or* streaming), **Groq** (cloud, batch).
|
|
120
|
+
- Delivers the transcript four ways: paste into the focused window
|
|
121
|
+
(default), copy to clipboard, print to the terminal, or write to
|
|
122
|
+
a file.
|
|
100
123
|
- Runs as a **system tray icon** with a single Record button, or as an
|
|
101
124
|
interactive **terminal TUI** — same menu in both.
|
|
102
125
|
- Hooks into your DE's keyboard shortcuts via `SIGUSR1` (toggle
|
|
@@ -126,8 +149,8 @@ scribe
|
|
|
126
149
|
This launches the system tray icon. Press Record, speak, press Stop —
|
|
127
150
|
the transcription lands in the focused window. Scribe picks the first
|
|
128
151
|
backend whose key / dependency is present, in order **`groq` →
|
|
129
|
-
`openai` → `whisper` → `vosk`**, so with `GROQ_API_KEY`
|
|
130
|
-
command above is equivalent to:
|
|
152
|
+
`openai` → `whisper-futo` → `whisper` → `vosk`**, so with `GROQ_API_KEY`
|
|
153
|
+
set the command above is equivalent to:
|
|
131
154
|
|
|
132
155
|
```bash
|
|
133
156
|
scribe --backend groq --model whisper-large-v3-turbo
|
|
@@ -142,15 +165,17 @@ scribe --backend openai --model gpt-4o-mini-transcribe # OpenAI sweet spot
|
|
|
142
165
|
scribe --backend openai --model gpt-realtime-whisper # OpenAI streaming
|
|
143
166
|
scribe --backend whisper --model small # local, no API key
|
|
144
167
|
scribe --frontend terminal # interactive TUI menu
|
|
145
|
-
scribe --
|
|
168
|
+
scribe --record # start recording immediately on launch (works in tray or terminal)
|
|
169
|
+
scribe --record --frontend terminal --mode file # one-shot batched dictation → file
|
|
170
|
+
scribe --record --frontend terminal --mode file --stream # streamed: chunks appended live as you speak
|
|
146
171
|
scribe --mode clipboard # copy to clipboard, no keystroke
|
|
147
172
|
scribe --mode terminal # only print to stdout
|
|
148
|
-
scribe -o transcript.txt
|
|
173
|
+
scribe --mode file -o transcript.txt # append to a file (no keystroke / clipboard)
|
|
149
174
|
```
|
|
150
175
|
|
|
151
176
|
With `--no-interactive` (terminal frontend only), scribe skips the
|
|
152
177
|
interactive menu and starts recording right away — handy for scripted,
|
|
153
|
-
one-shot transcriptions.
|
|
178
|
+
one-shot transcriptions.
|
|
154
179
|
|
|
155
180
|
Bias the recogniser toward names, jargon, or a domain glossary with
|
|
156
181
|
`--prompt "free text hint"` and `--words word1 word2 ...` (each also
|
|
@@ -161,12 +186,13 @@ for what each backend does with them.
|
|
|
161
186
|
|
|
162
187
|
## Backends at a glance
|
|
163
188
|
|
|
164
|
-
| Backend
|
|
165
|
-
|
|
166
|
-
| Groq (cloud)
|
|
167
|
-
| OpenAI (cloud)
|
|
168
|
-
| Whisper (local) | `whisper`
|
|
169
|
-
|
|
|
189
|
+
| Backend | `--backend` | Default model | Streaming model(s) | Requires |
|
|
190
|
+
|----------------------|-----------------|----------------------------|---------------------------|----------------------------------------|
|
|
191
|
+
| Groq (cloud) | `groq` | `whisper-large-v3-turbo` | — | `GROQ_API_KEY` |
|
|
192
|
+
| OpenAI (cloud) | `openai` | `gpt-4o-mini-transcribe` | `gpt-realtime-whisper` | `OPENAI_API_KEY` |
|
|
193
|
+
| Whisper FUTO (local) | `whisper-futo` | `small` | — | `pip install scribe-cli[whisper-futo]` |
|
|
194
|
+
| Whisper (local) | `whisper` | `small` | — | `pip install scribe-cli[whisper]` |
|
|
195
|
+
| Vosk (local) | `vosk` | language-dependent | all Vosk models | `pip install scribe-cli[vosk]` |
|
|
170
196
|
|
|
171
197
|
Whether a transcription appears live as you speak or all at once when
|
|
172
198
|
you stop depends on the **model** picked — see
|
|
@@ -175,8 +201,11 @@ you stop depends on the **model** picked — see
|
|
|
175
201
|
|
|
176
202
|
### Getting an API key
|
|
177
203
|
|
|
178
|
-
Groq is
|
|
179
|
-
|
|
204
|
+
Groq is the **recommended cloud backend by default** — extremely fast
|
|
205
|
+
(by a wide margin compared to other cloud STT options, especially in
|
|
206
|
+
**Stream** mode where the per-chunk roundtrip latency dominates the
|
|
207
|
+
perceived speed), quite accurate, and the **free tier** is generous
|
|
208
|
+
enough for everyday dictation. Sign up at
|
|
180
209
|
[console.groq.com](https://console.groq.com/), create an API key
|
|
181
210
|
under **Settings → API Keys**, and export it as `GROQ_API_KEY`.
|
|
182
211
|
|
|
@@ -189,7 +218,7 @@ I personally use [OpenAI](https://openai.com/api/) with `gpt-4o-mini-transcribe`
|
|
|
189
218
|
extras, Ubuntu / GNOME tray libs.
|
|
190
219
|
- [Backends in detail](docs/backends.md) — model lists, when to pick
|
|
191
220
|
which, the realtime model.
|
|
192
|
-
- [
|
|
221
|
+
- [Output modes & typer backends](docs/output.md) — keystroke vs
|
|
193
222
|
clipboard, Wayland / `eitype`, `--type-direct`.
|
|
194
223
|
- [System tray & global hotkeys](docs/tray.md) — menu tree, icon
|
|
195
224
|
states, `SIGUSR1`/`SIGUSR2`.
|
|
@@ -198,10 +227,17 @@ I personally use [OpenAI](https://openai.com/api/) with `gpt-4o-mini-transcribe`
|
|
|
198
227
|
- [Fine tuning & CLI reference](docs/cli.md) — every `scribe --help`
|
|
199
228
|
flag with examples.
|
|
200
229
|
|
|
230
|
+
## Related projects
|
|
231
|
+
|
|
232
|
+
- **[bard](https://github.com/perrette/bard)** — TTS sibling of scribe,
|
|
233
|
+
same tray/CLI architecture in reverse: highlight text, hear it
|
|
234
|
+
spoken. Shares the [`desktop-ai-core`](https://github.com/perrette/desktop-ai-core)
|
|
235
|
+
backbone (frontends, providers, dialog helpers).
|
|
236
|
+
|
|
201
237
|
## Compatibility
|
|
202
238
|
|
|
203
239
|
Initially developed for Python 3 on Ubuntu 24.04 (GNOME + Wayland);
|
|
204
240
|
works on macOS and Windows too. Wayland keystroke injection is
|
|
205
|
-
convoluted but [solved](docs/
|
|
241
|
+
convoluted but [solved](docs/output.md). For dependencies of
|
|
206
242
|
individual subsystems, check `pynput` (keyboard) and `pystray` (tray
|
|
207
243
|
icon).
|
|
@@ -9,11 +9,13 @@ cloud-based APIs, batch and streaming workflows.
|
|
|
9
9
|
|
|
10
10
|
## What it does
|
|
11
11
|
|
|
12
|
-
- Records from your mic and transcribes via one of
|
|
13
|
-
**Vosk** (local, streaming), **Whisper** (local, batch),
|
|
14
|
-
(
|
|
15
|
-
|
|
16
|
-
|
|
12
|
+
- Records from your mic and transcribes via one of five backends —
|
|
13
|
+
**Vosk** (local, streaming), **Whisper** (local, batch),
|
|
14
|
+
**Whisper FUTO** (local, batch — ACFT-tuned for short dictations),
|
|
15
|
+
**OpenAI** (cloud, batch *or* streaming), **Groq** (cloud, batch).
|
|
16
|
+
- Delivers the transcript four ways: paste into the focused window
|
|
17
|
+
(default), copy to clipboard, print to the terminal, or write to
|
|
18
|
+
a file.
|
|
17
19
|
- Runs as a **system tray icon** with a single Record button, or as an
|
|
18
20
|
interactive **terminal TUI** — same menu in both.
|
|
19
21
|
- Hooks into your DE's keyboard shortcuts via `SIGUSR1` (toggle
|
|
@@ -43,8 +45,8 @@ scribe
|
|
|
43
45
|
This launches the system tray icon. Press Record, speak, press Stop —
|
|
44
46
|
the transcription lands in the focused window. Scribe picks the first
|
|
45
47
|
backend whose key / dependency is present, in order **`groq` →
|
|
46
|
-
`openai` → `whisper` → `vosk`**, so with `GROQ_API_KEY`
|
|
47
|
-
command above is equivalent to:
|
|
48
|
+
`openai` → `whisper-futo` → `whisper` → `vosk`**, so with `GROQ_API_KEY`
|
|
49
|
+
set the command above is equivalent to:
|
|
48
50
|
|
|
49
51
|
```bash
|
|
50
52
|
scribe --backend groq --model whisper-large-v3-turbo
|
|
@@ -59,15 +61,17 @@ scribe --backend openai --model gpt-4o-mini-transcribe # OpenAI sweet spot
|
|
|
59
61
|
scribe --backend openai --model gpt-realtime-whisper # OpenAI streaming
|
|
60
62
|
scribe --backend whisper --model small # local, no API key
|
|
61
63
|
scribe --frontend terminal # interactive TUI menu
|
|
62
|
-
scribe --
|
|
64
|
+
scribe --record # start recording immediately on launch (works in tray or terminal)
|
|
65
|
+
scribe --record --frontend terminal --mode file # one-shot batched dictation → file
|
|
66
|
+
scribe --record --frontend terminal --mode file --stream # streamed: chunks appended live as you speak
|
|
63
67
|
scribe --mode clipboard # copy to clipboard, no keystroke
|
|
64
68
|
scribe --mode terminal # only print to stdout
|
|
65
|
-
scribe -o transcript.txt
|
|
69
|
+
scribe --mode file -o transcript.txt # append to a file (no keystroke / clipboard)
|
|
66
70
|
```
|
|
67
71
|
|
|
68
72
|
With `--no-interactive` (terminal frontend only), scribe skips the
|
|
69
73
|
interactive menu and starts recording right away — handy for scripted,
|
|
70
|
-
one-shot transcriptions.
|
|
74
|
+
one-shot transcriptions.
|
|
71
75
|
|
|
72
76
|
Bias the recogniser toward names, jargon, or a domain glossary with
|
|
73
77
|
`--prompt "free text hint"` and `--words word1 word2 ...` (each also
|
|
@@ -78,12 +82,13 @@ for what each backend does with them.
|
|
|
78
82
|
|
|
79
83
|
## Backends at a glance
|
|
80
84
|
|
|
81
|
-
| Backend
|
|
82
|
-
|
|
83
|
-
| Groq (cloud)
|
|
84
|
-
| OpenAI (cloud)
|
|
85
|
-
| Whisper (local) | `whisper`
|
|
86
|
-
|
|
|
85
|
+
| Backend | `--backend` | Default model | Streaming model(s) | Requires |
|
|
86
|
+
|----------------------|-----------------|----------------------------|---------------------------|----------------------------------------|
|
|
87
|
+
| Groq (cloud) | `groq` | `whisper-large-v3-turbo` | — | `GROQ_API_KEY` |
|
|
88
|
+
| OpenAI (cloud) | `openai` | `gpt-4o-mini-transcribe` | `gpt-realtime-whisper` | `OPENAI_API_KEY` |
|
|
89
|
+
| Whisper FUTO (local) | `whisper-futo` | `small` | — | `pip install scribe-cli[whisper-futo]` |
|
|
90
|
+
| Whisper (local) | `whisper` | `small` | — | `pip install scribe-cli[whisper]` |
|
|
91
|
+
| Vosk (local) | `vosk` | language-dependent | all Vosk models | `pip install scribe-cli[vosk]` |
|
|
87
92
|
|
|
88
93
|
Whether a transcription appears live as you speak or all at once when
|
|
89
94
|
you stop depends on the **model** picked — see
|
|
@@ -92,8 +97,11 @@ you stop depends on the **model** picked — see
|
|
|
92
97
|
|
|
93
98
|
### Getting an API key
|
|
94
99
|
|
|
95
|
-
Groq is
|
|
96
|
-
|
|
100
|
+
Groq is the **recommended cloud backend by default** — extremely fast
|
|
101
|
+
(by a wide margin compared to other cloud STT options, especially in
|
|
102
|
+
**Stream** mode where the per-chunk roundtrip latency dominates the
|
|
103
|
+
perceived speed), quite accurate, and the **free tier** is generous
|
|
104
|
+
enough for everyday dictation. Sign up at
|
|
97
105
|
[console.groq.com](https://console.groq.com/), create an API key
|
|
98
106
|
under **Settings → API Keys**, and export it as `GROQ_API_KEY`.
|
|
99
107
|
|
|
@@ -106,7 +114,7 @@ I personally use [OpenAI](https://openai.com/api/) with `gpt-4o-mini-transcribe`
|
|
|
106
114
|
extras, Ubuntu / GNOME tray libs.
|
|
107
115
|
- [Backends in detail](docs/backends.md) — model lists, when to pick
|
|
108
116
|
which, the realtime model.
|
|
109
|
-
- [
|
|
117
|
+
- [Output modes & typer backends](docs/output.md) — keystroke vs
|
|
110
118
|
clipboard, Wayland / `eitype`, `--type-direct`.
|
|
111
119
|
- [System tray & global hotkeys](docs/tray.md) — menu tree, icon
|
|
112
120
|
states, `SIGUSR1`/`SIGUSR2`.
|
|
@@ -115,10 +123,17 @@ I personally use [OpenAI](https://openai.com/api/) with `gpt-4o-mini-transcribe`
|
|
|
115
123
|
- [Fine tuning & CLI reference](docs/cli.md) — every `scribe --help`
|
|
116
124
|
flag with examples.
|
|
117
125
|
|
|
126
|
+
## Related projects
|
|
127
|
+
|
|
128
|
+
- **[bard](https://github.com/perrette/bard)** — TTS sibling of scribe,
|
|
129
|
+
same tray/CLI architecture in reverse: highlight text, hear it
|
|
130
|
+
spoken. Shares the [`desktop-ai-core`](https://github.com/perrette/desktop-ai-core)
|
|
131
|
+
backbone (frontends, providers, dialog helpers).
|
|
132
|
+
|
|
118
133
|
## Compatibility
|
|
119
134
|
|
|
120
135
|
Initially developed for Python 3 on Ubuntu 24.04 (GNOME + Wayland);
|
|
121
136
|
works on macOS and Windows too. Wayland keystroke injection is
|
|
122
|
-
convoluted but [solved](docs/
|
|
137
|
+
convoluted but [solved](docs/output.md). For dependencies of
|
|
123
138
|
individual subsystems, check `pynput` (keyboard) and `pystray` (tray
|
|
124
139
|
icon).
|
|
Binary file
|
|
@@ -70,7 +70,7 @@ Vosk transcribes in real time and is very good at one language at a
|
|
|
70
70
|
time, but tends to make more mistakes than Whisper and does not produce
|
|
71
71
|
punctuation. It becomes really useful in longer, interactive sessions
|
|
72
72
|
where the live "appears as you speak" UX matters — see
|
|
73
|
-
[
|
|
73
|
+
[output.md](output.md) for how the keystroke mode interacts with
|
|
74
74
|
streaming models.
|
|
75
75
|
|
|
76
76
|
There are many [Vosk models](https://alphacephei.com/vosk/models)
|
|
@@ -117,12 +117,15 @@ for the full picture.
|
|
|
117
117
|
## `groq` (Groq cloud)
|
|
118
118
|
|
|
119
119
|
Talks to Groq's OpenAI-compatible API and defaults to
|
|
120
|
-
`whisper-large-v3-turbo`.
|
|
121
|
-
|
|
120
|
+
`whisper-large-v3-turbo`. **Extremely fast** thanks to Groq's
|
|
121
|
+
inference hardware — the recommended cloud backend by default, and
|
|
122
|
+
the natural pick for `--stream` mode where per-chunk roundtrip
|
|
123
|
+
latency dominates perceived speed:
|
|
122
124
|
|
|
123
125
|
```bash
|
|
124
126
|
export GROQ_API_KEY=YOURAPIKEY
|
|
125
|
-
scribe --backend groq
|
|
127
|
+
scribe --backend groq # Clip mode (default)
|
|
128
|
+
scribe --backend groq --stream # live transcription, per-chunk
|
|
126
129
|
```
|
|
127
130
|
|
|
128
131
|
The `groq` backend reuses the `openai` Python client under the hood, so
|
|
@@ -146,14 +149,14 @@ style, domain, or word list. The concept is generic across the
|
|
|
146
149
|
whisper-family backends but each backend exposes it slightly
|
|
147
150
|
differently:
|
|
148
151
|
|
|
149
|
-
| Backend | `--prompt` | `--words` |
|
|
150
|
-
|
|
151
|
-
| `whisper` (faster-whisper, local) | passed as `initial_prompt=` | passed as `hotwords=` — a **dedicated biasing channel** separate from the prompt |
|
|
152
|
-
| `whisper-futo` (pywhispercpp, local) | passed as `initial_prompt=` | joined onto the prompt string (no separate hotwords channel here) |
|
|
153
|
-
| `openai` batch (`gpt-4o*-transcribe`) | passed as `prompt=` | joined onto the prompt string |
|
|
154
|
-
| `groq` (`whisper-large-v3-turbo`) | passed as `prompt=` | joined onto the prompt string |
|
|
155
|
-
| `openai` realtime (`gpt-realtime-whisper`) | *silently ignored* — the model rejects the prompt parameter server-side (HTTP 400 *"The 'prompt' parameter is not supported for this model."*). The kwarg stays accepted for plumbing compatibility but never reaches the API. | same — joined into the (ignored) prompt |
|
|
156
|
-
| `vosk` | *ignored* (no soft prompt) | *ignored* (Vosk only supports a hard `grammar` allowlist; not yet exposed) |
|
|
152
|
+
| Backend | `--prompt` | `--words` | `--language` |
|
|
153
|
+
|--------------------------------------|-------------------------------|--------------------------------------------------------|---------------------------------------------------------|
|
|
154
|
+
| `whisper` (faster-whisper, local) | passed as `initial_prompt=` | passed as `hotwords=` — a **dedicated biasing channel** separate from the prompt | passed as `language=` (ISO 639-1); `-l en` also auto-substitutes `small.en` etc. |
|
|
155
|
+
| `whisper-futo` (pywhispercpp, local) | passed as `initial_prompt=` | joined onto the prompt string (no separate hotwords channel here) | passed as `language=` (ISO 639-1); `-l en` auto-substitutes `small.en` etc. |
|
|
156
|
+
| `openai` batch (`gpt-4o*-transcribe`) | passed as `prompt=` | joined onto the prompt string | passed as `language=` hint (ISO 639-1) |
|
|
157
|
+
| `groq` (`whisper-large-v3-turbo`) | passed as `prompt=` | joined onto the prompt string | passed as `language=` hint (ISO 639-1) |
|
|
158
|
+
| `openai` realtime (`gpt-realtime-whisper`) | *silently ignored* — the model rejects the prompt parameter server-side (HTTP 400 *"The 'prompt' parameter is not supported for this model."*). The kwarg stays accepted for plumbing compatibility but never reaches the API. | same — joined into the (ignored) prompt | passed as `language=` (ISO 639-1) |
|
|
159
|
+
| `vosk` | *ignored* (no soft prompt) | *ignored* (Vosk only supports a hard `grammar` allowlist; not yet exposed) | picks a per-language model from `scribe/models.toml`; no runtime parameter |
|
|
157
160
|
|
|
158
161
|
The whisper-family APIs cap the prompt around ~224 tokens; longer
|
|
159
162
|
hints are silently truncated. Faster-whisper's `hotwords` channel is
|
|
@@ -184,34 +187,117 @@ invocation, pass an explicit empty value: `--prompt ""` (or
|
|
|
184
187
|
arguments (or `--words-file ""`) suppresses the words default. Each
|
|
185
188
|
side is independent.
|
|
186
189
|
|
|
187
|
-
##
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
190
|
+
## Language
|
|
191
|
+
|
|
192
|
+
`-l / --language LANG` tells the backend which language to expect.
|
|
193
|
+
What that means in practice varies by backend (see the per-backend
|
|
194
|
+
column in the table above):
|
|
195
|
+
|
|
196
|
+
- **Whisper-family** (`whisper`, `whisper-futo`, `openai` batch +
|
|
197
|
+
realtime, `groq`) — the language is passed to the model as a hard
|
|
198
|
+
lock: the decoder generates that language regardless of what it
|
|
199
|
+
hears acoustically. Accepts any [ISO 639-1 short code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes)
|
|
200
|
+
Whisper recognises (~99 languages). When unset, Whisper auto-detects
|
|
201
|
+
per chunk.
|
|
202
|
+
- **English-only model variants** — for `whisper` and `whisper-futo`,
|
|
203
|
+
`-l en` *also* auto-substitutes the English-only model when one
|
|
204
|
+
exists (`small` → `small.en`, etc.). These variants trade
|
|
205
|
+
multilingual coverage for English accuracy.
|
|
206
|
+
- **Vosk** — language isn't a runtime parameter; vosk ships a
|
|
207
|
+
separate model per language. `-l fr` looks up the vosk model
|
|
208
|
+
pre-mapped to French in [`scribe/models.toml`](../scribe/models.toml)
|
|
209
|
+
and instantiates that one. Vosk has no auto-detect path, so the
|
|
210
|
+
Language menu's `Auto` entry on vosk falls back to a sensible
|
|
211
|
+
default — the tray shows `Auto (🇬🇧 en)` to make this explicit
|
|
212
|
+
without mutating the stored `language=None`.
|
|
213
|
+
|
|
214
|
+
The tray's **Language** submenu exposes the four curated languages
|
|
215
|
+
(`en` / `fr` / `de` / `it`) with origin-country flag prefixes
|
|
216
|
+
(🇬🇧 / 🇫🇷 / 🇩🇪 / 🇮🇹). The CLI accepts these plus any other ISO 639-1
|
|
217
|
+
code the active backend recognises.
|
|
218
|
+
|
|
219
|
+
## Stream mode (works with any backend)
|
|
220
|
+
|
|
221
|
+
`--stream` (or **Mode: Stream** in the tray) emits transcribed text
|
|
222
|
+
**live as you speak**, regardless of which backend you picked. This
|
|
223
|
+
is the headline v1.0.0 improvement: scribe abstracts over the two
|
|
224
|
+
different mechanisms that backends use to deliver live output, so
|
|
225
|
+
`--stream` works uniformly across every supported backend.
|
|
226
|
+
|
|
227
|
+
- **Native streaming backends** (Vosk, `gpt-realtime-whisper`) push
|
|
228
|
+
partial results from the server as audio is received — scribe just
|
|
229
|
+
forwards them to the chosen output (focused window / clipboard /
|
|
230
|
+
terminal / file). These backends are *always* in Stream mode; the
|
|
231
|
+
Mode toggle reads "Mode: Stream (native)" for them and is read-only.
|
|
232
|
+
- **Batch backends** (Whisper local, Whisper FUTO, OpenAI
|
|
233
|
+
`gpt-4o-*-transcribe`, Groq `whisper-large-v3-turbo`) don't accept
|
|
234
|
+
partial audio. scribe instead cuts the recording buffer on
|
|
235
|
+
detected silence and issues a separate transcription request for
|
|
236
|
+
each chunk — internally called *pseudo-streaming*. The user sees
|
|
237
|
+
the same live experience.
|
|
191
238
|
|
|
192
239
|
```bash
|
|
193
|
-
scribe --
|
|
240
|
+
scribe --stream # any backend, live transcription
|
|
241
|
+
scribe --stream --backend groq # Groq + Stream is the sweet spot
|
|
242
|
+
scribe --stream --backend whisper # local, live, no API key
|
|
194
243
|
```
|
|
195
244
|
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
245
|
+
### How pseudo-streaming carves up a recording
|
|
246
|
+
|
|
247
|
+
Once the buffer has grown to at least `--stream-chunk-min` (default
|
|
248
|
+
1.5 s), silence of at least `--stream-chunk-silence-break` (default
|
|
249
|
+
0.6 s) triggers a chunk cut. A force-cut fires at `--stream-chunk-max`
|
|
250
|
+
(default 10 s) regardless of silence, to cap latency. The session
|
|
251
|
+
continues until you stop it manually.
|
|
252
|
+
|
|
253
|
+
### Does pseudo-streaming change the API cost?
|
|
254
|
+
|
|
255
|
+
For cloud backends, going from one big transcription to N chunked
|
|
256
|
+
requests **does not normally change the bill**:
|
|
257
|
+
|
|
258
|
+
- **Groq** (`whisper-large-v3-turbo`) is billed per second of audio.
|
|
259
|
+
Total audio is unchanged → same cost.
|
|
260
|
+
- **OpenAI `whisper-1`** (legacy) is billed per minute of audio. Same
|
|
261
|
+
logic, same cost.
|
|
262
|
+
- **OpenAI `gpt-4o-transcribe` / `gpt-4o-mini-transcribe`** are token-
|
|
263
|
+
billed (audio-in + text-out + prompt-in). Audio and output stay
|
|
264
|
+
identical; the only delta is the rolling cross-chunk *prompt*
|
|
265
|
+
context (~200 chars ≈ 50–60 tokens per chunk after the first).
|
|
266
|
+
At gpt-4o-mini-transcribe input rates this is negligible — well
|
|
267
|
+
under a cent per long session.
|
|
268
|
+
|
|
269
|
+
That said, your real cost depends on your usage and your account's
|
|
270
|
+
pricing tier — **verify on your provider's billing dashboard** if
|
|
271
|
+
cost is a hard constraint.
|
|
272
|
+
|
|
273
|
+
Two special values for `--stream-chunk-silence-break` (set via the
|
|
274
|
+
tray's **Silence break** picker or `--stream-chunk-silence-break 0`
|
|
275
|
+
at the CLI):
|
|
276
|
+
|
|
277
|
+
- **Auto** (`0`) — disables the fixed-threshold trigger. At force-cut
|
|
278
|
+
time scribe picks the *longest* silence interval within the window
|
|
279
|
+
whose start position is at least `--stream-chunk-min` into the chunk,
|
|
280
|
+
re-cutting there for a more natural word boundary. Falls back to a
|
|
281
|
+
brute force-cut if no qualifying silence is found.
|
|
282
|
+
- **Max** — disables silence-based cuts entirely; only the force-cut at
|
|
283
|
+
`--stream-chunk-max` fires. Useful when you want uniform chunk sizes
|
|
284
|
+
regardless of speech patterns. (Only selectable from the tray picker.)
|
|
285
|
+
|
|
286
|
+
Stream mode is off by default — the default `Clip` mode transcribes the
|
|
287
|
+
whole recording at end (`--clip`). The tray menu surfaces the same
|
|
288
|
+
toggle as the top-level **Mode: Stream / Clip** item. Native
|
|
289
|
+
streamers (vosk, `gpt-realtime-whisper`) are always streaming and the
|
|
290
|
+
menu shows **Mode: Stream (native)** for them.
|
|
206
291
|
|
|
207
292
|
### Cross-chunk prompt context
|
|
208
293
|
|
|
209
|
-
In pseudo-streaming
|
|
210
|
-
prompt with the trailing ~200 characters of the
|
|
211
|
-
transcription. This rolling tail is concatenated
|
|
212
|
-
`--prompt` / `--words` you configured and
|
|
213
|
-
the same channel as the static prompt
|
|
214
|
-
above). The motivation is cross-chunk
|
|
294
|
+
In Stream mode (pseudo-streaming) scribe automatically augments
|
|
295
|
+
each chunk's prompt with the trailing ~200 characters of the
|
|
296
|
+
*previous* chunk's transcription. This rolling tail is concatenated
|
|
297
|
+
onto whatever static `--prompt` / `--words` you configured and
|
|
298
|
+
reaches the backend through the same channel as the static prompt
|
|
299
|
+
(the vocabulary biasing table above). The motivation is cross-chunk
|
|
300
|
+
continuity:
|
|
215
301
|
|
|
216
302
|
- **Capitalization drift** — without context, a chunk that starts
|
|
217
303
|
right after a period might come back lowercased.
|
|
@@ -226,13 +312,12 @@ sits well under that and leaves room for your static prompt + words
|
|
|
226
312
|
list.
|
|
227
313
|
|
|
228
314
|
The rolling tail is **dropped** when the silence between two
|
|
229
|
-
utterances exceeds
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
natural sentence boundaries.
|
|
315
|
+
utterances exceeds `--stream-context-reset-silence` ×
|
|
316
|
+
`--stream-chunk-silence-break` (default 3 × 0.6 s = 1.8 s) — a long
|
|
317
|
+
pause is treated as a new sentence/idea boundary, where carrying a
|
|
318
|
+
possibly-bad prior chunk forward biases the next one more than it
|
|
319
|
+
helps. Use `--stream-context-reset-silence inf` to keep context across
|
|
320
|
+
arbitrarily long pauses.
|
|
236
321
|
|
|
237
322
|
Short pauses (mid-sentence punctuation) keep the context; the cut at
|
|
238
323
|
the start of every new recording also clears it.
|
|
@@ -13,10 +13,11 @@ The flags are grouped to mirror the source-of-truth in
|
|
|
13
13
|
|
|
14
14
|
| Flag | Purpose |
|
|
15
15
|
|---------------------------------|-------------------------------------------------------------------------|
|
|
16
|
-
| `--backend {vosk,whisper,openai,groq}` | Speech-recognition backend (prompted if omitted).
|
|
16
|
+
| `--backend {vosk,whisper,whisper-futo,openai,groq}` | Speech-recognition backend (prompted if omitted). |
|
|
17
17
|
| `--model NAME` | Model name for the chosen backend. Auto-routes to the right backend for known model names (e.g. `--model gpt-realtime-whisper` selects `openai`). |
|
|
18
|
-
| `-l, --language LANG` | Language alias selecting a preset Vosk model (`en`/`fr`/`de`/`it`), or `en` for English-only Whisper models. |
|
|
18
|
+
| `-l, --language LANG` | Language alias selecting a preset Vosk model (`en`/`fr`/`de`/`it`), or `en` for English-only Whisper / Whisper-FUTO models. |
|
|
19
19
|
| `--download-folder-whisper DIR` | Folder to store Whisper models. |
|
|
20
|
+
| `--download-folder-whisper-futo DIR` | Folder to store Whisper-FUTO ACFT ggml models (default: `$XDG_CACHE_HOME/whisper-futo`). |
|
|
20
21
|
| `--download-folder-vosk DIR` | Folder to store Vosk models. |
|
|
21
22
|
|
|
22
23
|
## Prompting & vocabulary biasing
|
|
@@ -55,22 +56,23 @@ flag suppresses only its own side (giving `--prompt ""` still loads
|
|
|
55
56
|
| Flag | Purpose |
|
|
56
57
|
|-----------------------|----------------------------------------------------------|
|
|
57
58
|
| `--input-device N` | Microphone device index (see `python -m sounddevice`). |
|
|
59
|
+
| `--dry-run` | Short-circuit the STT request boundary in every backend: model load is skipped and the network/SDK call returns a canned `[dry-run transcript]`. Used by the backend × mode smoke-test matrix; handy for plumbing without network access. |
|
|
58
60
|
|
|
59
61
|
## Output
|
|
60
62
|
|
|
61
63
|
| Flag | Purpose |
|
|
62
64
|
|-----------------------------|---------------------------------------------------------------------------------------------|
|
|
63
|
-
| `-m, --mode {keystroke,clipboard,terminal}` | Where transcribed text goes (default `keystroke`). See [
|
|
65
|
+
| `-m, --mode {keystroke,clipboard,terminal,file}` | Where transcribed text goes (default `keystroke`). `file` routes the transcript exclusively to `--output-file` and suppresses keyboard/clipboard output. See [output.md](output.md). |
|
|
64
66
|
| `--typer {auto,eitype,pynput,wtype,ydotool}` | Keystroke-injection backend (default `auto`). |
|
|
65
67
|
| `--type-direct` | In keystroke mode, type the transcription as keystrokes instead of synthesising Ctrl+V. |
|
|
66
|
-
| `-o, --output-file FILE` |
|
|
68
|
+
| `-o, --output-file FILE` | Path the transcription is appended to when `--mode file`. Defaults to `<user-desktop>/scribe-notes.txt` (the platform Desktop folder — `~/Desktop` on Linux/macOS, `%USERPROFILE%\Desktop` on Windows; falls back to home dir if Desktop is absent). Ignored when `--mode` is anything other than `file` (the four output modes are mutually exclusive). |
|
|
67
69
|
|
|
68
70
|
## Silence detection
|
|
69
71
|
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
72
|
+
> **Deprecated aliases** (still accepted, hidden from `--help`):
|
|
73
|
+
> `--duration N` maps to `--clip-timeout N`; `--silence-duration N`
|
|
74
|
+
> sets both `--stream-chunk-silence-break` and `--realtime-commit-silence`
|
|
75
|
+
> to `N`. Existing scripts using these flags continue to work.
|
|
74
76
|
|
|
75
77
|
## Voice activity detection
|
|
76
78
|
|
|
@@ -94,22 +96,53 @@ mode's knobs are ignored.
|
|
|
94
96
|
|
|
95
97
|
## Realtime (`gpt-realtime-whisper`)
|
|
96
98
|
|
|
97
|
-
| Flag | Default
|
|
98
|
-
|
|
99
|
-
| `--realtime-delay {minimal,low,medium,high,xhigh}` | `medium`
|
|
100
|
-
| `--realtime-gate` / `--no-realtime-gate` | on
|
|
99
|
+
| Flag | Default | Purpose |
|
|
100
|
+
|---------------------------------------------------|--------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
|
101
|
+
| `--realtime-delay {minimal,low,medium,high,xhigh}` | `medium` | Trade off latency vs accuracy on `gpt-realtime-whisper`. Lower = faster partials but more paste churn in the focused window. |
|
|
102
|
+
| `--realtime-gate` / `--no-realtime-gate` | on | Drop silent frames (per the active `--vad-mode`) before sending them over the WebSocket so silent audio isn't billed as input tokens. |
|
|
103
|
+
| `--realtime-commit-silence SECS` | `0.6` | Seconds of silence before a mid-session commit flushes trailing words to the server (default `0.6`). Set to `0` to rely solely on the server's turn detection. |
|
|
104
|
+
|
|
105
|
+
The tray's **Stream (advanced) › Stream** picker unifies `--realtime-gate`
|
|
106
|
+
and `--realtime-commit-silence` into a single choice: **Live** (gate
|
|
107
|
+
off, commit disabled — server turn detection only) or **Offline after
|
|
108
|
+
Xs** (gate on, commit after X seconds of silence). At the CLI level the
|
|
109
|
+
two flags remain independent. The auto-stop is documented under
|
|
110
|
+
**Listening mode → `--stream-timeout`** below (covers both native
|
|
111
|
+
streamers and pseudo-streaming on batch backends).
|
|
101
112
|
|
|
102
113
|
Streaming models (Vosk, `gpt-realtime-whisper`) ignore the batch
|
|
103
114
|
silence-chunking knobs; they have their own end-of-utterance signal.
|
|
104
115
|
|
|
116
|
+
## Listening mode
|
|
117
|
+
|
|
118
|
+
| Flag | Default | Purpose |
|
|
119
|
+
|-----------------------------------|---------|-------------------------------------------------------------------------------------------|
|
|
120
|
+
| `--stream` | — | Force a batch backend (whisper, whisper-futo, openai non-realtime, groq) into pseudo-streaming — live chunks driven by `--stream-chunk-max` and `--stream-chunk-silence-break`. Same as the tray's **Mode: Stream**. |
|
|
121
|
+
| `--clip` | default | Transcribe the whole recording at end. Same as the tray's **Mode: Clip**. |
|
|
122
|
+
| `--stream-chunk-max SECS` | `10` | Maximum chunk duration in seconds. Force-cut fires at this threshold when no silence pause has been detected (default `10`). |
|
|
123
|
+
| `--stream-chunk-min SECS` | `1.5` | Minimum chunk size before a silence-cut is allowed (default `1.5`). Prevents very short clips that cause Whisper hallucinations. |
|
|
124
|
+
| `--stream-chunk-silence-break SECS` | `0.6` | Silence duration that triggers a chunk cut (default `0.6`). Special value `0` enables Auto mode (best-silence-in-window at force-cut time). |
|
|
125
|
+
| `--stream-context-reset-silence X` | `3.0` | Multiplier of `--stream-chunk-silence-break` above which the rolling cross-chunk prompt context is discarded (default `3.0`, i.e. 1.8 s at default silence-break). Use `inf` to never reset. |
|
|
126
|
+
| `--clip-timeout SECS` | `120` | Auto-stop after this many seconds in Clip mode (default `120`). |
|
|
127
|
+
| `--stream-timeout SECS` | `None` | Auto-stop after this many seconds in Stream mode (`None` = Always On, no auto-stop). Tray equivalent: **Stream timeout** in the Stream (advanced) submenu. |
|
|
128
|
+
|
|
129
|
+
Native streamers (vosk, `gpt-realtime-whisper`) are always streaming
|
|
130
|
+
and ignore `--clip`. `--realtime`, `--pseudo-streaming`,
|
|
131
|
+
`--streaming-window`, and `--realtime-timeout` are kept as hidden
|
|
132
|
+
back-compat aliases (`--streaming-window N` maps to
|
|
133
|
+
`--stream-chunk-max 2N` to preserve the old effective force-cut
|
|
134
|
+
threshold; `--realtime-timeout` maps to `--stream-timeout`).
|
|
135
|
+
|
|
105
136
|
## Frontend
|
|
106
137
|
|
|
107
138
|
| Flag | Purpose |
|
|
108
139
|
|-----------------------------|----------------------------------------------------------------------|
|
|
109
140
|
| `--frontend {tray,terminal}` | UI to launch (default `tray`). |
|
|
110
|
-
| `--no-interactive` | In terminal mode, skip the interactive menu and record immediately.
|
|
141
|
+
| `--no-interactive` | In terminal mode, skip the interactive menu and record immediately. |
|
|
142
|
+
| `--record` | Start recording immediately on launch, frontend-agnostic. In terminal it's a one-line shortcut for `--no-interactive`; in tray it auto-fires the Record action ~0.5 s after the icon comes up. Useful for hotkey bindings (`scribe --record` triggers a recording from anywhere) and batched / scripted invocations. |
|
|
111
143
|
| `--vosk-models M [M ...]` | Vosk models offered in the tray menu. |
|
|
112
144
|
| `--whisper-models M [M ...]` | Whisper models offered in the tray menu. |
|
|
145
|
+
| `--whisper-futo-models M [M ...]` | Whisper-FUTO ACFT models offered in the tray menu. |
|
|
113
146
|
|
|
114
147
|
## Examples
|
|
115
148
|
|
|
@@ -134,13 +167,31 @@ environment) — you'll pay for silent audio while the session is open:
|
|
|
134
167
|
scribe --model gpt-realtime-whisper --no-realtime-gate
|
|
135
168
|
```
|
|
136
169
|
|
|
137
|
-
|
|
138
|
-
|
|
170
|
+
**Batched / scripted use** — record one dictation headlessly, write
|
|
171
|
+
it where you want, exit. No tray, no menu, no clipboard:
|
|
139
172
|
|
|
140
173
|
```bash
|
|
141
|
-
|
|
174
|
+
# Append to a file (default <Desktop>/scribe-notes.txt — override with -o)
|
|
175
|
+
scribe --record --frontend terminal --mode file
|
|
176
|
+
|
|
177
|
+
# Same with a custom path
|
|
178
|
+
scribe --record --frontend terminal --mode file -o /tmp/notes.txt
|
|
179
|
+
|
|
180
|
+
# Pipe-friendly: transcript on stdout
|
|
181
|
+
scribe --record --frontend terminal --mode terminal
|
|
182
|
+
|
|
183
|
+
# Streamed: chunks appended live (as you speak) instead of all-at-once
|
|
184
|
+
# at end-of-recording. Useful for long dictations and tail-following:
|
|
185
|
+
# tail -f /tmp/notes.txt
|
|
186
|
+
scribe --record --frontend terminal --mode file --stream -o /tmp/notes.txt
|
|
142
187
|
```
|
|
143
188
|
|
|
189
|
+
`--record` starts the recording immediately, `--frontend terminal`
|
|
190
|
+
skips the tray icon, `--mode file` (or `terminal`) picks where the
|
|
191
|
+
transcript lands, `--stream` (optional) emits chunks live instead of
|
|
192
|
+
the default Clip-mode all-at-once. Combine with a hotkey or cron for
|
|
193
|
+
one-shot capture.
|
|
194
|
+
|
|
144
195
|
Bias the recogniser toward domain jargon (medical terms, proper names):
|
|
145
196
|
|
|
146
197
|
```bash
|