scribe-cli 0.16.0__tar.gz → 0.17.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/PKG-INFO +58 -12
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/README.md +53 -11
- scribe_cli-0.17.1/docs/app-tray-menu.png +0 -0
- scribe_cli-0.17.1/docs/backends.md +238 -0
- scribe_cli-0.17.1/docs/cli.md +137 -0
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/docs/keyboard.md +31 -0
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/docs/tray.md +4 -2
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/pyproject.toml +3 -1
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/_version.py +3 -3
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/app.py +230 -40
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/backends/__init__.py +3 -0
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/backends/groq.py +4 -2
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/backends/openai_api.py +7 -2
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/backends/openai_realtime.py +167 -32
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/backends/vosk.py +6 -4
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/backends/whisper.py +11 -1
- scribe_cli-0.17.1/scribe/backends/whisper_futo.py +201 -0
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/menu.py +210 -15
- scribe_cli-0.17.1/scribe/models.py +280 -0
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/session.py +10 -1
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe_cli.egg-info/PKG-INFO +58 -12
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe_cli.egg-info/SOURCES.txt +6 -1
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe_cli.egg-info/requires.txt +5 -0
- scribe_cli-0.17.1/scripts/bench_whisper_local.py +156 -0
- scribe_cli-0.17.1/tests/test_openai_realtime_coalesce.py +221 -0
- scribe_cli-0.17.1/tests/test_pseudo_streaming.py +288 -0
- scribe_cli-0.17.1/tests/test_whisper_futo.py +245 -0
- scribe_cli-0.16.0/docs/app-tray-menu.png +0 -0
- scribe_cli-0.16.0/docs/backends.md +0 -122
- scribe_cli-0.16.0/docs/cli.md +0 -95
- scribe_cli-0.16.0/scribe/models.py +0 -144
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/.github/FUNDING.yml +0 -0
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/.github/workflows/pypi.yml +0 -0
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/.gitignore +0 -0
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/LICENSE +0 -0
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/docs/desktop-install.md +0 -0
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/docs/installation.md +0 -0
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/docs/roadmap-libei.md +0 -0
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/icon.xcf +0 -0
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/__init__.py +0 -0
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/audio.py +0 -0
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/install_desktop.py +0 -0
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/keyboard.py +0 -0
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/models.toml +0 -0
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/saverecording.py +0 -0
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/testpynput.py +0 -0
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/typers/__init__.py +0 -0
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/typers/base.py +0 -0
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/typers/eitype.py +0 -0
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/typers/pynput.py +0 -0
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/typers/wtype.py +0 -0
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/typers/ydotool.py +0 -0
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe/util.py +0 -0
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe_cli.egg-info/dependency_links.txt +0 -0
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe_cli.egg-info/entry_points.txt +0 -0
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe_cli.egg-info/top_level.txt +0 -0
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe_data/__init__.py +0 -0
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe_data/share/icon.png +0 -0
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe_data/share/icon_recording.png +0 -0
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe_data/share/icon_writing.png +0 -0
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scribe_data/templates/scribe.desktop +0 -0
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/scripts/test_python_versions_install.sh +0 -0
- {scribe_cli-0.16.0 → scribe_cli-0.17.1}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: scribe-cli
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.17.1
|
|
4
4
|
Summary: Speech-to-text CLI and system-tray app for dictating into any focused window. Local (vosk, faster-whisper) or cloud (groq, openai) backends, batch or streaming.
|
|
5
5
|
Author-email: Mahé Perrette <mahe.perrette@gmail.com>
|
|
6
6
|
License: MIT License
|
|
@@ -50,11 +50,14 @@ Requires-Dist: requests
|
|
|
50
50
|
Requires-Dist: pyperclip
|
|
51
51
|
Requires-Dist: unidecode
|
|
52
52
|
Requires-Dist: termcolor
|
|
53
|
+
Requires-Dist: platformdirs
|
|
53
54
|
Requires-Dist: desktop-ai-core>=0.2.0
|
|
54
55
|
Provides-Extra: keyboard
|
|
55
56
|
Requires-Dist: pynput; extra == "keyboard"
|
|
56
57
|
Provides-Extra: whisper
|
|
57
58
|
Requires-Dist: faster-whisper; extra == "whisper"
|
|
59
|
+
Provides-Extra: whisper-futo
|
|
60
|
+
Requires-Dist: pywhispercpp; extra == "whisper-futo"
|
|
58
61
|
Provides-Extra: vosk
|
|
59
62
|
Requires-Dist: vosk; extra == "vosk"
|
|
60
63
|
Provides-Extra: app
|
|
@@ -69,6 +72,7 @@ Requires-Dist: soundfile; extra == "groq"
|
|
|
69
72
|
Provides-Extra: all
|
|
70
73
|
Requires-Dist: pynput; extra == "all"
|
|
71
74
|
Requires-Dist: faster-whisper; extra == "all"
|
|
75
|
+
Requires-Dist: pywhispercpp; extra == "all"
|
|
72
76
|
Requires-Dist: openai<3,>=2.37.0; extra == "all"
|
|
73
77
|
Requires-Dist: soundfile; extra == "all"
|
|
74
78
|
Requires-Dist: vosk; extra == "all"
|
|
@@ -98,29 +102,60 @@ cloud-based APIs, batch and streaming workflows.
|
|
|
98
102
|
- Cross-platform: tested on Ubuntu (X11 and Wayland), macOS, Windows;
|
|
99
103
|
works under Termux for clipboard / terminal output.
|
|
100
104
|
|
|
101
|
-
##
|
|
105
|
+
## Install
|
|
102
106
|
|
|
103
107
|
```bash
|
|
104
108
|
sudo apt-get install portaudio19-dev xclip # Ubuntu; macOS: brew install portaudio
|
|
105
109
|
pip install scribe-cli[all]
|
|
106
110
|
export GROQ_API_KEY=YOURAPIKEY # or OPENAI_API_KEY, or skip and run local
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
See documentation below for setting up keyboard input on Ubuntu Wayland.
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
## Usage
|
|
117
|
+
|
|
118
|
+
In a terminal:
|
|
119
|
+
|
|
120
|
+
```bash
|
|
107
121
|
scribe
|
|
108
122
|
```
|
|
109
123
|
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
124
|
+
This launches the system tray icon. Press Record, speak, press Stop —
|
|
125
|
+
the transcription lands in the focused window. Scribe picks the first
|
|
126
|
+
backend whose key / dependency is present, in order **`groq` →
|
|
127
|
+
`openai` → `whisper` → `vosk`**, so with `GROQ_API_KEY` set the
|
|
128
|
+
command above is equivalent to:
|
|
113
129
|
|
|
114
|
-
|
|
130
|
+
```bash
|
|
131
|
+
scribe --backend groq --model whisper-large-v3-turbo
|
|
132
|
+
```
|
|
115
133
|
|
|
116
|
-
|
|
134
|
+
<img src=https://raw.githubusercontent.com/perrette/scribe/main/docs/app-tray-menu.png width=300px>
|
|
117
135
|
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
136
|
+
You can override the defaults or drop the tray entirely:
|
|
137
|
+
|
|
138
|
+
```bash
|
|
139
|
+
scribe --backend openai --model gpt-4o-mini-transcribe # OpenAI sweet spot
|
|
140
|
+
scribe --backend openai --model gpt-realtime-whisper # OpenAI streaming
|
|
141
|
+
scribe --backend whisper --model small # local, no API key
|
|
142
|
+
scribe --frontend terminal # interactive TUI menu
|
|
143
|
+
scribe --frontend terminal --no-interactive # record immediately, no menu
|
|
144
|
+
scribe --mode clipboard # copy to clipboard, no keystroke
|
|
145
|
+
scribe --mode terminal # only print to stdout
|
|
146
|
+
scribe -o transcript.txt # also append to a file
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
With `--no-interactive` (terminal frontend only), scribe skips the
|
|
150
|
+
interactive menu and starts recording right away — handy for scripted,
|
|
151
|
+
one-shot transcriptions. `--no-prompt` is kept as a deprecated alias.
|
|
152
|
+
|
|
153
|
+
Bias the recogniser toward names, jargon, or a domain glossary with
|
|
154
|
+
`--prompt "free text hint"` and `--words word1 word2 ...` (each also
|
|
155
|
+
accepts a `--prompt-file` / `--words-file` companion). See
|
|
156
|
+
[docs/backends.md › Vocabulary biasing](docs/backends.md#vocabulary-biasing)
|
|
157
|
+
for what each backend does with them.
|
|
122
158
|
|
|
123
|
-
I personally use [OpenAI](https://openai.com/api/) with `gpt-4o-mini-transcribe` as it is also fast and perhaps more accurate for my accent-tainted English.
|
|
124
159
|
|
|
125
160
|
## Backends at a glance
|
|
126
161
|
|
|
@@ -135,6 +170,17 @@ Whether a transcription appears live as you speak or all at once when
|
|
|
135
170
|
you stop depends on the **model** picked — see
|
|
136
171
|
[docs/backends.md](docs/backends.md).
|
|
137
172
|
|
|
173
|
+
|
|
174
|
+
### Getting an API key
|
|
175
|
+
|
|
176
|
+
Groq is a good cloud backend to start with — very fast, quite accurate, and the
|
|
177
|
+
**free tier** is generous enough for everyday dictation. Sign up at
|
|
178
|
+
[console.groq.com](https://console.groq.com/), create an API key
|
|
179
|
+
under **Settings → API Keys**, and export it as `GROQ_API_KEY`.
|
|
180
|
+
|
|
181
|
+
I personally use [OpenAI](https://openai.com/api/) with `gpt-4o-mini-transcribe` as it is also fast and perhaps more accurate for my accent-tainted English.
|
|
182
|
+
|
|
183
|
+
|
|
138
184
|
## Documentation
|
|
139
185
|
|
|
140
186
|
- [Installation & dependencies](docs/installation.md) — PortAudio,
|
|
@@ -21,29 +21,60 @@ cloud-based APIs, batch and streaming workflows.
|
|
|
21
21
|
- Cross-platform: tested on Ubuntu (X11 and Wayland), macOS, Windows;
|
|
22
22
|
works under Termux for clipboard / terminal output.
|
|
23
23
|
|
|
24
|
-
##
|
|
24
|
+
## Install
|
|
25
25
|
|
|
26
26
|
```bash
|
|
27
27
|
sudo apt-get install portaudio19-dev xclip # Ubuntu; macOS: brew install portaudio
|
|
28
28
|
pip install scribe-cli[all]
|
|
29
29
|
export GROQ_API_KEY=YOURAPIKEY # or OPENAI_API_KEY, or skip and run local
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
See documentation below for setting up keyboard input on Ubuntu Wayland.
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
## Usage
|
|
36
|
+
|
|
37
|
+
In a terminal:
|
|
38
|
+
|
|
39
|
+
```bash
|
|
30
40
|
scribe
|
|
31
41
|
```
|
|
32
42
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
43
|
+
This launches the system tray icon. Press Record, speak, press Stop —
|
|
44
|
+
the transcription lands in the focused window. Scribe picks the first
|
|
45
|
+
backend whose key / dependency is present, in order **`groq` →
|
|
46
|
+
`openai` → `whisper` → `vosk`**, so with `GROQ_API_KEY` set the
|
|
47
|
+
command above is equivalent to:
|
|
36
48
|
|
|
37
|
-
|
|
49
|
+
```bash
|
|
50
|
+
scribe --backend groq --model whisper-large-v3-turbo
|
|
51
|
+
```
|
|
38
52
|
|
|
39
|
-
|
|
53
|
+
<img src=https://raw.githubusercontent.com/perrette/scribe/main/docs/app-tray-menu.png width=300px>
|
|
40
54
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
55
|
+
You can override the defaults or drop the tray entirely:
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
scribe --backend openai --model gpt-4o-mini-transcribe # OpenAI sweet spot
|
|
59
|
+
scribe --backend openai --model gpt-realtime-whisper # OpenAI streaming
|
|
60
|
+
scribe --backend whisper --model small # local, no API key
|
|
61
|
+
scribe --frontend terminal # interactive TUI menu
|
|
62
|
+
scribe --frontend terminal --no-interactive # record immediately, no menu
|
|
63
|
+
scribe --mode clipboard # copy to clipboard, no keystroke
|
|
64
|
+
scribe --mode terminal # only print to stdout
|
|
65
|
+
scribe -o transcript.txt # also append to a file
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
With `--no-interactive` (terminal frontend only), scribe skips the
|
|
69
|
+
interactive menu and starts recording right away — handy for scripted,
|
|
70
|
+
one-shot transcriptions. `--no-prompt` is kept as a deprecated alias.
|
|
71
|
+
|
|
72
|
+
Bias the recogniser toward names, jargon, or a domain glossary with
|
|
73
|
+
`--prompt "free text hint"` and `--words word1 word2 ...` (each also
|
|
74
|
+
accepts a `--prompt-file` / `--words-file` companion). See
|
|
75
|
+
[docs/backends.md › Vocabulary biasing](docs/backends.md#vocabulary-biasing)
|
|
76
|
+
for what each backend does with them.
|
|
45
77
|
|
|
46
|
-
I personally use [OpenAI](https://openai.com/api/) with `gpt-4o-mini-transcribe` as it is also fast and perhaps more accurate for my accent-tainted English.
|
|
47
78
|
|
|
48
79
|
## Backends at a glance
|
|
49
80
|
|
|
@@ -58,6 +89,17 @@ Whether a transcription appears live as you speak or all at once when
|
|
|
58
89
|
you stop depends on the **model** picked — see
|
|
59
90
|
[docs/backends.md](docs/backends.md).
|
|
60
91
|
|
|
92
|
+
|
|
93
|
+
### Getting an API key
|
|
94
|
+
|
|
95
|
+
Groq is a good cloud backend to start with — very fast, quite accurate, and the
|
|
96
|
+
**free tier** is generous enough for everyday dictation. Sign up at
|
|
97
|
+
[console.groq.com](https://console.groq.com/), create an API key
|
|
98
|
+
under **Settings → API Keys**, and export it as `GROQ_API_KEY`.
|
|
99
|
+
|
|
100
|
+
I personally use [OpenAI](https://openai.com/api/) with `gpt-4o-mini-transcribe` as it is also fast and perhaps more accurate for my accent-tainted English.
|
|
101
|
+
|
|
102
|
+
|
|
61
103
|
## Documentation
|
|
62
104
|
|
|
63
105
|
- [Installation & dependencies](docs/installation.md) — PortAudio,
|
|
Binary file
|
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
# Backends in detail
|
|
2
|
+
|
|
3
|
+
Scribe ships five speech-to-text backends. They are all picked through
|
|
4
|
+
the same `--backend` / `--model` CLI flags (or the **Model** submenu in
|
|
5
|
+
the tray / terminal frontend). Whether a transcription is *streaming*
|
|
6
|
+
(text appears live as you speak) or *batch* (text arrives at end of
|
|
7
|
+
recording) depends on the **model** chosen — not the backend.
|
|
8
|
+
|
|
9
|
+
## At a glance
|
|
10
|
+
|
|
11
|
+
| Backend | `--backend` | Default model | Streaming model(s) | Requires |
|
|
12
|
+
|------------------------|-----------------|----------------------------|---------------------------|-----------------------------------------|
|
|
13
|
+
| Groq (cloud) | `groq` | `whisper-large-v3-turbo` | — | `GROQ_API_KEY` |
|
|
14
|
+
| OpenAI (cloud) | `openai` | `gpt-4o-mini-transcribe` | `gpt-realtime-whisper` | `OPENAI_API_KEY` |
|
|
15
|
+
| Whisper FUTO (local) | `whisper-futo` | `small` | — | `pip install scribe-cli[whisper-futo]` |
|
|
16
|
+
| Whisper (local) | `whisper` | `small` | — | `pip install scribe-cli[whisper]` |
|
|
17
|
+
| Vosk (local) | `vosk` | language-dependent | all Vosk models | `pip install scribe-cli[vosk]` |
|
|
18
|
+
|
|
19
|
+
Run `scribe` without arguments and it picks the first backend whose
|
|
20
|
+
dependency / API key is present, preferring cloud over local and the
|
|
21
|
+
faster local option first:
|
|
22
|
+
`groq → openai → whisper-futo → whisper → vosk`.
|
|
23
|
+
|
|
24
|
+
## `whisper-futo` (local, fast on short dictations)
|
|
25
|
+
|
|
26
|
+
Runs locally via [whisper.cpp](https://github.com/ggml-org/whisper.cpp)
|
|
27
|
+
(through [`pywhispercpp`](https://github.com/absadiki/pywhispercpp))
|
|
28
|
+
using [FUTO's ACFT-finetuned models](https://github.com/futo-org/whisper-acft).
|
|
29
|
+
ACFT (Audio Context Fine-Tuning) lets the encoder run on the actual
|
|
30
|
+
audio length instead of always padding to 30 s — a meaningful speedup
|
|
31
|
+
on short dictations, which is the typical scribe workload.
|
|
32
|
+
|
|
33
|
+
The available models offered in the tray menu are
|
|
34
|
+
`tiny / base / small`. FUTO has not released ACFT weights for
|
|
35
|
+
`medium / large / turbo`; for those sizes use the `whisper` backend.
|
|
36
|
+
|
|
37
|
+
With `--language en` (or `-l en`) scribe auto-substitutes the
|
|
38
|
+
English-only variant (e.g. `small` → `small.en`) when it exists.
|
|
39
|
+
|
|
40
|
+
Models are auto-downloaded on first use from `voiceinput.futo.org`
|
|
41
|
+
to `$XDG_CACHE_HOME/whisper-futo/` (override with
|
|
42
|
+
`--download-folder-whisper-futo`).
|
|
43
|
+
|
|
44
|
+
For audio ≥ 30 s the ACFT speedup tapers off and the encoder window
|
|
45
|
+
collapses to the standard 30 s; quality and speed in that regime are
|
|
46
|
+
similar to the `whisper` backend. Pick `whisper-futo` if most of your
|
|
47
|
+
dictations are short, the `whisper` backend if you regularly do
|
|
48
|
+
multi-minute recordings or need `medium` / `large` / `turbo`.
|
|
49
|
+
|
|
50
|
+
## `whisper` (local)
|
|
51
|
+
|
|
52
|
+
Runs locally via
|
|
53
|
+
[`faster-whisper`](https://github.com/SYSTRAN/faster-whisper) and
|
|
54
|
+
defaults to the `small` model. Excellent at full-utterance
|
|
55
|
+
transcription in
|
|
56
|
+
[many languages](https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages),
|
|
57
|
+
but it does not stream — text appears at end-of-recording — and
|
|
58
|
+
execution time depends on model size and hardware.
|
|
59
|
+
|
|
60
|
+
The available models offered in the tray menu are
|
|
61
|
+
`tiny / base / small / medium / large-v3 / large-v3-turbo`. Larger
|
|
62
|
+
models trade speed for accuracy.
|
|
63
|
+
|
|
64
|
+
With `--language en` (or `-l en`) scribe auto-substitutes the
|
|
65
|
+
English-only variant (e.g. `small` → `small.en`) when it exists.
|
|
66
|
+
|
|
67
|
+
## `vosk` (local, streaming)
|
|
68
|
+
|
|
69
|
+
Vosk transcribes in real time and is very good at one language at a
|
|
70
|
+
time, but tends to make more mistakes than Whisper and does not produce
|
|
71
|
+
punctuation. It becomes really useful in longer, interactive sessions
|
|
72
|
+
where the live "appears as you speak" UX matters — see
|
|
73
|
+
[keyboard.md](keyboard.md) for how the keystroke mode interacts with
|
|
74
|
+
streaming models.
|
|
75
|
+
|
|
76
|
+
There are many [Vosk models](https://alphacephei.com/vosk/models)
|
|
77
|
+
available; a handful are pre-mapped to common languages (`en`, `fr`,
|
|
78
|
+
`de`, `it`) in
|
|
79
|
+
[`scribe/models.toml`](../scribe/models.toml). Pick one with
|
|
80
|
+
`-l <lang>` or browse the full list interactively from the menu.
|
|
81
|
+
|
|
82
|
+
## `openai` (OpenAI cloud)
|
|
83
|
+
|
|
84
|
+
The OpenAI backend supports three models:
|
|
85
|
+
|
|
86
|
+
- `gpt-4o-mini-transcribe` *(default)* — fast, low-cost batch
|
|
87
|
+
transcription.
|
|
88
|
+
- `gpt-4o-transcribe` — higher-quality batch transcription.
|
|
89
|
+
- `gpt-realtime-whisper` *(streaming)* — partial transcripts arrive
|
|
90
|
+
as you speak. Same UX as Vosk but using OpenAI's cloud model.
|
|
91
|
+
|
|
92
|
+
All three share the same `OPENAI_API_KEY` and the `[openai]` extra; no
|
|
93
|
+
extra dependencies. Set the key once:
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
export OPENAI_API_KEY=YOURAPIKEY
|
|
97
|
+
scribe --backend openai # default: gpt-4o-mini-transcribe
|
|
98
|
+
scribe --model gpt-4o-transcribe # batch, higher quality
|
|
99
|
+
scribe --model gpt-realtime-whisper # streaming
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
`--model` alone auto-routes to the `openai` backend for any of the
|
|
103
|
+
three models above, so `--backend openai` is optional.
|
|
104
|
+
|
|
105
|
+
### `--realtime-delay` (gpt-realtime-whisper only)
|
|
106
|
+
|
|
107
|
+
The streaming model has a latency-vs-accuracy knob exposed as
|
|
108
|
+
`--realtime-delay {minimal,low,medium,high,xhigh}` (default `medium`).
|
|
109
|
+
Lower values emit partial transcripts sooner — at the cost of more
|
|
110
|
+
revisions arriving in the focused window. Higher values batch tokens
|
|
111
|
+
into longer chunks so what gets pasted is more stable.
|
|
112
|
+
|
|
113
|
+
See OpenAI's
|
|
114
|
+
[gpt-realtime-whisper model card](https://developers.openai.com/api/docs/models/gpt-realtime-whisper)
|
|
115
|
+
for the full picture.
|
|
116
|
+
|
|
117
|
+
## `groq` (Groq cloud)
|
|
118
|
+
|
|
119
|
+
Talks to Groq's OpenAI-compatible API and defaults to
|
|
120
|
+
`whisper-large-v3-turbo`. Typically the fastest cloud option for
|
|
121
|
+
full-utterance transcription:
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
export GROQ_API_KEY=YOURAPIKEY
|
|
125
|
+
scribe --backend groq
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
The `groq` backend reuses the `openai` Python client under the hood, so
|
|
129
|
+
installing `[openai]` is enough for both.
|
|
130
|
+
|
|
131
|
+
## Stopping a recording
|
|
132
|
+
|
|
133
|
+
For batch models (Whisper local, Whisper-via-API, Groq, `gpt-4o-*`) the
|
|
134
|
+
recording continues for up to 2 minutes until you stop it manually
|
|
135
|
+
(Stop in the tray, Ctrl+C in the terminal) — the transcription happens
|
|
136
|
+
once when you stop.
|
|
137
|
+
|
|
138
|
+
Streaming models (Vosk, `gpt-realtime-whisper`) emit partials as you
|
|
139
|
+
speak and stop on the same Stop / Ctrl+C action.
|
|
140
|
+
|
|
141
|
+
## Vocabulary biasing
|
|
142
|
+
|
|
143
|
+
`--prompt TEXT` and `--words W [W ...]` (plus the `--prompt-file` /
|
|
144
|
+
`--words-file` companions) bias the recogniser toward a particular
|
|
145
|
+
style, domain, or word list. The concept is generic across the
|
|
146
|
+
whisper-family backends but each backend exposes it slightly
|
|
147
|
+
differently:
|
|
148
|
+
|
|
149
|
+
| Backend | `--prompt` | `--words` |
|
|
150
|
+
|--------------------------------------|-------------------------------|--------------------------------------------------------|
|
|
151
|
+
| `whisper` (faster-whisper, local) | passed as `initial_prompt=` | passed as `hotwords=` — a **dedicated biasing channel** separate from the prompt |
|
|
152
|
+
| `whisper-futo` (pywhispercpp, local) | passed as `initial_prompt=` | joined onto the prompt string (no separate hotwords channel here) |
|
|
153
|
+
| `openai` batch (`gpt-4o*-transcribe`) | passed as `prompt=` | joined onto the prompt string |
|
|
154
|
+
| `groq` (`whisper-large-v3-turbo`) | passed as `prompt=` | joined onto the prompt string |
|
|
155
|
+
| `openai` realtime (`gpt-realtime-whisper`) | *silently ignored* — the model rejects the prompt parameter server-side (HTTP 400 *"The 'prompt' parameter is not supported for this model."*). The kwarg stays accepted for plumbing compatibility but never reaches the API. | same — joined into the (ignored) prompt |
|
|
156
|
+
| `vosk` | *ignored* (no soft prompt) | *ignored* (Vosk only supports a hard `grammar` allowlist; not yet exposed) |
|
|
157
|
+
|
|
158
|
+
The whisper-family APIs cap the prompt around ~224 tokens; longer
|
|
159
|
+
hints are silently truncated. Faster-whisper's `hotwords` channel is
|
|
160
|
+
the one place a separate "dictionary" really exists — everywhere else
|
|
161
|
+
`--words` is just a convenience to keep your word list out of the
|
|
162
|
+
prompt string in the CLI.
|
|
163
|
+
|
|
164
|
+
Both flags read from the corresponding `*-file` argument when present.
|
|
165
|
+
Inline + file inputs are combined.
|
|
166
|
+
|
|
167
|
+
```bash
|
|
168
|
+
# Inline
|
|
169
|
+
scribe --prompt "ML systems infra: K8s, etcd, Envoy." \
|
|
170
|
+
--words kubectl envoyproxy etcdctl
|
|
171
|
+
|
|
172
|
+
# From files (handy for long-lived glossaries)
|
|
173
|
+
scribe --prompt-file ~/.config/scribe/prompt.txt \
|
|
174
|
+
--words-file ~/.config/scribe/words.txt
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
When *no* prompt/words flag is given, scribe also auto-loads
|
|
178
|
+
`prompt.txt` and `words.txt` from the platform user-config dir
|
|
179
|
+
(`~/.config/scribe/` on Linux, `~/Library/Application Support/scribe/`
|
|
180
|
+
on macOS, `%LOCALAPPDATA%\scribe\` on Windows — resolved via
|
|
181
|
+
`platformdirs`) if they exist. To suppress the default for one
|
|
182
|
+
invocation, pass an explicit empty value: `--prompt ""` (or
|
|
183
|
+
`--prompt-file ""`) suppresses the prompt default; `--words` with no
|
|
184
|
+
arguments (or `--words-file ""`) suppresses the words default. Each
|
|
185
|
+
side is independent.
|
|
186
|
+
|
|
187
|
+
## Pseudo-streaming (experimental)
|
|
188
|
+
|
|
189
|
+
`--pseudo-streaming` makes a batch backend behave streaming-like by
|
|
190
|
+
cutting the running buffer into chunks driven by silence:
|
|
191
|
+
|
|
192
|
+
```bash
|
|
193
|
+
scribe --pseudo-streaming --streaming-window 5
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
After `--streaming-window` seconds of buffered audio, scribe cuts at
|
|
197
|
+
the first silence of at least `--silence-duration` and transcribes the
|
|
198
|
+
chunk; if no silence arrives by `2 × --streaming-window`, it
|
|
199
|
+
force-cuts. The session continues until you stop it. Default `5` s
|
|
200
|
+
trades a little Whisper context for snappier "text appears as you
|
|
201
|
+
speak" UX; raise it (10–30 s) if accuracy on long sentences matters
|
|
202
|
+
more than latency.
|
|
203
|
+
|
|
204
|
+
This is experimental and off by default. The tray menu surfaces the
|
|
205
|
+
same toggle under Options ▶ Advanced ▶ Pseudo-streaming.
|
|
206
|
+
|
|
207
|
+
### Cross-chunk prompt context
|
|
208
|
+
|
|
209
|
+
In pseudo-streaming mode scribe automatically augments each chunk's
|
|
210
|
+
prompt with the trailing ~200 characters of the *previous* chunk's
|
|
211
|
+
transcription. This rolling tail is concatenated onto whatever static
|
|
212
|
+
`--prompt` / `--words` you configured and reaches the backend through
|
|
213
|
+
the same channel as the static prompt (the vocabulary biasing table
|
|
214
|
+
above). The motivation is cross-chunk continuity:
|
|
215
|
+
|
|
216
|
+
- **Capitalization drift** — without context, a chunk that starts
|
|
217
|
+
right after a period might come back lowercased.
|
|
218
|
+
- **Article gender (FR/IT/ES/…)** — `"la nouveau"` → `"le nouveau"`
|
|
219
|
+
once the prior chunk has established the noun.
|
|
220
|
+
- **Language lock** — `whisper.cpp` auto-detects language per call;
|
|
221
|
+
feeding the previous chunk's tokens keeps the language stable
|
|
222
|
+
across cuts.
|
|
223
|
+
|
|
224
|
+
Whisper's prompt window is capped at ~224 tokens; 200 chars of French
|
|
225
|
+
sits well under that and leaves room for your static prompt + words
|
|
226
|
+
list.
|
|
227
|
+
|
|
228
|
+
The rolling tail is **dropped** whenever the pause that triggered the
|
|
229
|
+
chunk cut exceeded 1.5 seconds — a long pause is treated as a new
|
|
230
|
+
sentence/idea boundary, where carrying a possibly-bad prior chunk
|
|
231
|
+
forward biases the next one more than it helps. This mirrors
|
|
232
|
+
`whisper.cpp`'s `--keep-context off` default: prior-text conditioning
|
|
233
|
+
can self-reinforce errors (hallucinations, decoder repetition loops)
|
|
234
|
+
more readily than it provides useful continuity, so we cap it at
|
|
235
|
+
natural sentence boundaries.
|
|
236
|
+
|
|
237
|
+
Short pauses (mid-sentence punctuation) keep the context; the cut at
|
|
238
|
+
the start of every new recording also clears it.
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
# Fine tuning & CLI reference
|
|
2
|
+
|
|
3
|
+
For a complete, always-current listing run:
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
scribe --help
|
|
7
|
+
```
|
|
8
|
+
|
|
9
|
+
The flags are grouped to mirror the source-of-truth in
|
|
10
|
+
[`scribe/app.py`](../scribe/app.py).
|
|
11
|
+
|
|
12
|
+
## Backend
|
|
13
|
+
|
|
14
|
+
| Flag | Purpose |
|
|
15
|
+
|---------------------------------|-------------------------------------------------------------------------|
|
|
16
|
+
| `--backend {vosk,whisper,openai,groq}` | Speech-recognition backend (prompted if omitted). |
|
|
17
|
+
| `--model NAME` | Model name for the chosen backend. Auto-routes to the right backend for known model names (e.g. `--model gpt-realtime-whisper` selects `openai`). |
|
|
18
|
+
| `-l, --language LANG` | Language alias selecting a preset Vosk model (`en`/`fr`/`de`/`it`), or `en` for English-only Whisper models. |
|
|
19
|
+
| `--download-folder-whisper DIR` | Folder to store Whisper models. |
|
|
20
|
+
| `--download-folder-vosk DIR` | Folder to store Vosk models. |
|
|
21
|
+
|
|
22
|
+
## Prompting & vocabulary biasing
|
|
23
|
+
|
|
24
|
+
Bias the model toward particular names, jargon, or topics. Two
|
|
25
|
+
complementary knobs:
|
|
26
|
+
|
|
27
|
+
| Flag | Purpose |
|
|
28
|
+
|--------------------------|--------------------------------------------------------------------------------------------------|
|
|
29
|
+
| `--prompt TEXT` | Free-text style / context hint shown to the model. |
|
|
30
|
+
| `--prompt-file PATH` | Reads the prompt from a file; appended to `--prompt` if both are given. |
|
|
31
|
+
| `--words W [W ...]` | List of words to emphasise. Joined onto the prompt for cloud Whisper; routed to faster-whisper's dedicated `hotwords` channel locally. |
|
|
32
|
+
| `--words-file PATH` | Whitespace-separated words from a file; merged with `--words`. |
|
|
33
|
+
|
|
34
|
+
The whisper-family APIs cap the prompt around ~224 tokens; longer hints
|
|
35
|
+
are silently truncated. Vosk has no soft prompt and ignores both flags.
|
|
36
|
+
See [backends.md › Vocabulary biasing](backends.md#vocabulary-biasing)
|
|
37
|
+
for the per-backend wiring.
|
|
38
|
+
|
|
39
|
+
**Default files.** When none of the four flags above are given, scribe
|
|
40
|
+
also looks for `prompt.txt` and `words.txt` in the platform user-config
|
|
41
|
+
dir and loads them if they exist — handy for a long-lived personal
|
|
42
|
+
glossary. The path is resolved via `platformdirs`:
|
|
43
|
+
|
|
44
|
+
- Linux: `$XDG_CONFIG_HOME/scribe/` (default `~/.config/scribe/`)
|
|
45
|
+
- macOS: `~/Library/Application Support/scribe/`
|
|
46
|
+
- Windows: `%LOCALAPPDATA%\scribe\`
|
|
47
|
+
|
|
48
|
+
To suppress the default on a single invocation, pass an empty value:
|
|
49
|
+
`--prompt ""`, `--prompt-file ""`, or `--words` with no arguments. Each
|
|
50
|
+
flag suppresses only its own side (giving `--prompt ""` still loads
|
|
51
|
+
`words.txt` if present).
|
|
52
|
+
|
|
53
|
+
## Audio
|
|
54
|
+
|
|
55
|
+
| Flag | Purpose |
|
|
56
|
+
|-----------------------|----------------------------------------------------------|
|
|
57
|
+
| `--input-device N` | Microphone device index (see `python -m sounddevice`). |
|
|
58
|
+
|
|
59
|
+
## Output
|
|
60
|
+
|
|
61
|
+
| Flag | Purpose |
|
|
62
|
+
|-----------------------------|---------------------------------------------------------------------------------------------|
|
|
63
|
+
| `-m, --mode {keystroke,clipboard,terminal}` | Where transcribed text goes (default `keystroke`). See [keyboard.md](keyboard.md). |
|
|
64
|
+
| `--typer {auto,eitype,pynput,wtype,ydotool}` | Keystroke-injection backend (default `auto`). |
|
|
65
|
+
| `--type-direct` | In keystroke mode, type the transcription as keystrokes instead of synthesising Ctrl+V. |
|
|
66
|
+
| `-o, --output-file FILE` | Also append the transcription to this file. |
|
|
67
|
+
|
|
68
|
+
## Silence detection (shared)
|
|
69
|
+
|
|
70
|
+
| Flag | Default | Purpose |
|
|
71
|
+
|----------------------------|---------|------------------------------------------------------------------------|
|
|
72
|
+
| `--duration SECS` | `120` | Max recording duration in seconds. |
|
|
73
|
+
| `--silence-db DB` | `-40` | dBFS volume floor for "this frame is silent". Used by every silence-driven behavior. |
|
|
74
|
+
| `--silence-duration SECS` | `0.6` | How long silence must persist before triggering a backend's silence behavior (realtime auto-commit, pseudo-streaming cut). |
|
|
75
|
+
|
|
76
|
+
## Realtime (`gpt-realtime-whisper`)
|
|
77
|
+
|
|
78
|
+
| Flag | Default | Purpose |
|
|
79
|
+
|---------------------------------------------------|----------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
|
80
|
+
| `--realtime-delay {minimal,low,medium,high,xhigh}` | `medium` | Trade off latency vs accuracy on `gpt-realtime-whisper`. Lower = faster partials but more paste churn in the focused window. |
|
|
81
|
+
| `--realtime-gate` / `--no-realtime-gate` | on | Drop silent frames (per `--silence-db`) before sending them over the WebSocket so silent audio isn't billed as input tokens. After `--silence-duration` of silence, also commit mid-session so trailing words flush live. |
|
|
82
|
+
|
|
83
|
+
Streaming models (Vosk, `gpt-realtime-whisper`) ignore the batch
|
|
84
|
+
silence-chunking knobs; they have their own end-of-utterance signal.
|
|
85
|
+
|
|
86
|
+
## Frontend
|
|
87
|
+
|
|
88
|
+
| Flag | Purpose |
|
|
89
|
+
|-----------------------------|----------------------------------------------------------------------|
|
|
90
|
+
| `--frontend {tray,terminal}` | UI to launch (default `tray`). |
|
|
91
|
+
| `--no-interactive` | In terminal mode, skip the interactive menu and record immediately. (`--no-prompt` is kept as a deprecated alias.) |
|
|
92
|
+
| `--vosk-models M [M ...]` | Vosk models offered in the tray menu. |
|
|
93
|
+
| `--whisper-models M [M ...]` | Whisper models offered in the tray menu. |
|
|
94
|
+
|
|
95
|
+
## Examples
|
|
96
|
+
|
|
97
|
+
Predefine the tray menu's Whisper / Vosk model lists:
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
scribe --vosk-models vosk-model-fr-0.22 \
|
|
101
|
+
--whisper-models small large-v3-turbo
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
Stream OpenAI realtime transcripts with the most aggressive latency
|
|
105
|
+
setting:
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
scribe --model gpt-realtime-whisper --realtime-delay minimal
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
Disable the realtime silence gate (e.g. to A/B against a noisy
|
|
112
|
+
environment) — you'll pay for silent audio while the session is open:
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
scribe --model gpt-realtime-whisper --no-realtime-gate
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
Run scribe headlessly into a file without touching the clipboard or
|
|
119
|
+
focused window:
|
|
120
|
+
|
|
121
|
+
```bash
|
|
122
|
+
scribe --frontend terminal --no-interactive --mode terminal -o session.txt
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
Bias the recogniser toward domain jargon (medical terms, proper names):
|
|
126
|
+
|
|
127
|
+
```bash
|
|
128
|
+
scribe --prompt "Patient notes from a cardiology consult." \
|
|
129
|
+
--words tachycardia bradycardia echocardiogram metoprolol
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
Or store the lists in files for reuse across sessions:
|
|
133
|
+
|
|
134
|
+
```bash
|
|
135
|
+
scribe --prompt-file ~/.config/scribe/prompt.txt \
|
|
136
|
+
--words-file ~/.config/scribe/words.txt
|
|
137
|
+
```
|
|
@@ -167,3 +167,34 @@ If `eitype` is unavailable, two older workarounds also work:
|
|
|
167
167
|
Roadmap for native libei integration (eventual Python bindings,
|
|
168
168
|
expanded compositor support) is tracked in
|
|
169
169
|
[docs/roadmap-libei.md](roadmap-libei.md).
|
|
170
|
+
|
|
171
|
+
## Realtime backend: delta coalescing
|
|
172
|
+
|
|
173
|
+
The `gpt-realtime-whisper` backend emits one transcription delta per
|
|
174
|
+
word/subword at ~30–80 ms intervals — much faster than the
|
|
175
|
+
`pyperclip.copy()` + Ctrl+V cycle can settle on Wayland (≥100 ms,
|
|
176
|
+
because `wl-copy` is asynchronous). Pasting every delta led to
|
|
177
|
+
clipboard races where successive copies overwrote each other before
|
|
178
|
+
Ctrl+V landed, manifesting as dropped and duplicated words
|
|
179
|
+
(*"fait fait le mot mot time time…"*).
|
|
180
|
+
|
|
181
|
+
In **paste mode** (default keystroke output) scribe therefore
|
|
182
|
+
coalesces deltas: incoming tokens accumulate into a small buffer and
|
|
183
|
+
are flushed only when *either* ~400 ms have elapsed since the last
|
|
184
|
+
flush, *or* the buffer ends on sentence-final punctuation
|
|
185
|
+
(`. ! ? \n`). A 200 ms floor between any two flushes prevents
|
|
186
|
+
back-to-back punctuation flushes from racing each other through the
|
|
187
|
+
clipboard.
|
|
188
|
+
|
|
189
|
+
With **`--type-direct`** the coalescing is bypassed entirely — each
|
|
190
|
+
delta goes through the chosen typer as a raw keystroke synchronously
|
|
191
|
+
(uinput / xtest / portal libei), no clipboard involved, no race to
|
|
192
|
+
defeat. The UX is also snappier: tokens appear one at a time rather
|
|
193
|
+
than in ~400 ms-cadenced bursts.
|
|
194
|
+
|
|
195
|
+
macOS and Windows clipboards are synchronous, so the race that
|
|
196
|
+
motivates coalescing is essentially a Wayland artefact; scribe still
|
|
197
|
+
coalesces in paste mode there for consistency, but it's harmless.
|
|
198
|
+
This whole behaviour is realtime-specific — Vosk's per-phrase commits
|
|
199
|
+
already arrive at a sane cadence, and the pseudo-streaming backends
|
|
200
|
+
emit one chunk per silence cut (already coarse enough).
|
|
@@ -58,8 +58,10 @@ Options ▶
|
|
|
58
58
|
Keyboard backend ▶ eitype / pynput / ydotool / wtype
|
|
59
59
|
(rows incompatible with this OS are hidden;
|
|
60
60
|
submenu hidden entirely when ≤ 1 row left)
|
|
61
|
-
Advanced ▶
|
|
62
|
-
|
|
61
|
+
Advanced ▶ silence duration, silence threshold,
|
|
62
|
+
realtime gate, pseudo-streaming
|
|
63
|
+
[experimental], streaming window
|
|
64
|
+
[experimental], output file
|
|
63
65
|
Quit
|
|
64
66
|
```
|
|
65
67
|
|
|
@@ -20,6 +20,7 @@ dependencies = [
|
|
|
20
20
|
"pyperclip",
|
|
21
21
|
"unidecode",
|
|
22
22
|
"termcolor",
|
|
23
|
+
"platformdirs",
|
|
23
24
|
"desktop-ai-core>=0.2.0",
|
|
24
25
|
]
|
|
25
26
|
|
|
@@ -61,11 +62,12 @@ keywords = [
|
|
|
61
62
|
[project.optional-dependencies]
|
|
62
63
|
keyboard = ["pynput"]
|
|
63
64
|
whisper = ["faster-whisper"]
|
|
65
|
+
whisper-futo = ["pywhispercpp"]
|
|
64
66
|
vosk = ["vosk"]
|
|
65
67
|
app = ["pystray", "PyGObject"]
|
|
66
68
|
openai = ["openai>=2.37.0,<3", "soundfile"]
|
|
67
69
|
groq = ["openai>=2.37.0,<3", "soundfile"]
|
|
68
|
-
all = ["pynput", "faster-whisper", "openai>=2.37.0,<3", "soundfile", "vosk", "pystray"]
|
|
70
|
+
all = ["pynput", "faster-whisper", "pywhispercpp", "openai>=2.37.0,<3", "soundfile", "vosk", "pystray"]
|
|
69
71
|
|
|
70
72
|
|
|
71
73
|
[tool.setuptools]
|