scribe-cli 0.13.1__tar.gz → 0.16.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. scribe_cli-0.16.0/.github/FUNDING.yml +3 -0
  2. scribe_cli-0.16.0/.gitignore +9 -0
  3. scribe_cli-0.16.0/PKG-INFO +159 -0
  4. scribe_cli-0.16.0/README.md +82 -0
  5. scribe_cli-0.16.0/docs/app-tray-menu.png +0 -0
  6. scribe_cli-0.16.0/docs/backends.md +122 -0
  7. scribe_cli-0.16.0/docs/cli.md +95 -0
  8. scribe_cli-0.16.0/docs/desktop-install.md +44 -0
  9. scribe_cli-0.16.0/docs/installation.md +84 -0
  10. scribe_cli-0.16.0/docs/keyboard.md +169 -0
  11. scribe_cli-0.16.0/docs/roadmap-libei.md +158 -0
  12. scribe_cli-0.16.0/docs/tray.md +90 -0
  13. {scribe_cli-0.13.1 → scribe_cli-0.16.0}/pyproject.toml +17 -4
  14. scribe_cli-0.16.0/scribe/_version.py +24 -0
  15. scribe_cli-0.16.0/scribe/app.py +485 -0
  16. scribe_cli-0.16.0/scribe/backends/__init__.py +48 -0
  17. scribe_cli-0.16.0/scribe/backends/groq.py +26 -0
  18. scribe_cli-0.16.0/scribe/backends/openai_api.py +64 -0
  19. scribe_cli-0.16.0/scribe/backends/openai_realtime.py +274 -0
  20. scribe_cli-0.16.0/scribe/backends/vosk.py +58 -0
  21. scribe_cli-0.16.0/scribe/backends/whisper.py +45 -0
  22. scribe_cli-0.16.0/scribe/install_desktop.py +135 -0
  23. scribe_cli-0.16.0/scribe/keyboard.py +72 -0
  24. scribe_cli-0.16.0/scribe/menu.py +765 -0
  25. scribe_cli-0.16.0/scribe/models.py +144 -0
  26. scribe_cli-0.16.0/scribe/session.py +150 -0
  27. scribe_cli-0.16.0/scribe/typers/__init__.py +44 -0
  28. scribe_cli-0.16.0/scribe/typers/base.py +18 -0
  29. scribe_cli-0.16.0/scribe/typers/eitype.py +64 -0
  30. scribe_cli-0.16.0/scribe/typers/pynput.py +65 -0
  31. scribe_cli-0.16.0/scribe/typers/wtype.py +78 -0
  32. scribe_cli-0.16.0/scribe/typers/ydotool.py +72 -0
  33. scribe_cli-0.16.0/scribe_cli.egg-info/PKG-INFO +159 -0
  34. {scribe_cli-0.13.1 → scribe_cli-0.16.0}/scribe_cli.egg-info/SOURCES.txt +23 -0
  35. {scribe_cli-0.13.1 → scribe_cli-0.16.0}/scribe_cli.egg-info/requires.txt +9 -4
  36. scribe_cli-0.13.1/.gitignore +0 -6
  37. scribe_cli-0.13.1/PKG-INFO +0 -286
  38. scribe_cli-0.13.1/README.md +0 -214
  39. scribe_cli-0.13.1/scribe/_version.py +0 -21
  40. scribe_cli-0.13.1/scribe/app.py +0 -589
  41. scribe_cli-0.13.1/scribe/install_desktop.py +0 -50
  42. scribe_cli-0.13.1/scribe/keyboard.py +0 -68
  43. scribe_cli-0.13.1/scribe/models.py +0 -278
  44. scribe_cli-0.13.1/scribe_cli.egg-info/PKG-INFO +0 -286
  45. {scribe_cli-0.13.1 → scribe_cli-0.16.0}/.github/workflows/pypi.yml +0 -0
  46. {scribe_cli-0.13.1 → scribe_cli-0.16.0}/LICENSE +0 -0
  47. {scribe_cli-0.13.1 → scribe_cli-0.16.0}/icon.xcf +0 -0
  48. {scribe_cli-0.13.1 → scribe_cli-0.16.0}/scribe/__init__.py +0 -0
  49. {scribe_cli-0.13.1 → scribe_cli-0.16.0}/scribe/audio.py +0 -0
  50. {scribe_cli-0.13.1 → scribe_cli-0.16.0}/scribe/models.toml +0 -0
  51. {scribe_cli-0.13.1 → scribe_cli-0.16.0}/scribe/saverecording.py +0 -0
  52. {scribe_cli-0.13.1 → scribe_cli-0.16.0}/scribe/testpynput.py +0 -0
  53. {scribe_cli-0.13.1 → scribe_cli-0.16.0}/scribe/util.py +0 -0
  54. {scribe_cli-0.13.1 → scribe_cli-0.16.0}/scribe_cli.egg-info/dependency_links.txt +0 -0
  55. {scribe_cli-0.13.1 → scribe_cli-0.16.0}/scribe_cli.egg-info/entry_points.txt +0 -0
  56. {scribe_cli-0.13.1 → scribe_cli-0.16.0}/scribe_cli.egg-info/top_level.txt +0 -0
  57. {scribe_cli-0.13.1 → scribe_cli-0.16.0}/scribe_data/__init__.py +0 -0
  58. {scribe_cli-0.13.1 → scribe_cli-0.16.0}/scribe_data/share/icon.png +0 -0
  59. {scribe_cli-0.13.1 → scribe_cli-0.16.0}/scribe_data/share/icon_recording.png +0 -0
  60. {scribe_cli-0.13.1 → scribe_cli-0.16.0}/scribe_data/share/icon_writing.png +0 -0
  61. {scribe_cli-0.13.1 → scribe_cli-0.16.0}/scribe_data/templates/scribe.desktop +0 -0
  62. {scribe_cli-0.13.1 → scribe_cli-0.16.0}/scripts/test_python_versions_install.sh +0 -0
  63. {scribe_cli-0.13.1 → scribe_cli-0.16.0}/setup.cfg +0 -0
@@ -0,0 +1,3 @@
1
+ # These are supported funding model platforms
2
+
3
+ github: [ perrette ]
@@ -0,0 +1,9 @@
1
+ __pycache__
2
+ *.pyc
3
+ .venv
4
+ build
5
+ dist
6
+ scribe/_version.py
7
+
8
+ # Autonomous roadmap workflows (local coordination artifacts; never committed)
9
+ workflows/
@@ -0,0 +1,159 @@
1
+ Metadata-Version: 2.4
2
+ Name: scribe-cli
3
+ Version: 0.16.0
4
+ Summary: Speech-to-text CLI and system-tray app for dictating into any focused window. Local (vosk, faster-whisper) or cloud (groq, openai) backends, batch or streaming.
5
+ Author-email: Mahé Perrette <mahe.perrette@gmail.com>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2024 Mahé Perrette
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ ---
29
+
30
+ Note: This project relies on external packages that may have more restrictive
31
+ licenses. For example, the `pynput` package is licensed under LGPLv3, which
32
+ has different requirements compared to the MIT License. Please review the
33
+ licenses of all dependencies before using or distributing this software to
34
+ ensure compliance with their respective terms.
35
+ Project-URL: Homepage, https://github.com/perrette/scribe
36
+ Keywords: speech-to-text,speech recognition,transcription,dictation,voice-typing,voice-to-text,realtime,streaming,language,AI,local,API,cli,tray,vosk,whisper,openai,groq,gpt-4o,linux,wayland,keyboard,clipboard
37
+ Classifier: Programming Language :: Python :: 3.9
38
+ Classifier: Programming Language :: Python :: 3.10
39
+ Classifier: Programming Language :: Python :: 3.11
40
+ Classifier: Programming Language :: Python :: 3.12
41
+ Classifier: Programming Language :: Python :: 3.13
42
+ Classifier: Operating System :: OS Independent
43
+ Requires-Python: >=3.9
44
+ Description-Content-Type: text/markdown
45
+ License-File: LICENSE
46
+ Requires-Dist: numpy
47
+ Requires-Dist: sounddevice
48
+ Requires-Dist: tqdm
49
+ Requires-Dist: requests
50
+ Requires-Dist: pyperclip
51
+ Requires-Dist: unidecode
52
+ Requires-Dist: termcolor
53
+ Requires-Dist: desktop-ai-core>=0.2.0
54
+ Provides-Extra: keyboard
55
+ Requires-Dist: pynput; extra == "keyboard"
56
+ Provides-Extra: whisper
57
+ Requires-Dist: faster-whisper; extra == "whisper"
58
+ Provides-Extra: vosk
59
+ Requires-Dist: vosk; extra == "vosk"
60
+ Provides-Extra: app
61
+ Requires-Dist: pystray; extra == "app"
62
+ Requires-Dist: PyGObject; extra == "app"
63
+ Provides-Extra: openai
64
+ Requires-Dist: openai<3,>=2.37.0; extra == "openai"
65
+ Requires-Dist: soundfile; extra == "openai"
66
+ Provides-Extra: groq
67
+ Requires-Dist: openai<3,>=2.37.0; extra == "groq"
68
+ Requires-Dist: soundfile; extra == "groq"
69
+ Provides-Extra: all
70
+ Requires-Dist: pynput; extra == "all"
71
+ Requires-Dist: faster-whisper; extra == "all"
72
+ Requires-Dist: openai<3,>=2.37.0; extra == "all"
73
+ Requires-Dist: soundfile; extra == "all"
74
+ Requires-Dist: vosk; extra == "all"
75
+ Requires-Dist: pystray; extra == "all"
76
+ Dynamic: license-file
77
+
78
+ [![pypi](https://img.shields.io/pypi/v/scribe-cli)](https://pypi.org/project/scribe-cli)
79
+ ![](https://img.shields.io/python/required-version-toml?tomlFilePath=https%3A%2F%2Fraw.githubusercontent.com%2Fperrette%2Fscribe%2Frefs%2Fheads%2Fmain%2Fpyproject.toml)
80
+
81
+ # Scribe <img src="https://github.com/perrette/scribe/raw/main/scribe_data/share/icon.png" width="48">
82
+
83
+ **Talk. It types.** Scribe is a speech-to-text CLI and tray app that
84
+ pipes transcribed text straight into the focused window. It supports local and
85
+ cloud-based APIs, batch and streaming workflows.
86
+
87
+ ## What it does
88
+
89
+ - Records from your mic and transcribes via one of four backends —
90
+ **Vosk** (local, streaming), **Whisper** (local, batch), **OpenAI**
91
+ (cloud, batch *or* streaming), **Groq** (cloud, batch).
92
+ - Delivers the transcript three ways: paste into the focused window
93
+ (default), copy to clipboard, or print to the terminal.
94
+ - Runs as a **system tray icon** with a single Record button, or as an
95
+ interactive **terminal TUI** — same menu in both.
96
+ - Hooks into your DE's keyboard shortcuts via `SIGUSR1` (toggle
97
+ recording) and `SIGUSR2` (cancel).
98
+ - Cross-platform: tested on Ubuntu (X11 and Wayland), macOS, Windows;
99
+ works under Termux for clipboard / terminal output.
100
+
101
+ ## Getting started
102
+
103
+ ```bash
104
+ sudo apt-get install portaudio19-dev xclip # Ubuntu; macOS: brew install portaudio
105
+ pip install scribe-cli[all]
106
+ export GROQ_API_KEY=YOURAPIKEY # or OPENAI_API_KEY, or skip and run local
107
+ scribe
108
+ ```
109
+
110
+ Scribe picks the first backend whose key / dependency is present, in
111
+ order **`groq` → `openai` → `whisper` → `vosk`**, and launches the
112
+ tray icon. Press Record, speak, press Stop.
113
+
114
+ See documentation below for setting up keyboard input on Ubuntu Wayland.
115
+
116
+ ### Getting an API key
117
+
118
+ Groq is a good cloud backend to start with — very fast, quite accurate, and the
119
+ **free tier** is generous enough for everyday dictation. Sign up at
120
+ [console.groq.com](https://console.groq.com/), create an API key
121
+ under **Settings → API Keys**, and export it as `GROQ_API_KEY`.
122
+
123
+ I personally use [OpenAI](https://openai.com/api/) with `gpt-4o-mini-transcribe` as it is also fast and perhaps more accurate for my accent-tainted English.
124
+
125
+ ## Backends at a glance
126
+
127
+ | Backend | `--backend` | Default model | Streaming model(s) | Requires |
128
+ |-----------------|-------------|----------------------------|---------------------------|-------------------------------------|
129
+ | Groq (cloud) | `groq` | `whisper-large-v3-turbo` | — | `GROQ_API_KEY` |
130
+ | OpenAI (cloud) | `openai` | `gpt-4o-mini-transcribe` | `gpt-realtime-whisper` | `OPENAI_API_KEY` |
131
+ | Whisper (local) | `whisper` | `small` | — | `pip install scribe-cli[whisper]` |
132
+ | Vosk (local) | `vosk` | language-dependent | all Vosk models | `pip install scribe-cli[vosk]` |
133
+
134
+ Whether a transcription appears live as you speak or all at once when
135
+ you stop depends on the **model** picked — see
136
+ [docs/backends.md](docs/backends.md).
137
+
138
+ ## Documentation
139
+
140
+ - [Installation & dependencies](docs/installation.md) — PortAudio,
141
+ extras, Ubuntu / GNOME tray libs.
142
+ - [Backends in detail](docs/backends.md) — model lists, when to pick
143
+ which, the realtime model.
144
+ - [Keyboard modes & typer backends](docs/keyboard.md) — keystroke vs
145
+ clipboard, Wayland / `eitype`, `--type-direct`.
146
+ - [System tray & global hotkeys](docs/tray.md) — menu tree, icon
147
+ states, `SIGUSR1`/`SIGUSR2`.
148
+ - [Desktop entry & autostart (`scribe-install`)](docs/desktop-install.md)
149
+ — GNOME / KDE launcher integration.
150
+ - [Fine tuning & CLI reference](docs/cli.md) — every `scribe --help`
151
+ flag with examples.
152
+
153
+ ## Compatibility
154
+
155
+ Initially developed for Python 3 on Ubuntu 24.04 (GNOME + Wayland);
156
+ works on macOS and Windows too. Wayland keystroke injection is
157
+ convoluted but [solved](docs/keyboard.md). For dependencies of
158
+ individual subsystems, check `pynput` (keyboard) and `pystray` (tray
159
+ icon).
@@ -0,0 +1,82 @@
1
+ [![pypi](https://img.shields.io/pypi/v/scribe-cli)](https://pypi.org/project/scribe-cli)
2
+ ![](https://img.shields.io/python/required-version-toml?tomlFilePath=https%3A%2F%2Fraw.githubusercontent.com%2Fperrette%2Fscribe%2Frefs%2Fheads%2Fmain%2Fpyproject.toml)
3
+
4
+ # Scribe <img src="https://github.com/perrette/scribe/raw/main/scribe_data/share/icon.png" width="48">
5
+
6
+ **Talk. It types.** Scribe is a speech-to-text CLI and tray app that
7
+ pipes transcribed text straight into the focused window. It supports local and
8
+ cloud-based APIs, batch and streaming workflows.
9
+
10
+ ## What it does
11
+
12
+ - Records from your mic and transcribes via one of four backends —
13
+ **Vosk** (local, streaming), **Whisper** (local, batch), **OpenAI**
14
+ (cloud, batch *or* streaming), **Groq** (cloud, batch).
15
+ - Delivers the transcript three ways: paste into the focused window
16
+ (default), copy to clipboard, or print to the terminal.
17
+ - Runs as a **system tray icon** with a single Record button, or as an
18
+ interactive **terminal TUI** — same menu in both.
19
+ - Hooks into your DE's keyboard shortcuts via `SIGUSR1` (toggle
20
+ recording) and `SIGUSR2` (cancel).
21
+ - Cross-platform: tested on Ubuntu (X11 and Wayland), macOS, Windows;
22
+ works under Termux for clipboard / terminal output.
23
+
24
+ ## Getting started
25
+
26
+ ```bash
27
+ sudo apt-get install portaudio19-dev xclip # Ubuntu; macOS: brew install portaudio
28
+ pip install scribe-cli[all]
29
+ export GROQ_API_KEY=YOURAPIKEY # or OPENAI_API_KEY, or skip and run local
30
+ scribe
31
+ ```
32
+
33
+ Scribe picks the first backend whose key / dependency is present, in
34
+ order **`groq` → `openai` → `whisper` → `vosk`**, and launches the
35
+ tray icon. Press Record, speak, press Stop.
36
+
37
+ See documentation below for setting up keyboard input on Ubuntu Wayland.
38
+
39
+ ### Getting an API key
40
+
41
+ Groq is a good cloud backend to start with — very fast, quite accurate, and the
42
+ **free tier** is generous enough for everyday dictation. Sign up at
43
+ [console.groq.com](https://console.groq.com/), create an API key
44
+ under **Settings → API Keys**, and export it as `GROQ_API_KEY`.
45
+
46
+ I personally use [OpenAI](https://openai.com/api/) with `gpt-4o-mini-transcribe` as it is also fast and perhaps more accurate for my accent-tainted English.
47
+
48
+ ## Backends at a glance
49
+
50
+ | Backend | `--backend` | Default model | Streaming model(s) | Requires |
51
+ |-----------------|-------------|----------------------------|---------------------------|-------------------------------------|
52
+ | Groq (cloud) | `groq` | `whisper-large-v3-turbo` | — | `GROQ_API_KEY` |
53
+ | OpenAI (cloud) | `openai` | `gpt-4o-mini-transcribe` | `gpt-realtime-whisper` | `OPENAI_API_KEY` |
54
+ | Whisper (local) | `whisper` | `small` | — | `pip install scribe-cli[whisper]` |
55
+ | Vosk (local) | `vosk` | language-dependent | all Vosk models | `pip install scribe-cli[vosk]` |
56
+
57
+ Whether a transcription appears live as you speak or all at once when
58
+ you stop depends on the **model** picked — see
59
+ [docs/backends.md](docs/backends.md).
60
+
61
+ ## Documentation
62
+
63
+ - [Installation & dependencies](docs/installation.md) — PortAudio,
64
+ extras, Ubuntu / GNOME tray libs.
65
+ - [Backends in detail](docs/backends.md) — model lists, when to pick
66
+ which, the realtime model.
67
+ - [Keyboard modes & typer backends](docs/keyboard.md) — keystroke vs
68
+ clipboard, Wayland / `eitype`, `--type-direct`.
69
+ - [System tray & global hotkeys](docs/tray.md) — menu tree, icon
70
+ states, `SIGUSR1`/`SIGUSR2`.
71
+ - [Desktop entry & autostart (`scribe-install`)](docs/desktop-install.md)
72
+ — GNOME / KDE launcher integration.
73
+ - [Fine tuning & CLI reference](docs/cli.md) — every `scribe --help`
74
+ flag with examples.
75
+
76
+ ## Compatibility
77
+
78
+ Initially developed for Python 3 on Ubuntu 24.04 (GNOME + Wayland);
79
+ works on macOS and Windows too. Wayland keystroke injection is
80
+ convoluted but [solved](docs/keyboard.md). For dependencies of
81
+ individual subsystems, check `pynput` (keyboard) and `pystray` (tray
82
+ icon).
Binary file
@@ -0,0 +1,122 @@
1
+ # Backends in detail
2
+
3
+ Scribe ships four speech-to-text backends. They are all picked through
4
+ the same `--backend` / `--model` CLI flags (or the **Model** submenu in
5
+ the tray / terminal frontend). Whether a transcription is *streaming*
6
+ (text appears live as you speak) or *batch* (text arrives at end of
7
+ recording) depends on the **model** chosen — not the backend.
8
+
9
+ ## At a glance
10
+
11
+ | Backend | `--backend` | Default model | Streaming model(s) | Requires |
12
+ |-----------------|-------------|----------------------------|---------------------------|-------------------------------------|
13
+ | Groq (cloud) | `groq` | `whisper-large-v3-turbo` | — | `GROQ_API_KEY` |
14
+ | OpenAI (cloud) | `openai` | `gpt-4o-mini-transcribe` | `gpt-realtime-whisper` | `OPENAI_API_KEY` |
15
+ | Whisper (local) | `whisper` | `small` | — | `pip install scribe-cli[whisper]` |
16
+ | Vosk (local) | `vosk` | language-dependent | all Vosk models | `pip install scribe-cli[vosk]` |
17
+
18
+ Run `scribe` without arguments and it picks the first backend whose
19
+ dependency / API key is present, preferring cloud over local:
20
+ `groq → openai → whisper → vosk`.
21
+
22
+ ## `whisper` (local)
23
+
24
+ Runs locally via
25
+ [`faster-whisper`](https://github.com/SYSTRAN/faster-whisper) and
26
+ defaults to the `small` model. Excellent at full-utterance
27
+ transcription in
28
+ [many languages](https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages),
29
+ but it does not stream — text appears at end-of-recording — and
30
+ execution time depends on model size and hardware.
31
+
32
+ The available models offered in the tray menu are
33
+ `tiny / base / small / medium / large-v3 / large-v3-turbo`. Larger
34
+ models trade speed for accuracy.
35
+
36
+ With `--language en` (or `-l en`) scribe auto-substitutes the
37
+ English-only variant (e.g. `small` → `small.en`) when it exists.
38
+
39
+ ## `vosk` (local, streaming)
40
+
41
+ Vosk transcribes in real time and is very good at one language at a
42
+ time, but tends to make more mistakes than Whisper and does not produce
43
+ punctuation. It becomes really useful in longer, interactive sessions
44
+ where the live "appears as you speak" UX matters — see
45
+ [keyboard.md](keyboard.md) for how the keystroke mode interacts with
46
+ streaming models.
47
+
48
+ There are many [Vosk models](https://alphacephei.com/vosk/models)
49
+ available; a handful are pre-mapped to common languages (`en`, `fr`,
50
+ `de`, `it`) in
51
+ [`scribe/models.toml`](../scribe/models.toml). Pick one with
52
+ `-l <lang>` or browse the full list interactively from the menu.
53
+
54
+ ## `openai` (OpenAI cloud)
55
+
56
+ The OpenAI backend supports three models:
57
+
58
+ - `gpt-4o-mini-transcribe` *(default)* — fast, low-cost batch
59
+ transcription.
60
+ - `gpt-4o-transcribe` — higher-quality batch transcription.
61
+ - `gpt-realtime-whisper` *(streaming)* — partial transcripts arrive
62
+ as you speak. Same UX as Vosk but using OpenAI's cloud model.
63
+
64
+ All three share the same `OPENAI_API_KEY` and the `[openai]` extra; no
65
+ extra dependencies. Set the key once:
66
+
67
+ ```bash
68
+ export OPENAI_API_KEY=YOURAPIKEY
69
+ scribe --backend openai # default: gpt-4o-mini-transcribe
70
+ scribe --model gpt-4o-transcribe # batch, higher quality
71
+ scribe --model gpt-realtime-whisper # streaming
72
+ ```
73
+
74
+ `--model` alone auto-routes to the `openai` backend for any of the
75
+ three models above, so `--backend openai` is optional.
76
+
77
+ ### `--realtime-delay` (gpt-realtime-whisper only)
78
+
79
+ The streaming model has a latency-vs-accuracy knob exposed as
80
+ `--realtime-delay {minimal,low,medium,high,xhigh}` (default `medium`).
81
+ Lower values emit partial transcripts sooner — at the cost of more
82
+ revisions arriving in the focused window. Higher values batch tokens
83
+ into longer chunks so what gets pasted is more stable.
84
+
85
+ See OpenAI's
86
+ [gpt-realtime-whisper model card](https://developers.openai.com/api/docs/models/gpt-realtime-whisper)
87
+ for the full picture.
88
+
89
+ ## `groq` (Groq cloud)
90
+
91
+ Talks to Groq's OpenAI-compatible API and defaults to
92
+ `whisper-large-v3-turbo`. Typically the fastest cloud option for
93
+ full-utterance transcription:
94
+
95
+ ```bash
96
+ export GROQ_API_KEY=YOURAPIKEY
97
+ scribe --backend groq
98
+ ```
99
+
100
+ The `groq` backend reuses the `openai` Python client under the hood, so
101
+ installing `[openai]` is enough for both.
102
+
103
+ ## Stopping a recording
104
+
105
+ For batch models (Whisper local, Whisper-via-API, Groq, `gpt-4o-*`) the
106
+ recording continues for up to 2 minutes until you stop it manually
107
+ (Stop in the tray, Ctrl+C in the terminal) — the transcription happens
108
+ once when you stop.
109
+
110
+ You can also auto-cut on silence:
111
+
112
+ ```bash
113
+ scribe --silence-db -40 --silence 2
114
+ ```
115
+
116
+ cuts the recording when a silence below −40 dB lasts more than 2
117
+ seconds. The defaults (`--silence-db -200`, `--silence 120`) effectively
118
+ disable this and keep full manual control.
119
+
120
+ Streaming models (Vosk, `gpt-realtime-whisper`) emit partials as you
121
+ speak and stop on the same Stop / Ctrl+C action — there is no silence
122
+ threshold to tune.
@@ -0,0 +1,95 @@
1
+ # Fine tuning & CLI reference
2
+
3
+ For a complete, always-current listing run:
4
+
5
+ ```bash
6
+ scribe --help
7
+ ```
8
+
9
+ The flags are grouped to mirror the source-of-truth in
10
+ [`scribe/app.py`](../scribe/app.py).
11
+
12
+ ## Backend
13
+
14
+ | Flag | Purpose |
15
+ |---------------------------------|-------------------------------------------------------------------------|
16
+ | `--backend {vosk,whisper,openai,groq}` | Speech-recognition backend (prompted if omitted). |
17
+ | `--model NAME` | Model name for the chosen backend. Auto-routes to the right backend for known model names (e.g. `--model gpt-realtime-whisper` selects `openai`). |
18
+ | `-l, --language LANG` | Language alias selecting a preset Vosk model (`en`/`fr`/`de`/`it`), or `en` for English-only Whisper models. |
19
+ | `--api-key KEY` | API key for cloud backends; falls back to `OPENAI_API_KEY` / `GROQ_API_KEY` env. |
20
+ | `--download-folder-whisper DIR` | Folder to store Whisper models. |
21
+ | `--download-folder-vosk DIR` | Folder to store Vosk models. |
22
+
23
+ ## Audio
24
+
25
+ | Flag | Purpose |
26
+ |-----------------------|----------------------------------------------------------|
27
+ | `--input-device N` | Microphone device index (see `python -m sounddevice`). |
28
+
29
+ ## Output
30
+
31
+ | Flag | Purpose |
32
+ |-----------------------------|---------------------------------------------------------------------------------------------|
33
+ | `-m, --mode {keystroke,clipboard,terminal}` | Where transcribed text goes (default `keystroke`). See [keyboard.md](keyboard.md). |
34
+ | `--typer {auto,eitype,pynput,wtype,ydotool}` | Keystroke-injection backend (default `auto`). |
35
+ | `--type-direct` | In keystroke mode, type the transcription as keystrokes instead of synthesising Ctrl+V. |
36
+ | `-o, --output-file FILE` | Also append the transcription to this file. |
37
+
38
+ ## Realtime (`gpt-realtime-whisper`)
39
+
40
+ | Flag | Purpose |
41
+ |---------------------------------------------------|----------------------------------------------------------------------------------------------|
42
+ | `--realtime-delay {minimal,low,medium,high,xhigh}` | Trade off latency vs accuracy on `gpt-realtime-whisper` (default `medium`). Lower = faster partials but more paste churn in the focused window. |
43
+
44
+ This flag only affects the OpenAI realtime model; the other backends
45
+ ignore it.
46
+
47
+ ## Silence detection (whisper, openai batch, groq)
48
+
49
+ | Flag | Default | Purpose |
50
+ |-------------------------------|---------|---------------------------------------------------------------|
51
+ | `--duration SECS` | `120` | Max recording duration in seconds. |
52
+ | `--silence SECS` | `120` | Silence duration in seconds that triggers a cut (default effectively disables it). |
53
+ | `--silence-db DB` | `-200` | Silence threshold in dB (default effectively disables it). |
54
+ | `-a, --restart-after-silence` | off | Resume recording after a silence-triggered transcription. |
55
+
56
+ Streaming models (Vosk, `gpt-realtime-whisper`) ignore these — they
57
+ have their own end-of-utterance signal.
58
+
59
+ ## Frontend
60
+
61
+ | Flag | Purpose |
62
+ |-----------------------------|----------------------------------------------------------------------|
63
+ | `--frontend {tray,terminal}` | UI to launch (default `tray`). |
64
+ | `--no-prompt` | In terminal mode, skip the interactive menu and record immediately. |
65
+ | `--vosk-models M [M ...]` | Vosk models offered in the tray menu. |
66
+ | `--whisper-models M [M ...]` | Whisper models offered in the tray menu. |
67
+
68
+ ## Examples
69
+
70
+ Predefine the tray menu's Whisper / Vosk model lists:
71
+
72
+ ```bash
73
+ scribe --vosk-models vosk-model-fr-0.22 \
74
+ --whisper-models small large-v3-turbo
75
+ ```
76
+
77
+ Cut on 2 s of silence below −40 dB and auto-restart afterwards:
78
+
79
+ ```bash
80
+ scribe --silence-db -40 --silence 2 -a
81
+ ```
82
+
83
+ Stream OpenAI realtime transcripts with the most aggressive latency
84
+ setting:
85
+
86
+ ```bash
87
+ scribe --model gpt-realtime-whisper --realtime-delay minimal
88
+ ```
89
+
90
+ Run scribe headlessly into a file without touching the clipboard or
91
+ focused window:
92
+
93
+ ```bash
94
+ scribe --frontend terminal --no-prompt --mode terminal -o session.txt
95
+ ```
@@ -0,0 +1,44 @@
1
+ # Desktop entry & autostart (`scribe-install`)
2
+
3
+ On Linux (GNOME, KDE, anything supporting the freedesktop
4
+ [`.desktop`](https://specifications.freedesktop.org/desktop-entry-spec/)
5
+ spec), the `scribe-install` command generates a `scribe.desktop` file
6
+ under `$HOME/.local/share/applications` so scribe shows up in your
7
+ launcher / dash.
8
+
9
+ Any extra arguments are passed straight through to `scribe`, plus two
10
+ install-only options: `--name` (the human-readable label) and
11
+ `--frontend {tray,terminal}` (default: `tray`).
12
+
13
+ ## Two common flavors
14
+
15
+ ```bash
16
+ scribe-install --name "Scribe"
17
+ scribe-install --name "Scribe Terminal" --frontend terminal
18
+ ```
19
+
20
+ - The first creates an app named **Scribe** that runs in tray mode
21
+ (no terminal window), with the tray icon as the only mode of
22
+ interaction.
23
+ - The second creates an app named **Scribe Terminal** that opens a
24
+ terminal window and runs the interactive TUI.
25
+
26
+ Keyboard mode defaults to `keystroke` — pass `--mode clipboard` or
27
+ `--mode terminal` if you want a different default for the installed
28
+ app.
29
+
30
+ ## Wayland / eitype auto-prompt
31
+
32
+ After writing the desktop file, `scribe-install` checks whether you're
33
+ on a Wayland session without `eitype` (the recommended typer backend
34
+ for GNOME / KDE / Hyprland — see [keyboard.md](keyboard.md)). If so:
35
+
36
+ - If `cargo` is already on your `$PATH`, it asks whether to run
37
+ `cargo install --git https://github.com/Adam-D-Lewis/eitype` for you
38
+ (~1–2 min, no `sudo`, writes only to `~/.cargo/bin`).
39
+ - If `cargo` is missing, it prints the rustup + cargo-install recipe
40
+ so you can run it manually.
41
+
42
+ `ydotool` is never auto-installed: enabling it grants kernel-level
43
+ input access (via the `input` group or a setuid daemon) and ought to
44
+ be a conscious choice. See its package docs if you need it.
@@ -0,0 +1,84 @@
1
+ # Installation & dependencies
2
+
3
+ Scribe is a Python package distributed on PyPI as
4
+ [`scribe-cli`](https://pypi.org/project/scribe-cli) (the `-cli` suffix
5
+ disambiguates it from an unrelated package). It runs on Linux (X11 and
6
+ Wayland), macOS, and Windows; on Android it works under Termux for
7
+ clipboard / terminal output.
8
+
9
+ ## System dependencies
10
+
11
+ Scribe records audio via PortAudio (through `sounddevice`) and reads /
12
+ writes the clipboard via `xclip` on Linux. On Ubuntu:
13
+
14
+ ```bash
15
+ sudo apt-get install portaudio19-dev xclip
16
+ ```
17
+
18
+ On macOS use Homebrew:
19
+
20
+ ```bash
21
+ brew install portaudio
22
+ ```
23
+
24
+ (Windows ships everything needed via the wheels.)
25
+
26
+ ## Python package
27
+
28
+ The simplest install pulls every optional dependency:
29
+
30
+ ```bash
31
+ pip install scribe-cli[all]
32
+ ```
33
+
34
+ For local development from a clone:
35
+
36
+ ```bash
37
+ git clone https://github.com/perrette/scribe.git
38
+ cd scribe
39
+ pip install -e .[all]
40
+ ```
41
+
42
+ ## Pick-and-choose extras
43
+
44
+ If you don't want everything, `scribe-cli` ships granular extras matching
45
+ the four backends and the tray UI:
46
+
47
+ | Extra | Pulls in | Needed for |
48
+ |--------------|----------------------------------------------------|-----------------------------------------|
49
+ | `[whisper]` | `faster-whisper` | local Whisper backend |
50
+ | `[vosk]` | `vosk` | local Vosk backend (streaming) |
51
+ | `[openai]` | `openai`, `soundfile` | OpenAI cloud backend (incl. realtime) |
52
+ | `[groq]` | `openai`, `soundfile` | Groq cloud backend |
53
+ | `[keyboard]` | `pynput` | the `pynput` typer (XTest/Quartz/WinAPI)|
54
+ | `[app]` | `pystray`, `PyGObject` | system tray icon |
55
+ | `[all]` | all of the above | one-shot setup |
56
+
57
+ You need at least one backend extra (or none if you only plan to use
58
+ cloud backends *and* already have the `openai` package). The `groq`
59
+ backend reuses the `openai` client, so `[openai]` covers both.
60
+
61
+ ## Ubuntu / GNOME tray dependencies
62
+
63
+ The tray icon needs system libraries for the AppIndicator stack:
64
+
65
+ ```bash
66
+ sudo apt install libcairo-dev libgirepository1.0-dev gir1.2-appindicator3-0.1
67
+ pip install PyGObject pystray
68
+ ```
69
+
70
+ These come for free with `[all]` or `[app]`, but the apt packages must
71
+ be installed first so `PyGObject` can compile.
72
+
73
+ ## Keyboard injection backends
74
+
75
+ The Python `pynput` package is the default typer and is pulled in by
76
+ `[keyboard]` / `[all]`. The other typer backends (`eitype`, `wtype`,
77
+ `ydotool`) are OS-level binaries — see [keyboard.md](keyboard.md) for
78
+ when you need them and how to install each.
79
+
80
+ ## Model cache
81
+
82
+ Local backends (Vosk, Whisper) download their model files on first use
83
+ to `$XDG_CACHE_HOME/<backend>` (defaults to `$HOME/.cache/<backend>`).
84
+ Override with `--download-folder-vosk` / `--download-folder-whisper`.