python-voiceio 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. python_voiceio-0.2.0/LICENSE +21 -0
  2. python_voiceio-0.2.0/PKG-INFO +260 -0
  3. python_voiceio-0.2.0/README.md +225 -0
  4. python_voiceio-0.2.0/pyproject.toml +54 -0
  5. python_voiceio-0.2.0/python_voiceio.egg-info/PKG-INFO +260 -0
  6. python_voiceio-0.2.0/python_voiceio.egg-info/SOURCES.txt +57 -0
  7. python_voiceio-0.2.0/python_voiceio.egg-info/dependency_links.txt +1 -0
  8. python_voiceio-0.2.0/python_voiceio.egg-info/entry_points.txt +6 -0
  9. python_voiceio-0.2.0/python_voiceio.egg-info/requires.txt +20 -0
  10. python_voiceio-0.2.0/python_voiceio.egg-info/top_level.txt +1 -0
  11. python_voiceio-0.2.0/setup.cfg +4 -0
  12. python_voiceio-0.2.0/tests/test_app_wiring.py +135 -0
  13. python_voiceio-0.2.0/tests/test_backend_probes.py +90 -0
  14. python_voiceio-0.2.0/tests/test_config.py +63 -0
  15. python_voiceio-0.2.0/tests/test_fallback.py +101 -0
  16. python_voiceio-0.2.0/tests/test_health.py +73 -0
  17. python_voiceio-0.2.0/tests/test_ibus_typer.py +153 -0
  18. python_voiceio-0.2.0/tests/test_platform.py +74 -0
  19. python_voiceio-0.2.0/tests/test_prebuffer.py +105 -0
  20. python_voiceio-0.2.0/tests/test_recorder_integration.py +148 -0
  21. python_voiceio-0.2.0/tests/test_streaming.py +481 -0
  22. python_voiceio-0.2.0/tests/test_transcriber.py +69 -0
  23. python_voiceio-0.2.0/voiceio/__init__.py +1 -0
  24. python_voiceio-0.2.0/voiceio/__main__.py +3 -0
  25. python_voiceio-0.2.0/voiceio/app.py +415 -0
  26. python_voiceio-0.2.0/voiceio/backends.py +13 -0
  27. python_voiceio-0.2.0/voiceio/cli.py +475 -0
  28. python_voiceio-0.2.0/voiceio/config.py +136 -0
  29. python_voiceio-0.2.0/voiceio/feedback.py +78 -0
  30. python_voiceio-0.2.0/voiceio/health.py +194 -0
  31. python_voiceio-0.2.0/voiceio/hotkeys/__init__.py +22 -0
  32. python_voiceio-0.2.0/voiceio/hotkeys/base.py +27 -0
  33. python_voiceio-0.2.0/voiceio/hotkeys/chain.py +83 -0
  34. python_voiceio-0.2.0/voiceio/hotkeys/evdev.py +134 -0
  35. python_voiceio-0.2.0/voiceio/hotkeys/pynput_backend.py +80 -0
  36. python_voiceio-0.2.0/voiceio/hotkeys/socket_backend.py +77 -0
  37. python_voiceio-0.2.0/voiceio/ibus/__init__.py +8 -0
  38. python_voiceio-0.2.0/voiceio/ibus/engine.py +268 -0
  39. python_voiceio-0.2.0/voiceio/platform.py +139 -0
  40. python_voiceio-0.2.0/voiceio/recorder.py +208 -0
  41. python_voiceio-0.2.0/voiceio/service.py +234 -0
  42. python_voiceio-0.2.0/voiceio/sounds/__init__.py +0 -0
  43. python_voiceio-0.2.0/voiceio/sounds/commit.wav +0 -0
  44. python_voiceio-0.2.0/voiceio/sounds/start.wav +0 -0
  45. python_voiceio-0.2.0/voiceio/sounds/stop.wav +0 -0
  46. python_voiceio-0.2.0/voiceio/streaming.py +202 -0
  47. python_voiceio-0.2.0/voiceio/transcriber.py +165 -0
  48. python_voiceio-0.2.0/voiceio/tray.py +54 -0
  49. python_voiceio-0.2.0/voiceio/typers/__init__.py +31 -0
  50. python_voiceio-0.2.0/voiceio/typers/base.py +44 -0
  51. python_voiceio-0.2.0/voiceio/typers/chain.py +79 -0
  52. python_voiceio-0.2.0/voiceio/typers/clipboard.py +110 -0
  53. python_voiceio-0.2.0/voiceio/typers/ibus.py +389 -0
  54. python_voiceio-0.2.0/voiceio/typers/pynput_type.py +51 -0
  55. python_voiceio-0.2.0/voiceio/typers/wtype.py +57 -0
  56. python_voiceio-0.2.0/voiceio/typers/xdotool.py +45 -0
  57. python_voiceio-0.2.0/voiceio/typers/ydotool.py +115 -0
  58. python_voiceio-0.2.0/voiceio/wizard.py +882 -0
  59. python_voiceio-0.2.0/voiceio/worker.py +39 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Hugo Montenegro
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,260 @@
1
+ Metadata-Version: 2.4
2
+ Name: python-voiceio
3
+ Version: 0.2.0
4
+ Summary: Push-to-talk voice-to-text for Linux. Press a hotkey, speak, press again - text appears at your cursor.
5
+ Author: Hugo Montenegro
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/Hugo0/voiceio
8
+ Project-URL: Issues, https://github.com/Hugo0/voiceio/issues
9
+ Keywords: voice,speech-to-text,whisper,linux,dictation,wayland,ibus
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Environment :: X11 Applications
12
+ Classifier: Intended Audience :: End Users/Desktop
13
+ Classifier: Operating System :: POSIX :: Linux
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
16
+ Requires-Python: >=3.11
17
+ Description-Content-Type: text/markdown
18
+ License-File: LICENSE
19
+ Requires-Dist: faster-whisper>=1.0.0
20
+ Requires-Dist: sounddevice>=0.4.6
21
+ Requires-Dist: numpy>=1.24.0
22
+ Provides-Extra: linux
23
+ Requires-Dist: evdev>=1.6.0; extra == "linux"
24
+ Provides-Extra: x11
25
+ Requires-Dist: pynput>=1.7.6; extra == "x11"
26
+ Provides-Extra: mac
27
+ Requires-Dist: pynput>=1.7.6; extra == "mac"
28
+ Provides-Extra: tray
29
+ Requires-Dist: pystray>=0.19; extra == "tray"
30
+ Requires-Dist: Pillow>=10.0; extra == "tray"
31
+ Provides-Extra: dev
32
+ Requires-Dist: pytest>=7.0; extra == "dev"
33
+ Requires-Dist: pytest-mock; extra == "dev"
34
+ Dynamic: license-file
35
+
36
+ # voiceio
37
+
38
+ [![CI](https://github.com/Hugo0/voiceio/actions/workflows/ci.yml/badge.svg)](https://github.com/Hugo0/voiceio/actions/workflows/ci.yml)
39
+ [![PyPI](https://img.shields.io/pypi/v/voiceio)](https://pypi.org/project/voiceio/)
40
+ [![Python](https://img.shields.io/pypi/pyversions/voiceio)](https://pypi.org/project/voiceio/)
41
+ [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE)
42
+
43
+ Push-to-talk voice-to-text for Linux and macOS, on any app. Press a hotkey, speak, press again - text appears at your cursor.
44
+
45
+ 100% local and open source. No API keys, no cloud, no telemetry. Use and modify at your will.
46
+
47
+ <!-- demo video -->
48
+ <p align="center">
49
+ <a href="https://www.tella.tv/video/YOUR_VIDEO_ID">
50
+ <img src="https://github.com/Hugo0/voiceio/raw/main/assets/demo-thumbnail.png" alt="voiceio demo" width="600">
51
+ </a>
52
+ <br>
53
+ <em>Click to watch the demo</em>
54
+ </p>
55
+
56
+ ## Quick start
57
+
58
+ ```bash
59
+ # 1. Install system dependencies (Ubuntu/Debian)
60
+ sudo apt install pipx ibus gir1.2-ibus-1.0 python3-gi portaudio19-dev
61
+
62
+ # 2. Install voiceio
63
+ pipx install voiceio
64
+
65
+ # 3. Run the setup wizard
66
+ voiceio setup
67
+ ```
68
+
69
+ That's it. Press **Ctrl+Alt+V** (or your chosen hotkey) to start dictating.
70
+
71
+ <details>
72
+ <summary><strong>Fedora</strong></summary>
73
+
74
+ ```bash
75
+ sudo dnf install pipx ibus python3-gobject portaudio-devel
76
+ pipx install voiceio
77
+ voiceio setup
78
+ ```
79
+ </details>
80
+
81
+ <details>
82
+ <summary><strong>Arch Linux</strong></summary>
83
+
84
+ ```bash
85
+ sudo pacman -S python-pipx ibus python-gobject portaudio
86
+ pipx install voiceio
87
+ voiceio setup
88
+ ```
89
+ </details>
90
+
91
+ <details>
92
+ <summary><strong>Build from source</strong></summary>
93
+
94
+ If you want the source code locally to hack on or customize for personal use. PRs are welcome!
95
+
96
+ ```bash
97
+ git clone https://github.com/Hugo0/voiceio
98
+ cd voiceio
99
+ pip install -e ".[linux,dev]"
100
+ voiceio setup
101
+ ```
102
+ </details>
103
+
104
+ > You can also install with `uv tool install voiceio` or `pip install voiceio`.
105
+
106
+ ## How it works
107
+
108
+ ```
109
+ hotkey → mic capture → whisper (local) → text at cursor
110
+ pre-buffered streaming IBus / clipboard
111
+ ```
112
+
113
+ 1. Press your hotkey: voiceio starts recording (with a 1-second pre-buffer, so it catches the beginning even if you start speaking before pressing)
114
+ 2. Speak naturally: text streams into the focused app in real-time as an underlined preview
115
+ 3. Press the hotkey again: the final transcription replaces the preview and is committed
116
+
117
+ Transcription runs locally via [faster-whisper](https://github.com/SYSTRAN/faster-whisper). Text is injected through IBus (works in any GTK/Qt app: browsers, Telegram, editors) with an automatic clipboard fallback for terminals.
118
+
119
+ ## Features
120
+
121
+ - **Streaming**: text appears as you speak, not after you stop
122
+ - **Works everywhere**: IBus input method for GUI apps, clipboard for terminals
123
+ - **Wayland + X11**: evdev hotkeys work on both, no root required
124
+ - **Pre-buffer**: never miss the first syllable
125
+ - **Auto-healing**: falls back to the next working backend if one fails
126
+ - **Autostart**: optional systemd service, restarts on crash
127
+ - **Self-diagnosing**: `voiceio doctor` checks everything, `--fix` repairs it
128
+
129
+ ## Models
130
+
131
+ | Model | Size | Speed | Accuracy | Good for |
132
+ |-------|------|-------|----------|----------|
133
+ | `tiny` | 75 MB | ~10x realtime | Basic | Quick notes, low-end hardware |
134
+ | `base` | 150 MB | ~7x realtime | Good | Daily use (default) |
135
+ | `small` | 500 MB | ~4x realtime | Better | Longer dictation |
136
+ | `medium` | 1.5 GB | ~2x realtime | Great | Accuracy-sensitive work |
137
+ | `large-v3` | 3 GB | ~1x realtime | Best | Maximum quality, GPU recommended |
138
+
139
+ Models download automatically on first use. Switch anytime: `voiceio --model small`.
140
+
141
+ ## Commands
142
+
143
+ ```
144
+ voiceio Start the daemon
145
+ voiceio setup Interactive setup wizard
146
+ voiceio doctor Health check (--fix to auto-repair)
147
+ voiceio test Test microphone + live transcription
148
+ voiceio toggle Toggle recording on a running daemon
149
+ voiceio service install Autostart on login via systemd
150
+ voiceio logs View recent logs
151
+ voiceio uninstall Remove all system integrations
152
+ ```
153
+
154
+ ## Configuration
155
+
156
+ `voiceio setup` handles everything interactively. To tweak later, edit `~/.config/voiceio/config.toml` or override at runtime:
157
+
158
+ ```bash
159
+ voiceio --model large-v3 --language auto -v
160
+ ```
161
+
162
+ See [config.example.toml](config.example.toml) for all options.
163
+
164
+ ## Troubleshooting
165
+
166
+ ```bash
167
+ voiceio doctor # see what's working
168
+ voiceio doctor --fix # auto-fix issues
169
+ voiceio logs # check debug output
170
+ ```
171
+
172
+ | Problem | Fix |
173
+ |---------|-----|
174
+ | No text appears | `voiceio doctor --fix` - usually a missing IBus component or GNOME input source |
175
+ | Hotkey doesn't work on Wayland | `sudo usermod -aG input $USER` then log out and back in |
176
+ | Transcription too slow | Use a smaller model: `voiceio --model tiny` |
177
+ | Want to start fresh | `voiceio uninstall` then `voiceio setup` |
178
+ | Doesn't work on MacOS | I haven't added proper support for apple yet. either use https://aquavoice.com/ or make a PR |
179
+
180
+ ## Platform support
181
+
182
+ | Platform | Status | Text injection | Hotkeys | Streaming preview |
183
+ |----------|--------|---------------|---------|-------------------|
184
+ | Ubuntu / Debian (GNOME, Wayland) | **Tested daily** | IBus | evdev / GNOME shortcut | Yes |
185
+ | Ubuntu / Debian (GNOME, X11) | Supported | IBus | evdev / pynput | Yes |
186
+ | Fedora (GNOME) | Supported | IBus | evdev / GNOME shortcut | Yes |
187
+ | Arch Linux | Supported | IBus | evdev | Yes |
188
+ | KDE / Sway / Hyprland | Should work | IBus / ydotool / wtype | evdev | Yes |
189
+ | macOS | Experimental | pynput / clipboard | pynput | Type-and-correct (no preedit) |
190
+
191
+ voiceio auto-detects your platform and picks the best available backends. Run `voiceio doctor` to see what's working on your system.
192
+
193
+ ## Uninstall
194
+
195
+ ```bash
196
+ voiceio uninstall # removes service, IBus, shortcuts, symlinks
197
+ pipx uninstall voiceio # removes the package
198
+ ```
199
+
200
+ ## TODO
201
+
202
+ **Launch**
203
+ - [ ] Publish to PyPI
204
+ - [ ] Record demo video + thumbnail
205
+ - [ ] Test clean install on a fresh VM/container
206
+ - [ ] GitHub repo: description, topics, social preview image
207
+ - [ ] Bump version to 0.2.0
208
+
209
+ **Code quality**
210
+ - [ ] IBus activation on non-GNOME desktops (KDE, Sway, Hyprland), currently GNOME-only via gsettings
211
+ - [ ] `voiceio doctor --json` for machine-readable output
212
+ - [ ] Shell completions (`voiceio completion bash/zsh/fish`)
213
+ - [ ] Refactor wizard.py (882 lines) into smaller, testable modules
214
+ - [ ] Socket protocol versioning (e.g. `v1:preedit:text`)
215
+ - [ ] Configurable log file path
216
+
217
+ ## Wishlist
218
+
219
+ Contributions welcome! Open an issue to discuss before starting.
220
+
221
+ **High impact**
222
+ - [ ] **Text-to-speech (voice output)**: select text, press a hotkey, hear it spoken aloud. Completes the "io" in voiceio. Use a local TTS engine (Piper, Coqui, espeak-ng), same philosophy: no cloud, no API keys
223
+ - [ ] **Wake word**: "Hey voiceio" hands-free activation (no hotkey needed). Use a small always-on keyword model (e.g. openWakeWord, Porcupine)
224
+ - [ ] **Custom vocabulary / hot words**: user-defined word list for names, jargon, technical terms that Whisper gets wrong. Boost via `initial_prompt` or fine-tuned logit bias
225
+ - [ ] **Per-app profiles**: different language/model/output settings per application (e.g. formal writing in docs, casual in chat)
226
+ - [ ] **Voice commands**: "select all", "new line", "undo that", "delete last sentence". Parse transcribed text for command patterns before injecting
227
+ - [ ] **Punctuation & formatting commands**: "period", "comma", "new paragraph", "capitalize that"
228
+ - [ ] **Auto-punctuation model**: post-process Whisper output with a small punctuation/capitalization model for cleaner text
229
+
230
+ **Platform expansion**
231
+ - [ ] **macOS Input Method (IMKit)**: native streaming preedit on macOS, matching IBus quality on Linux
232
+ - [ ] **Windows support**: Text Services Framework (TSF) for text injection, global hotkeys via win32api
233
+ - [ ] **Flatpak / Snap packaging**: sandboxed distribution for Linux
234
+ - [ ] **AUR package**: community package for Arch Linux
235
+
236
+ **UX polish**
237
+ - [ ] **System tray icon with recording animation**: pulsing/colored icon showing recording state, quick menu for model/language switching
238
+ - [ ] **Desktop notifications with transcribed text**: show what was typed, with an undo button
239
+ - [ ] **Confidence indicator**: visual hint when Whisper is uncertain (maybe highlight low-confidence words)
240
+ - [ ] **Recording timeout**: auto-stop after N seconds of silence or max duration, preventing forgotten recordings
241
+ - [ ] **Sound themes**: bundled sound packs (subtle, mechanical, sci-fi, none)
242
+ - [ ] **First-run onboarding overlay**: lightweight "press Ctrl+Alt+V to start" hint on first launch
243
+
244
+ **Power features**
245
+ - [ ] **Multi-language in one session**: auto-detect language switches mid-dictation (Whisper supports this but needs tuning)
246
+ - [ ] **Speaker diarization**: "Person 1: ... Person 2: ..." for meeting notes (via pyannote or whisperX)
247
+ - [ ] **LLM post-processing**: pipe transcription through a local LLM (Ollama) for grammar correction, summarization, or reformatting
248
+ - [ ] **Clipboard history**: keep last N transcriptions, quick-paste from history
249
+ - [ ] **Transcription log / journal**: searchable history of everything you've dictated, with timestamps
250
+ - [ ] **API / webhook**: expose a local API so other tools can trigger recording or receive transcriptions
251
+ - [ ] **Browser extension**: inject text into web apps that don't work with IBus (e.g. some Electron apps)
252
+
253
+ **Developer experience**
254
+ - [ ] **Plugin system**: hooks for pre/post processing (e.g. custom formatters, translators, text transforms)
255
+ - [ ] **Alternative STT backends**: support Whisper.cpp, Deepgram, AssemblyAI, OpenAI Whisper API as optional backends
256
+ - [ ] **GPU acceleration docs**: CUDA/ROCm setup guide for faster transcription on large models
257
+
258
+ ## License
259
+
260
+ MIT
@@ -0,0 +1,225 @@
1
+ # voiceio
2
+
3
+ [![CI](https://github.com/Hugo0/voiceio/actions/workflows/ci.yml/badge.svg)](https://github.com/Hugo0/voiceio/actions/workflows/ci.yml)
4
+ [![PyPI](https://img.shields.io/pypi/v/voiceio)](https://pypi.org/project/voiceio/)
5
+ [![Python](https://img.shields.io/pypi/pyversions/voiceio)](https://pypi.org/project/voiceio/)
6
+ [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE)
7
+
8
+ Push-to-talk voice-to-text for Linux and macOS, on any app. Press a hotkey, speak, press again - text appears at your cursor.
9
+
10
+ 100% local and open source. No API keys, no cloud, no telemetry. Use and modify at your will.
11
+
12
+ <!-- demo video -->
13
+ <p align="center">
14
+ <a href="https://www.tella.tv/video/YOUR_VIDEO_ID">
15
+ <img src="https://github.com/Hugo0/voiceio/raw/main/assets/demo-thumbnail.png" alt="voiceio demo" width="600">
16
+ </a>
17
+ <br>
18
+ <em>Click to watch the demo</em>
19
+ </p>
20
+
21
+ ## Quick start
22
+
23
+ ```bash
24
+ # 1. Install system dependencies (Ubuntu/Debian)
25
+ sudo apt install pipx ibus gir1.2-ibus-1.0 python3-gi portaudio19-dev
26
+
27
+ # 2. Install voiceio
28
+ pipx install voiceio
29
+
30
+ # 3. Run the setup wizard
31
+ voiceio setup
32
+ ```
33
+
34
+ That's it. Press **Ctrl+Alt+V** (or your chosen hotkey) to start dictating.
35
+
36
+ <details>
37
+ <summary><strong>Fedora</strong></summary>
38
+
39
+ ```bash
40
+ sudo dnf install pipx ibus python3-gobject portaudio-devel
41
+ pipx install voiceio
42
+ voiceio setup
43
+ ```
44
+ </details>
45
+
46
+ <details>
47
+ <summary><strong>Arch Linux</strong></summary>
48
+
49
+ ```bash
50
+ sudo pacman -S python-pipx ibus python-gobject portaudio
51
+ pipx install voiceio
52
+ voiceio setup
53
+ ```
54
+ </details>
55
+
56
+ <details>
57
+ <summary><strong>Build from source</strong></summary>
58
+
59
+ If you want the source code locally to hack on or customize for personal use. PRs are welcome!
60
+
61
+ ```bash
62
+ git clone https://github.com/Hugo0/voiceio
63
+ cd voiceio
64
+ pip install -e ".[linux,dev]"
65
+ voiceio setup
66
+ ```
67
+ </details>
68
+
69
+ > You can also install with `uv tool install voiceio` or `pip install voiceio`.
70
+
71
+ ## How it works
72
+
73
+ ```
74
+ hotkey → mic capture → whisper (local) → text at cursor
75
+ pre-buffered streaming IBus / clipboard
76
+ ```
77
+
78
+ 1. Press your hotkey: voiceio starts recording (with a 1-second pre-buffer, so it catches the beginning even if you start speaking before pressing)
79
+ 2. Speak naturally: text streams into the focused app in real-time as an underlined preview
80
+ 3. Press the hotkey again: the final transcription replaces the preview and is committed
81
+
82
+ Transcription runs locally via [faster-whisper](https://github.com/SYSTRAN/faster-whisper). Text is injected through IBus (works in any GTK/Qt app: browsers, Telegram, editors) with an automatic clipboard fallback for terminals.
83
+
84
+ ## Features
85
+
86
+ - **Streaming**: text appears as you speak, not after you stop
87
+ - **Works everywhere**: IBus input method for GUI apps, clipboard for terminals
88
+ - **Wayland + X11**: evdev hotkeys work on both, no root required
89
+ - **Pre-buffer**: never miss the first syllable
90
+ - **Auto-healing**: falls back to the next working backend if one fails
91
+ - **Autostart**: optional systemd service, restarts on crash
92
+ - **Self-diagnosing**: `voiceio doctor` checks everything, `--fix` repairs it
93
+
94
+ ## Models
95
+
96
+ | Model | Size | Speed | Accuracy | Good for |
97
+ |-------|------|-------|----------|----------|
98
+ | `tiny` | 75 MB | ~10x realtime | Basic | Quick notes, low-end hardware |
99
+ | `base` | 150 MB | ~7x realtime | Good | Daily use (default) |
100
+ | `small` | 500 MB | ~4x realtime | Better | Longer dictation |
101
+ | `medium` | 1.5 GB | ~2x realtime | Great | Accuracy-sensitive work |
102
+ | `large-v3` | 3 GB | ~1x realtime | Best | Maximum quality, GPU recommended |
103
+
104
+ Models download automatically on first use. Switch anytime: `voiceio --model small`.
105
+
106
+ ## Commands
107
+
108
+ ```
109
+ voiceio Start the daemon
110
+ voiceio setup Interactive setup wizard
111
+ voiceio doctor Health check (--fix to auto-repair)
112
+ voiceio test Test microphone + live transcription
113
+ voiceio toggle Toggle recording on a running daemon
114
+ voiceio service install Autostart on login via systemd
115
+ voiceio logs View recent logs
116
+ voiceio uninstall Remove all system integrations
117
+ ```
118
+
119
+ ## Configuration
120
+
121
+ `voiceio setup` handles everything interactively. To tweak later, edit `~/.config/voiceio/config.toml` or override at runtime:
122
+
123
+ ```bash
124
+ voiceio --model large-v3 --language auto -v
125
+ ```
126
+
127
+ See [config.example.toml](config.example.toml) for all options.
128
+
129
+ ## Troubleshooting
130
+
131
+ ```bash
132
+ voiceio doctor # see what's working
133
+ voiceio doctor --fix # auto-fix issues
134
+ voiceio logs # check debug output
135
+ ```
136
+
137
+ | Problem | Fix |
138
+ |---------|-----|
139
+ | No text appears | `voiceio doctor --fix` - usually a missing IBus component or GNOME input source |
140
+ | Hotkey doesn't work on Wayland | `sudo usermod -aG input $USER` then log out and back in |
141
+ | Transcription too slow | Use a smaller model: `voiceio --model tiny` |
142
+ | Want to start fresh | `voiceio uninstall` then `voiceio setup` |
143
+ | Doesn't work on MacOS | I haven't added proper support for apple yet. either use https://aquavoice.com/ or make a PR |
144
+
145
+ ## Platform support
146
+
147
+ | Platform | Status | Text injection | Hotkeys | Streaming preview |
148
+ |----------|--------|---------------|---------|-------------------|
149
+ | Ubuntu / Debian (GNOME, Wayland) | **Tested daily** | IBus | evdev / GNOME shortcut | Yes |
150
+ | Ubuntu / Debian (GNOME, X11) | Supported | IBus | evdev / pynput | Yes |
151
+ | Fedora (GNOME) | Supported | IBus | evdev / GNOME shortcut | Yes |
152
+ | Arch Linux | Supported | IBus | evdev | Yes |
153
+ | KDE / Sway / Hyprland | Should work | IBus / ydotool / wtype | evdev | Yes |
154
+ | macOS | Experimental | pynput / clipboard | pynput | Type-and-correct (no preedit) |
155
+
156
+ voiceio auto-detects your platform and picks the best available backends. Run `voiceio doctor` to see what's working on your system.
157
+
158
+ ## Uninstall
159
+
160
+ ```bash
161
+ voiceio uninstall # removes service, IBus, shortcuts, symlinks
162
+ pipx uninstall voiceio # removes the package
163
+ ```
164
+
165
+ ## TODO
166
+
167
+ **Launch**
168
+ - [ ] Publish to PyPI
169
+ - [ ] Record demo video + thumbnail
170
+ - [ ] Test clean install on a fresh VM/container
171
+ - [ ] GitHub repo: description, topics, social preview image
172
+ - [ ] Bump version to 0.2.0
173
+
174
+ **Code quality**
175
+ - [ ] IBus activation on non-GNOME desktops (KDE, Sway, Hyprland), currently GNOME-only via gsettings
176
+ - [ ] `voiceio doctor --json` for machine-readable output
177
+ - [ ] Shell completions (`voiceio completion bash/zsh/fish`)
178
+ - [ ] Refactor wizard.py (882 lines) into smaller, testable modules
179
+ - [ ] Socket protocol versioning (e.g. `v1:preedit:text`)
180
+ - [ ] Configurable log file path
181
+
182
+ ## Wishlist
183
+
184
+ Contributions welcome! Open an issue to discuss before starting.
185
+
186
+ **High impact**
187
+ - [ ] **Text-to-speech (voice output)**: select text, press a hotkey, hear it spoken aloud. Completes the "io" in voiceio. Use a local TTS engine (Piper, Coqui, espeak-ng), same philosophy: no cloud, no API keys
188
+ - [ ] **Wake word**: "Hey voiceio" hands-free activation (no hotkey needed). Use a small always-on keyword model (e.g. openWakeWord, Porcupine)
189
+ - [ ] **Custom vocabulary / hot words**: user-defined word list for names, jargon, technical terms that Whisper gets wrong. Boost via `initial_prompt` or fine-tuned logit bias
190
+ - [ ] **Per-app profiles**: different language/model/output settings per application (e.g. formal writing in docs, casual in chat)
191
+ - [ ] **Voice commands**: "select all", "new line", "undo that", "delete last sentence". Parse transcribed text for command patterns before injecting
192
+ - [ ] **Punctuation & formatting commands**: "period", "comma", "new paragraph", "capitalize that"
193
+ - [ ] **Auto-punctuation model**: post-process Whisper output with a small punctuation/capitalization model for cleaner text
194
+
195
+ **Platform expansion**
196
+ - [ ] **macOS Input Method (IMKit)**: native streaming preedit on macOS, matching IBus quality on Linux
197
+ - [ ] **Windows support**: Text Services Framework (TSF) for text injection, global hotkeys via win32api
198
+ - [ ] **Flatpak / Snap packaging**: sandboxed distribution for Linux
199
+ - [ ] **AUR package**: community package for Arch Linux
200
+
201
+ **UX polish**
202
+ - [ ] **System tray icon with recording animation**: pulsing/colored icon showing recording state, quick menu for model/language switching
203
+ - [ ] **Desktop notifications with transcribed text**: show what was typed, with an undo button
204
+ - [ ] **Confidence indicator**: visual hint when Whisper is uncertain (maybe highlight low-confidence words)
205
+ - [ ] **Recording timeout**: auto-stop after N seconds of silence or max duration, preventing forgotten recordings
206
+ - [ ] **Sound themes**: bundled sound packs (subtle, mechanical, sci-fi, none)
207
+ - [ ] **First-run onboarding overlay**: lightweight "press Ctrl+Alt+V to start" hint on first launch
208
+
209
+ **Power features**
210
+ - [ ] **Multi-language in one session**: auto-detect language switches mid-dictation (Whisper supports this but needs tuning)
211
+ - [ ] **Speaker diarization**: "Person 1: ... Person 2: ..." for meeting notes (via pyannote or whisperX)
212
+ - [ ] **LLM post-processing**: pipe transcription through a local LLM (Ollama) for grammar correction, summarization, or reformatting
213
+ - [ ] **Clipboard history**: keep last N transcriptions, quick-paste from history
214
+ - [ ] **Transcription log / journal**: searchable history of everything you've dictated, with timestamps
215
+ - [ ] **API / webhook**: expose a local API so other tools can trigger recording or receive transcriptions
216
+ - [ ] **Browser extension**: inject text into web apps that don't work with IBus (e.g. some Electron apps)
217
+
218
+ **Developer experience**
219
+ - [ ] **Plugin system**: hooks for pre/post processing (e.g. custom formatters, translators, text transforms)
220
+ - [ ] **Alternative STT backends**: support Whisper.cpp, Deepgram, AssemblyAI, OpenAI Whisper API as optional backends
221
+ - [ ] **GPU acceleration docs**: CUDA/ROCm setup guide for faster transcription on large models
222
+
223
+ ## License
224
+
225
+ MIT
@@ -0,0 +1,54 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "python-voiceio"
7
+ version = "0.2.0"
8
+ description = "Push-to-talk voice-to-text for Linux. Press a hotkey, speak, press again - text appears at your cursor."
9
+ readme = "README.md"
10
+ license = "MIT"
11
+ requires-python = ">=3.11"
12
+ authors = [{ name = "Hugo Montenegro" }]
13
+ keywords = ["voice", "speech-to-text", "whisper", "linux", "dictation", "wayland", "ibus"]
14
+ classifiers = [
15
+ "Development Status :: 4 - Beta",
16
+ "Environment :: X11 Applications",
17
+ "Intended Audience :: End Users/Desktop",
18
+ "Operating System :: POSIX :: Linux",
19
+ "Programming Language :: Python :: 3",
20
+ "Topic :: Multimedia :: Sound/Audio :: Speech",
21
+ ]
22
+ dependencies = [
23
+ "faster-whisper>=1.0.0",
24
+ "sounddevice>=0.4.6",
25
+ "numpy>=1.24.0",
26
+ ]
27
+
28
+ [project.optional-dependencies]
29
+ linux = ["evdev>=1.6.0"]
30
+ x11 = ["pynput>=1.7.6"]
31
+ mac = ["pynput>=1.7.6"]
32
+ tray = ["pystray>=0.19", "Pillow>=10.0"]
33
+ dev = ["pytest>=7.0", "pytest-mock"]
34
+
35
+ [project.urls]
36
+ Homepage = "https://github.com/Hugo0/voiceio"
37
+ Issues = "https://github.com/Hugo0/voiceio/issues"
38
+
39
+ [project.scripts]
40
+ voiceio = "voiceio.cli:main"
41
+ # Legacy aliases (prefer: voiceio toggle/doctor/setup/test)
42
+ voiceio-toggle = "voiceio.cli:_cmd_toggle"
43
+ voiceio-doctor = "voiceio.cli:_cmd_doctor_legacy"
44
+ voiceio-setup = "voiceio.cli:_cmd_setup"
45
+ voiceio-test = "voiceio.cli:_cmd_test"
46
+
47
+ [tool.ruff]
48
+ lint.per-file-ignores."voiceio/ibus/engine.py" = ["E402"]
49
+
50
+ [tool.setuptools.packages.find]
51
+ include = ["voiceio*"]
52
+
53
+ [tool.setuptools.package-data]
54
+ "voiceio.sounds" = ["*.wav"]