python-voiceio 0.2.4__tar.gz → 0.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. {python_voiceio-0.2.4/python_voiceio.egg-info → python_voiceio-0.3.1}/PKG-INFO +89 -56
  2. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/README.md +72 -55
  3. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/pyproject.toml +14 -2
  4. {python_voiceio-0.2.4 → python_voiceio-0.3.1/python_voiceio.egg-info}/PKG-INFO +89 -56
  5. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/python_voiceio.egg-info/SOURCES.txt +44 -1
  6. python_voiceio-0.3.1/python_voiceio.egg-info/requires.txt +41 -0
  7. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/tests/test_app_wiring.py +92 -25
  8. python_voiceio-0.3.1/tests/test_autocorrect.py +264 -0
  9. python_voiceio-0.3.1/tests/test_clipboard_read.py +91 -0
  10. python_voiceio-0.3.1/tests/test_commands.py +173 -0
  11. python_voiceio-0.3.1/tests/test_corrections.py +148 -0
  12. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/tests/test_fallback.py +10 -0
  13. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/tests/test_health.py +6 -6
  14. python_voiceio-0.3.1/tests/test_hints.py +74 -0
  15. python_voiceio-0.3.1/tests/test_history.py +90 -0
  16. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/tests/test_ibus_typer.py +7 -0
  17. python_voiceio-0.3.1/tests/test_llm.py +217 -0
  18. python_voiceio-0.3.1/tests/test_llm_api.py +176 -0
  19. python_voiceio-0.3.1/tests/test_numbers.py +101 -0
  20. python_voiceio-0.3.1/tests/test_postprocess.py +107 -0
  21. python_voiceio-0.3.1/tests/test_prompt.py +111 -0
  22. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/tests/test_streaming.py +2 -2
  23. python_voiceio-0.3.1/tests/test_tts.py +158 -0
  24. python_voiceio-0.3.1/tests/test_vad.py +118 -0
  25. python_voiceio-0.3.1/tests/test_vocabulary.py +71 -0
  26. python_voiceio-0.3.1/tests/test_wordfreq.py +80 -0
  27. python_voiceio-0.3.1/voiceio/__init__.py +1 -0
  28. python_voiceio-0.3.1/voiceio/app.py +655 -0
  29. python_voiceio-0.3.1/voiceio/autocorrect.py +284 -0
  30. python_voiceio-0.3.1/voiceio/cli.py +1094 -0
  31. python_voiceio-0.3.1/voiceio/clipboard_read.py +69 -0
  32. python_voiceio-0.3.1/voiceio/commands.py +130 -0
  33. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/voiceio/config.py +58 -2
  34. python_voiceio-0.3.1/voiceio/corrections.py +160 -0
  35. python_voiceio-0.3.1/voiceio/demo.py +199 -0
  36. python_voiceio-0.3.1/voiceio/feedback.py +162 -0
  37. python_voiceio-0.3.1/voiceio/health.py +408 -0
  38. python_voiceio-0.3.1/voiceio/hints.py +58 -0
  39. python_voiceio-0.3.1/voiceio/history.py +64 -0
  40. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/voiceio/hotkeys/chain.py +1 -0
  41. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/voiceio/hotkeys/pynput_backend.py +23 -0
  42. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/voiceio/hotkeys/socket_backend.py +35 -12
  43. python_voiceio-0.3.1/voiceio/llm.py +258 -0
  44. python_voiceio-0.3.1/voiceio/llm_api.py +183 -0
  45. python_voiceio-0.3.1/voiceio/models/silero_vad.onnx +0 -0
  46. python_voiceio-0.3.1/voiceio/numbers.py +228 -0
  47. python_voiceio-0.3.1/voiceio/pidlock.py +22 -0
  48. python_voiceio-0.3.1/voiceio/platform.py +272 -0
  49. python_voiceio-0.3.1/voiceio/postprocess.py +84 -0
  50. python_voiceio-0.3.1/voiceio/prompt.py +73 -0
  51. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/voiceio/recorder.py +53 -13
  52. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/voiceio/service.py +73 -9
  53. python_voiceio-0.3.1/voiceio/sounds/__init__.py +0 -0
  54. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/voiceio/sounds/commit.wav +0 -0
  55. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/voiceio/sounds/start.wav +0 -0
  56. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/voiceio/sounds/stop.wav +0 -0
  57. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/voiceio/streaming.py +84 -19
  58. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/voiceio/transcriber.py +14 -2
  59. python_voiceio-0.3.1/voiceio/tray/__init__.py +277 -0
  60. python_voiceio-0.3.1/voiceio/tray/_icons.py +125 -0
  61. python_voiceio-0.3.1/voiceio/tray/_indicator.py +181 -0
  62. python_voiceio-0.3.1/voiceio/tray/_pystray.py +123 -0
  63. python_voiceio-0.3.1/voiceio/tts/__init__.py +11 -0
  64. python_voiceio-0.3.1/voiceio/tts/base.py +29 -0
  65. python_voiceio-0.3.1/voiceio/tts/chain.py +79 -0
  66. python_voiceio-0.3.1/voiceio/tts/edge_engine.py +74 -0
  67. python_voiceio-0.3.1/voiceio/tts/espeak.py +47 -0
  68. python_voiceio-0.3.1/voiceio/tts/piper_engine.py +90 -0
  69. python_voiceio-0.3.1/voiceio/tts/player.py +62 -0
  70. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/voiceio/typers/chain.py +1 -0
  71. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/voiceio/typers/clipboard.py +49 -6
  72. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/voiceio/typers/ibus.py +3 -2
  73. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/voiceio/typers/pynput_type.py +9 -0
  74. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/voiceio/typers/wtype.py +2 -1
  75. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/voiceio/typers/xdotool.py +2 -1
  76. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/voiceio/typers/ydotool.py +2 -1
  77. python_voiceio-0.3.1/voiceio/vad.py +122 -0
  78. python_voiceio-0.3.1/voiceio/vocabulary.py +59 -0
  79. python_voiceio-0.3.1/voiceio/wizard.py +1463 -0
  80. python_voiceio-0.3.1/voiceio/wordfreq.py +69 -0
  81. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/voiceio/worker.py +16 -1
  82. python_voiceio-0.2.4/python_voiceio.egg-info/requires.txt +0 -20
  83. python_voiceio-0.2.4/voiceio/__init__.py +0 -1
  84. python_voiceio-0.2.4/voiceio/app.py +0 -414
  85. python_voiceio-0.2.4/voiceio/cli.py +0 -512
  86. python_voiceio-0.2.4/voiceio/feedback.py +0 -78
  87. python_voiceio-0.2.4/voiceio/health.py +0 -194
  88. python_voiceio-0.2.4/voiceio/platform.py +0 -139
  89. python_voiceio-0.2.4/voiceio/tray.py +0 -54
  90. python_voiceio-0.2.4/voiceio/wizard.py +0 -883
  91. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/LICENSE +0 -0
  92. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/python_voiceio.egg-info/dependency_links.txt +0 -0
  93. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/python_voiceio.egg-info/entry_points.txt +0 -0
  94. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/python_voiceio.egg-info/top_level.txt +0 -0
  95. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/setup.cfg +0 -0
  96. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/tests/test_backend_probes.py +0 -0
  97. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/tests/test_config.py +0 -0
  98. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/tests/test_platform.py +0 -0
  99. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/tests/test_prebuffer.py +0 -0
  100. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/tests/test_recorder_integration.py +0 -0
  101. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/tests/test_transcriber.py +0 -0
  102. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/voiceio/__main__.py +0 -0
  103. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/voiceio/backends.py +0 -0
  104. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/voiceio/hotkeys/__init__.py +0 -0
  105. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/voiceio/hotkeys/base.py +0 -0
  106. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/voiceio/hotkeys/evdev.py +0 -0
  107. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/voiceio/ibus/__init__.py +0 -0
  108. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/voiceio/ibus/engine.py +0 -0
  109. {python_voiceio-0.2.4/voiceio/sounds → python_voiceio-0.3.1/voiceio/models}/__init__.py +0 -0
  110. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/voiceio/typers/__init__.py +0 -0
  111. {python_voiceio-0.2.4 → python_voiceio-0.3.1}/voiceio/typers/base.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: python-voiceio
3
- Version: 0.2.4
3
+ Version: 0.3.1
4
4
  Summary: Speak → text, locally, instantly.
5
5
  Author: Hugo Montenegro
6
6
  License-Expression: MIT
@@ -8,11 +8,13 @@ Project-URL: Homepage, https://github.com/Hugo0/voiceio
8
8
  Project-URL: Repository, https://github.com/Hugo0/voiceio
9
9
  Project-URL: Issues, https://github.com/Hugo0/voiceio/issues
10
10
  Project-URL: Changelog, https://github.com/Hugo0/voiceio/releases
11
- Keywords: voice,speech-to-text,whisper,linux,dictation,wayland,ibus
11
+ Keywords: voice,speech-to-text,whisper,linux,windows,dictation,wayland,ibus
12
12
  Classifier: Development Status :: 4 - Beta
13
13
  Classifier: Environment :: X11 Applications
14
14
  Classifier: Intended Audience :: End Users/Desktop
15
15
  Classifier: Operating System :: POSIX :: Linux
16
+ Classifier: Operating System :: Microsoft :: Windows
17
+ Classifier: Operating System :: MacOS
16
18
  Classifier: Programming Language :: Python :: 3
17
19
  Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
18
20
  Requires-Python: >=3.11
@@ -21,14 +23,28 @@ License-File: LICENSE
21
23
  Requires-Dist: faster-whisper>=1.0.0
22
24
  Requires-Dist: sounddevice>=0.4.6
23
25
  Requires-Dist: numpy>=1.24.0
26
+ Requires-Dist: onnxruntime>=1.16.0
27
+ Requires-Dist: wordfreq>=3.0
24
28
  Requires-Dist: evdev>=1.6.0; sys_platform == "linux"
29
+ Requires-Dist: pynput>=1.7.6; sys_platform == "win32"
30
+ Requires-Dist: pynput>=1.7.6; sys_platform == "darwin"
31
+ Requires-Dist: pyperclip>=1.8.0; sys_platform == "win32"
32
+ Requires-Dist: win11toast>=0.36; sys_platform == "win32"
25
33
  Provides-Extra: x11
26
34
  Requires-Dist: pynput>=1.7.6; extra == "x11"
27
35
  Provides-Extra: mac
28
36
  Requires-Dist: pynput>=1.7.6; extra == "mac"
37
+ Provides-Extra: win
38
+ Requires-Dist: pynput>=1.7.6; extra == "win"
39
+ Requires-Dist: pyperclip>=1.8.0; extra == "win"
40
+ Requires-Dist: win11toast>=0.36; extra == "win"
29
41
  Provides-Extra: tray
30
42
  Requires-Dist: pystray>=0.19; extra == "tray"
31
43
  Requires-Dist: Pillow>=10.0; extra == "tray"
44
+ Provides-Extra: tts
45
+ Requires-Dist: piper-tts>=1.2.0; extra == "tts"
46
+ Provides-Extra: tts-cloud
47
+ Requires-Dist: edge-tts>=6.1.0; extra == "tts-cloud"
32
48
  Provides-Extra: dev
33
49
  Requires-Dist: pytest>=7.0; extra == "dev"
34
50
  Requires-Dist: pytest-mock; extra == "dev"
@@ -78,6 +94,31 @@ voiceio setup
78
94
  ```
79
95
  </details>
80
96
 
97
+ <details>
98
+ <summary><strong>Windows</strong></summary>
99
+
100
+ ```powershell
101
+ # Option A: Install with pip (requires Python 3.11+)
102
+ pip install python-voiceio
103
+ voiceio setup
104
+
105
+ # Option B: Download the installer from GitHub Releases (no Python needed)
106
+ # https://github.com/Hugo0/voiceio/releases
107
+ # Also available as a portable .zip if you prefer no installation.
108
+ ```
109
+
110
+ Windows uses pynput for hotkeys and text injection. No extra system dependencies required.
111
+ </details>
112
+
113
+ <details>
114
+ <summary><strong>macOS</strong></summary>
115
+
116
+ ```bash
117
+ pipx install python-voiceio
118
+ voiceio setup
119
+ ```
120
+ </details>
121
+
81
122
  <details>
82
123
  <summary><strong>Build from source</strong></summary>
83
124
 
@@ -86,9 +127,13 @@ If you want the source code locally to hack on or customize for personal use. PR
86
127
  ```bash
87
128
  git clone https://github.com/Hugo0/voiceio
88
129
  cd voiceio
89
- pip install -e ".[linux,dev]"
90
- voiceio setup
130
+ uv pip install -e ".[linux,dev]"
131
+
132
+ # Bootstrap CLI commands onto PATH (creates ~/.local/bin/voiceio)
133
+ uv run voiceio setup
91
134
  ```
135
+
136
+ > **Note:** Source installs live inside a virtualenv, so `voiceio` isn't on PATH until setup creates symlinks in `~/.local/bin/`. If `voiceio` isn't found after setup, restart your terminal or run `export PATH="$HOME/.local/bin:$PATH"`.
92
137
  </details>
93
138
 
94
139
  > You can also install with `uv tool install python-voiceio` or `pip install python-voiceio`.
@@ -100,11 +145,7 @@ hotkey → mic capture → whisper (local) → text at cursor
100
145
  pre-buffered streaming IBus / clipboard
101
146
  ```
102
147
 
103
- 1. Press your hotkey: voiceio starts recording (with a 1-second pre-buffer, so it catches the beginning even if you start speaking before pressing)
104
- 2. Speak naturally: text streams into the focused app in real-time as an underlined preview
105
- 3. Press the hotkey again: the final transcription replaces the preview and is committed
106
-
107
- Transcription runs locally via [faster-whisper](https://github.com/SYSTRAN/faster-whisper). Text is injected through IBus (works in any GTK/Qt app: browsers, Telegram, editors) with an automatic clipboard fallback for terminals.
148
+ Press your hotkey to start recording (1s pre-buffer catches the first syllable). Text streams into the focused app as an underlined preview. Press again to commit. Transcription runs locally via [faster-whisper](https://github.com/SYSTRAN/faster-whisper), text is injected through IBus (any GTK/Qt app) with clipboard fallback for terminals.
108
149
 
109
150
  ## Features
110
151
 
@@ -137,14 +178,17 @@ voiceio doctor Health check (--fix to auto-repair)
137
178
  voiceio test Test microphone + live transcription
138
179
  voiceio toggle Toggle recording on a running daemon
139
180
  voiceio update Update to latest version
140
- voiceio service install Autostart on login via systemd
181
+ voiceio service install Autostart on login (systemd / Windows Startup)
141
182
  voiceio logs View recent logs
142
183
  voiceio uninstall Remove all system integrations
143
184
  ```
144
185
 
145
186
  ## Configuration
146
187
 
147
- `voiceio setup` handles everything interactively. To tweak later, edit `~/.config/voiceio/config.toml` or override at runtime:
188
+ `voiceio setup` handles everything interactively. To tweak later, edit the config file or override at runtime:
189
+
190
+ - Linux/macOS: `~/.config/voiceio/config.toml`
191
+ - Windows: `%LOCALAPPDATA%\voiceio\config\config.toml`
148
192
 
149
193
  ```bash
150
194
  voiceio --model large-v3 --language auto -v
@@ -166,7 +210,9 @@ voiceio logs # check debug output
166
210
  | Hotkey doesn't work on Wayland | `sudo usermod -aG input $USER` then log out and back in |
167
211
  | Transcription too slow | Use a smaller model: `voiceio --model tiny` |
168
212
  | Want to start fresh | `voiceio uninstall` then `voiceio setup` |
169
- | Doesn't work on MacOS | I haven't added proper support for apple yet. either use https://aquavoice.com/ or make a PR |
213
+ | Windows: antivirus blocks hotkeys | pynput uses global keyboard hooks add an exception for voiceio |
214
+ | Windows: no sound feedback | Check `voiceio logs` for audio device info |
215
+ | macOS issues | Experimental — consider [aquavoice.com](https://aquavoice.com/) or contribute a PR |
170
216
 
171
217
  ## Platform support
172
218
 
@@ -177,6 +223,7 @@ voiceio logs # check debug output
177
223
  | Fedora (GNOME) | Supported | IBus | evdev / GNOME shortcut | Yes |
178
224
  | Arch Linux | Supported | IBus | evdev | Yes |
179
225
  | KDE / Sway / Hyprland | Should work | IBus / ydotool / wtype | evdev | Yes |
226
+ | Windows 10/11 | Experimental | pynput / clipboard | pynput | Type-and-correct (no preedit) |
180
227
  | macOS | Experimental | pynput / clipboard | pynput | Type-and-correct (no preedit) |
181
228
 
182
229
  voiceio auto-detects your platform and picks the best available backends. Run `voiceio doctor` to see what's working on your system.
@@ -188,50 +235,36 @@ voiceio uninstall # removes service, IBus, shortcuts, symlinks
188
235
  pipx uninstall python-voiceio # removes the package
189
236
  ```
190
237
 
191
- ## Wishlist
192
-
193
- Contributions welcome! See [CONTRIBUTING.md](CONTRIBUTING.md). Open an issue to discuss before starting.
194
-
195
- **High impact**
196
- - [ ] **macOS support**: test and polish pynput hotkey + typer backends
197
- - [ ] **Silence filtering**: VAD-based trimming to prevent Whisper hallucinations on silence
198
- - [ ] **distil-whisper models**: better speed/accuracy tradeoffs
199
- - [ ] **IBus on non-GNOME desktops**: KDE, Sway, Hyprland activation (currently GNOME-only via gsettings)
200
- - [ ] **Text-to-speech (voice output)**: select text, press a hotkey, hear it spoken aloud. Completes the "io" in voiceio. Use a local TTS engine (Piper, Coqui, espeak-ng), same philosophy: no cloud, no API keys
201
- - [ ] **Wake word**: "Hey voiceio" hands-free activation (no hotkey needed). Use a small always-on keyword model (e.g. openWakeWord, Porcupine)
202
- - [ ] **Custom vocabulary / hot words**: user-defined word list for names, jargon, technical terms that Whisper gets wrong. Boost via `initial_prompt` or fine-tuned logit bias
203
- - [ ] **Per-app profiles**: different language/model/output settings per application (e.g. formal writing in docs, casual in chat)
204
- - [ ] **Voice commands**: "select all", "new line", "undo that", "delete last sentence". Parse transcribed text for command patterns before injecting
205
- - [ ] **Punctuation & formatting commands**: "period", "comma", "new paragraph", "capitalize that"
206
- - [ ] **Auto-punctuation model**: post-process Whisper output with a small punctuation/capitalization model for cleaner text
207
-
208
- **Platform expansion**
209
- - [ ] **macOS Input Method (IMKit)**: native streaming preedit on macOS, matching IBus quality on Linux
210
- - [ ] **Windows support**: Text Services Framework (TSF) for text injection, global hotkeys via win32api
211
- - [ ] **Flatpak / Snap packaging**: sandboxed distribution for Linux
212
- - [ ] **AUR package**: community package for Arch Linux
213
-
214
- **UX polish**
215
- - [ ] **System tray icon with recording animation**: pulsing/colored icon showing recording state, quick menu for model/language switching
216
- - [ ] **Desktop notifications with transcribed text**: show what was typed, with an undo button
217
- - [ ] **Confidence indicator**: visual hint when Whisper is uncertain (maybe highlight low-confidence words)
218
- - [ ] **Recording timeout**: auto-stop after N seconds of silence or max duration, preventing forgotten recordings
219
- - [ ] **Sound themes**: bundled sound packs (subtle, mechanical, sci-fi, none)
220
- - [ ] **First-run onboarding overlay**: lightweight "press Ctrl+Alt+V to start" hint on first launch
221
-
222
- **Power features**
223
- - [ ] **Multi-language in one session**: auto-detect language switches mid-dictation (Whisper supports this but needs tuning)
224
- - [ ] **Speaker diarization**: "Person 1: ... Person 2: ..." for meeting notes (via pyannote or whisperX)
225
- - [ ] **LLM post-processing**: pipe transcription through a local LLM (Ollama) for grammar correction, summarization, or reformatting
226
- - [ ] **Clipboard history**: keep last N transcriptions, quick-paste from history
227
- - [ ] **Transcription log / journal**: searchable history of everything you've dictated, with timestamps
228
- - [ ] **API / webhook**: expose a local API so other tools can trigger recording or receive transcriptions
229
- - [ ] **Browser extension**: inject text into web apps that don't work with IBus (e.g. some Electron apps)
230
-
231
- **Developer experience**
232
- - [ ] **Plugin system**: hooks for pre/post processing (e.g. custom formatters, translators, text transforms)
233
- - [ ] **Alternative STT backends**: support Whisper.cpp, Deepgram, AssemblyAI, OpenAI Whisper API as optional backends
234
- - [ ] **GPU acceleration docs**: CUDA/ROCm setup guide for faster transcription on large models
238
+ ## Roadmap
239
+
240
+ Contributions welcome! See [CONTRIBUTING.md](CONTRIBUTING.md) and [open issues](https://github.com/Hugo0/voiceio/issues).
241
+
242
+ **Now**
243
+ - [ ] macOS polish (IMKit for native preedit, Accessibility API for text injection)
244
+
245
+ **Soon**
246
+ - [ ] Per-app context awareness (detect focused app, adapt formatting/behavior)
247
+ - [ ] File/audio transcription mode (`voiceio transcribe recording.mp3`)
248
+
249
+ **Backlog**
250
+ - [ ] Multiple engine backends (whisper.cpp for Vulkan/AMD, VOSK for low-end hardware)
251
+ - [ ] Echo cancellation (filter system audio for meeting use)
252
+ - [ ] Wake word activation ("Hey voiceio")
253
+ - [ ] Text-to-speech output (Piper/espeak-ng completes the "io")
254
+
255
+ **Done**
256
+ - [x] LLM auto-audit dictionary (`voiceio correct --auto` scan history with LLM, interactive correction)
257
+ - [x] LLM post-processing via Ollama (grammar cleanup, spelling fixes on final pass)
258
+ - [x] Corrections dictionary auto-replace misheard words, "correct that" voice command
259
+ - [x] Transcription history searchable log of everything you've dictated
260
+ - [x] Number-to-digit conversion ("three hundred forty two" → "342")
261
+ - [x] VAD-based silence filtering (Silero VAD, prevents Whisper hallucinations)
262
+ - [x] Voice commands "new line", "new paragraph", "scratch that", punctuation by name
263
+ - [x] Custom vocabulary / personal dictionary (bias Whisper via `initial_prompt`)
264
+ - [x] Smart punctuation & capitalization post-processing
265
+ - [x] Windows support
266
+ - [x] System tray icon with animated states
267
+ - [x] Auto-stop on silence
235
268
 
236
269
  ## License
237
270
 
@@ -42,6 +42,31 @@ voiceio setup
42
42
  ```
43
43
  </details>
44
44
 
45
+ <details>
46
+ <summary><strong>Windows</strong></summary>
47
+
48
+ ```powershell
49
+ # Option A: Install with pip (requires Python 3.11+)
50
+ pip install python-voiceio
51
+ voiceio setup
52
+
53
+ # Option B: Download the installer from GitHub Releases (no Python needed)
54
+ # https://github.com/Hugo0/voiceio/releases
55
+ # Also available as a portable .zip if you prefer no installation.
56
+ ```
57
+
58
+ Windows uses pynput for hotkeys and text injection. No extra system dependencies required.
59
+ </details>
60
+
61
+ <details>
62
+ <summary><strong>macOS</strong></summary>
63
+
64
+ ```bash
65
+ pipx install python-voiceio
66
+ voiceio setup
67
+ ```
68
+ </details>
69
+
45
70
  <details>
46
71
  <summary><strong>Build from source</strong></summary>
47
72
 
@@ -50,9 +75,13 @@ If you want the source code locally to hack on or customize for personal use. PR
50
75
  ```bash
51
76
  git clone https://github.com/Hugo0/voiceio
52
77
  cd voiceio
53
- pip install -e ".[linux,dev]"
54
- voiceio setup
78
+ uv pip install -e ".[linux,dev]"
79
+
80
+ # Bootstrap CLI commands onto PATH (creates ~/.local/bin/voiceio)
81
+ uv run voiceio setup
55
82
  ```
83
+
84
+ > **Note:** Source installs live inside a virtualenv, so `voiceio` isn't on PATH until setup creates symlinks in `~/.local/bin/`. If `voiceio` isn't found after setup, restart your terminal or run `export PATH="$HOME/.local/bin:$PATH"`.
56
85
  </details>
57
86
 
58
87
  > You can also install with `uv tool install python-voiceio` or `pip install python-voiceio`.
@@ -64,11 +93,7 @@ hotkey → mic capture → whisper (local) → text at cursor
64
93
  pre-buffered streaming IBus / clipboard
65
94
  ```
66
95
 
67
- 1. Press your hotkey: voiceio starts recording (with a 1-second pre-buffer, so it catches the beginning even if you start speaking before pressing)
68
- 2. Speak naturally: text streams into the focused app in real-time as an underlined preview
69
- 3. Press the hotkey again: the final transcription replaces the preview and is committed
70
-
71
- Transcription runs locally via [faster-whisper](https://github.com/SYSTRAN/faster-whisper). Text is injected through IBus (works in any GTK/Qt app: browsers, Telegram, editors) with an automatic clipboard fallback for terminals.
96
+ Press your hotkey to start recording (1s pre-buffer catches the first syllable). Text streams into the focused app as an underlined preview. Press again to commit. Transcription runs locally via [faster-whisper](https://github.com/SYSTRAN/faster-whisper), text is injected through IBus (any GTK/Qt app) with clipboard fallback for terminals.
72
97
 
73
98
  ## Features
74
99
 
@@ -101,14 +126,17 @@ voiceio doctor Health check (--fix to auto-repair)
101
126
  voiceio test Test microphone + live transcription
102
127
  voiceio toggle Toggle recording on a running daemon
103
128
  voiceio update Update to latest version
104
- voiceio service install Autostart on login via systemd
129
+ voiceio service install Autostart on login (systemd / Windows Startup)
105
130
  voiceio logs View recent logs
106
131
  voiceio uninstall Remove all system integrations
107
132
  ```
108
133
 
109
134
  ## Configuration
110
135
 
111
- `voiceio setup` handles everything interactively. To tweak later, edit `~/.config/voiceio/config.toml` or override at runtime:
136
+ `voiceio setup` handles everything interactively. To tweak later, edit the config file or override at runtime:
137
+
138
+ - Linux/macOS: `~/.config/voiceio/config.toml`
139
+ - Windows: `%LOCALAPPDATA%\voiceio\config\config.toml`
112
140
 
113
141
  ```bash
114
142
  voiceio --model large-v3 --language auto -v
@@ -130,7 +158,9 @@ voiceio logs # check debug output
130
158
  | Hotkey doesn't work on Wayland | `sudo usermod -aG input $USER` then log out and back in |
131
159
  | Transcription too slow | Use a smaller model: `voiceio --model tiny` |
132
160
  | Want to start fresh | `voiceio uninstall` then `voiceio setup` |
133
- | Doesn't work on MacOS | I haven't added proper support for apple yet. either use https://aquavoice.com/ or make a PR |
161
+ | Windows: antivirus blocks hotkeys | pynput uses global keyboard hooks add an exception for voiceio |
162
+ | Windows: no sound feedback | Check `voiceio logs` for audio device info |
163
+ | macOS issues | Experimental — consider [aquavoice.com](https://aquavoice.com/) or contribute a PR |
134
164
 
135
165
  ## Platform support
136
166
 
@@ -141,6 +171,7 @@ voiceio logs # check debug output
141
171
  | Fedora (GNOME) | Supported | IBus | evdev / GNOME shortcut | Yes |
142
172
  | Arch Linux | Supported | IBus | evdev | Yes |
143
173
  | KDE / Sway / Hyprland | Should work | IBus / ydotool / wtype | evdev | Yes |
174
+ | Windows 10/11 | Experimental | pynput / clipboard | pynput | Type-and-correct (no preedit) |
144
175
  | macOS | Experimental | pynput / clipboard | pynput | Type-and-correct (no preedit) |
145
176
 
146
177
  voiceio auto-detects your platform and picks the best available backends. Run `voiceio doctor` to see what's working on your system.
@@ -152,51 +183,37 @@ voiceio uninstall # removes service, IBus, shortcuts, symlinks
152
183
  pipx uninstall python-voiceio # removes the package
153
184
  ```
154
185
 
155
- ## Wishlist
156
-
157
- Contributions welcome! See [CONTRIBUTING.md](CONTRIBUTING.md). Open an issue to discuss before starting.
158
-
159
- **High impact**
160
- - [ ] **macOS support**: test and polish pynput hotkey + typer backends
161
- - [ ] **Silence filtering**: VAD-based trimming to prevent Whisper hallucinations on silence
162
- - [ ] **distil-whisper models**: better speed/accuracy tradeoffs
163
- - [ ] **IBus on non-GNOME desktops**: KDE, Sway, Hyprland activation (currently GNOME-only via gsettings)
164
- - [ ] **Text-to-speech (voice output)**: select text, press a hotkey, hear it spoken aloud. Completes the "io" in voiceio. Use a local TTS engine (Piper, Coqui, espeak-ng), same philosophy: no cloud, no API keys
165
- - [ ] **Wake word**: "Hey voiceio" hands-free activation (no hotkey needed). Use a small always-on keyword model (e.g. openWakeWord, Porcupine)
166
- - [ ] **Custom vocabulary / hot words**: user-defined word list for names, jargon, technical terms that Whisper gets wrong. Boost via `initial_prompt` or fine-tuned logit bias
167
- - [ ] **Per-app profiles**: different language/model/output settings per application (e.g. formal writing in docs, casual in chat)
168
- - [ ] **Voice commands**: "select all", "new line", "undo that", "delete last sentence". Parse transcribed text for command patterns before injecting
169
- - [ ] **Punctuation & formatting commands**: "period", "comma", "new paragraph", "capitalize that"
170
- - [ ] **Auto-punctuation model**: post-process Whisper output with a small punctuation/capitalization model for cleaner text
171
-
172
- **Platform expansion**
173
- - [ ] **macOS Input Method (IMKit)**: native streaming preedit on macOS, matching IBus quality on Linux
174
- - [ ] **Windows support**: Text Services Framework (TSF) for text injection, global hotkeys via win32api
175
- - [ ] **Flatpak / Snap packaging**: sandboxed distribution for Linux
176
- - [ ] **AUR package**: community package for Arch Linux
177
-
178
- **UX polish**
179
- - [ ] **System tray icon with recording animation**: pulsing/colored icon showing recording state, quick menu for model/language switching
180
- - [ ] **Desktop notifications with transcribed text**: show what was typed, with an undo button
181
- - [ ] **Confidence indicator**: visual hint when Whisper is uncertain (maybe highlight low-confidence words)
182
- - [ ] **Recording timeout**: auto-stop after N seconds of silence or max duration, preventing forgotten recordings
183
- - [ ] **Sound themes**: bundled sound packs (subtle, mechanical, sci-fi, none)
184
- - [ ] **First-run onboarding overlay**: lightweight "press Ctrl+Alt+V to start" hint on first launch
185
-
186
- **Power features**
187
- - [ ] **Multi-language in one session**: auto-detect language switches mid-dictation (Whisper supports this but needs tuning)
188
- - [ ] **Speaker diarization**: "Person 1: ... Person 2: ..." for meeting notes (via pyannote or whisperX)
189
- - [ ] **LLM post-processing**: pipe transcription through a local LLM (Ollama) for grammar correction, summarization, or reformatting
190
- - [ ] **Clipboard history**: keep last N transcriptions, quick-paste from history
191
- - [ ] **Transcription log / journal**: searchable history of everything you've dictated, with timestamps
192
- - [ ] **API / webhook**: expose a local API so other tools can trigger recording or receive transcriptions
193
- - [ ] **Browser extension**: inject text into web apps that don't work with IBus (e.g. some Electron apps)
194
-
195
- **Developer experience**
196
- - [ ] **Plugin system**: hooks for pre/post processing (e.g. custom formatters, translators, text transforms)
197
- - [ ] **Alternative STT backends**: support Whisper.cpp, Deepgram, AssemblyAI, OpenAI Whisper API as optional backends
198
- - [ ] **GPU acceleration docs**: CUDA/ROCm setup guide for faster transcription on large models
186
+ ## Roadmap
187
+
188
+ Contributions welcome! See [CONTRIBUTING.md](CONTRIBUTING.md) and [open issues](https://github.com/Hugo0/voiceio/issues).
189
+
190
+ **Now**
191
+ - [ ] macOS polish (IMKit for native preedit, Accessibility API for text injection)
192
+
193
+ **Soon**
194
+ - [ ] Per-app context awareness (detect focused app, adapt formatting/behavior)
195
+ - [ ] File/audio transcription mode (`voiceio transcribe recording.mp3`)
196
+
197
+ **Backlog**
198
+ - [ ] Multiple engine backends (whisper.cpp for Vulkan/AMD, VOSK for low-end hardware)
199
+ - [ ] Echo cancellation (filter system audio for meeting use)
200
+ - [ ] Wake word activation ("Hey voiceio")
201
+ - [ ] Text-to-speech output (Piper/espeak-ng completes the "io")
202
+
203
+ **Done**
204
+ - [x] LLM auto-audit dictionary (`voiceio correct --auto` scan history with LLM, interactive correction)
205
+ - [x] LLM post-processing via Ollama (grammar cleanup, spelling fixes on final pass)
206
+ - [x] Corrections dictionary auto-replace misheard words, "correct that" voice command
207
+ - [x] Transcription history searchable log of everything you've dictated
208
+ - [x] Number-to-digit conversion ("three hundred forty two" → "342")
209
+ - [x] VAD-based silence filtering (Silero VAD, prevents Whisper hallucinations)
210
+ - [x] Voice commands "new line", "new paragraph", "scratch that", punctuation by name
211
+ - [x] Custom vocabulary / personal dictionary (bias Whisper via `initial_prompt`)
212
+ - [x] Smart punctuation & capitalization post-processing
213
+ - [x] Windows support
214
+ - [x] System tray icon with animated states
215
+ - [x] Auto-stop on silence
199
216
 
200
217
  ## License
201
218
 
202
- MIT
219
+ MIT
@@ -4,18 +4,20 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "python-voiceio"
7
- version = "0.2.4"
7
+ version = "0.3.1"
8
8
  description = "Speak → text, locally, instantly."
9
9
  readme = "README.md"
10
10
  license = "MIT"
11
11
  requires-python = ">=3.11"
12
12
  authors = [{ name = "Hugo Montenegro" }]
13
- keywords = ["voice", "speech-to-text", "whisper", "linux", "dictation", "wayland", "ibus"]
13
+ keywords = ["voice", "speech-to-text", "whisper", "linux", "windows", "dictation", "wayland", "ibus"]
14
14
  classifiers = [
15
15
  "Development Status :: 4 - Beta",
16
16
  "Environment :: X11 Applications",
17
17
  "Intended Audience :: End Users/Desktop",
18
18
  "Operating System :: POSIX :: Linux",
19
+ "Operating System :: Microsoft :: Windows",
20
+ "Operating System :: MacOS",
19
21
  "Programming Language :: Python :: 3",
20
22
  "Topic :: Multimedia :: Sound/Audio :: Speech",
21
23
  ]
@@ -23,13 +25,22 @@ dependencies = [
23
25
  "faster-whisper>=1.0.0",
24
26
  "sounddevice>=0.4.6",
25
27
  "numpy>=1.24.0",
28
+ "onnxruntime>=1.16.0",
29
+ "wordfreq>=3.0",
26
30
  "evdev>=1.6.0; sys_platform == 'linux'",
31
+ "pynput>=1.7.6; sys_platform == 'win32'",
32
+ "pynput>=1.7.6; sys_platform == 'darwin'",
33
+ "pyperclip>=1.8.0; sys_platform == 'win32'",
34
+ "win11toast>=0.36; sys_platform == 'win32'",
27
35
  ]
28
36
 
29
37
  [project.optional-dependencies]
30
38
  x11 = ["pynput>=1.7.6"]
31
39
  mac = ["pynput>=1.7.6"]
40
+ win = ["pynput>=1.7.6", "pyperclip>=1.8.0", "win11toast>=0.36"]
32
41
  tray = ["pystray>=0.19", "Pillow>=10.0"]
42
+ tts = ["piper-tts>=1.2.0"]
43
+ tts-cloud = ["edge-tts>=6.1.0"]
33
44
  dev = ["pytest>=7.0", "pytest-mock"]
34
45
 
35
46
  [project.urls]
@@ -54,3 +65,4 @@ include = ["voiceio*"]
54
65
 
55
66
  [tool.setuptools.package-data]
56
67
  "voiceio.sounds" = ["*.wav"]
68
+ "voiceio.models" = ["*.onnx"]