pushtotype 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. pushtotype-0.1.0/.claude/settings.local.json +11 -0
  2. pushtotype-0.1.0/.github/ISSUE_TEMPLATE/bug_report.md +34 -0
  3. pushtotype-0.1.0/.github/ISSUE_TEMPLATE/feature_request.md +17 -0
  4. pushtotype-0.1.0/.github/workflows/ci.yml +28 -0
  5. pushtotype-0.1.0/.gitignore +46 -0
  6. pushtotype-0.1.0/.python-version +1 -0
  7. pushtotype-0.1.0/CONTRIBUTING.md +107 -0
  8. pushtotype-0.1.0/LICENSE +21 -0
  9. pushtotype-0.1.0/PKG-INFO +260 -0
  10. pushtotype-0.1.0/README.md +225 -0
  11. pushtotype-0.1.0/assets/.gitkeep +0 -0
  12. pushtotype-0.1.0/claude.md +19 -0
  13. pushtotype-0.1.0/dogfood_test.md +68 -0
  14. pushtotype-0.1.0/learnings/xclip_injection_fix.md +54 -0
  15. pushtotype-0.1.0/main.py +6 -0
  16. pushtotype-0.1.0/memory/MEMORY.md +4 -0
  17. pushtotype-0.1.0/memory/project_efficiency_benchmark.md +15 -0
  18. pushtotype-0.1.0/project-plan.md +340 -0
  19. pushtotype-0.1.0/pyproject.toml +56 -0
  20. pushtotype-0.1.0/src/pushtotype/__init__.py +1 -0
  21. pushtotype-0.1.0/src/pushtotype/__main__.py +1 -0
  22. pushtotype-0.1.0/src/pushtotype/audio.py +56 -0
  23. pushtotype-0.1.0/src/pushtotype/cli.py +269 -0
  24. pushtotype-0.1.0/src/pushtotype/config.py +144 -0
  25. pushtotype-0.1.0/src/pushtotype/daemon.py +274 -0
  26. pushtotype-0.1.0/src/pushtotype/feedback.py +175 -0
  27. pushtotype-0.1.0/src/pushtotype/hotkey.py +183 -0
  28. pushtotype-0.1.0/src/pushtotype/injector.py +72 -0
  29. pushtotype-0.1.0/src/pushtotype/session.py +57 -0
  30. pushtotype-0.1.0/src/pushtotype/sounds/start.wav +0 -0
  31. pushtotype-0.1.0/src/pushtotype/sounds/stop.wav +0 -0
  32. pushtotype-0.1.0/src/pushtotype/transcriber.py +62 -0
  33. pushtotype-0.1.0/tasks/M0_PROJECT_SETUP.md +181 -0
  34. pushtotype-0.1.0/tasks/M1_AUDIO_TRANSCRIPTION.md +207 -0
  35. pushtotype-0.1.0/tasks/M2_HOTKEY_PUSH_TO_TALK.md +246 -0
  36. pushtotype-0.1.0/tasks/M3_TEXT_INJECTION.md +249 -0
  37. pushtotype-0.1.0/tasks/M4_CONFIG_POLISH.md +306 -0
  38. pushtotype-0.1.0/tasks/M5_DISTRIBUTION_DOCS.md +329 -0
  39. pushtotype-0.1.0/tests/test_audio.py +95 -0
  40. pushtotype-0.1.0/tests/test_config.py +169 -0
  41. pushtotype-0.1.0/tests/test_feedback.py +63 -0
  42. pushtotype-0.1.0/tests/test_hotkey.py +119 -0
  43. pushtotype-0.1.0/tests/test_injector.py +104 -0
  44. pushtotype-0.1.0/tests/test_session.py +27 -0
  45. pushtotype-0.1.0/tests/test_smoke.py +9 -0
  46. pushtotype-0.1.0/tests/test_transcriber.py +75 -0
  47. pushtotype-0.1.0/uv.lock +1049 -0
@@ -0,0 +1,11 @@
1
+ {
2
+ "permissions": {
3
+ "allow": [
4
+ "Bash(gh auth:*)",
5
+ "Bash(pip install:*)",
6
+ "Bash(uv run:*)",
7
+ "Bash(uv add:*)",
8
+ "Bash(uv pip:*)"
9
+ ]
10
+ }
11
+ }
@@ -0,0 +1,34 @@
1
+ ---
2
+ name: Bug report
3
+ about: Something isn't working
4
+ labels: bug
5
+ ---
6
+
7
+ ## What happened
8
+
9
+ <!-- A clear description of the bug -->
10
+
11
+ ## Steps to reproduce
12
+
13
+ 1.
14
+ 2.
15
+ 3.
16
+
17
+ ## Expected behavior
18
+
19
+ <!-- What you expected to happen -->
20
+
21
+ ## Startup output (`pushtotype -v`)
22
+
23
+ ```
24
+ paste output here
25
+ ```
26
+
27
+ ## Environment
28
+
29
+ - OS:
30
+ - Display server (X11 / Wayland):
31
+ - Python version (`python --version`):
32
+ - PushToType version (`pushtotype --version`):
33
+ - GPU / CPU:
34
+ - Install method (pip / pipx / uv / source):
@@ -0,0 +1,17 @@
1
+ ---
2
+ name: Feature request
3
+ about: Suggest an idea or improvement
4
+ labels: enhancement
5
+ ---
6
+
7
+ ## Problem
8
+
9
+ <!-- What problem does this solve? Who is it for? -->
10
+
11
+ ## Proposed solution
12
+
13
+ <!-- What would you like to see? -->
14
+
15
+ ## Alternatives considered
16
+
17
+ <!-- Any other approaches you thought about? -->
@@ -0,0 +1,28 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ pull_request:
6
+
7
+ jobs:
8
+ lint-and-test:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - uses: actions/checkout@v4
12
+
13
+ - name: Set up Python
14
+ uses: actions/setup-python@v5
15
+ with:
16
+ python-version: "3.10"
17
+
18
+ - name: Install dependencies
19
+ run: pip install -e ".[dev]"
20
+
21
+ - name: Lint
22
+ run: ruff check .
23
+
24
+ - name: Format check
25
+ run: ruff format --check .
26
+
27
+ - name: Test
28
+ run: pytest
@@ -0,0 +1,46 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.pyo
5
+ *.pyd
6
+ .Python
7
+ *.egg
8
+ *.egg-info/
9
+ dist/
10
+ build/
11
+ eggs/
12
+ parts/
13
+ var/
14
+ sdist/
15
+ wheels/
16
+ .installed.cfg
17
+ lib/
18
+ lib64/
19
+
20
+ # Virtual environments
21
+ .venv/
22
+ venv/
23
+ env/
24
+ ENV/
25
+
26
+ # Testing
27
+ .pytest_cache/
28
+ .coverage
29
+ htmlcov/
30
+
31
+ # Ruff
32
+ .ruff_cache/
33
+
34
+ # IDE
35
+ .idea/
36
+ .vscode/
37
+ *.swp
38
+ *.swo
39
+
40
+ # OS
41
+ .DS_Store
42
+ Thumbs.db
43
+
44
+ # Distribution
45
+ *.tar.gz
46
+ *.whl
@@ -0,0 +1 @@
1
+ 3.13
@@ -0,0 +1,107 @@
1
+ # Contributing to PushToType
2
+
3
+ Thanks for your interest in contributing. PushToType is a focused tool — the goal is to keep it simple, fast, and reliable.
4
+
5
+ ---
6
+
7
+ ## Development Setup
8
+
9
+ ```bash
10
+ # Clone the repo
11
+ git clone https://github.com/danielgraviet/pushtotype.git
12
+ cd pushtotype
13
+
14
+ # Install with dev dependencies
15
+ uv pip install -e ".[dev]"
16
+
17
+ # Verify setup
18
+ uv run pytest tests/
19
+ uv run pushtotype test --duration 3
20
+ ```
21
+
22
+ **System dependencies required for manual testing:**
23
+ ```bash
24
+ sudo apt install libportaudio2 xdotool
25
+ sudo usermod -aG input $USER # log out/in after
26
+ ```
27
+
28
+ ---
29
+
30
+ ## Running Tests
31
+
32
+ ```bash
33
+ uv run pytest tests/ # all tests
34
+ uv run pytest tests/ -v # verbose
35
+ uv run pytest tests/ -q # quiet
36
+ ```
37
+
38
+ Tests use mocks for hardware (audio, evdev, GPU) so they run in CI without any devices attached.
39
+
40
+ ---
41
+
42
+ ## Code Style
43
+
44
+ PushToType uses [ruff](https://docs.astral.sh/ruff/) for linting and formatting.
45
+
46
+ ```bash
47
+ uv run ruff check src/ tests/ # lint
48
+ uv run ruff format src/ tests/ # format
49
+ uv run ruff format --check src/ tests/ # check without changing
50
+ ```
51
+
52
+ CI will fail if either check fails. Run both before submitting a PR.
53
+
54
+ ---
55
+
56
+ ## Architecture
57
+
58
+ ```
59
+ src/pushtotype/
60
+ ├── cli.py Entry point — click commands, config wiring, wizard
61
+ ├── daemon.py Main loop — hotkey → record → transcribe → inject
62
+ ├── config.py TOML config loading, saving, validation, defaults
63
+ ├── hotkey.py evdev-based global hotkey listener (async)
64
+ ├── transcriber.py faster-whisper wrapper
65
+ ├── injector.py xdotool type (X11) / wtype (Wayland)
66
+ ├── audio.py sounddevice audio capture
67
+ ├── feedback.py Start/stop/error beep sounds
68
+ └── session.py X11 / Wayland detection
69
+ ```
70
+
71
+ **Data flow:**
72
+ 1. `HotkeyListener` (evdev, async) fires `_on_press` / `_on_release` callbacks
73
+ 2. `_on_release` concatenates recorded audio frames and schedules `_transcribe`
74
+ 3. `_transcribe` runs `Transcriber.transcribe()` in a thread pool executor
75
+ 4. Result is passed to `TextInjector.inject()` which calls `xdotool type`
76
+
77
+ ---
78
+
79
+ ## Where Help Is Wanted
80
+
81
+ Check the [issues](https://github.com/danielgraviet/pushtotype/issues) page for `good first issue` labels. Some areas:
82
+
83
+ - **Wayland improvements** — better session detection, testing on more compositors
84
+ - **AMD GPU support** — ROCm / DirectML via ctranslate2
85
+ - **Hotkey UX** — better evdev capture fallback for users not in the `input` group
86
+ - **Tests** — more coverage for daemon and CLI integration paths
87
+
88
+ ---
89
+
90
+ ## Submitting a PR
91
+
92
+ 1. Fork the repo and create a branch from `master`
93
+ 2. Make your changes
94
+ 3. Run `uv run pytest tests/` and `uv run ruff check src/ tests/` — both must pass
95
+ 4. Open a PR with a clear description of what changed and why
96
+
97
+ For larger changes, open an issue first to discuss the approach.
98
+
99
+ ---
100
+
101
+ ## Reporting Bugs
102
+
103
+ Use the [bug report template](.github/ISSUE_TEMPLATE/bug_report.md). Include:
104
+ - OS and display server (X11/Wayland)
105
+ - Python version
106
+ - Output of `pushtotype -v` startup block
107
+ - Steps to reproduce
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 danielgraviet
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,260 @@
1
+ Metadata-Version: 2.4
2
+ Name: pushtotype
3
+ Version: 0.1.0
4
+ Summary: Real-time speech-to-text for Linux. Hold a hotkey, speak, release — your words appear wherever your cursor is.
5
+ Project-URL: Homepage, https://github.com/danielgraviet/pushtotype
6
+ Project-URL: Repository, https://github.com/danielgraviet/pushtotype
7
+ Project-URL: Issues, https://github.com/danielgraviet/pushtotype/issues
8
+ Author-email: Daniel Graviet <dtgraviet@gmail.com>
9
+ License: MIT
10
+ License-File: LICENSE
11
+ Keywords: linux,push-to-talk,speech-to-text,transcription,voice-typing,whisper
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Environment :: X11 Applications
14
+ Classifier: Intended Audience :: End Users/Desktop
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: POSIX :: Linux
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
22
+ Requires-Python: >=3.10
23
+ Requires-Dist: click>=8.0
24
+ Requires-Dist: evdev>=1.6.0
25
+ Requires-Dist: faster-whisper>=1.0.0
26
+ Requires-Dist: numpy>=1.24
27
+ Requires-Dist: sounddevice>=0.4.6
28
+ Requires-Dist: tomli-w>=1.0
29
+ Requires-Dist: tomli>=2.0; python_version < '3.11'
30
+ Provides-Extra: dev
31
+ Requires-Dist: pytest; extra == 'dev'
32
+ Requires-Dist: pytest-asyncio; extra == 'dev'
33
+ Requires-Dist: ruff; extra == 'dev'
34
+ Description-Content-Type: text/markdown
35
+
36
+ # PushToType
37
+
38
+ > Hold a hotkey, speak, release — your words appear wherever your cursor is.
39
+
40
+ [![PyPI version](https://img.shields.io/pypi/v/pushtotype)](https://pypi.org/project/pushtotype/)
41
+ [![Python](https://img.shields.io/pypi/pyversions/pushtotype)](https://pypi.org/project/pushtotype/)
42
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
43
+ [![CI](https://github.com/danielgraviet/pushtotype/actions/workflows/ci.yml/badge.svg)](https://github.com/danielgraviet/pushtotype/actions)
44
+
45
+ PushToType is a local, real-time speech-to-text tool for Linux. It transcribes your voice using a local Whisper model and types the result directly into whatever application has focus — no clipboard, no cloud, no API keys.
46
+
47
+ An open-source alternative to OpenAI's Whisper Flow, which has no Linux support.
48
+
49
+ ---
50
+
51
+ ## Features
52
+
53
+ - **Works everywhere** — types into any focused app: browsers, editors, terminals, search bars
54
+ - **Local-only** — `faster-whisper` runs on your GPU (CUDA) with automatic CPU fallback
55
+ - **No cloud** — no API keys, no network required after the one-time model download
56
+ - **Fast** — ~250ms from hotkey release to text appearing
57
+ - **Configurable** — TOML config file, interactive setup wizard, CLI flags
58
+ - **Wayland + X11** — works on both display servers via `evdev`
59
+
60
+ ---
61
+
62
+ ## Quick Start
63
+
64
+ ```bash
65
+ # Install
66
+ uv add pushtotype # or: pip install pushtotype
67
+
68
+ # System dependencies (X11)
69
+ sudo apt install libportaudio2 xdotool
70
+
71
+ # Add yourself to the input group (required for hotkey detection)
72
+ sudo usermod -aG input $USER
73
+ # Log out and back in for this to take effect
74
+
75
+ # Run the setup wizard
76
+ pushtotype config
77
+
78
+ # Start
79
+ pushtotype
80
+ ```
81
+
82
+ Hold your configured hotkey (default: right Ctrl), speak, release. Text appears at the cursor.
83
+
84
+ ---
85
+
86
+ ## How It Works
87
+
88
+ ```
89
+ [Hold hotkey] → [Record audio] → [Whisper transcription] → [Type into focused app]
90
+ evdev sounddevice faster-whisper xdotool type
91
+ ```
92
+
93
+ PushToType runs as a background daemon. A global hotkey listener (via `evdev`, reading directly from `/dev/input/`) fires a recording callback. When you release the hotkey, the audio is sent to `faster-whisper` for transcription, then `xdotool type` injects the text into whatever window is focused.
94
+
95
+ ---
96
+
97
+ ## Installation
98
+
99
+ ### Recommended: uv
100
+
101
+ ```bash
102
+ uv tool install pushtotype
103
+ ```
104
+
105
+ ### pip / pipx
106
+
107
+ ```bash
108
+ pip install pushtotype
109
+ # or
110
+ pipx install pushtotype
111
+ ```
112
+
113
+ ### From source
114
+
115
+ ```bash
116
+ git clone https://github.com/danielgraviet/pushtotype.git
117
+ cd pushtotype
118
+ uv pip install -e ".[dev]"
119
+ ```
120
+
121
+ ---
122
+
123
+ ## System Requirements
124
+
125
+ | Requirement | Notes |
126
+ |---|---|
127
+ | Linux | X11 or Wayland |
128
+ | Python 3.10+ | |
129
+ | `libportaudio2` | `sudo apt install libportaudio2` |
130
+ | `xdotool` | X11 only — `sudo apt install xdotool` |
131
+ | `wtype` + `wl-clipboard` | Wayland only — `sudo apt install wtype wl-clipboard` |
132
+ | `input` group | `sudo usermod -aG input $USER` |
133
+ | NVIDIA GPU | Recommended for speed — CPU works but is slower |
134
+
135
+ ---
136
+
137
+ ## Configuration
138
+
139
+ Config file lives at `~/.config/pushtotype/config.toml`. Run `pushtotype config` to create it interactively.
140
+
141
+ ```toml
142
+ [hotkey]
143
+ keys = ["KEY_RIGHTCTRL"]
144
+
145
+ [audio]
146
+ device = "default"
147
+ sample_rate = 16000
148
+
149
+ [model]
150
+ name = "base.en"
151
+ device = "auto"
152
+ compute_type = "float16"
153
+
154
+ [feedback]
155
+ enabled = true
156
+ volume = 0.5
157
+
158
+ [output]
159
+ method = "auto" # "auto", "x11", or "wayland"
160
+ ```
161
+
162
+ ### Config priority (highest to lowest)
163
+
164
+ 1. CLI flags (e.g. `--model small.en`)
165
+ 2. Environment variables (e.g. `PUSHTOTYPE_MODEL=small.en`)
166
+ 3. Config file (`~/.config/pushtotype/config.toml`)
167
+ 4. Built-in defaults
168
+
169
+ ### Environment variables
170
+
171
+ | Variable | Config key |
172
+ |---|---|
173
+ | `PUSHTOTYPE_MODEL` | `model.name` |
174
+ | `PUSHTOTYPE_DEVICE` | `model.device` |
175
+ | `PUSHTOTYPE_AUDIO_DEV` | `audio.device` |
176
+ | `PUSHTOTYPE_FEEDBACK` | `feedback.enabled` |
177
+ | `PUSHTOTYPE_HOTKEY` | `hotkey.keys` (comma-separated) |
178
+
179
+ ---
180
+
181
+ ## CLI Reference
182
+
183
+ ```
184
+ pushtotype Start the push-to-talk daemon
185
+ pushtotype config Run the interactive setup wizard
186
+ pushtotype config --show Print the current effective config
187
+ pushtotype devices List available audio input devices
188
+ pushtotype test Record 5 seconds and transcribe (verify setup)
189
+ pushtotype download [MODEL] Pre-download a Whisper model
190
+ ```
191
+
192
+ **Global flags:**
193
+
194
+ ```
195
+ -v, --verbose Enable debug logging (shows per-step timings)
196
+ -q, --quiet Suppress all output except errors
197
+ --log-file PATH Write logs to a file
198
+ --model NAME Override model (e.g. small.en)
199
+ --hotkey COMBO Override hotkey (e.g. ctrl+shift+s)
200
+ --device INDEX Override audio device index
201
+ --no-feedback Disable start/stop beeps
202
+ ```
203
+
204
+ ---
205
+
206
+ ## Troubleshooting
207
+
208
+ **`Permission denied` on `/dev/input/`**
209
+
210
+ You need to be in the `input` group:
211
+ ```bash
212
+ sudo usermod -aG input $USER
213
+ # Log out and back in
214
+ ```
215
+
216
+ **`xdotool not found`**
217
+ ```bash
218
+ sudo apt install xdotool
219
+ ```
220
+
221
+ **Text doesn't appear in my terminal**
222
+
223
+ Terminals use `Ctrl+Shift+V` to paste, but PushToType uses `xdotool type` which bypasses the clipboard entirely — it should work in all terminals without any special config.
224
+
225
+ **CUDA not available**
226
+
227
+ PushToType automatically falls back to CPU. Transcription will be slower (~1-3s per 5s of audio vs ~0.2s on GPU). Check `pushtotype -v` startup output to see which device is being used.
228
+
229
+ **Model download fails / slow**
230
+
231
+ Models are cached in `~/.cache/huggingface/hub/` after the first download. Pre-download manually:
232
+ ```bash
233
+ pushtotype download base.en
234
+ ```
235
+
236
+ **`wtype` or `wl-copy` not found (Wayland)**
237
+ ```bash
238
+ sudo apt install wtype wl-clipboard
239
+ ```
240
+
241
+ ---
242
+
243
+ ## Known Limitations
244
+
245
+ - English only (`base.en` model)
246
+ - No AMD GPU (ROCm) support
247
+ - Wayland session detection relies on `XDG_SESSION_TYPE` or `WAYLAND_DISPLAY`
248
+ - No GUI — terminal only
249
+
250
+ ---
251
+
252
+ ## Contributing
253
+
254
+ See [CONTRIBUTING.md](CONTRIBUTING.md). Issues and PRs welcome.
255
+
256
+ ---
257
+
258
+ ## License
259
+
260
+ [MIT](LICENSE)