voice-command 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,24 @@
1
+ # Python
2
+ .venv/
3
+ __pycache__/
4
+ *.pyc
5
+ *.pyo
6
+ *.egg-info/
7
+ dist/
8
+ build/
9
+
10
+ # OS
11
+ .DS_Store
12
+
13
+ # Test fixtures (large audio files)
14
+ tests/fixtures/*.m4a
15
+ tests/fixtures/*.wav
16
+ tests/fixtures/*.mp3
17
+
18
+ # IDE
19
+ .vscode/
20
+ .idea/
21
+
22
+ # Models (downloaded at runtime)
23
+ *.bin
24
+ *.safetensors
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Denis
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,155 @@
1
+ Metadata-Version: 2.4
2
+ Name: voice-command
3
+ Version: 0.1.0
4
+ Summary: VAD-driven streaming voice dictation for macOS with editing commands, tech-term correction, and keyboard output
5
+ Project-URL: Homepage, https://github.com/depoledna/voice-command
6
+ Project-URL: Repository, https://github.com/depoledna/voice-command
7
+ Project-URL: Issues, https://github.com/depoledna/voice-command/issues
8
+ Author: Denis
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Keywords: dictation,macos,mlx,speech-to-text,vad,voice,whisper
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Environment :: Console
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: MacOS
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
20
+ Requires-Python: >=3.12
21
+ Requires-Dist: faster-whisper>=1.2.1
22
+ Requires-Dist: jiwer>=4.0.0
23
+ Requires-Dist: librosa>=0.11.0
24
+ Requires-Dist: mlx-lm>=0.30.7
25
+ Requires-Dist: mlx-whisper>=0.4.3
26
+ Requires-Dist: noisereduce>=3.0.3
27
+ Requires-Dist: psutil>=7.2.2
28
+ Requires-Dist: pydantic>=2.12.5
29
+ Requires-Dist: pydub>=0.25.1
30
+ Requires-Dist: pynput>=1.8.1
31
+ Requires-Dist: scipy>=1.17.1
32
+ Requires-Dist: silero-vad-lite>=0.2.1
33
+ Requires-Dist: sounddevice>=0.5.5
34
+ Requires-Dist: transformers>=5.2.0
35
+ Description-Content-Type: text/markdown
36
+
37
+ # voice-command
38
+
39
+ VAD-driven streaming voice dictation for macOS (Apple Silicon). Speaks into your mic, text appears in a terminal buffer or gets typed directly into any app.
40
+
41
+ All inference runs locally — Whisper for ASR, Silero for VAD, Qwen for tech-term correction. No cloud APIs.
42
+
43
+ ## Requirements
44
+
45
+ - macOS with Apple Silicon
46
+ - Python 3.12+
47
+ - Microphone access (System Settings > Privacy > Microphone)
48
+ - Accessibility access for `--type` mode (System Settings > Privacy > Accessibility)
49
+
50
+ ## Install
51
+
52
+ ### From PyPI
53
+
54
+ ```bash
55
+ pip install voice-command
56
+ # or
57
+ uv tool install voice-command
58
+ ```
59
+
60
+ ### From source
61
+
62
+ ```bash
63
+ git clone https://github.com/depoledna/voice-command.git
64
+ cd voice-command
65
+ uv sync
66
+ ```
67
+
68
+ Models download automatically on first run (~300MB for Whisper + ~1GB for Qwen).
69
+
70
+ ## Usage
71
+
72
+ ```bash
73
+ # Terminal buffer mode (TUI)
74
+ voice-cmd
75
+
76
+ # Type directly into the focused app
77
+ voice-cmd --type
78
+
79
+ # Transcribe a recording
80
+ voice-cmd --file recording.m4a
81
+
82
+ # List audio devices
83
+ voice-cmd --list-devices
84
+
85
+ # Use a specific device
86
+ voice-cmd --device 1
87
+ ```
88
+
89
+ When running from source, use `uv run python voice_cmd.py` instead of `voice-cmd`.
90
+
91
+ ## Voice Commands
92
+
93
+ | Command | Action |
94
+ |---------|--------|
95
+ | `period` / `comma` / `question mark` | Insert punctuation |
96
+ | `new line` | Line break |
97
+ | `new paragraph` | Double line break |
98
+ | `scratch that` | Delete last ~5 words |
99
+ | `delete last N words` | Delete last N words |
100
+ | `undo` | Undo last action |
101
+ | `clear all` | Clear buffer |
102
+ | `stop listening` | Pause |
103
+ | `start listening` | Resume |
104
+ | `copy all` | Copy to clipboard |
105
+ | `done` | Copy to clipboard and exit |
106
+ | `show commands` | Show command list |
107
+
108
+ Commands can appear inline with dictated text: "Send the email **period** **new line** Don't forget the attachment" produces two lines with proper punctuation.
109
+
110
+ ## Pipeline
111
+
112
+ 1. **Audio** - `sounddevice` captures mic input, resampled to 16kHz
113
+ 2. **VAD** - Silero VAD with hysteresis detects speech boundaries (32ms frames, pre-roll buffering)
114
+ 3. **ASR** - MLX Whisper (small, 8-bit) with dev-vocabulary prompt
115
+ 4. **LLM** - Qwen3 1.7B (4-bit) fixes tech terms: "fast api" -> "FastAPI", "type script" -> "TypeScript"
116
+ 5. **Commands** - Sentence splitting + leading/trailing command extraction
117
+ 6. **Output** - TUI buffer display or keystroke diff-typing via pynput
118
+
119
+ ## Type Mode
120
+
121
+ `--type` mode sends keystrokes to the focused app. It detects when its own terminal is focused and skips typing to avoid feedback loops. A 3-second countdown lets you switch to the target app after launching.
122
+
123
+ ## Benchmarks
124
+
125
+ ```bash
126
+ # Compare ASR models (requires test fixtures in tests/fixtures/)
127
+ uv run python tests/benchmark.py
128
+
129
+ # Pipeline diagnostics
130
+ uv run python tests/diagnose_pipeline.py
131
+ ```
132
+
133
+ ## Releasing
134
+
135
+ 1. Update the version in `pyproject.toml`
136
+ 2. Commit: `git commit -am "chore: bump version to X.Y.Z"`
137
+ 3. Tag: `git tag vX.Y.Z`
138
+ 4. Push: `git push origin main --tags`
139
+
140
+ The GitHub Actions workflow builds and publishes to PyPI automatically via trusted publishers (OIDC).
141
+
142
+ ### First-time PyPI setup
143
+
144
+ 1. Go to https://pypi.org/manage/account/publishing/
145
+ 2. Add a "pending publisher":
146
+ - Package name: `voice-command`
147
+ - Owner: `depoledna`
148
+ - Repository: `voice-command`
149
+ - Workflow: `release.yml`
150
+ - Environment: `pypi`
151
+ 3. In the GitHub repo, go to Settings > Environments > create `pypi`
152
+
153
+ ## License
154
+
155
+ MIT
@@ -0,0 +1,119 @@
1
+ # voice-command
2
+
3
+ VAD-driven streaming voice dictation for macOS (Apple Silicon). Speaks into your mic, text appears in a terminal buffer or gets typed directly into any app.
4
+
5
+ All inference runs locally — Whisper for ASR, Silero for VAD, Qwen for tech-term correction. No cloud APIs.
6
+
7
+ ## Requirements
8
+
9
+ - macOS with Apple Silicon
10
+ - Python 3.12+
11
+ - Microphone access (System Settings > Privacy > Microphone)
12
+ - Accessibility access for `--type` mode (System Settings > Privacy > Accessibility)
13
+
14
+ ## Install
15
+
16
+ ### From PyPI
17
+
18
+ ```bash
19
+ pip install voice-command
20
+ # or
21
+ uv tool install voice-command
22
+ ```
23
+
24
+ ### From source
25
+
26
+ ```bash
27
+ git clone https://github.com/depoledna/voice-command.git
28
+ cd voice-command
29
+ uv sync
30
+ ```
31
+
32
+ Models download automatically on first run (~300MB for Whisper + ~1GB for Qwen).
33
+
34
+ ## Usage
35
+
36
+ ```bash
37
+ # Terminal buffer mode (TUI)
38
+ voice-cmd
39
+
40
+ # Type directly into the focused app
41
+ voice-cmd --type
42
+
43
+ # Transcribe a recording
44
+ voice-cmd --file recording.m4a
45
+
46
+ # List audio devices
47
+ voice-cmd --list-devices
48
+
49
+ # Use a specific device
50
+ voice-cmd --device 1
51
+ ```
52
+
53
+ When running from source, use `uv run python voice_cmd.py` instead of `voice-cmd`.
54
+
55
+ ## Voice Commands
56
+
57
+ | Command | Action |
58
+ |---------|--------|
59
+ | `period` / `comma` / `question mark` | Insert punctuation |
60
+ | `new line` | Line break |
61
+ | `new paragraph` | Double line break |
62
+ | `scratch that` | Delete last ~5 words |
63
+ | `delete last N words` | Delete last N words |
64
+ | `undo` | Undo last action |
65
+ | `clear all` | Clear buffer |
66
+ | `stop listening` | Pause |
67
+ | `start listening` | Resume |
68
+ | `copy all` | Copy to clipboard |
69
+ | `done` | Copy to clipboard and exit |
70
+ | `show commands` | Show command list |
71
+
72
+ Commands can appear inline with dictated text: "Send the email **period** **new line** Don't forget the attachment" produces two lines with proper punctuation.
73
+
74
+ ## Pipeline
75
+
76
+ 1. **Audio** - `sounddevice` captures mic input, resampled to 16kHz
77
+ 2. **VAD** - Silero VAD with hysteresis detects speech boundaries (32ms frames, pre-roll buffering)
78
+ 3. **ASR** - MLX Whisper (small, 8-bit) with dev-vocabulary prompt
79
+ 4. **LLM** - Qwen3 1.7B (4-bit) fixes tech terms: "fast api" -> "FastAPI", "type script" -> "TypeScript"
80
+ 5. **Commands** - Sentence splitting + leading/trailing command extraction
81
+ 6. **Output** - TUI buffer display or keystroke diff-typing via pynput
82
+
83
+ ## Type Mode
84
+
85
+ `--type` mode sends keystrokes to the focused app. It detects when its own terminal is focused and skips typing to avoid feedback loops. A 3-second countdown lets you switch to the target app after launching.
86
+
87
+ ## Benchmarks
88
+
89
+ ```bash
90
+ # Compare ASR models (requires test fixtures in tests/fixtures/)
91
+ uv run python tests/benchmark.py
92
+
93
+ # Pipeline diagnostics
94
+ uv run python tests/diagnose_pipeline.py
95
+ ```
96
+
97
+ ## Releasing
98
+
99
+ 1. Update the version in `pyproject.toml`
100
+ 2. Commit: `git commit -am "chore: bump version to X.Y.Z"`
101
+ 3. Tag: `git tag vX.Y.Z`
102
+ 4. Push: `git push origin main --tags`
103
+
104
+ The GitHub Actions workflow builds and publishes to PyPI automatically via trusted publishers (OIDC).
105
+
106
+ ### First-time PyPI setup
107
+
108
+ 1. Go to https://pypi.org/manage/account/publishing/
109
+ 2. Add a "pending publisher":
110
+ - Package name: `voice-command`
111
+ - Owner: `depoledna`
112
+ - Repository: `voice-command`
113
+ - Workflow: `release.yml`
114
+ - Environment: `pypi`
115
+ 3. In the GitHub repo, go to Settings > Environments > create `pypi`
116
+
117
+ ## License
118
+
119
+ MIT
@@ -0,0 +1,54 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "voice-command"
7
+ version = "0.1.0"
8
+ description = "VAD-driven streaming voice dictation for macOS with editing commands, tech-term correction, and keyboard output"
9
+ readme = "README.md"
10
+ requires-python = ">=3.12"
11
+ authors = [{ name = "Denis" }]
12
+ license = "MIT"
13
+ keywords = ["voice", "dictation", "whisper", "mlx", "macos", "speech-to-text", "vad"]
14
+ classifiers = [
15
+ "Development Status :: 4 - Beta",
16
+ "Environment :: Console",
17
+ "Intended Audience :: Developers",
18
+ "License :: OSI Approved :: MIT License",
19
+ "Operating System :: MacOS",
20
+ "Programming Language :: Python :: 3.12",
21
+ "Programming Language :: Python :: 3.13",
22
+ "Topic :: Multimedia :: Sound/Audio :: Speech",
23
+ ]
24
+ dependencies = [
25
+ "faster-whisper>=1.2.1",
26
+ "jiwer>=4.0.0",
27
+ "librosa>=0.11.0",
28
+ "mlx-lm>=0.30.7",
29
+ "mlx-whisper>=0.4.3",
30
+ "noisereduce>=3.0.3",
31
+ "psutil>=7.2.2",
32
+ "pydantic>=2.12.5",
33
+ "pydub>=0.25.1",
34
+ "pynput>=1.8.1",
35
+ "scipy>=1.17.1",
36
+ "silero-vad-lite>=0.2.1",
37
+ "sounddevice>=0.5.5",
38
+ "transformers>=5.2.0",
39
+ ]
40
+
41
+ [project.scripts]
42
+ voice-cmd = "voice_cmd:main"
43
+
44
+ [project.urls]
45
+ Homepage = "https://github.com/depoledna/voice-command"
46
+ Repository = "https://github.com/depoledna/voice-command"
47
+ Issues = "https://github.com/depoledna/voice-command/issues"
48
+
49
+ [tool.hatch.build.targets.wheel]
50
+ packages = ["."]
51
+ include = ["voice_cmd.py"]
52
+
53
+ [tool.hatch.build.targets.sdist]
54
+ include = ["voice_cmd.py", "pyproject.toml", "README.md", "LICENSE"]