voice-command 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- voice_command-0.1.0/.gitignore +24 -0
- voice_command-0.1.0/LICENSE +21 -0
- voice_command-0.1.0/PKG-INFO +155 -0
- voice_command-0.1.0/README.md +119 -0
- voice_command-0.1.0/pyproject.toml +54 -0
- voice_command-0.1.0/voice_cmd.py +1140 -0
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
.venv/
|
|
3
|
+
__pycache__/
|
|
4
|
+
*.pyc
|
|
5
|
+
*.pyo
|
|
6
|
+
*.egg-info/
|
|
7
|
+
dist/
|
|
8
|
+
build/
|
|
9
|
+
|
|
10
|
+
# OS
|
|
11
|
+
.DS_Store
|
|
12
|
+
|
|
13
|
+
# Test fixtures (large audio files)
|
|
14
|
+
tests/fixtures/*.m4a
|
|
15
|
+
tests/fixtures/*.wav
|
|
16
|
+
tests/fixtures/*.mp3
|
|
17
|
+
|
|
18
|
+
# IDE
|
|
19
|
+
.vscode/
|
|
20
|
+
.idea/
|
|
21
|
+
|
|
22
|
+
# Models (downloaded at runtime)
|
|
23
|
+
*.bin
|
|
24
|
+
*.safetensors
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Denis
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: voice-command
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: VAD-driven streaming voice dictation for macOS with editing commands, tech-term correction, and keyboard output
|
|
5
|
+
Project-URL: Homepage, https://github.com/depoledna/voice-command
|
|
6
|
+
Project-URL: Repository, https://github.com/depoledna/voice-command
|
|
7
|
+
Project-URL: Issues, https://github.com/depoledna/voice-command/issues
|
|
8
|
+
Author: Denis
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: dictation,macos,mlx,speech-to-text,vad,voice,whisper
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Environment :: Console
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Operating System :: MacOS
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
|
|
20
|
+
Requires-Python: >=3.12
|
|
21
|
+
Requires-Dist: faster-whisper>=1.2.1
|
|
22
|
+
Requires-Dist: jiwer>=4.0.0
|
|
23
|
+
Requires-Dist: librosa>=0.11.0
|
|
24
|
+
Requires-Dist: mlx-lm>=0.30.7
|
|
25
|
+
Requires-Dist: mlx-whisper>=0.4.3
|
|
26
|
+
Requires-Dist: noisereduce>=3.0.3
|
|
27
|
+
Requires-Dist: psutil>=7.2.2
|
|
28
|
+
Requires-Dist: pydantic>=2.12.5
|
|
29
|
+
Requires-Dist: pydub>=0.25.1
|
|
30
|
+
Requires-Dist: pynput>=1.8.1
|
|
31
|
+
Requires-Dist: scipy>=1.17.1
|
|
32
|
+
Requires-Dist: silero-vad-lite>=0.2.1
|
|
33
|
+
Requires-Dist: sounddevice>=0.5.5
|
|
34
|
+
Requires-Dist: transformers>=5.2.0
|
|
35
|
+
Description-Content-Type: text/markdown
|
|
36
|
+
|
|
37
|
+
# voice-command
|
|
38
|
+
|
|
39
|
+
VAD-driven streaming voice dictation for macOS (Apple Silicon). Speaks into your mic, text appears in a terminal buffer or gets typed directly into any app.
|
|
40
|
+
|
|
41
|
+
All inference runs locally — Whisper for ASR, Silero for VAD, Qwen for tech-term correction. No cloud APIs.
|
|
42
|
+
|
|
43
|
+
## Requirements
|
|
44
|
+
|
|
45
|
+
- macOS with Apple Silicon
|
|
46
|
+
- Python 3.12+
|
|
47
|
+
- Microphone access (System Settings > Privacy > Microphone)
|
|
48
|
+
- Accessibility access for `--type` mode (System Settings > Privacy > Accessibility)
|
|
49
|
+
|
|
50
|
+
## Install
|
|
51
|
+
|
|
52
|
+
### From PyPI
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
pip install voice-command
|
|
56
|
+
# or
|
|
57
|
+
uv tool install voice-command
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### From source
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
git clone https://github.com/depoledna/voice-command.git
|
|
64
|
+
cd voice-command
|
|
65
|
+
uv sync
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
Models download automatically on first run (~300MB for Whisper + ~1GB for Qwen).
|
|
69
|
+
|
|
70
|
+
## Usage
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
# Terminal buffer mode (TUI)
|
|
74
|
+
voice-cmd
|
|
75
|
+
|
|
76
|
+
# Type directly into the focused app
|
|
77
|
+
voice-cmd --type
|
|
78
|
+
|
|
79
|
+
# Transcribe a recording
|
|
80
|
+
voice-cmd --file recording.m4a
|
|
81
|
+
|
|
82
|
+
# List audio devices
|
|
83
|
+
voice-cmd --list-devices
|
|
84
|
+
|
|
85
|
+
# Use a specific device
|
|
86
|
+
voice-cmd --device 1
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
When running from source, use `uv run python voice_cmd.py` instead of `voice-cmd`.
|
|
90
|
+
|
|
91
|
+
## Voice Commands
|
|
92
|
+
|
|
93
|
+
| Command | Action |
|
|
94
|
+
|---------|--------|
|
|
95
|
+
| `period` / `comma` / `question mark` | Insert punctuation |
|
|
96
|
+
| `new line` | Line break |
|
|
97
|
+
| `new paragraph` | Double line break |
|
|
98
|
+
| `scratch that` | Delete last ~5 words |
|
|
99
|
+
| `delete last N words` | Delete last N words |
|
|
100
|
+
| `undo` | Undo last action |
|
|
101
|
+
| `clear all` | Clear buffer |
|
|
102
|
+
| `stop listening` | Pause |
|
|
103
|
+
| `start listening` | Resume |
|
|
104
|
+
| `copy all` | Copy to clipboard |
|
|
105
|
+
| `done` | Copy to clipboard and exit |
|
|
106
|
+
| `show commands` | Show command list |
|
|
107
|
+
|
|
108
|
+
Commands can appear inline with dictated text: "Send the email **period** **new line** Don't forget the attachment" produces two lines with proper punctuation.
|
|
109
|
+
|
|
110
|
+
## Pipeline
|
|
111
|
+
|
|
112
|
+
1. **Audio** - `sounddevice` captures mic input, resampled to 16kHz
|
|
113
|
+
2. **VAD** - Silero VAD with hysteresis detects speech boundaries (32ms frames, pre-roll buffering)
|
|
114
|
+
3. **ASR** - MLX Whisper (small, 8-bit) with dev-vocabulary prompt
|
|
115
|
+
4. **LLM** - Qwen3 1.7B (4-bit) fixes tech terms: "fast api" -> "FastAPI", "type script" -> "TypeScript"
|
|
116
|
+
5. **Commands** - Sentence splitting + leading/trailing command extraction
|
|
117
|
+
6. **Output** - TUI buffer display or keystroke diff-typing via pynput
|
|
118
|
+
|
|
119
|
+
## Type Mode
|
|
120
|
+
|
|
121
|
+
`--type` mode sends keystrokes to the focused app. It detects when its own terminal is focused and skips typing to avoid feedback loops. A 3-second countdown lets you switch to the target app after launching.
|
|
122
|
+
|
|
123
|
+
## Benchmarks
|
|
124
|
+
|
|
125
|
+
```bash
|
|
126
|
+
# Compare ASR models (requires test fixtures in tests/fixtures/)
|
|
127
|
+
uv run python tests/benchmark.py
|
|
128
|
+
|
|
129
|
+
# Pipeline diagnostics
|
|
130
|
+
uv run python tests/diagnose_pipeline.py
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
## Releasing
|
|
134
|
+
|
|
135
|
+
1. Update the version in `pyproject.toml`
|
|
136
|
+
2. Commit: `git commit -am "chore: bump version to X.Y.Z"`
|
|
137
|
+
3. Tag: `git tag vX.Y.Z`
|
|
138
|
+
4. Push: `git push origin main --tags`
|
|
139
|
+
|
|
140
|
+
The GitHub Actions workflow builds and publishes to PyPI automatically via trusted publishers (OIDC).
|
|
141
|
+
|
|
142
|
+
### First-time PyPI setup
|
|
143
|
+
|
|
144
|
+
1. Go to https://pypi.org/manage/account/publishing/
|
|
145
|
+
2. Add a "pending publisher":
|
|
146
|
+
- Package name: `voice-command`
|
|
147
|
+
- Owner: `depoledna`
|
|
148
|
+
- Repository: `voice-command`
|
|
149
|
+
- Workflow: `release.yml`
|
|
150
|
+
- Environment: `pypi`
|
|
151
|
+
3. In the GitHub repo, go to Settings > Environments > create `pypi`
|
|
152
|
+
|
|
153
|
+
## License
|
|
154
|
+
|
|
155
|
+
MIT
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
# voice-command
|
|
2
|
+
|
|
3
|
+
VAD-driven streaming voice dictation for macOS (Apple Silicon). Speaks into your mic, text appears in a terminal buffer or gets typed directly into any app.
|
|
4
|
+
|
|
5
|
+
All inference runs locally — Whisper for ASR, Silero for VAD, Qwen for tech-term correction. No cloud APIs.
|
|
6
|
+
|
|
7
|
+
## Requirements
|
|
8
|
+
|
|
9
|
+
- macOS with Apple Silicon
|
|
10
|
+
- Python 3.12+
|
|
11
|
+
- Microphone access (System Settings > Privacy > Microphone)
|
|
12
|
+
- Accessibility access for `--type` mode (System Settings > Privacy > Accessibility)
|
|
13
|
+
|
|
14
|
+
## Install
|
|
15
|
+
|
|
16
|
+
### From PyPI
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install voice-command
|
|
20
|
+
# or
|
|
21
|
+
uv tool install voice-command
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
### From source
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
git clone https://github.com/depoledna/voice-command.git
|
|
28
|
+
cd voice-command
|
|
29
|
+
uv sync
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
Models download automatically on first run (~300MB for Whisper + ~1GB for Qwen).
|
|
33
|
+
|
|
34
|
+
## Usage
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
# Terminal buffer mode (TUI)
|
|
38
|
+
voice-cmd
|
|
39
|
+
|
|
40
|
+
# Type directly into the focused app
|
|
41
|
+
voice-cmd --type
|
|
42
|
+
|
|
43
|
+
# Transcribe a recording
|
|
44
|
+
voice-cmd --file recording.m4a
|
|
45
|
+
|
|
46
|
+
# List audio devices
|
|
47
|
+
voice-cmd --list-devices
|
|
48
|
+
|
|
49
|
+
# Use a specific device
|
|
50
|
+
voice-cmd --device 1
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
When running from source, use `uv run python voice_cmd.py` instead of `voice-cmd`.
|
|
54
|
+
|
|
55
|
+
## Voice Commands
|
|
56
|
+
|
|
57
|
+
| Command | Action |
|
|
58
|
+
|---------|--------|
|
|
59
|
+
| `period` / `comma` / `question mark` | Insert punctuation |
|
|
60
|
+
| `new line` | Line break |
|
|
61
|
+
| `new paragraph` | Double line break |
|
|
62
|
+
| `scratch that` | Delete last ~5 words |
|
|
63
|
+
| `delete last N words` | Delete last N words |
|
|
64
|
+
| `undo` | Undo last action |
|
|
65
|
+
| `clear all` | Clear buffer |
|
|
66
|
+
| `stop listening` | Pause |
|
|
67
|
+
| `start listening` | Resume |
|
|
68
|
+
| `copy all` | Copy to clipboard |
|
|
69
|
+
| `done` | Copy to clipboard and exit |
|
|
70
|
+
| `show commands` | Show command list |
|
|
71
|
+
|
|
72
|
+
Commands can appear inline with dictated text: "Send the email **period** **new line** Don't forget the attachment" produces two lines with proper punctuation.
|
|
73
|
+
|
|
74
|
+
## Pipeline
|
|
75
|
+
|
|
76
|
+
1. **Audio** - `sounddevice` captures mic input, resampled to 16kHz
|
|
77
|
+
2. **VAD** - Silero VAD with hysteresis detects speech boundaries (32ms frames, pre-roll buffering)
|
|
78
|
+
3. **ASR** - MLX Whisper (small, 8-bit) with dev-vocabulary prompt
|
|
79
|
+
4. **LLM** - Qwen3 1.7B (4-bit) fixes tech terms: "fast api" -> "FastAPI", "type script" -> "TypeScript"
|
|
80
|
+
5. **Commands** - Sentence splitting + leading/trailing command extraction
|
|
81
|
+
6. **Output** - TUI buffer display or keystroke diff-typing via pynput
|
|
82
|
+
|
|
83
|
+
## Type Mode
|
|
84
|
+
|
|
85
|
+
`--type` mode sends keystrokes to the focused app. It detects when its own terminal is focused and skips typing to avoid feedback loops. A 3-second countdown lets you switch to the target app after launching.
|
|
86
|
+
|
|
87
|
+
## Benchmarks
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
# Compare ASR models (requires test fixtures in tests/fixtures/)
|
|
91
|
+
uv run python tests/benchmark.py
|
|
92
|
+
|
|
93
|
+
# Pipeline diagnostics
|
|
94
|
+
uv run python tests/diagnose_pipeline.py
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
## Releasing
|
|
98
|
+
|
|
99
|
+
1. Update the version in `pyproject.toml`
|
|
100
|
+
2. Commit: `git commit -am "chore: bump version to X.Y.Z"`
|
|
101
|
+
3. Tag: `git tag vX.Y.Z`
|
|
102
|
+
4. Push: `git push origin main --tags`
|
|
103
|
+
|
|
104
|
+
The GitHub Actions workflow builds and publishes to PyPI automatically via trusted publishers (OIDC).
|
|
105
|
+
|
|
106
|
+
### First-time PyPI setup
|
|
107
|
+
|
|
108
|
+
1. Go to https://pypi.org/manage/account/publishing/
|
|
109
|
+
2. Add a "pending publisher":
|
|
110
|
+
- Package name: `voice-command`
|
|
111
|
+
- Owner: `depoledna`
|
|
112
|
+
- Repository: `voice-command`
|
|
113
|
+
- Workflow: `release.yml`
|
|
114
|
+
- Environment: `pypi`
|
|
115
|
+
3. In the GitHub repo, go to Settings > Environments > create `pypi`
|
|
116
|
+
|
|
117
|
+
## License
|
|
118
|
+
|
|
119
|
+
MIT
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "voice-command"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "VAD-driven streaming voice dictation for macOS with editing commands, tech-term correction, and keyboard output"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.12"
|
|
11
|
+
authors = [{ name = "Denis" }]
|
|
12
|
+
license = "MIT"
|
|
13
|
+
keywords = ["voice", "dictation", "whisper", "mlx", "macos", "speech-to-text", "vad"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 4 - Beta",
|
|
16
|
+
"Environment :: Console",
|
|
17
|
+
"Intended Audience :: Developers",
|
|
18
|
+
"License :: OSI Approved :: MIT License",
|
|
19
|
+
"Operating System :: MacOS",
|
|
20
|
+
"Programming Language :: Python :: 3.12",
|
|
21
|
+
"Programming Language :: Python :: 3.13",
|
|
22
|
+
"Topic :: Multimedia :: Sound/Audio :: Speech",
|
|
23
|
+
]
|
|
24
|
+
dependencies = [
|
|
25
|
+
"faster-whisper>=1.2.1",
|
|
26
|
+
"jiwer>=4.0.0",
|
|
27
|
+
"librosa>=0.11.0",
|
|
28
|
+
"mlx-lm>=0.30.7",
|
|
29
|
+
"mlx-whisper>=0.4.3",
|
|
30
|
+
"noisereduce>=3.0.3",
|
|
31
|
+
"psutil>=7.2.2",
|
|
32
|
+
"pydantic>=2.12.5",
|
|
33
|
+
"pydub>=0.25.1",
|
|
34
|
+
"pynput>=1.8.1",
|
|
35
|
+
"scipy>=1.17.1",
|
|
36
|
+
"silero-vad-lite>=0.2.1",
|
|
37
|
+
"sounddevice>=0.5.5",
|
|
38
|
+
"transformers>=5.2.0",
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
[project.scripts]
|
|
42
|
+
voice-cmd = "voice_cmd:main"
|
|
43
|
+
|
|
44
|
+
[project.urls]
|
|
45
|
+
Homepage = "https://github.com/depoledna/voice-command"
|
|
46
|
+
Repository = "https://github.com/depoledna/voice-command"
|
|
47
|
+
Issues = "https://github.com/depoledna/voice-command/issues"
|
|
48
|
+
|
|
49
|
+
[tool.hatch.build.targets.wheel]
|
|
50
|
+
packages = ["."]
|
|
51
|
+
include = ["voice_cmd.py"]
|
|
52
|
+
|
|
53
|
+
[tool.hatch.build.targets.sdist]
|
|
54
|
+
include = ["voice_cmd.py", "pyproject.toml", "README.md", "LICENSE"]
|