vtype 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vtype-0.1.0/.gitignore +98 -0
- vtype-0.1.0/.vscode/launch.json +13 -0
- vtype-0.1.0/.vscode/tasks.json +16 -0
- vtype-0.1.0/.vscodeignore +10 -0
- vtype-0.1.0/DESIGN.md +101 -0
- vtype-0.1.0/LICENSE +21 -0
- vtype-0.1.0/PKG-INFO +74 -0
- vtype-0.1.0/README.md +77 -0
- vtype-0.1.0/README_PYPI.md +55 -0
- vtype-0.1.0/images/icon.png +0 -0
- vtype-0.1.0/package.json +63 -0
- vtype-0.1.0/pyproject.toml +31 -0
- vtype-0.1.0/requirements.txt +5 -0
- vtype-0.1.0/src/extension.ts +135 -0
- vtype-0.1.0/tsconfig.json +12 -0
- vtype-0.1.0/voice_backend.py +172 -0
- vtype-0.1.0/voice_terminal.py +168 -0
- vtype-0.1.0/vtype/__init__.py +3 -0
- vtype-0.1.0/vtype/__main__.py +174 -0
vtype-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
# Voice Claude Code - Git Ignore
|
|
2
|
+
|
|
3
|
+
# Python
|
|
4
|
+
__pycache__/
|
|
5
|
+
*.py[cod]
|
|
6
|
+
*$py.class
|
|
7
|
+
*.so
|
|
8
|
+
.Python
|
|
9
|
+
build/
|
|
10
|
+
develop-eggs/
|
|
11
|
+
dist/
|
|
12
|
+
downloads/
|
|
13
|
+
eggs/
|
|
14
|
+
.eggs/
|
|
15
|
+
lib/
|
|
16
|
+
lib64/
|
|
17
|
+
parts/
|
|
18
|
+
sdist/
|
|
19
|
+
var/
|
|
20
|
+
wheels/
|
|
21
|
+
*.egg-info/
|
|
22
|
+
.installed.cfg
|
|
23
|
+
*.egg
|
|
24
|
+
MANIFEST
|
|
25
|
+
|
|
26
|
+
# Virtual Environments
|
|
27
|
+
venv/
|
|
28
|
+
env/
|
|
29
|
+
ENV/
|
|
30
|
+
env.bak/
|
|
31
|
+
venv.bak/
|
|
32
|
+
|
|
33
|
+
# IDE
|
|
34
|
+
.idea/
|
|
35
|
+
# Keep .vscode/launch.json and tasks.json for dev workflow
|
|
36
|
+
.vscode/settings.json
|
|
37
|
+
.vscode/*.code-workspace
|
|
38
|
+
*.swp
|
|
39
|
+
*.swo
|
|
40
|
+
*~
|
|
41
|
+
.DS_Store
|
|
42
|
+
|
|
43
|
+
# Configuration (contains API keys - NEVER COMMIT)
|
|
44
|
+
.env
|
|
45
|
+
.env.local
|
|
46
|
+
|
|
47
|
+
# Whisper Models (large files)
|
|
48
|
+
models/*.bin
|
|
49
|
+
models/*.pt
|
|
50
|
+
*.bin
|
|
51
|
+
*.pt
|
|
52
|
+
|
|
53
|
+
# Audio files (temporary recordings)
|
|
54
|
+
*.wav
|
|
55
|
+
*.mp3
|
|
56
|
+
*.ogg
|
|
57
|
+
recordings/
|
|
58
|
+
temp_audio/
|
|
59
|
+
|
|
60
|
+
# Logs
|
|
61
|
+
*.log
|
|
62
|
+
logs/
|
|
63
|
+
*.log.*
|
|
64
|
+
|
|
65
|
+
# Testing
|
|
66
|
+
.pytest_cache/
|
|
67
|
+
.coverage
|
|
68
|
+
htmlcov/
|
|
69
|
+
.tox/
|
|
70
|
+
|
|
71
|
+
# Node / VSCode extension build artifacts
|
|
72
|
+
node_modules/
|
|
73
|
+
package-lock.json
|
|
74
|
+
out/
|
|
75
|
+
*.vsix
|
|
76
|
+
|
|
77
|
+
# OS
|
|
78
|
+
Thumbs.db
|
|
79
|
+
.DS_Store
|
|
80
|
+
|
|
81
|
+
# Backup files
|
|
82
|
+
*.bak
|
|
83
|
+
*.backup
|
|
84
|
+
*~
|
|
85
|
+
|
|
86
|
+
# Session data (if storing locally)
|
|
87
|
+
sessions/
|
|
88
|
+
*.session
|
|
89
|
+
|
|
90
|
+
# Claude Code project settings
|
|
91
|
+
.claude/
|
|
92
|
+
|
|
93
|
+
# Azure Speech SDK cache
|
|
94
|
+
.azure/
|
|
95
|
+
|
|
96
|
+
# Porcupine models/keys
|
|
97
|
+
*.ppn
|
|
98
|
+
porcupine_key.txt
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
{
|
|
2
|
+
"version": "0.2.0",
|
|
3
|
+
"configurations": [
|
|
4
|
+
{
|
|
5
|
+
"name": "Run Extension",
|
|
6
|
+
"type": "extensionHost",
|
|
7
|
+
"request": "launch",
|
|
8
|
+
"args": ["--extensionDevelopmentPath=${workspaceFolder}"],
|
|
9
|
+
"outFiles": ["${workspaceFolder}/out/**/*.js"],
|
|
10
|
+
"preLaunchTask": "${defaultBuildTask}"
|
|
11
|
+
}
|
|
12
|
+
]
|
|
13
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
{
|
|
2
|
+
"version": "2.0.0",
|
|
3
|
+
"tasks": [
|
|
4
|
+
{
|
|
5
|
+
"label": "npm: compile",
|
|
6
|
+
"type": "npm",
|
|
7
|
+
"script": "compile",
|
|
8
|
+
"group": {
|
|
9
|
+
"kind": "build",
|
|
10
|
+
"isDefault": true
|
|
11
|
+
},
|
|
12
|
+
"problemMatcher": ["$tsc"],
|
|
13
|
+
"detail": "Compile TypeScript extension"
|
|
14
|
+
}
|
|
15
|
+
]
|
|
16
|
+
}
|
vtype-0.1.0/DESIGN.md
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
# Voice Claude Code — Design
|
|
2
|
+
|
|
3
|
+
## What it is
|
|
4
|
+
|
|
5
|
+
A VSCode extension that adds voice input to Claude Code. The user presses a keyboard shortcut, speaks, and the transcribed text is pasted into the Claude Code chat input. No webview, no browser, no server — just a status bar button and a Python backend.
|
|
6
|
+
|
|
7
|
+
## Architecture
|
|
8
|
+
|
|
9
|
+
```
|
|
10
|
+
┌─────────────────────────────────────────┐
|
|
11
|
+
│ VSCode Extension (extension.ts) │
|
|
12
|
+
│ ─ Status bar: $(mic) Voice │
|
|
13
|
+
│ ─ Keybinding: Ctrl+Alt+V │
|
|
14
|
+
│ ─ Spawns voice_backend.py via stdin │
|
|
15
|
+
│ ─ IPC: line-delimited text protocol │
|
|
16
|
+
└──────────────┬──────────────────────────┘
|
|
17
|
+
│ stdin/stdout
|
|
18
|
+
▼
|
|
19
|
+
┌─────────────────────────────────────────┐
|
|
20
|
+
│ Python Backend (voice_backend.py) │
|
|
21
|
+
│ │
|
|
22
|
+
│ sounddevice (RMS VAD) │
|
|
23
|
+
│ ↓ speech detected │
|
|
24
|
+
│ openai-whisper (base model) │
|
|
25
|
+
│ ↓ transcribed text │
|
|
26
|
+
│ pyperclip → clipboard │
|
|
27
|
+
│ ↓ PASTE: signal to extension │
|
|
28
|
+
│ extension calls claude-vscode.focus │
|
|
29
|
+
│ ↓ DO_PASTE signal back (150ms later) │
|
|
30
|
+
│ pynput → Ctrl+V at OS level │
|
|
31
|
+
└─────────────────────────────────────────┘
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## IPC Protocol
|
|
35
|
+
|
|
36
|
+
**Extension → Backend (stdin):**
|
|
37
|
+
| Command | Meaning |
|
|
38
|
+
|---|---|
|
|
39
|
+
| `START` | Begin listening |
|
|
40
|
+
| `STOP` | Stop listening, go idle |
|
|
41
|
+
| `DO_PASTE` | Focus confirmed, paste now |
|
|
42
|
+
| `QUIT` | Shut down |
|
|
43
|
+
|
|
44
|
+
**Backend → Extension (stdout):**
|
|
45
|
+
| Message | Meaning |
|
|
46
|
+
|---|---|
|
|
47
|
+
| `READY` | Whisper model loaded |
|
|
48
|
+
| `LISTENING` | Waiting for speech |
|
|
49
|
+
| `SPEAKING` | Recording |
|
|
50
|
+
| `TRANSCRIBING` | Whisper processing |
|
|
51
|
+
| `PASTE:<text>` | Text ready, requesting focus |
|
|
52
|
+
| `PASTED:<text>` | Paste complete |
|
|
53
|
+
| `IDLE` | Stopped |
|
|
54
|
+
| `ERROR:<msg>` | Something went wrong |
|
|
55
|
+
|
|
56
|
+
## Focus management
|
|
57
|
+
|
|
58
|
+
The tricky part: pynput simulates Ctrl+V at the OS level, so the correct window must have focus at the exact moment of paste. If the user opened a file editor while speaking, a naive paste would land in the editor.
|
|
59
|
+
|
|
60
|
+
**Solution — round-trip before paste:**
|
|
61
|
+
1. Backend copies text to clipboard, emits `PASTE:<text>`, then blocks on a threading.Event
|
|
62
|
+
2. Extension receives `PASTE:`, calls `claude-vscode.focus` (the same command as `Ctrl+Esc`)
|
|
63
|
+
3. After 150ms (enough for webview focus to propagate), extension sends `DO_PASTE`
|
|
64
|
+
4. Backend unblocks, runs pynput Ctrl+V
|
|
65
|
+
|
|
66
|
+
This ensures the Claude Code input always has OS-level focus at paste time, regardless of what the user did while speaking. Timeout of 2s prevents blocking indefinitely if the extension crashes.
|
|
67
|
+
|
|
68
|
+
## Key decisions
|
|
69
|
+
|
|
70
|
+
**No LLM cleanup layer** — an earlier iteration piped Whisper output through a local qwen2.5:1.5b model (via Ollama) to strip filler words and resolve self-corrections. Removed because Claude Code's AI is more than capable of understanding natural speech patterns, and the cleanup added ~2.8s latency per utterance with no real benefit.
|
|
71
|
+
|
|
72
|
+
**No TTS** — not needed. The user reads Claude's responses normally.
|
|
73
|
+
|
|
74
|
+
**No wake word** — user activates manually via Ctrl+Alt+V. Simpler, no always-on mic.
|
|
75
|
+
|
|
76
|
+
**No webview** — earlier design used a browser-based UI with WebSockets. Scrapped in favour of a status bar item that works with whatever text area has focus.
|
|
77
|
+
|
|
78
|
+
**openai-whisper not whisper.cpp** — Python bindings are easier to integrate. `base` model gives acceptable accuracy and speed on CPU.
|
|
79
|
+
|
|
80
|
+
**pynput for paste** — OS-level Ctrl+V simulation works with any input, including VSCode webview panels that don't support the VSCode clipboard paste command.
|
|
81
|
+
|
|
82
|
+
## VAD parameters
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
SPEECH_RMS = 0.015 # RMS threshold to start recording
|
|
86
|
+
SILENCE_RMS = 0.010 # RMS threshold to detect silence
|
|
87
|
+
SILENCE_SECS = 1.2 # Seconds of silence to end utterance
|
|
88
|
+
MIN_SPEECH = 0.4 # Minimum speech duration (seconds)
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## File structure
|
|
92
|
+
|
|
93
|
+
```
|
|
94
|
+
voice-claude-code/
|
|
95
|
+
├── src/extension.ts — VSCode extension (TypeScript)
|
|
96
|
+
├── voice_backend.py — Python: VAD + Whisper + paste
|
|
97
|
+
├── package.json — Extension manifest, keybindings, settings
|
|
98
|
+
├── tsconfig.json — TypeScript config
|
|
99
|
+
├── requirements.txt — Python dependencies
|
|
100
|
+
└── out/extension.js — Compiled output (gitignored)
|
|
101
|
+
```
|
vtype-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Voice Claude Code
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
vtype-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vtype
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Offline voice input — speak into any focused window, no cloud required
|
|
5
|
+
License: MIT
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Keywords: dictation,offline,speech,voice,voice-input,whisper
|
|
8
|
+
Classifier: Environment :: Console
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Topic :: Utilities
|
|
13
|
+
Requires-Python: >=3.9
|
|
14
|
+
Requires-Dist: numpy
|
|
15
|
+
Requires-Dist: openai-whisper
|
|
16
|
+
Requires-Dist: pynput
|
|
17
|
+
Requires-Dist: sounddevice
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
|
|
20
|
+
# vtype
|
|
21
|
+
|
|
22
|
+
**Offline voice input for any focused window.** Press `Ctrl+Alt+V`, speak, and your words are typed instantly — no cloud, no API key, no subscription.
|
|
23
|
+
|
|
24
|
+
Works in Claude Code terminal, VS Code, any text editor, browser, chat app — whatever window is focused.
|
|
25
|
+
|
|
26
|
+
---
|
|
27
|
+
|
|
28
|
+
## Install
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
pip install vtype
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
> **First run** downloads the Whisper speech model (~150 MB). Subsequent runs are instant.
|
|
35
|
+
|
|
36
|
+
## Usage
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
vtype
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
- Press `Ctrl+Alt+V` — starts listening
|
|
43
|
+
- Speak naturally
|
|
44
|
+
- Silence for ~1 second → transcribes and types into the focused window
|
|
45
|
+
- Press `Ctrl+Alt+V` again to cancel
|
|
46
|
+
- `Ctrl+C` to quit
|
|
47
|
+
|
|
48
|
+
## Requirements
|
|
49
|
+
|
|
50
|
+
- Python 3.9+
|
|
51
|
+
- A microphone
|
|
52
|
+
- FFmpeg (for audio processing)
|
|
53
|
+
|
|
54
|
+
**FFmpeg install:**
|
|
55
|
+
|
|
56
|
+
| Platform | Command |
|
|
57
|
+
|----------|---------|
|
|
58
|
+
| Windows | `winget install ffmpeg` |
|
|
59
|
+
| macOS | `brew install ffmpeg` |
|
|
60
|
+
| Linux | `sudo apt install ffmpeg` |
|
|
61
|
+
|
|
62
|
+
## How it works
|
|
63
|
+
|
|
64
|
+
- Speech detection runs locally using RMS-based VAD (no network calls)
|
|
65
|
+
- Transcription uses [OpenAI Whisper](https://github.com/openai/whisper) (base model, runs on CPU)
|
|
66
|
+
- Text is typed via OS-level keyboard simulation — works in any window including terminals and browser tabs
|
|
67
|
+
|
|
68
|
+
## Privacy
|
|
69
|
+
|
|
70
|
+
Everything runs on your machine. Audio never leaves your device.
|
|
71
|
+
|
|
72
|
+
## License
|
|
73
|
+
|
|
74
|
+
MIT
|
vtype-0.1.0/README.md
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# Claude Voice
|
|
2
|
+
|
|
3
|
+
**Talk to Claude instead of typing.** Press `Ctrl+Alt+V`, speak your prompt, and it appears in Claude Code's chat input — ready to send.
|
|
4
|
+
|
|
5
|
+
**No API key. No cloud. No subscription. Runs 100% offline on your machine.**
|
|
6
|
+
|
|
7
|
+

|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
## Why Claude Voice?
|
|
12
|
+
|
|
13
|
+
- **No API key required** — just install and speak
|
|
14
|
+
- **Your voice never leaves your computer** — speech is processed locally on your machine
|
|
15
|
+
- **Works offline** — no internet connection needed after the initial model download
|
|
16
|
+
- **Free to use** — no per-request costs, no subscriptions
|
|
17
|
+
|
|
18
|
+
---
|
|
19
|
+
|
|
20
|
+
## Setup
|
|
21
|
+
|
|
22
|
+
**1. Install Python dependencies**
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
pip install openai-whisper sounddevice pynput pyperclip numpy
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
> The first use downloads a ~140 MB speech recognition model. After that, everything runs fully offline with no internet connection required.
|
|
29
|
+
|
|
30
|
+
**2. Reload VS Code**
|
|
31
|
+
|
|
32
|
+
`Ctrl+Shift+P` → **Developer: Reload Window**
|
|
33
|
+
|
|
34
|
+
That's it. Look for the **🎤 Voice** button in your status bar.
|
|
35
|
+
|
|
36
|
+
---
|
|
37
|
+
|
|
38
|
+
## How to use
|
|
39
|
+
|
|
40
|
+
1. Press `Ctrl+Alt+V` (Windows/Linux) or `Cmd+Alt+V` (Mac) — or click **🎤 Voice** in the status bar
|
|
41
|
+
2. Speak your prompt naturally
|
|
42
|
+
3. Pause for ~1 second when done — your words appear in Claude Code's chat input
|
|
43
|
+
4. Press `Enter` to send
|
|
44
|
+
|
|
45
|
+
Press `Ctrl+Alt+V` again at any time to cancel.
|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
## Requirements
|
|
50
|
+
|
|
51
|
+
- [Claude Code](https://marketplace.visualstudio.com/items?itemName=Anthropic.claude-code) extension installed
|
|
52
|
+
- Python 3.9 or higher
|
|
53
|
+
- A microphone
|
|
54
|
+
|
|
55
|
+
---
|
|
56
|
+
|
|
57
|
+
## Settings
|
|
58
|
+
|
|
59
|
+
| Setting | Default | Description |
|
|
60
|
+
|---|---|---|
|
|
61
|
+
| `claude-voice.focusTarget` | `claude` | `claude` = always focus Claude Code input before pasting. `none` = paste wherever focus already is (works with any input field). |
|
|
62
|
+
|
|
63
|
+
---
|
|
64
|
+
|
|
65
|
+
## Troubleshooting
|
|
66
|
+
|
|
67
|
+
**"Missing Python dependencies"** — Run `pip install openai-whisper sounddevice pynput pyperclip numpy` in your terminal.
|
|
68
|
+
|
|
69
|
+
**Nothing appears after speaking** — Make sure the Claude Code panel is open. The extension automatically focuses the Claude Code input before pasting.
|
|
70
|
+
|
|
71
|
+
**Text goes to the wrong app** — The extension checks that VS Code is the active window before pasting. If you switch away while it's transcribing, the paste is safely discarded.
|
|
72
|
+
|
|
73
|
+
**macOS** — `pynput` requires Accessibility permissions: System Settings → Privacy & Security → Accessibility → enable VS Code.
|
|
74
|
+
|
|
75
|
+
**Linux** — Requires `xdotool`: `sudo apt install xdotool`
|
|
76
|
+
|
|
77
|
+
**Slow first transcription** — Whisper loads on first use (~5s). Subsequent transcriptions are faster.
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# vtype
|
|
2
|
+
|
|
3
|
+
**Offline voice input for any focused window.** Press `Ctrl+Alt+V`, speak, and your words are typed instantly — no cloud, no API key, no subscription.
|
|
4
|
+
|
|
5
|
+
Works in Claude Code terminal, VS Code, any text editor, browser, chat app — whatever window is focused.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## Install
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
pip install vtype
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
> **First run** downloads the Whisper speech model (~150 MB). Subsequent runs are instant.
|
|
16
|
+
|
|
17
|
+
## Usage
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
vtype
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
- Press `Ctrl+Alt+V` — starts listening
|
|
24
|
+
- Speak naturally
|
|
25
|
+
- Silence for ~1 second → transcribes and types into the focused window
|
|
26
|
+
- Press `Ctrl+Alt+V` again to cancel
|
|
27
|
+
- `Ctrl+C` to quit
|
|
28
|
+
|
|
29
|
+
## Requirements
|
|
30
|
+
|
|
31
|
+
- Python 3.9+
|
|
32
|
+
- A microphone
|
|
33
|
+
- FFmpeg (for audio processing)
|
|
34
|
+
|
|
35
|
+
**FFmpeg install:**
|
|
36
|
+
|
|
37
|
+
| Platform | Command |
|
|
38
|
+
|----------|---------|
|
|
39
|
+
| Windows | `winget install ffmpeg` |
|
|
40
|
+
| macOS | `brew install ffmpeg` |
|
|
41
|
+
| Linux | `sudo apt install ffmpeg` |
|
|
42
|
+
|
|
43
|
+
## How it works
|
|
44
|
+
|
|
45
|
+
- Speech detection runs locally using RMS-based VAD (no network calls)
|
|
46
|
+
- Transcription uses [OpenAI Whisper](https://github.com/openai/whisper) (base model, runs on CPU)
|
|
47
|
+
- Text is typed via OS-level keyboard simulation — works in any window including terminals and browser tabs
|
|
48
|
+
|
|
49
|
+
## Privacy
|
|
50
|
+
|
|
51
|
+
Everything runs on your machine. Audio never leaves your device.
|
|
52
|
+
|
|
53
|
+
## License
|
|
54
|
+
|
|
55
|
+
MIT
|
|
Binary file
|
vtype-0.1.0/package.json
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "claude-voice",
|
|
3
|
+
"displayName": "Claude Voice",
|
|
4
|
+
"description": "Talk to Claude instead of typing. Press a shortcut, speak your prompt, and it appears in Claude Code ready to send.",
|
|
5
|
+
"version": "0.1.4",
|
|
6
|
+
"publisher": "jsaluja",
|
|
7
|
+
"license": "MIT",
|
|
8
|
+
"repository": {
|
|
9
|
+
"type": "git",
|
|
10
|
+
"url": "https://github.com/jsaluja/claude-voice"
|
|
11
|
+
},
|
|
12
|
+
"keywords": ["claude", "voice", "speech", "whisper", "ai", "assistant", "dictation"],
|
|
13
|
+
"engines": {
|
|
14
|
+
"vscode": "^1.85.0"
|
|
15
|
+
},
|
|
16
|
+
"icon": "images/icon.png",
|
|
17
|
+
"galleryBanner": {
|
|
18
|
+
"color": "#18181f",
|
|
19
|
+
"theme": "dark"
|
|
20
|
+
},
|
|
21
|
+
"categories": ["AI", "Other"],
|
|
22
|
+
"activationEvents": ["onStartupFinished"],
|
|
23
|
+
"main": "./out/extension.js",
|
|
24
|
+
"contributes": {
|
|
25
|
+
"commands": [
|
|
26
|
+
{
|
|
27
|
+
"command": "claude-voice.toggle",
|
|
28
|
+
"title": "Toggle Voice Input",
|
|
29
|
+
"category": "Claude Voice"
|
|
30
|
+
}
|
|
31
|
+
],
|
|
32
|
+
"configuration": {
|
|
33
|
+
"title": "Claude Voice",
|
|
34
|
+
"properties": {
|
|
35
|
+
"claude-voice.focusTarget": {
|
|
36
|
+
"type": "string",
|
|
37
|
+
"enum": ["claude", "none"],
|
|
38
|
+
"default": "claude",
|
|
39
|
+
"description": "Where to focus before pasting. 'claude' = always focus Claude Code input. 'none' = paste wherever focus already is."
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
},
|
|
43
|
+
"keybindings": [
|
|
44
|
+
{
|
|
45
|
+
"command": "claude-voice.toggle",
|
|
46
|
+
"key": "ctrl+alt+v",
|
|
47
|
+
"mac": "cmd+alt+v"
|
|
48
|
+
}
|
|
49
|
+
]
|
|
50
|
+
},
|
|
51
|
+
"scripts": {
|
|
52
|
+
"compile": "tsc -p ./",
|
|
53
|
+
"watch": "tsc -watch -p ./",
|
|
54
|
+
"vscode:prepublish": "npm run compile",
|
|
55
|
+
"package": "vsce package"
|
|
56
|
+
},
|
|
57
|
+
"devDependencies": {
|
|
58
|
+
"@types/vscode": "^1.85.0",
|
|
59
|
+
"@types/node": "^20.0.0",
|
|
60
|
+
"typescript": "^5.3.0",
|
|
61
|
+
"@vscode/vsce": "^2.22.0"
|
|
62
|
+
}
|
|
63
|
+
}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "vtype"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Offline voice input — speak into any focused window, no cloud required"
|
|
9
|
+
readme = "README_PYPI.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
keywords = ["voice", "speech", "whisper", "dictation", "offline", "voice-input"]
|
|
13
|
+
classifiers = [
|
|
14
|
+
"Programming Language :: Python :: 3",
|
|
15
|
+
"License :: OSI Approved :: MIT License",
|
|
16
|
+
"Operating System :: OS Independent",
|
|
17
|
+
"Topic :: Utilities",
|
|
18
|
+
"Environment :: Console",
|
|
19
|
+
]
|
|
20
|
+
dependencies = [
|
|
21
|
+
"openai-whisper",
|
|
22
|
+
"sounddevice",
|
|
23
|
+
"pynput",
|
|
24
|
+
"numpy",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
[project.scripts]
|
|
28
|
+
vtype = "vtype.__main__:main"
|
|
29
|
+
|
|
30
|
+
[tool.hatch.build.targets.wheel]
|
|
31
|
+
packages = ["vtype"]
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
import * as vscode from 'vscode';
|
|
2
|
+
import * as cp from 'child_process';
|
|
3
|
+
import * as path from 'path';
|
|
4
|
+
|
|
5
|
+
let backend: cp.ChildProcess | null = null;
|
|
6
|
+
let statusBar: vscode.StatusBarItem;
|
|
7
|
+
let listening = false;
|
|
8
|
+
let extPath = '';
|
|
9
|
+
let deactivating = false;
|
|
10
|
+
|
|
11
|
+
export function activate(context: vscode.ExtensionContext) {
|
|
12
|
+
statusBar = vscode.window.createStatusBarItem(vscode.StatusBarAlignment.Right, 1000);
|
|
13
|
+
statusBar.text = '$(sync~spin) Voice';
|
|
14
|
+
statusBar.tooltip = 'Voice: loading...';
|
|
15
|
+
statusBar.command = 'claude-voice.toggle';
|
|
16
|
+
statusBar.backgroundColor = new vscode.ThemeColor('statusBarItem.warningBackground');
|
|
17
|
+
statusBar.show();
|
|
18
|
+
context.subscriptions.push(statusBar);
|
|
19
|
+
|
|
20
|
+
vscode.window.showInformationMessage('Click 🎤 Voice in the status bar or press Ctrl+Alt+V to talk to the agent');
|
|
21
|
+
|
|
22
|
+
extPath = context.extensionPath;
|
|
23
|
+
startBackend();
|
|
24
|
+
|
|
25
|
+
context.subscriptions.push(
|
|
26
|
+
vscode.commands.registerCommand('claude-voice.toggle', toggle)
|
|
27
|
+
);
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
function startBackend() {
|
|
31
|
+
const script = path.join(extPath, 'voice_backend.py');
|
|
32
|
+
const python = process.platform === 'win32' ? 'python' : 'python3';
|
|
33
|
+
backend = cp.spawn(python, [script], { cwd: extPath });
|
|
34
|
+
|
|
35
|
+
backend.stdout?.on('data', (data: Buffer) => {
|
|
36
|
+
for (const line of data.toString().split('\n')) {
|
|
37
|
+
const msg = line.trim();
|
|
38
|
+
if (msg) { onMessage(msg); }
|
|
39
|
+
}
|
|
40
|
+
});
|
|
41
|
+
|
|
42
|
+
let stderrBuf = '';
|
|
43
|
+
backend.stderr?.on('data', (d: Buffer) => {
|
|
44
|
+
const text = d.toString();
|
|
45
|
+
stderrBuf += text;
|
|
46
|
+
console.log('[voice]', text.trim());
|
|
47
|
+
});
|
|
48
|
+
|
|
49
|
+
backend.on('exit', (code) => {
|
|
50
|
+
backend = null;
|
|
51
|
+
listening = false;
|
|
52
|
+
if (deactivating) { return; }
|
|
53
|
+
statusBar.text = '$(mic-filled) Voice (error)';
|
|
54
|
+
statusBar.color = new vscode.ThemeColor('statusBarItem.errorForeground');
|
|
55
|
+
if (stderrBuf.includes('ModuleNotFoundError') || stderrBuf.includes('No module named')) {
|
|
56
|
+
vscode.window.showErrorMessage(
|
|
57
|
+
'Claude Voice: Missing Python dependencies. Run: pip install -r requirements.txt',
|
|
58
|
+
'Copy Command'
|
|
59
|
+
).then(sel => {
|
|
60
|
+
if (sel === 'Copy Command') { vscode.env.clipboard.writeText('pip install openai-whisper sounddevice pynput pyperclip numpy'); }
|
|
61
|
+
});
|
|
62
|
+
return;
|
|
63
|
+
}
|
|
64
|
+
stderrBuf = '';
|
|
65
|
+
vscode.window.showWarningMessage(`Voice backend crashed (code ${code}) — restarting in 3s...`);
|
|
66
|
+
setTimeout(() => {
|
|
67
|
+
if (!deactivating) {
|
|
68
|
+
statusBar.text = '$(mic) Voice';
|
|
69
|
+
statusBar.color = undefined;
|
|
70
|
+
startBackend();
|
|
71
|
+
}
|
|
72
|
+
}, 3000);
|
|
73
|
+
});
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
function onMessage(msg: string) {
|
|
77
|
+
if (msg === 'READY') {
|
|
78
|
+
statusBar.text = '$(mic) Voice';
|
|
79
|
+
statusBar.tooltip = 'Click to start voice input';
|
|
80
|
+
statusBar.color = undefined;
|
|
81
|
+
} else if (msg === 'LISTENING') {
|
|
82
|
+
statusBar.text = '$(radio-tower) Listening...';
|
|
83
|
+
statusBar.tooltip = 'Click to stop';
|
|
84
|
+
statusBar.color = new vscode.ThemeColor('statusBarItem.warningForeground');
|
|
85
|
+
} else if (msg === 'TRANSCRIBING') {
|
|
86
|
+
statusBar.text = '$(loading~spin) Transcribing...';
|
|
87
|
+
statusBar.tooltip = 'Click to stop';
|
|
88
|
+
statusBar.color = undefined;
|
|
89
|
+
} else if (msg === 'IDLE') {
|
|
90
|
+
statusBar.text = '$(mic) Voice';
|
|
91
|
+
statusBar.tooltip = 'Click to start voice input';
|
|
92
|
+
statusBar.color = undefined;
|
|
93
|
+
listening = false;
|
|
94
|
+
} else if (msg.startsWith('PASTE:')) {
|
|
95
|
+
const text = msg.slice(6);
|
|
96
|
+
const target = vscode.workspace.getConfiguration('claude-voice').get<string>('focusTarget', 'claude');
|
|
97
|
+
const windowFocused = vscode.window.state.focused;
|
|
98
|
+
const inEditor = vscode.window.activeTextEditor !== undefined;
|
|
99
|
+
// Only focus-redirect when VS Code is already focused and user is in a text editor.
|
|
100
|
+
// If VS Code is not focused, don't call claude-vscode.focus — it snaps to conversation 1,
|
|
101
|
+
// disrupting multi-conversation workflows. The backend will discard the paste and stay listening.
|
|
102
|
+
const needsFocus = target === 'claude' && windowFocused && inEditor;
|
|
103
|
+
if (needsFocus) { vscode.commands.executeCommand('claude-vscode.focus'); }
|
|
104
|
+
// Always wait at least 100ms — even without focus redirect, the shortcut
|
|
105
|
+
// momentarily pulls focus from the webview input and it needs time to recover
|
|
106
|
+
setTimeout(() => { backend?.stdin?.write('DO_PASTE\n'); }, needsFocus ? 150 : 100);
|
|
107
|
+
vscode.window.setStatusBarMessage(`Voice: "${text.slice(0, 60)}"`, 3000);
|
|
108
|
+
} else if (msg.startsWith('PASTED:')) {
|
|
109
|
+
} else if (msg.startsWith('ERROR:')) {
|
|
110
|
+
vscode.window.showWarningMessage(msg.slice(6));
|
|
111
|
+
statusBar.text = '$(mic) Voice';
|
|
112
|
+
statusBar.color = undefined;
|
|
113
|
+
listening = false;
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
function toggle() {
|
|
118
|
+
if (!backend?.stdin) {
|
|
119
|
+
vscode.window.showWarningMessage('Voice backend not running.');
|
|
120
|
+
return;
|
|
121
|
+
}
|
|
122
|
+
if (!listening) {
|
|
123
|
+
listening = true;
|
|
124
|
+
backend.stdin.write('START\n');
|
|
125
|
+
} else {
|
|
126
|
+
listening = false;
|
|
127
|
+
backend.stdin.write('STOP\n');
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
export function deactivate() {
|
|
132
|
+
deactivating = true;
|
|
133
|
+
backend?.stdin?.write('QUIT\n');
|
|
134
|
+
backend?.kill();
|
|
135
|
+
}
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Voice backend — VAD + Whisper + paste.
|
|
4
|
+
Commands from stdin: START, STOP, QUIT, DO_PASTE
|
|
5
|
+
Status to stdout: READY, LISTENING, SPEAKING, TRANSCRIBING, PASTE:<text>, PASTED:<text>, IDLE, ERROR:<msg>
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import sys
|
|
9
|
+
import platform
|
|
10
|
+
import threading
|
|
11
|
+
import numpy as np
|
|
12
|
+
import sounddevice as sd
|
|
13
|
+
import whisper
|
|
14
|
+
import pyperclip
|
|
15
|
+
from pynput.keyboard import Controller as KbController, Key
|
|
16
|
+
|
|
17
|
+
def _is_vscode_focused():
|
|
18
|
+
"""Returns True if VSCode is the foreground window. Fails open on unsupported platforms."""
|
|
19
|
+
try:
|
|
20
|
+
system = platform.system()
|
|
21
|
+
if system == 'Windows':
|
|
22
|
+
import ctypes
|
|
23
|
+
user32 = ctypes.windll.user32
|
|
24
|
+
hwnd = user32.GetForegroundWindow()
|
|
25
|
+
length = user32.GetWindowTextLengthW(hwnd)
|
|
26
|
+
buf = ctypes.create_unicode_buffer(length + 1)
|
|
27
|
+
user32.GetWindowTextW(hwnd, buf, length + 1)
|
|
28
|
+
return 'visual studio code' in buf.value.lower()
|
|
29
|
+
elif system == 'Darwin':
|
|
30
|
+
import subprocess
|
|
31
|
+
result = subprocess.run(
|
|
32
|
+
['osascript', '-e', 'tell application "System Events" to get name of first application process whose frontmost is true'],
|
|
33
|
+
capture_output=True, text=True, timeout=1
|
|
34
|
+
)
|
|
35
|
+
return 'code' in result.stdout.strip().lower()
|
|
36
|
+
elif system == 'Linux':
|
|
37
|
+
import subprocess
|
|
38
|
+
result = subprocess.run(
|
|
39
|
+
['xdotool', 'getactivewindow', 'getwindowname'],
|
|
40
|
+
capture_output=True, text=True, timeout=1
|
|
41
|
+
)
|
|
42
|
+
return 'visual studio code' in result.stdout.strip().lower()
|
|
43
|
+
except Exception:
|
|
44
|
+
pass
|
|
45
|
+
return True # fail open: unknown platform or tool missing — allow paste
|
|
46
|
+
|
|
47
|
+
SAMPLE_RATE = 16000
|
|
48
|
+
CHUNK_SIZE = int(SAMPLE_RATE * 0.1) # 100ms chunks
|
|
49
|
+
|
|
50
|
+
SPEECH_RMS = 0.015
|
|
51
|
+
SILENCE_RMS = 0.010
|
|
52
|
+
SILENCE_SECS = 1.2
|
|
53
|
+
MIN_SPEECH = 0.4
|
|
54
|
+
|
|
55
|
+
IDLE = 'idle'
|
|
56
|
+
LISTENING = 'listening'
|
|
57
|
+
SPEAKING = 'speaking'
|
|
58
|
+
|
|
59
|
+
# ── Load Whisper ─────────────────────────────────────────────────────────────
|
|
60
|
+
sys.stderr.write("Loading Whisper model...\n")
|
|
61
|
+
sys.stderr.flush()
|
|
62
|
+
_model = whisper.load_model("base")
|
|
63
|
+
sys.stderr.write("Whisper ready\n")
|
|
64
|
+
sys.stderr.flush()
|
|
65
|
+
print("READY", flush=True)
|
|
66
|
+
|
|
67
|
+
# ── State ─────────────────────────────────────────────────────────────────────
|
|
68
|
+
_state = IDLE
|
|
69
|
+
_speech_buf = []
|
|
70
|
+
_silence_cnt = 0
|
|
71
|
+
_lock = threading.Lock()
|
|
72
|
+
_do_paste_event = threading.Event() # set by DO_PASTE command from extension
|
|
73
|
+
|
|
74
|
+
def _emit(msg):
|
|
75
|
+
print(msg, flush=True)
|
|
76
|
+
|
|
77
|
+
# ── Audio callback ────────────────────────────────────────────────────────────
|
|
78
|
+
def _audio_cb(indata, frames, t, status):
|
|
79
|
+
global _state, _speech_buf, _silence_cnt
|
|
80
|
+
|
|
81
|
+
if _state not in (LISTENING, SPEAKING):
|
|
82
|
+
return
|
|
83
|
+
|
|
84
|
+
rms = float(np.sqrt(np.mean(indata ** 2)))
|
|
85
|
+
|
|
86
|
+
with _lock:
|
|
87
|
+
if _state == LISTENING:
|
|
88
|
+
if rms > SPEECH_RMS:
|
|
89
|
+
_state = SPEAKING
|
|
90
|
+
_speech_buf = [indata.copy()]
|
|
91
|
+
_silence_cnt = 0
|
|
92
|
+
|
|
93
|
+
elif _state == SPEAKING:
|
|
94
|
+
_speech_buf.append(indata.copy())
|
|
95
|
+
if rms < SILENCE_RMS:
|
|
96
|
+
_silence_cnt += 1
|
|
97
|
+
if _silence_cnt * 0.1 >= SILENCE_SECS:
|
|
98
|
+
buf = list(_speech_buf)
|
|
99
|
+
_speech_buf = []
|
|
100
|
+
_silence_cnt = 0
|
|
101
|
+
_state = LISTENING
|
|
102
|
+
_emit('TRANSCRIBING')
|
|
103
|
+
threading.Thread(target=_process, args=(buf,), daemon=True).start()
|
|
104
|
+
else:
|
|
105
|
+
_silence_cnt = 0
|
|
106
|
+
|
|
107
|
+
_stream = sd.InputStream(
|
|
108
|
+
samplerate=SAMPLE_RATE, channels=1, dtype='float32',
|
|
109
|
+
blocksize=CHUNK_SIZE, callback=_audio_cb
|
|
110
|
+
)
|
|
111
|
+
_stream.start()
|
|
112
|
+
|
|
113
|
+
# ── Transcribe + paste ────────────────────────────────────────────────────────
|
|
114
|
+
def _process(chunks):
|
|
115
|
+
audio = np.concatenate(chunks).flatten()
|
|
116
|
+
if len(audio) / SAMPLE_RATE < MIN_SPEECH:
|
|
117
|
+
_emit('LISTENING')
|
|
118
|
+
return
|
|
119
|
+
|
|
120
|
+
result = _model.transcribe(audio, language='en', fp16=False, verbose=False)
|
|
121
|
+
text = result['text'].strip()
|
|
122
|
+
|
|
123
|
+
if not text:
|
|
124
|
+
_emit('LISTENING')
|
|
125
|
+
return
|
|
126
|
+
|
|
127
|
+
# Signal extension to focus Claude input, then wait for DO_PASTE ack
|
|
128
|
+
pyperclip.copy(text)
|
|
129
|
+
_do_paste_event.clear()
|
|
130
|
+
_emit(f'PASTE:{text}')
|
|
131
|
+
got_ack = _do_paste_event.wait(timeout=2.0)
|
|
132
|
+
|
|
133
|
+
if not got_ack:
|
|
134
|
+
_emit('LISTENING')
|
|
135
|
+
return
|
|
136
|
+
|
|
137
|
+
if not _is_vscode_focused():
|
|
138
|
+
# Foreground window is not VSCode — discard paste but keep listening
|
|
139
|
+
_emit('LISTENING')
|
|
140
|
+
return
|
|
141
|
+
|
|
142
|
+
kb = KbController()
|
|
143
|
+
paste_key = Key.cmd if platform.system() == 'Darwin' else Key.ctrl
|
|
144
|
+
kb.press(paste_key)
|
|
145
|
+
kb.press('v')
|
|
146
|
+
kb.release('v')
|
|
147
|
+
kb.release(paste_key)
|
|
148
|
+
|
|
149
|
+
_emit(f'PASTED:{text}')
|
|
150
|
+
_emit('LISTENING')
|
|
151
|
+
|
|
152
|
+
# ── Command loop ──────────────────────────────────────────────────────────────
|
|
153
|
+
for line in sys.stdin:
|
|
154
|
+
cmd = line.strip()
|
|
155
|
+
if cmd == 'START':
|
|
156
|
+
with _lock:
|
|
157
|
+
_state = LISTENING
|
|
158
|
+
_speech_buf = []
|
|
159
|
+
_silence_cnt = 0
|
|
160
|
+
_emit('LISTENING')
|
|
161
|
+
elif cmd == 'DO_PASTE':
|
|
162
|
+
_do_paste_event.set()
|
|
163
|
+
elif cmd == 'STOP':
|
|
164
|
+
with _lock:
|
|
165
|
+
_state = IDLE
|
|
166
|
+
_speech_buf = []
|
|
167
|
+
_silence_cnt = 0
|
|
168
|
+
_emit('IDLE')
|
|
169
|
+
elif cmd == 'QUIT':
|
|
170
|
+
break
|
|
171
|
+
|
|
172
|
+
_stream.stop()
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
voice_terminal.py — Voice input for Claude Code terminal (standalone).
|
|
4
|
+
|
|
5
|
+
Press Ctrl+Alt+V to start listening.
|
|
6
|
+
Speak. Silence auto-triggers transcription.
|
|
7
|
+
Transcribed text is typed into the focused terminal window.
|
|
8
|
+
Press Ctrl+Alt+V again to cancel.
|
|
9
|
+
|
|
10
|
+
Run with: python voice_terminal.py
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import sys
|
|
14
|
+
import platform # used for OS notifications
|
|
15
|
+
import threading
|
|
16
|
+
import numpy as np
|
|
17
|
+
import sounddevice as sd
|
|
18
|
+
import whisper
|
|
19
|
+
from pynput import keyboard
|
|
20
|
+
from pynput.keyboard import Controller as KbController
|
|
21
|
+
|
|
22
|
+
SAMPLE_RATE = 16000
|
|
23
|
+
CHUNK_SIZE = int(SAMPLE_RATE * 0.1) # 100ms chunks
|
|
24
|
+
|
|
25
|
+
SPEECH_RMS = 0.015
|
|
26
|
+
SILENCE_RMS = 0.010
|
|
27
|
+
SILENCE_SECS = 1.2
|
|
28
|
+
MIN_SPEECH = 0.4
|
|
29
|
+
|
|
30
|
+
IDLE = 'idle'
|
|
31
|
+
LISTENING = 'listening'
|
|
32
|
+
SPEAKING = 'speaking'
|
|
33
|
+
TRANSCRIBING = 'transcribing'
|
|
34
|
+
|
|
35
|
+
# ── Terminal window detection ─────────────────────────────────────────────────
|
|
36
|
+
|
|
37
|
+
# ── Notification ──────────────────────────────────────────────────────────────
|
|
38
|
+
def _notify(title, message=''):
|
|
39
|
+
"""Show a brief OS notification."""
|
|
40
|
+
try:
|
|
41
|
+
system = platform.system()
|
|
42
|
+
if system == 'Darwin':
|
|
43
|
+
import subprocess
|
|
44
|
+
subprocess.run(
|
|
45
|
+
['osascript', '-e',
|
|
46
|
+
f'display notification "{message}" with title "{title}"'],
|
|
47
|
+
timeout=2, capture_output=True
|
|
48
|
+
)
|
|
49
|
+
elif system == 'Windows':
|
|
50
|
+
# Use PowerShell toast notification
|
|
51
|
+
import subprocess
|
|
52
|
+
script = (
|
|
53
|
+
f'[Windows.UI.Notifications.ToastNotificationManager, Windows.UI.Notifications, ContentType=WindowsRuntime] | Out-Null;'
|
|
54
|
+
f'$template = [Windows.UI.Notifications.ToastNotificationManager]::GetTemplateContent([Windows.UI.Notifications.ToastTemplateType]::ToastText02);'
|
|
55
|
+
f'$template.SelectSingleNode("//text[@id=1]").InnerText = "{title}";'
|
|
56
|
+
f'$template.SelectSingleNode("//text[@id=2]").InnerText = "{message}";'
|
|
57
|
+
f'$notifier = [Windows.UI.Notifications.ToastNotificationManager]::CreateToastNotifier("Claude Voice");'
|
|
58
|
+
f'$notifier.Show([Windows.UI.Notifications.ToastNotification]::new($template));'
|
|
59
|
+
)
|
|
60
|
+
subprocess.run(['powershell', '-Command', script], timeout=3, capture_output=True)
|
|
61
|
+
elif system == 'Linux':
|
|
62
|
+
import subprocess
|
|
63
|
+
subprocess.run(['notify-send', title, message], timeout=2, capture_output=True)
|
|
64
|
+
except Exception:
|
|
65
|
+
pass # notifications are best-effort
|
|
66
|
+
|
|
67
|
+
# ── Load Whisper ───────────────────────────────────────────────────────────────
|
|
68
|
+
print("Claude Voice — loading speech model...", flush=True)
|
|
69
|
+
_model = whisper.load_model("base")
|
|
70
|
+
print("Ready. Press Ctrl+Alt+V to start/stop voice input.", flush=True)
|
|
71
|
+
_notify("Claude Voice", "Ready — press Ctrl+Alt+V to speak")
|
|
72
|
+
|
|
73
|
+
# ── State ──────────────────────────────────────────────────────────────────────
|
|
74
|
+
_state = IDLE
|
|
75
|
+
_speech_buf = []
|
|
76
|
+
_silence_cnt = 0
|
|
77
|
+
_lock = threading.Lock()
|
|
78
|
+
_kb = KbController()
|
|
79
|
+
|
|
80
|
+
# ── Audio callback ─────────────────────────────────────────────────────────────
|
|
81
|
+
def _audio_cb(indata, frames, t, status):
|
|
82
|
+
global _state, _speech_buf, _silence_cnt
|
|
83
|
+
|
|
84
|
+
if _state not in (LISTENING, SPEAKING):
|
|
85
|
+
return
|
|
86
|
+
|
|
87
|
+
rms = float(np.sqrt(np.mean(indata ** 2)))
|
|
88
|
+
|
|
89
|
+
with _lock:
|
|
90
|
+
if _state == LISTENING:
|
|
91
|
+
if rms > SPEECH_RMS:
|
|
92
|
+
_state = SPEAKING
|
|
93
|
+
_speech_buf = [indata.copy()]
|
|
94
|
+
_silence_cnt = 0
|
|
95
|
+
|
|
96
|
+
elif _state == SPEAKING:
|
|
97
|
+
_speech_buf.append(indata.copy())
|
|
98
|
+
if rms < SILENCE_RMS:
|
|
99
|
+
_silence_cnt += 1
|
|
100
|
+
if _silence_cnt * 0.1 >= SILENCE_SECS:
|
|
101
|
+
buf = list(_speech_buf)
|
|
102
|
+
_speech_buf = []
|
|
103
|
+
_silence_cnt = 0
|
|
104
|
+
_state = TRANSCRIBING
|
|
105
|
+
threading.Thread(target=_process, args=(buf,), daemon=True).start()
|
|
106
|
+
else:
|
|
107
|
+
_silence_cnt = 0
|
|
108
|
+
|
|
109
|
+
_stream = sd.InputStream(
|
|
110
|
+
samplerate=SAMPLE_RATE, channels=1, dtype='float32',
|
|
111
|
+
blocksize=CHUNK_SIZE, callback=_audio_cb
|
|
112
|
+
)
|
|
113
|
+
_stream.start()
|
|
114
|
+
|
|
115
|
+
# ── Transcribe + type ──────────────────────────────────────────────────────────
|
|
116
|
+
def _process(chunks):
|
|
117
|
+
global _state
|
|
118
|
+
|
|
119
|
+
audio = np.concatenate(chunks).flatten()
|
|
120
|
+
if len(audio) / SAMPLE_RATE < MIN_SPEECH:
|
|
121
|
+
with _lock:
|
|
122
|
+
_state = LISTENING
|
|
123
|
+
return
|
|
124
|
+
|
|
125
|
+
result = _model.transcribe(audio, language='en', fp16=False, verbose=False)
|
|
126
|
+
text = result['text'].strip()
|
|
127
|
+
|
|
128
|
+
if not text:
|
|
129
|
+
with _lock:
|
|
130
|
+
_state = LISTENING
|
|
131
|
+
return
|
|
132
|
+
|
|
133
|
+
_kb.type(text)
|
|
134
|
+
|
|
135
|
+
with _lock:
|
|
136
|
+
_state = LISTENING
|
|
137
|
+
|
|
138
|
+
# ── Hotkey toggle ──────────────────────────────────────────────────────────────
|
|
139
|
+
def _toggle():
|
|
140
|
+
global _state, _speech_buf, _silence_cnt
|
|
141
|
+
with _lock:
|
|
142
|
+
if _state == IDLE:
|
|
143
|
+
_state = LISTENING
|
|
144
|
+
_speech_buf = []
|
|
145
|
+
_silence_cnt = 0
|
|
146
|
+
_notify("Claude Voice", "Listening...")
|
|
147
|
+
print("[voice] Listening...", flush=True)
|
|
148
|
+
elif _state == TRANSCRIBING:
|
|
149
|
+
pass # busy, ignore
|
|
150
|
+
else:
|
|
151
|
+
_state = IDLE
|
|
152
|
+
_speech_buf = []
|
|
153
|
+
_silence_cnt = 0
|
|
154
|
+
_notify("Claude Voice", "Stopped")
|
|
155
|
+
print("[voice] Stopped.", flush=True)
|
|
156
|
+
|
|
157
|
+
_stop_event = threading.Event()
|
|
158
|
+
|
|
159
|
+
print("Hotkey active. Ctrl+C to quit.", flush=True)
|
|
160
|
+
with keyboard.GlobalHotKeys({'<ctrl>+<alt>+v': _toggle}) as hotkey:
|
|
161
|
+
try:
|
|
162
|
+
while not _stop_event.is_set():
|
|
163
|
+
_stop_event.wait(timeout=0.5)
|
|
164
|
+
except KeyboardInterrupt:
|
|
165
|
+
pass
|
|
166
|
+
|
|
167
|
+
_stream.stop()
|
|
168
|
+
print("Claude Voice stopped.", flush=True)
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
vtype — offline voice input for any focused window.
|
|
4
|
+
|
|
5
|
+
Press Ctrl+Alt+V to start/stop listening.
|
|
6
|
+
Speak. Silence auto-triggers transcription.
|
|
7
|
+
Transcribed text is typed into whatever window is currently focused.
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
vtype
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import platform
|
|
14
|
+
import subprocess
|
|
15
|
+
import threading
|
|
16
|
+
|
|
17
|
+
import numpy as np
|
|
18
|
+
import sounddevice as sd
|
|
19
|
+
import whisper
|
|
20
|
+
from pynput import keyboard
|
|
21
|
+
from pynput.keyboard import Controller as KbController
|
|
22
|
+
|
|
23
|
+
SAMPLE_RATE = 16000
|
|
24
|
+
CHUNK_SIZE = int(SAMPLE_RATE * 0.1) # 100ms chunks
|
|
25
|
+
|
|
26
|
+
SPEECH_RMS = 0.015
|
|
27
|
+
SILENCE_RMS = 0.010
|
|
28
|
+
SILENCE_SECS = 1.2
|
|
29
|
+
MIN_SPEECH = 0.4 # seconds
|
|
30
|
+
|
|
31
|
+
IDLE = 'idle'
|
|
32
|
+
LISTENING = 'listening'
|
|
33
|
+
SPEAKING = 'speaking'
|
|
34
|
+
TRANSCRIBING = 'transcribing'
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _notify(title, message=''):
|
|
38
|
+
"""Show a brief OS notification (best-effort)."""
|
|
39
|
+
try:
|
|
40
|
+
system = platform.system()
|
|
41
|
+
if system == 'Darwin':
|
|
42
|
+
subprocess.run(
|
|
43
|
+
['osascript', '-e',
|
|
44
|
+
f'display notification "{message}" with title "{title}"'],
|
|
45
|
+
timeout=2, capture_output=True
|
|
46
|
+
)
|
|
47
|
+
elif system == 'Windows':
|
|
48
|
+
script = (
|
|
49
|
+
'[Windows.UI.Notifications.ToastNotificationManager,'
|
|
50
|
+
' Windows.UI.Notifications, ContentType=WindowsRuntime] | Out-Null;'
|
|
51
|
+
'$t = [Windows.UI.Notifications.ToastNotificationManager]::'
|
|
52
|
+
'GetTemplateContent([Windows.UI.Notifications.ToastTemplateType]::ToastText02);'
|
|
53
|
+
f'$t.SelectSingleNode("//text[@id=1]").InnerText = "{title}";'
|
|
54
|
+
f'$t.SelectSingleNode("//text[@id=2]").InnerText = "{message}";'
|
|
55
|
+
'$n = [Windows.UI.Notifications.ToastNotificationManager]::'
|
|
56
|
+
'CreateToastNotifier("vtype");'
|
|
57
|
+
'$n.Show([Windows.UI.Notifications.ToastNotification]::new($t));'
|
|
58
|
+
)
|
|
59
|
+
subprocess.run(['powershell', '-Command', script],
|
|
60
|
+
timeout=3, capture_output=True)
|
|
61
|
+
elif system == 'Linux':
|
|
62
|
+
subprocess.run(['notify-send', title, message],
|
|
63
|
+
timeout=2, capture_output=True)
|
|
64
|
+
except Exception:
|
|
65
|
+
pass
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class VType:
|
|
69
|
+
def __init__(self, model_size='base'):
|
|
70
|
+
self.model_size = model_size
|
|
71
|
+
self.state = IDLE
|
|
72
|
+
self.speech_buf = []
|
|
73
|
+
self.silence_cnt = 0
|
|
74
|
+
self.lock = threading.Lock()
|
|
75
|
+
self.kb = KbController()
|
|
76
|
+
self.model = None
|
|
77
|
+
self.stream = None
|
|
78
|
+
|
|
79
|
+
# ── Audio callback ────────────────────────────────────────────────────────
|
|
80
|
+
def _audio_cb(self, indata, frames, t, status):
|
|
81
|
+
if self.state not in (LISTENING, SPEAKING):
|
|
82
|
+
return
|
|
83
|
+
|
|
84
|
+
rms = float(np.sqrt(np.mean(indata ** 2)))
|
|
85
|
+
|
|
86
|
+
with self.lock:
|
|
87
|
+
if self.state == LISTENING:
|
|
88
|
+
if rms > SPEECH_RMS:
|
|
89
|
+
self.state = SPEAKING
|
|
90
|
+
self.speech_buf = [indata.copy()]
|
|
91
|
+
self.silence_cnt = 0
|
|
92
|
+
|
|
93
|
+
elif self.state == SPEAKING:
|
|
94
|
+
self.speech_buf.append(indata.copy())
|
|
95
|
+
if rms < SILENCE_RMS:
|
|
96
|
+
self.silence_cnt += 1
|
|
97
|
+
if self.silence_cnt * 0.1 >= SILENCE_SECS:
|
|
98
|
+
buf = list(self.speech_buf)
|
|
99
|
+
self.speech_buf = []
|
|
100
|
+
self.silence_cnt = 0
|
|
101
|
+
self.state = TRANSCRIBING
|
|
102
|
+
threading.Thread(
|
|
103
|
+
target=self._process, args=(buf,), daemon=True
|
|
104
|
+
).start()
|
|
105
|
+
else:
|
|
106
|
+
self.silence_cnt = 0
|
|
107
|
+
|
|
108
|
+
# ── Transcribe + type ─────────────────────────────────────────────────────
|
|
109
|
+
def _process(self, chunks):
|
|
110
|
+
audio = np.concatenate(chunks).flatten()
|
|
111
|
+
if len(audio) / SAMPLE_RATE < MIN_SPEECH:
|
|
112
|
+
with self.lock:
|
|
113
|
+
self.state = LISTENING
|
|
114
|
+
return
|
|
115
|
+
|
|
116
|
+
result = self.model.transcribe(audio, language='en', fp16=False, verbose=False)
|
|
117
|
+
text = result['text'].strip()
|
|
118
|
+
|
|
119
|
+
if text:
|
|
120
|
+
self.kb.type(text)
|
|
121
|
+
|
|
122
|
+
with self.lock:
|
|
123
|
+
self.state = LISTENING
|
|
124
|
+
|
|
125
|
+
# ── Hotkey toggle ─────────────────────────────────────────────────────────
|
|
126
|
+
def _toggle(self):
|
|
127
|
+
with self.lock:
|
|
128
|
+
if self.state == IDLE:
|
|
129
|
+
self.state = LISTENING
|
|
130
|
+
self.speech_buf = []
|
|
131
|
+
self.silence_cnt = 0
|
|
132
|
+
_notify("vtype", "Listening...")
|
|
133
|
+
print("[vtype] Listening...", flush=True)
|
|
134
|
+
elif self.state == TRANSCRIBING:
|
|
135
|
+
pass # busy — ignore
|
|
136
|
+
else:
|
|
137
|
+
self.state = IDLE
|
|
138
|
+
self.speech_buf = []
|
|
139
|
+
self.silence_cnt = 0
|
|
140
|
+
_notify("vtype", "Stopped")
|
|
141
|
+
print("[vtype] Stopped.", flush=True)
|
|
142
|
+
|
|
143
|
+
# ── Main loop ─────────────────────────────────────────────────────────────
|
|
144
|
+
def run(self):
|
|
145
|
+
print(f"vtype — loading speech model ({self.model_size})...", flush=True)
|
|
146
|
+
self.model = whisper.load_model(self.model_size)
|
|
147
|
+
print("Ready. Press Ctrl+Alt+V to start/stop voice input.", flush=True)
|
|
148
|
+
_notify("vtype", "Ready — press Ctrl+Alt+V to speak")
|
|
149
|
+
|
|
150
|
+
self.stream = sd.InputStream(
|
|
151
|
+
samplerate=SAMPLE_RATE, channels=1, dtype='float32',
|
|
152
|
+
blocksize=CHUNK_SIZE, callback=self._audio_cb
|
|
153
|
+
)
|
|
154
|
+
self.stream.start()
|
|
155
|
+
|
|
156
|
+
stop_event = threading.Event()
|
|
157
|
+
print("Press Ctrl+C to quit.", flush=True)
|
|
158
|
+
with keyboard.GlobalHotKeys({'<ctrl>+<alt>+v': self._toggle}):
|
|
159
|
+
try:
|
|
160
|
+
while not stop_event.is_set():
|
|
161
|
+
stop_event.wait(timeout=0.5)
|
|
162
|
+
except KeyboardInterrupt:
|
|
163
|
+
pass
|
|
164
|
+
|
|
165
|
+
self.stream.stop()
|
|
166
|
+
print("vtype stopped.", flush=True)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def main():
|
|
170
|
+
VType().run()
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
if __name__ == '__main__':
|
|
174
|
+
main()
|