vtype 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vtype-0.1.0/.gitignore ADDED
@@ -0,0 +1,98 @@
1
+ # Voice Claude Code - Git Ignore
2
+
3
+ # Python
4
+ __pycache__/
5
+ *.py[cod]
6
+ *$py.class
7
+ *.so
8
+ .Python
9
+ build/
10
+ develop-eggs/
11
+ dist/
12
+ downloads/
13
+ eggs/
14
+ .eggs/
15
+ lib/
16
+ lib64/
17
+ parts/
18
+ sdist/
19
+ var/
20
+ wheels/
21
+ *.egg-info/
22
+ .installed.cfg
23
+ *.egg
24
+ MANIFEST
25
+
26
+ # Virtual Environments
27
+ venv/
28
+ env/
29
+ ENV/
30
+ env.bak/
31
+ venv.bak/
32
+
33
+ # IDE
34
+ .idea/
35
+ # Keep .vscode/launch.json and tasks.json for dev workflow
36
+ .vscode/settings.json
37
+ .vscode/*.code-workspace
38
+ *.swp
39
+ *.swo
40
+ *~
41
+ .DS_Store
42
+
43
+ # Configuration (contains API keys - NEVER COMMIT)
44
+ .env
45
+ .env.local
46
+
47
+ # Whisper Models (large files)
48
+ models/*.bin
49
+ models/*.pt
50
+ *.bin
51
+ *.pt
52
+
53
+ # Audio files (temporary recordings)
54
+ *.wav
55
+ *.mp3
56
+ *.ogg
57
+ recordings/
58
+ temp_audio/
59
+
60
+ # Logs
61
+ *.log
62
+ logs/
63
+ *.log.*
64
+
65
+ # Testing
66
+ .pytest_cache/
67
+ .coverage
68
+ htmlcov/
69
+ .tox/
70
+
71
+ # Node / VSCode extension build artifacts
72
+ node_modules/
73
+ package-lock.json
74
+ out/
75
+ *.vsix
76
+
77
+ # OS
78
+ Thumbs.db
79
+ .DS_Store
80
+
81
+ # Backup files
82
+ *.bak
83
+ *.backup
84
+ *~
85
+
86
+ # Session data (if storing locally)
87
+ sessions/
88
+ *.session
89
+
90
+ # Claude Code project settings
91
+ .claude/
92
+
93
+ # Azure Speech SDK cache
94
+ .azure/
95
+
96
+ # Porcupine models/keys
97
+ *.ppn
98
+ porcupine_key.txt
@@ -0,0 +1,13 @@
1
+ {
2
+ "version": "0.2.0",
3
+ "configurations": [
4
+ {
5
+ "name": "Run Extension",
6
+ "type": "extensionHost",
7
+ "request": "launch",
8
+ "args": ["--extensionDevelopmentPath=${workspaceFolder}"],
9
+ "outFiles": ["${workspaceFolder}/out/**/*.js"],
10
+ "preLaunchTask": "${defaultBuildTask}"
11
+ }
12
+ ]
13
+ }
@@ -0,0 +1,16 @@
1
+ {
2
+ "version": "2.0.0",
3
+ "tasks": [
4
+ {
5
+ "label": "npm: compile",
6
+ "type": "npm",
7
+ "script": "compile",
8
+ "group": {
9
+ "kind": "build",
10
+ "isDefault": true
11
+ },
12
+ "problemMatcher": ["$tsc"],
13
+ "detail": "Compile TypeScript extension"
14
+ }
15
+ ]
16
+ }
@@ -0,0 +1,10 @@
1
+ .vscode/**
2
+ .claude/**
3
+ config/**
4
+ node_modules/**
5
+ src/**
6
+ tsconfig.json
7
+ .gitignore
8
+ *.vsix
9
+ voice_terminal.py
10
+ widget.py
vtype-0.1.0/DESIGN.md ADDED
@@ -0,0 +1,101 @@
1
+ # Voice Claude Code — Design
2
+
3
+ ## What it is
4
+
5
+ A VSCode extension that adds voice input to Claude Code. The user presses a keyboard shortcut, speaks, and the transcribed text is pasted into the Claude Code chat input. No webview, no browser, no server — just a status bar button and a Python backend.
6
+
7
+ ## Architecture
8
+
9
+ ```
10
+ ┌─────────────────────────────────────────┐
11
+ │ VSCode Extension (extension.ts) │
12
+ │ ─ Status bar: $(mic) Voice │
13
+ │ ─ Keybinding: Ctrl+Alt+V │
14
+ │ ─ Spawns voice_backend.py via stdin │
15
+ │ ─ IPC: line-delimited text protocol │
16
+ └──────────────┬──────────────────────────┘
17
+ │ stdin/stdout
18
+
19
+ ┌─────────────────────────────────────────┐
20
+ │ Python Backend (voice_backend.py) │
21
+ │ │
22
+ │ sounddevice (RMS VAD) │
23
+ │ ↓ speech detected │
24
+ │ openai-whisper (base model) │
25
+ │ ↓ transcribed text │
26
+ │ pyperclip → clipboard │
27
+ │ ↓ PASTE: signal to extension │
28
+ │ extension calls claude-vscode.focus │
29
+ │ ↓ DO_PASTE signal back (150ms later) │
30
+ │ pynput → Ctrl+V at OS level │
31
+ └─────────────────────────────────────────┘
32
+ ```
33
+
34
+ ## IPC Protocol
35
+
36
+ **Extension → Backend (stdin):**
37
+ | Command | Meaning |
38
+ |---|---|
39
+ | `START` | Begin listening |
40
+ | `STOP` | Stop listening, go idle |
41
+ | `DO_PASTE` | Focus confirmed, paste now |
42
+ | `QUIT` | Shut down |
43
+
44
+ **Backend → Extension (stdout):**
45
+ | Message | Meaning |
46
+ |---|---|
47
+ | `READY` | Whisper model loaded |
48
+ | `LISTENING` | Waiting for speech |
49
+ | `SPEAKING` | Recording |
50
+ | `TRANSCRIBING` | Whisper processing |
51
+ | `PASTE:<text>` | Text ready, requesting focus |
52
+ | `PASTED:<text>` | Paste complete |
53
+ | `IDLE` | Stopped |
54
+ | `ERROR:<msg>` | Something went wrong |
55
+
56
+ ## Focus management
57
+
58
+ The tricky part: pynput simulates Ctrl+V at the OS level, so the correct window must have focus at the exact moment of paste. If the user opened a file editor while speaking, a naive paste would land in the editor.
59
+
60
+ **Solution — round-trip before paste:**
61
+ 1. Backend copies text to clipboard, emits `PASTE:<text>`, then blocks on a threading.Event
62
+ 2. Extension receives `PASTE:`, calls `claude-vscode.focus` (the same command as `Ctrl+Esc`)
63
+ 3. After 150ms (enough for webview focus to propagate), extension sends `DO_PASTE`
64
+ 4. Backend unblocks, runs pynput Ctrl+V
65
+
66
+ This ensures the Claude Code input always has OS-level focus at paste time, regardless of what the user did while speaking. Timeout of 2s prevents blocking indefinitely if the extension crashes.
67
+
68
+ ## Key decisions
69
+
70
+ **No LLM cleanup layer** — an earlier iteration piped Whisper output through a local qwen2.5:1.5b model (via Ollama) to strip filler words and resolve self-corrections. Removed because Claude Code's AI is more than capable of understanding natural speech patterns, and the cleanup added ~2.8s latency per utterance with no real benefit.
71
+
72
+ **No TTS** — not needed. The user reads Claude's responses normally.
73
+
74
+ **No wake word** — user activates manually via Ctrl+Alt+V. Simpler, no always-on mic.
75
+
76
+ **No webview** — earlier design used a browser-based UI with WebSockets. Scrapped in favour of a status bar item that works with whatever text area has focus.
77
+
78
+ **openai-whisper not whisper.cpp** — Python bindings are easier to integrate. `base` model gives acceptable accuracy and speed on CPU.
79
+
80
+ **pynput for paste** — OS-level Ctrl+V simulation works with any input, including VSCode webview panels that don't support the VSCode clipboard paste command.
81
+
82
+ ## VAD parameters
83
+
84
+ ```python
85
+ SPEECH_RMS = 0.015 # RMS threshold to start recording
86
+ SILENCE_RMS = 0.010 # RMS threshold to detect silence
87
+ SILENCE_SECS = 1.2 # Seconds of silence to end utterance
88
+ MIN_SPEECH = 0.4 # Minimum speech duration (seconds)
89
+ ```
90
+
91
+ ## File structure
92
+
93
+ ```
94
+ voice-claude-code/
95
+ ├── src/extension.ts — VSCode extension (TypeScript)
96
+ ├── voice_backend.py — Python: VAD + Whisper + paste
97
+ ├── package.json — Extension manifest, keybindings, settings
98
+ ├── tsconfig.json — TypeScript config
99
+ ├── requirements.txt — Python dependencies
100
+ └── out/extension.js — Compiled output (gitignored)
101
+ ```
vtype-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Voice Claude Code
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
vtype-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,74 @@
1
+ Metadata-Version: 2.4
2
+ Name: vtype
3
+ Version: 0.1.0
4
+ Summary: Offline voice input — speak into any focused window, no cloud required
5
+ License: MIT
6
+ License-File: LICENSE
7
+ Keywords: dictation,offline,speech,voice,voice-input,whisper
8
+ Classifier: Environment :: Console
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Topic :: Utilities
13
+ Requires-Python: >=3.9
14
+ Requires-Dist: numpy
15
+ Requires-Dist: openai-whisper
16
+ Requires-Dist: pynput
17
+ Requires-Dist: sounddevice
18
+ Description-Content-Type: text/markdown
19
+
20
+ # vtype
21
+
22
+ **Offline voice input for any focused window.** Press `Ctrl+Alt+V`, speak, and your words are typed instantly — no cloud, no API key, no subscription.
23
+
24
+ Works in Claude Code terminal, VS Code, any text editor, browser, chat app — whatever window is focused.
25
+
26
+ ---
27
+
28
+ ## Install
29
+
30
+ ```bash
31
+ pip install vtype
32
+ ```
33
+
34
+ > **First run** downloads the Whisper speech model (~150 MB). Subsequent runs are instant.
35
+
36
+ ## Usage
37
+
38
+ ```bash
39
+ vtype
40
+ ```
41
+
42
+ - Press `Ctrl+Alt+V` — starts listening
43
+ - Speak naturally
44
+ - Silence for ~1 second → transcribes and types into the focused window
45
+ - Press `Ctrl+Alt+V` again to cancel
46
+ - `Ctrl+C` to quit
47
+
48
+ ## Requirements
49
+
50
+ - Python 3.9+
51
+ - A microphone
52
+ - FFmpeg (for audio processing)
53
+
54
+ **FFmpeg install:**
55
+
56
+ | Platform | Command |
57
+ |----------|---------|
58
+ | Windows | `winget install ffmpeg` |
59
+ | macOS | `brew install ffmpeg` |
60
+ | Linux | `sudo apt install ffmpeg` |
61
+
62
+ ## How it works
63
+
64
+ - Speech detection runs locally using RMS-based VAD (no network calls)
65
+ - Transcription uses [OpenAI Whisper](https://github.com/openai/whisper) (base model, runs on CPU)
66
+ - Text is typed via OS-level keyboard simulation — works in any window including terminals and browser tabs
67
+
68
+ ## Privacy
69
+
70
+ Everything runs on your machine. Audio never leaves your device.
71
+
72
+ ## License
73
+
74
+ MIT
vtype-0.1.0/README.md ADDED
@@ -0,0 +1,77 @@
1
+ # Claude Voice
2
+
3
+ **Talk to Claude instead of typing.** Press `Ctrl+Alt+V`, speak your prompt, and it appears in Claude Code's chat input — ready to send.
4
+
5
+ **No API key. No cloud. No subscription. Runs 100% offline on your machine.**
6
+
7
+ ![Claude Voice demo](images/demo.gif)
8
+
9
+ ---
10
+
11
+ ## Why Claude Voice?
12
+
13
+ - **No API key required** — just install and speak
14
+ - **Your voice never leaves your computer** — speech is processed locally on your machine
15
+ - **Works offline** — no internet connection needed after the initial model download
16
+ - **Free to use** — no per-request costs, no subscriptions
17
+
18
+ ---
19
+
20
+ ## Setup
21
+
22
+ **1. Install Python dependencies**
23
+
24
+ ```bash
25
+ pip install openai-whisper sounddevice pynput pyperclip numpy
26
+ ```
27
+
28
+ > The first use downloads a ~140 MB speech recognition model. After that, everything runs fully offline with no internet connection required.
29
+
30
+ **2. Reload VS Code**
31
+
32
+ `Ctrl+Shift+P` → **Developer: Reload Window**
33
+
34
+ That's it. Look for the **🎤 Voice** button in your status bar.
35
+
36
+ ---
37
+
38
+ ## How to use
39
+
40
+ 1. Press `Ctrl+Alt+V` (Windows/Linux) or `Cmd+Alt+V` (Mac) — or click **🎤 Voice** in the status bar
41
+ 2. Speak your prompt naturally
42
+ 3. Pause for ~1 second when done — your words appear in Claude Code's chat input
43
+ 4. Press `Enter` to send
44
+
45
+ Press `Ctrl+Alt+V` again at any time to cancel.
46
+
47
+ ---
48
+
49
+ ## Requirements
50
+
51
+ - [Claude Code](https://marketplace.visualstudio.com/items?itemName=Anthropic.claude-code) extension installed
52
+ - Python 3.9 or higher
53
+ - A microphone
54
+
55
+ ---
56
+
57
+ ## Settings
58
+
59
+ | Setting | Default | Description |
60
+ |---|---|---|
61
+ | `claude-voice.focusTarget` | `claude` | `claude` = always focus Claude Code input before pasting. `none` = paste wherever focus already is (works with any input field). |
62
+
63
+ ---
64
+
65
+ ## Troubleshooting
66
+
67
+ **"Missing Python dependencies"** — Run `pip install openai-whisper sounddevice pynput pyperclip numpy` in your terminal.
68
+
69
+ **Nothing appears after speaking** — Make sure the Claude Code panel is open. The extension automatically focuses the Claude Code input before pasting.
70
+
71
+ **Text goes to the wrong app** — The extension checks that VS Code is the active window before pasting. If you switch away while it's transcribing, the paste is safely discarded.
72
+
73
+ **macOS** — `pynput` requires Accessibility permissions: System Settings → Privacy & Security → Accessibility → enable VS Code.
74
+
75
+ **Linux** — Requires `xdotool`: `sudo apt install xdotool`
76
+
77
+ **Slow first transcription** — Whisper loads on first use (~5s). Subsequent transcriptions are faster.
@@ -0,0 +1,55 @@
1
+ # vtype
2
+
3
+ **Offline voice input for any focused window.** Press `Ctrl+Alt+V`, speak, and your words are typed instantly — no cloud, no API key, no subscription.
4
+
5
+ Works in Claude Code terminal, VS Code, any text editor, browser, chat app — whatever window is focused.
6
+
7
+ ---
8
+
9
+ ## Install
10
+
11
+ ```bash
12
+ pip install vtype
13
+ ```
14
+
15
+ > **First run** downloads the Whisper speech model (~150 MB). Subsequent runs are instant.
16
+
17
+ ## Usage
18
+
19
+ ```bash
20
+ vtype
21
+ ```
22
+
23
+ - Press `Ctrl+Alt+V` — starts listening
24
+ - Speak naturally
25
+ - Silence for ~1 second → transcribes and types into the focused window
26
+ - Press `Ctrl+Alt+V` again to cancel
27
+ - `Ctrl+C` to quit
28
+
29
+ ## Requirements
30
+
31
+ - Python 3.9+
32
+ - A microphone
33
+ - FFmpeg (for audio processing)
34
+
35
+ **FFmpeg install:**
36
+
37
+ | Platform | Command |
38
+ |----------|---------|
39
+ | Windows | `winget install ffmpeg` |
40
+ | macOS | `brew install ffmpeg` |
41
+ | Linux | `sudo apt install ffmpeg` |
42
+
43
+ ## How it works
44
+
45
+ - Speech detection runs locally using RMS-based VAD (no network calls)
46
+ - Transcription uses [OpenAI Whisper](https://github.com/openai/whisper) (base model, runs on CPU)
47
+ - Text is typed via OS-level keyboard simulation — works in any window including terminals and browser tabs
48
+
49
+ ## Privacy
50
+
51
+ Everything runs on your machine. Audio never leaves your device.
52
+
53
+ ## License
54
+
55
+ MIT
Binary file
@@ -0,0 +1,63 @@
1
+ {
2
+ "name": "claude-voice",
3
+ "displayName": "Claude Voice",
4
+ "description": "Talk to Claude instead of typing. Press a shortcut, speak your prompt, and it appears in Claude Code ready to send.",
5
+ "version": "0.1.4",
6
+ "publisher": "jsaluja",
7
+ "license": "MIT",
8
+ "repository": {
9
+ "type": "git",
10
+ "url": "https://github.com/jsaluja/claude-voice"
11
+ },
12
+ "keywords": ["claude", "voice", "speech", "whisper", "ai", "assistant", "dictation"],
13
+ "engines": {
14
+ "vscode": "^1.85.0"
15
+ },
16
+ "icon": "images/icon.png",
17
+ "galleryBanner": {
18
+ "color": "#18181f",
19
+ "theme": "dark"
20
+ },
21
+ "categories": ["AI", "Other"],
22
+ "activationEvents": ["onStartupFinished"],
23
+ "main": "./out/extension.js",
24
+ "contributes": {
25
+ "commands": [
26
+ {
27
+ "command": "claude-voice.toggle",
28
+ "title": "Toggle Voice Input",
29
+ "category": "Claude Voice"
30
+ }
31
+ ],
32
+ "configuration": {
33
+ "title": "Claude Voice",
34
+ "properties": {
35
+ "claude-voice.focusTarget": {
36
+ "type": "string",
37
+ "enum": ["claude", "none"],
38
+ "default": "claude",
39
+ "description": "Where to focus before pasting. 'claude' = always focus Claude Code input. 'none' = paste wherever focus already is."
40
+ }
41
+ }
42
+ },
43
+ "keybindings": [
44
+ {
45
+ "command": "claude-voice.toggle",
46
+ "key": "ctrl+alt+v",
47
+ "mac": "cmd+alt+v"
48
+ }
49
+ ]
50
+ },
51
+ "scripts": {
52
+ "compile": "tsc -p ./",
53
+ "watch": "tsc -watch -p ./",
54
+ "vscode:prepublish": "npm run compile",
55
+ "package": "vsce package"
56
+ },
57
+ "devDependencies": {
58
+ "@types/vscode": "^1.85.0",
59
+ "@types/node": "^20.0.0",
60
+ "typescript": "^5.3.0",
61
+ "@vscode/vsce": "^2.22.0"
62
+ }
63
+ }
@@ -0,0 +1,31 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "vtype"
7
+ version = "0.1.0"
8
+ description = "Offline voice input — speak into any focused window, no cloud required"
9
+ readme = "README_PYPI.md"
10
+ requires-python = ">=3.9"
11
+ license = { text = "MIT" }
12
+ keywords = ["voice", "speech", "whisper", "dictation", "offline", "voice-input"]
13
+ classifiers = [
14
+ "Programming Language :: Python :: 3",
15
+ "License :: OSI Approved :: MIT License",
16
+ "Operating System :: OS Independent",
17
+ "Topic :: Utilities",
18
+ "Environment :: Console",
19
+ ]
20
+ dependencies = [
21
+ "openai-whisper",
22
+ "sounddevice",
23
+ "pynput",
24
+ "numpy",
25
+ ]
26
+
27
+ [project.scripts]
28
+ vtype = "vtype.__main__:main"
29
+
30
+ [tool.hatch.build.targets.wheel]
31
+ packages = ["vtype"]
@@ -0,0 +1,5 @@
1
+ openai-whisper
2
+ sounddevice
3
+ pynput
4
+ pyperclip
5
+ numpy
@@ -0,0 +1,135 @@
1
+ import * as vscode from 'vscode';
2
+ import * as cp from 'child_process';
3
+ import * as path from 'path';
4
+
5
+ let backend: cp.ChildProcess | null = null;
6
+ let statusBar: vscode.StatusBarItem;
7
+ let listening = false;
8
+ let extPath = '';
9
+ let deactivating = false;
10
+
11
+ export function activate(context: vscode.ExtensionContext) {
12
+ statusBar = vscode.window.createStatusBarItem(vscode.StatusBarAlignment.Right, 1000);
13
+ statusBar.text = '$(sync~spin) Voice';
14
+ statusBar.tooltip = 'Voice: loading...';
15
+ statusBar.command = 'claude-voice.toggle';
16
+ statusBar.backgroundColor = new vscode.ThemeColor('statusBarItem.warningBackground');
17
+ statusBar.show();
18
+ context.subscriptions.push(statusBar);
19
+
20
+ vscode.window.showInformationMessage('Click 🎤 Voice in the status bar or press Ctrl+Alt+V to talk to the agent');
21
+
22
+ extPath = context.extensionPath;
23
+ startBackend();
24
+
25
+ context.subscriptions.push(
26
+ vscode.commands.registerCommand('claude-voice.toggle', toggle)
27
+ );
28
+ }
29
+
30
+ function startBackend() {
31
+ const script = path.join(extPath, 'voice_backend.py');
32
+ const python = process.platform === 'win32' ? 'python' : 'python3';
33
+ backend = cp.spawn(python, [script], { cwd: extPath });
34
+
35
+ backend.stdout?.on('data', (data: Buffer) => {
36
+ for (const line of data.toString().split('\n')) {
37
+ const msg = line.trim();
38
+ if (msg) { onMessage(msg); }
39
+ }
40
+ });
41
+
42
+ let stderrBuf = '';
43
+ backend.stderr?.on('data', (d: Buffer) => {
44
+ const text = d.toString();
45
+ stderrBuf += text;
46
+ console.log('[voice]', text.trim());
47
+ });
48
+
49
+ backend.on('exit', (code) => {
50
+ backend = null;
51
+ listening = false;
52
+ if (deactivating) { return; }
53
+ statusBar.text = '$(mic-filled) Voice (error)';
54
+ statusBar.color = new vscode.ThemeColor('statusBarItem.errorForeground');
55
+ if (stderrBuf.includes('ModuleNotFoundError') || stderrBuf.includes('No module named')) {
56
+ vscode.window.showErrorMessage(
57
+ 'Claude Voice: Missing Python dependencies. Run: pip install -r requirements.txt',
58
+ 'Copy Command'
59
+ ).then(sel => {
60
+ if (sel === 'Copy Command') { vscode.env.clipboard.writeText('pip install openai-whisper sounddevice pynput pyperclip numpy'); }
61
+ });
62
+ return;
63
+ }
64
+ stderrBuf = '';
65
+ vscode.window.showWarningMessage(`Voice backend crashed (code ${code}) — restarting in 3s...`);
66
+ setTimeout(() => {
67
+ if (!deactivating) {
68
+ statusBar.text = '$(mic) Voice';
69
+ statusBar.color = undefined;
70
+ startBackend();
71
+ }
72
+ }, 3000);
73
+ });
74
+ }
75
+
76
+ function onMessage(msg: string) {
77
+ if (msg === 'READY') {
78
+ statusBar.text = '$(mic) Voice';
79
+ statusBar.tooltip = 'Click to start voice input';
80
+ statusBar.color = undefined;
81
+ } else if (msg === 'LISTENING') {
82
+ statusBar.text = '$(radio-tower) Listening...';
83
+ statusBar.tooltip = 'Click to stop';
84
+ statusBar.color = new vscode.ThemeColor('statusBarItem.warningForeground');
85
+ } else if (msg === 'TRANSCRIBING') {
86
+ statusBar.text = '$(loading~spin) Transcribing...';
87
+ statusBar.tooltip = 'Click to stop';
88
+ statusBar.color = undefined;
89
+ } else if (msg === 'IDLE') {
90
+ statusBar.text = '$(mic) Voice';
91
+ statusBar.tooltip = 'Click to start voice input';
92
+ statusBar.color = undefined;
93
+ listening = false;
94
+ } else if (msg.startsWith('PASTE:')) {
95
+ const text = msg.slice(6);
96
+ const target = vscode.workspace.getConfiguration('claude-voice').get<string>('focusTarget', 'claude');
97
+ const windowFocused = vscode.window.state.focused;
98
+ const inEditor = vscode.window.activeTextEditor !== undefined;
99
+ // Only focus-redirect when VS Code is already focused and user is in a text editor.
100
+ // If VS Code is not focused, don't call claude-vscode.focus — it snaps to conversation 1,
101
+ // disrupting multi-conversation workflows. The backend will discard the paste and stay listening.
102
+ const needsFocus = target === 'claude' && windowFocused && inEditor;
103
+ if (needsFocus) { vscode.commands.executeCommand('claude-vscode.focus'); }
104
+ // Always wait at least 100ms — even without focus redirect, the shortcut
105
+ // momentarily pulls focus from the webview input and it needs time to recover
106
+ setTimeout(() => { backend?.stdin?.write('DO_PASTE\n'); }, needsFocus ? 150 : 100);
107
+ vscode.window.setStatusBarMessage(`Voice: "${text.slice(0, 60)}"`, 3000);
108
+ } else if (msg.startsWith('PASTED:')) {
109
+ } else if (msg.startsWith('ERROR:')) {
110
+ vscode.window.showWarningMessage(msg.slice(6));
111
+ statusBar.text = '$(mic) Voice';
112
+ statusBar.color = undefined;
113
+ listening = false;
114
+ }
115
+ }
116
+
117
+ function toggle() {
118
+ if (!backend?.stdin) {
119
+ vscode.window.showWarningMessage('Voice backend not running.');
120
+ return;
121
+ }
122
+ if (!listening) {
123
+ listening = true;
124
+ backend.stdin.write('START\n');
125
+ } else {
126
+ listening = false;
127
+ backend.stdin.write('STOP\n');
128
+ }
129
+ }
130
+
131
+ export function deactivate() {
132
+ deactivating = true;
133
+ backend?.stdin?.write('QUIT\n');
134
+ backend?.kill();
135
+ }
@@ -0,0 +1,12 @@
1
+ {
2
+ "compilerOptions": {
3
+ "module": "commonjs",
4
+ "target": "ES2020",
5
+ "outDir": "./out",
6
+ "lib": ["ES2020"],
7
+ "sourceMap": true,
8
+ "rootDir": "./src",
9
+ "strict": true
10
+ },
11
+ "exclude": ["node_modules", ".vscode-test"]
12
+ }
@@ -0,0 +1,172 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Voice backend — VAD + Whisper + paste.
4
+ Commands from stdin: START, STOP, QUIT, DO_PASTE
5
+ Status to stdout: READY, LISTENING, SPEAKING, TRANSCRIBING, PASTE:<text>, PASTED:<text>, IDLE, ERROR:<msg>
6
+ """
7
+
8
+ import sys
9
+ import platform
10
+ import threading
11
+ import numpy as np
12
+ import sounddevice as sd
13
+ import whisper
14
+ import pyperclip
15
+ from pynput.keyboard import Controller as KbController, Key
16
+
17
+ def _is_vscode_focused():
18
+ """Returns True if VSCode is the foreground window. Fails open on unsupported platforms."""
19
+ try:
20
+ system = platform.system()
21
+ if system == 'Windows':
22
+ import ctypes
23
+ user32 = ctypes.windll.user32
24
+ hwnd = user32.GetForegroundWindow()
25
+ length = user32.GetWindowTextLengthW(hwnd)
26
+ buf = ctypes.create_unicode_buffer(length + 1)
27
+ user32.GetWindowTextW(hwnd, buf, length + 1)
28
+ return 'visual studio code' in buf.value.lower()
29
+ elif system == 'Darwin':
30
+ import subprocess
31
+ result = subprocess.run(
32
+ ['osascript', '-e', 'tell application "System Events" to get name of first application process whose frontmost is true'],
33
+ capture_output=True, text=True, timeout=1
34
+ )
35
+ return 'code' in result.stdout.strip().lower()
36
+ elif system == 'Linux':
37
+ import subprocess
38
+ result = subprocess.run(
39
+ ['xdotool', 'getactivewindow', 'getwindowname'],
40
+ capture_output=True, text=True, timeout=1
41
+ )
42
+ return 'visual studio code' in result.stdout.strip().lower()
43
+ except Exception:
44
+ pass
45
+ return True # fail open: unknown platform or tool missing — allow paste
46
+
47
+ SAMPLE_RATE = 16000
48
+ CHUNK_SIZE = int(SAMPLE_RATE * 0.1) # 100ms chunks
49
+
50
+ SPEECH_RMS = 0.015
51
+ SILENCE_RMS = 0.010
52
+ SILENCE_SECS = 1.2
53
+ MIN_SPEECH = 0.4
54
+
55
+ IDLE = 'idle'
56
+ LISTENING = 'listening'
57
+ SPEAKING = 'speaking'
58
+
59
+ # ── Load Whisper ─────────────────────────────────────────────────────────────
60
+ sys.stderr.write("Loading Whisper model...\n")
61
+ sys.stderr.flush()
62
+ _model = whisper.load_model("base")
63
+ sys.stderr.write("Whisper ready\n")
64
+ sys.stderr.flush()
65
+ print("READY", flush=True)
66
+
67
+ # ── State ─────────────────────────────────────────────────────────────────────
68
+ _state = IDLE
69
+ _speech_buf = []
70
+ _silence_cnt = 0
71
+ _lock = threading.Lock()
72
+ _do_paste_event = threading.Event() # set by DO_PASTE command from extension
73
+
74
+ def _emit(msg):
75
+ print(msg, flush=True)
76
+
77
+ # ── Audio callback ────────────────────────────────────────────────────────────
78
+ def _audio_cb(indata, frames, t, status):
79
+ global _state, _speech_buf, _silence_cnt
80
+
81
+ if _state not in (LISTENING, SPEAKING):
82
+ return
83
+
84
+ rms = float(np.sqrt(np.mean(indata ** 2)))
85
+
86
+ with _lock:
87
+ if _state == LISTENING:
88
+ if rms > SPEECH_RMS:
89
+ _state = SPEAKING
90
+ _speech_buf = [indata.copy()]
91
+ _silence_cnt = 0
92
+
93
+ elif _state == SPEAKING:
94
+ _speech_buf.append(indata.copy())
95
+ if rms < SILENCE_RMS:
96
+ _silence_cnt += 1
97
+ if _silence_cnt * 0.1 >= SILENCE_SECS:
98
+ buf = list(_speech_buf)
99
+ _speech_buf = []
100
+ _silence_cnt = 0
101
+ _state = LISTENING
102
+ _emit('TRANSCRIBING')
103
+ threading.Thread(target=_process, args=(buf,), daemon=True).start()
104
+ else:
105
+ _silence_cnt = 0
106
+
107
+ _stream = sd.InputStream(
108
+ samplerate=SAMPLE_RATE, channels=1, dtype='float32',
109
+ blocksize=CHUNK_SIZE, callback=_audio_cb
110
+ )
111
+ _stream.start()
112
+
113
+ # ── Transcribe + paste ────────────────────────────────────────────────────────
114
+ def _process(chunks):
115
+ audio = np.concatenate(chunks).flatten()
116
+ if len(audio) / SAMPLE_RATE < MIN_SPEECH:
117
+ _emit('LISTENING')
118
+ return
119
+
120
+ result = _model.transcribe(audio, language='en', fp16=False, verbose=False)
121
+ text = result['text'].strip()
122
+
123
+ if not text:
124
+ _emit('LISTENING')
125
+ return
126
+
127
+ # Signal extension to focus Claude input, then wait for DO_PASTE ack
128
+ pyperclip.copy(text)
129
+ _do_paste_event.clear()
130
+ _emit(f'PASTE:{text}')
131
+ got_ack = _do_paste_event.wait(timeout=2.0)
132
+
133
+ if not got_ack:
134
+ _emit('LISTENING')
135
+ return
136
+
137
+ if not _is_vscode_focused():
138
+ # Foreground window is not VSCode — discard paste but keep listening
139
+ _emit('LISTENING')
140
+ return
141
+
142
+ kb = KbController()
143
+ paste_key = Key.cmd if platform.system() == 'Darwin' else Key.ctrl
144
+ kb.press(paste_key)
145
+ kb.press('v')
146
+ kb.release('v')
147
+ kb.release(paste_key)
148
+
149
+ _emit(f'PASTED:{text}')
150
+ _emit('LISTENING')
151
+
152
+ # ── Command loop ──────────────────────────────────────────────────────────────
153
+ for line in sys.stdin:
154
+ cmd = line.strip()
155
+ if cmd == 'START':
156
+ with _lock:
157
+ _state = LISTENING
158
+ _speech_buf = []
159
+ _silence_cnt = 0
160
+ _emit('LISTENING')
161
+ elif cmd == 'DO_PASTE':
162
+ _do_paste_event.set()
163
+ elif cmd == 'STOP':
164
+ with _lock:
165
+ _state = IDLE
166
+ _speech_buf = []
167
+ _silence_cnt = 0
168
+ _emit('IDLE')
169
+ elif cmd == 'QUIT':
170
+ break
171
+
172
+ _stream.stop()
@@ -0,0 +1,168 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ voice_terminal.py — Voice input for Claude Code terminal (standalone).
4
+
5
+ Press Ctrl+Alt+V to start listening.
6
+ Speak. Silence auto-triggers transcription.
7
+ Transcribed text is typed into the focused terminal window.
8
+ Press Ctrl+Alt+V again to cancel.
9
+
10
+ Run with: python voice_terminal.py
11
+ """
12
+
13
+ import sys
14
+ import platform # used for OS notifications
15
+ import threading
16
+ import numpy as np
17
+ import sounddevice as sd
18
+ import whisper
19
+ from pynput import keyboard
20
+ from pynput.keyboard import Controller as KbController
21
+
22
+ SAMPLE_RATE = 16000
23
+ CHUNK_SIZE = int(SAMPLE_RATE * 0.1) # 100ms chunks
24
+
25
+ SPEECH_RMS = 0.015
26
+ SILENCE_RMS = 0.010
27
+ SILENCE_SECS = 1.2
28
+ MIN_SPEECH = 0.4
29
+
30
+ IDLE = 'idle'
31
+ LISTENING = 'listening'
32
+ SPEAKING = 'speaking'
33
+ TRANSCRIBING = 'transcribing'
34
+
35
+ # ── Terminal window detection ─────────────────────────────────────────────────
36
+
37
+ # ── Notification ──────────────────────────────────────────────────────────────
38
+ def _notify(title, message=''):
39
+ """Show a brief OS notification."""
40
+ try:
41
+ system = platform.system()
42
+ if system == 'Darwin':
43
+ import subprocess
44
+ subprocess.run(
45
+ ['osascript', '-e',
46
+ f'display notification "{message}" with title "{title}"'],
47
+ timeout=2, capture_output=True
48
+ )
49
+ elif system == 'Windows':
50
+ # Use PowerShell toast notification
51
+ import subprocess
52
+ script = (
53
+ f'[Windows.UI.Notifications.ToastNotificationManager, Windows.UI.Notifications, ContentType=WindowsRuntime] | Out-Null;'
54
+ f'$template = [Windows.UI.Notifications.ToastNotificationManager]::GetTemplateContent([Windows.UI.Notifications.ToastTemplateType]::ToastText02);'
55
+ f'$template.SelectSingleNode("//text[@id=1]").InnerText = "{title}";'
56
+ f'$template.SelectSingleNode("//text[@id=2]").InnerText = "{message}";'
57
+ f'$notifier = [Windows.UI.Notifications.ToastNotificationManager]::CreateToastNotifier("Claude Voice");'
58
+ f'$notifier.Show([Windows.UI.Notifications.ToastNotification]::new($template));'
59
+ )
60
+ subprocess.run(['powershell', '-Command', script], timeout=3, capture_output=True)
61
+ elif system == 'Linux':
62
+ import subprocess
63
+ subprocess.run(['notify-send', title, message], timeout=2, capture_output=True)
64
+ except Exception:
65
+ pass # notifications are best-effort
66
+
67
+ # ── Load Whisper ───────────────────────────────────────────────────────────────
68
+ print("Claude Voice — loading speech model...", flush=True)
69
+ _model = whisper.load_model("base")
70
+ print("Ready. Press Ctrl+Alt+V to start/stop voice input.", flush=True)
71
+ _notify("Claude Voice", "Ready — press Ctrl+Alt+V to speak")
72
+
73
+ # ── State ──────────────────────────────────────────────────────────────────────
74
+ _state = IDLE
75
+ _speech_buf = []
76
+ _silence_cnt = 0
77
+ _lock = threading.Lock()
78
+ _kb = KbController()
79
+
80
+ # ── Audio callback ─────────────────────────────────────────────────────────────
81
+ def _audio_cb(indata, frames, t, status):
82
+ global _state, _speech_buf, _silence_cnt
83
+
84
+ if _state not in (LISTENING, SPEAKING):
85
+ return
86
+
87
+ rms = float(np.sqrt(np.mean(indata ** 2)))
88
+
89
+ with _lock:
90
+ if _state == LISTENING:
91
+ if rms > SPEECH_RMS:
92
+ _state = SPEAKING
93
+ _speech_buf = [indata.copy()]
94
+ _silence_cnt = 0
95
+
96
+ elif _state == SPEAKING:
97
+ _speech_buf.append(indata.copy())
98
+ if rms < SILENCE_RMS:
99
+ _silence_cnt += 1
100
+ if _silence_cnt * 0.1 >= SILENCE_SECS:
101
+ buf = list(_speech_buf)
102
+ _speech_buf = []
103
+ _silence_cnt = 0
104
+ _state = TRANSCRIBING
105
+ threading.Thread(target=_process, args=(buf,), daemon=True).start()
106
+ else:
107
+ _silence_cnt = 0
108
+
109
+ _stream = sd.InputStream(
110
+ samplerate=SAMPLE_RATE, channels=1, dtype='float32',
111
+ blocksize=CHUNK_SIZE, callback=_audio_cb
112
+ )
113
+ _stream.start()
114
+
115
+ # ── Transcribe + type ──────────────────────────────────────────────────────────
116
+ def _process(chunks):
117
+ global _state
118
+
119
+ audio = np.concatenate(chunks).flatten()
120
+ if len(audio) / SAMPLE_RATE < MIN_SPEECH:
121
+ with _lock:
122
+ _state = LISTENING
123
+ return
124
+
125
+ result = _model.transcribe(audio, language='en', fp16=False, verbose=False)
126
+ text = result['text'].strip()
127
+
128
+ if not text:
129
+ with _lock:
130
+ _state = LISTENING
131
+ return
132
+
133
+ _kb.type(text)
134
+
135
+ with _lock:
136
+ _state = LISTENING
137
+
138
+ # ── Hotkey toggle ──────────────────────────────────────────────────────────────
139
+ def _toggle():
140
+ global _state, _speech_buf, _silence_cnt
141
+ with _lock:
142
+ if _state == IDLE:
143
+ _state = LISTENING
144
+ _speech_buf = []
145
+ _silence_cnt = 0
146
+ _notify("Claude Voice", "Listening...")
147
+ print("[voice] Listening...", flush=True)
148
+ elif _state == TRANSCRIBING:
149
+ pass # busy, ignore
150
+ else:
151
+ _state = IDLE
152
+ _speech_buf = []
153
+ _silence_cnt = 0
154
+ _notify("Claude Voice", "Stopped")
155
+ print("[voice] Stopped.", flush=True)
156
+
157
+ _stop_event = threading.Event()
158
+
159
+ print("Hotkey active. Ctrl+C to quit.", flush=True)
160
+ with keyboard.GlobalHotKeys({'<ctrl>+<alt>+v': _toggle}) as hotkey:
161
+ try:
162
+ while not _stop_event.is_set():
163
+ _stop_event.wait(timeout=0.5)
164
+ except KeyboardInterrupt:
165
+ pass
166
+
167
+ _stream.stop()
168
+ print("Claude Voice stopped.", flush=True)
@@ -0,0 +1,3 @@
1
+ """vtype — offline voice input for any focused window."""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,174 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ vtype — offline voice input for any focused window.
4
+
5
+ Press Ctrl+Alt+V to start/stop listening.
6
+ Speak. Silence auto-triggers transcription.
7
+ Transcribed text is typed into whatever window is currently focused.
8
+
9
+ Usage:
10
+ vtype
11
+ """
12
+
13
+ import platform
14
+ import subprocess
15
+ import threading
16
+
17
+ import numpy as np
18
+ import sounddevice as sd
19
+ import whisper
20
+ from pynput import keyboard
21
+ from pynput.keyboard import Controller as KbController
22
+
23
+ SAMPLE_RATE = 16000
24
+ CHUNK_SIZE = int(SAMPLE_RATE * 0.1) # 100ms chunks
25
+
26
+ SPEECH_RMS = 0.015
27
+ SILENCE_RMS = 0.010
28
+ SILENCE_SECS = 1.2
29
+ MIN_SPEECH = 0.4 # seconds
30
+
31
+ IDLE = 'idle'
32
+ LISTENING = 'listening'
33
+ SPEAKING = 'speaking'
34
+ TRANSCRIBING = 'transcribing'
35
+
36
+
37
+ def _notify(title, message=''):
38
+ """Show a brief OS notification (best-effort)."""
39
+ try:
40
+ system = platform.system()
41
+ if system == 'Darwin':
42
+ subprocess.run(
43
+ ['osascript', '-e',
44
+ f'display notification "{message}" with title "{title}"'],
45
+ timeout=2, capture_output=True
46
+ )
47
+ elif system == 'Windows':
48
+ script = (
49
+ '[Windows.UI.Notifications.ToastNotificationManager,'
50
+ ' Windows.UI.Notifications, ContentType=WindowsRuntime] | Out-Null;'
51
+ '$t = [Windows.UI.Notifications.ToastNotificationManager]::'
52
+ 'GetTemplateContent([Windows.UI.Notifications.ToastTemplateType]::ToastText02);'
53
+ f'$t.SelectSingleNode("//text[@id=1]").InnerText = "{title}";'
54
+ f'$t.SelectSingleNode("//text[@id=2]").InnerText = "{message}";'
55
+ '$n = [Windows.UI.Notifications.ToastNotificationManager]::'
56
+ 'CreateToastNotifier("vtype");'
57
+ '$n.Show([Windows.UI.Notifications.ToastNotification]::new($t));'
58
+ )
59
+ subprocess.run(['powershell', '-Command', script],
60
+ timeout=3, capture_output=True)
61
+ elif system == 'Linux':
62
+ subprocess.run(['notify-send', title, message],
63
+ timeout=2, capture_output=True)
64
+ except Exception:
65
+ pass
66
+
67
+
68
+ class VType:
69
+ def __init__(self, model_size='base'):
70
+ self.model_size = model_size
71
+ self.state = IDLE
72
+ self.speech_buf = []
73
+ self.silence_cnt = 0
74
+ self.lock = threading.Lock()
75
+ self.kb = KbController()
76
+ self.model = None
77
+ self.stream = None
78
+
79
+ # ── Audio callback ────────────────────────────────────────────────────────
80
+ def _audio_cb(self, indata, frames, t, status):
81
+ if self.state not in (LISTENING, SPEAKING):
82
+ return
83
+
84
+ rms = float(np.sqrt(np.mean(indata ** 2)))
85
+
86
+ with self.lock:
87
+ if self.state == LISTENING:
88
+ if rms > SPEECH_RMS:
89
+ self.state = SPEAKING
90
+ self.speech_buf = [indata.copy()]
91
+ self.silence_cnt = 0
92
+
93
+ elif self.state == SPEAKING:
94
+ self.speech_buf.append(indata.copy())
95
+ if rms < SILENCE_RMS:
96
+ self.silence_cnt += 1
97
+ if self.silence_cnt * 0.1 >= SILENCE_SECS:
98
+ buf = list(self.speech_buf)
99
+ self.speech_buf = []
100
+ self.silence_cnt = 0
101
+ self.state = TRANSCRIBING
102
+ threading.Thread(
103
+ target=self._process, args=(buf,), daemon=True
104
+ ).start()
105
+ else:
106
+ self.silence_cnt = 0
107
+
108
+ # ── Transcribe + type ─────────────────────────────────────────────────────
109
+ def _process(self, chunks):
110
+ audio = np.concatenate(chunks).flatten()
111
+ if len(audio) / SAMPLE_RATE < MIN_SPEECH:
112
+ with self.lock:
113
+ self.state = LISTENING
114
+ return
115
+
116
+ result = self.model.transcribe(audio, language='en', fp16=False, verbose=False)
117
+ text = result['text'].strip()
118
+
119
+ if text:
120
+ self.kb.type(text)
121
+
122
+ with self.lock:
123
+ self.state = LISTENING
124
+
125
+ # ── Hotkey toggle ─────────────────────────────────────────────────────────
126
+ def _toggle(self):
127
+ with self.lock:
128
+ if self.state == IDLE:
129
+ self.state = LISTENING
130
+ self.speech_buf = []
131
+ self.silence_cnt = 0
132
+ _notify("vtype", "Listening...")
133
+ print("[vtype] Listening...", flush=True)
134
+ elif self.state == TRANSCRIBING:
135
+ pass # busy — ignore
136
+ else:
137
+ self.state = IDLE
138
+ self.speech_buf = []
139
+ self.silence_cnt = 0
140
+ _notify("vtype", "Stopped")
141
+ print("[vtype] Stopped.", flush=True)
142
+
143
+ # ── Main loop ─────────────────────────────────────────────────────────────
144
+ def run(self):
145
+ print(f"vtype — loading speech model ({self.model_size})...", flush=True)
146
+ self.model = whisper.load_model(self.model_size)
147
+ print("Ready. Press Ctrl+Alt+V to start/stop voice input.", flush=True)
148
+ _notify("vtype", "Ready — press Ctrl+Alt+V to speak")
149
+
150
+ self.stream = sd.InputStream(
151
+ samplerate=SAMPLE_RATE, channels=1, dtype='float32',
152
+ blocksize=CHUNK_SIZE, callback=self._audio_cb
153
+ )
154
+ self.stream.start()
155
+
156
+ stop_event = threading.Event()
157
+ print("Press Ctrl+C to quit.", flush=True)
158
+ with keyboard.GlobalHotKeys({'<ctrl>+<alt>+v': self._toggle}):
159
+ try:
160
+ while not stop_event.is_set():
161
+ stop_event.wait(timeout=0.5)
162
+ except KeyboardInterrupt:
163
+ pass
164
+
165
+ self.stream.stop()
166
+ print("vtype stopped.", flush=True)
167
+
168
+
169
+ def main():
170
+ VType().run()
171
+
172
+
173
+ if __name__ == '__main__':
174
+ main()