speakandcode 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- speakandcode-0.1.0/LICENSE +21 -0
- speakandcode-0.1.0/PKG-INFO +122 -0
- speakandcode-0.1.0/README.md +93 -0
- speakandcode-0.1.0/pyproject.toml +44 -0
- speakandcode-0.1.0/setup.cfg +4 -0
- speakandcode-0.1.0/speakandcode.egg-info/PKG-INFO +122 -0
- speakandcode-0.1.0/speakandcode.egg-info/SOURCES.txt +17 -0
- speakandcode-0.1.0/speakandcode.egg-info/dependency_links.txt +1 -0
- speakandcode-0.1.0/speakandcode.egg-info/entry_points.txt +2 -0
- speakandcode-0.1.0/speakandcode.egg-info/requires.txt +7 -0
- speakandcode-0.1.0/speakandcode.egg-info/top_level.txt +1 -0
- speakandcode-0.1.0/voice_coding/__init__.py +1 -0
- speakandcode-0.1.0/voice_coding/clipboard.py +23 -0
- speakandcode-0.1.0/voice_coding/learn_cmd.py +137 -0
- speakandcode-0.1.0/voice_coding/main.py +137 -0
- speakandcode-0.1.0/voice_coding/memory.py +12 -0
- speakandcode-0.1.0/voice_coding/postprocessor.py +72 -0
- speakandcode-0.1.0/voice_coding/recorder.py +64 -0
- speakandcode-0.1.0/voice_coding/transcriber.py +64 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Johnny Chien
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: speakandcode
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Private voice-to-text for developers. Hold a key, speak, get text pasted into any app.
|
|
5
|
+
Author: Johnny Chien
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/johnny-chien/voice-coding
|
|
8
|
+
Project-URL: Repository, https://github.com/johnny-chien/voice-coding
|
|
9
|
+
Keywords: voice,coding,transcription,developer-tools,voice-to-text
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Environment :: MacOS X
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Operating System :: MacOS
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
18
|
+
Requires-Python: >=3.11
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: sounddevice>=0.5
|
|
22
|
+
Requires-Dist: soundfile>=0.12
|
|
23
|
+
Requires-Dist: numpy>=1.24
|
|
24
|
+
Requires-Dist: pynput>=1.7
|
|
25
|
+
Requires-Dist: google-genai>=1.0
|
|
26
|
+
Requires-Dist: python-dotenv>=1.0
|
|
27
|
+
Requires-Dist: pyobjc-framework-Quartz>=9.0
|
|
28
|
+
Dynamic: license-file
|
|
29
|
+
|
|
30
|
+
# Voice Coding
|
|
31
|
+
|
|
32
|
+
Private voice-to-text for developers. Hold a hotkey, speak, get text pasted into any app.
|
|
33
|
+
|
|
34
|
+
- **Private** — audio goes to Gemini Flash (your own API key), nowhere else
|
|
35
|
+
- **Fast** — ~1 second transcription via Gemini 3.0 Flash
|
|
36
|
+
- **Universal** — auto-pastes into any focused app: VS Code, Terminal, Slack, browser, etc.
|
|
37
|
+
- **Coding-aware** — "dot env" → `.env`, "camel case foo bar" → `fooBar`
|
|
38
|
+
|
|
39
|
+
## Install
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
pip install speakandcode
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## Setup
|
|
46
|
+
|
|
47
|
+
1. Get a [Gemini API key](https://aistudio.google.com/apikey)
|
|
48
|
+
2. Set your API key:
|
|
49
|
+
```bash
|
|
50
|
+
export GEMINI_API_KEY=your_key_here
|
|
51
|
+
```
|
|
52
|
+
Or create a `~/.voice-coding/.env` file:
|
|
53
|
+
```
|
|
54
|
+
GEMINI_API_KEY=your_key_here
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### macOS Permissions
|
|
58
|
+
|
|
59
|
+
Your terminal app (Terminal.app / iTerm / VS Code) needs two permissions in **System Settings → Privacy & Security**:
|
|
60
|
+
|
|
61
|
+
- **Microphone** — for audio recording
|
|
62
|
+
- **Accessibility** — for global hotkey detection and auto-paste keystroke simulation
|
|
63
|
+
|
|
64
|
+
After granting Accessibility, **restart your terminal app** for the permission to take effect.
|
|
65
|
+
|
|
66
|
+
## Learn Project Vocabulary (Optional)
|
|
67
|
+
|
|
68
|
+
Teach Voice Coding the vocabulary of any project you work on:
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
cd /path/to/your/project
|
|
72
|
+
voice learn
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
This scans the repo (README, package.json, etc.) and merges its vocabulary into your global memory at `~/.voice-coding/memory.md`. Run it in each repo you work on — terms accumulate across projects.
|
|
76
|
+
|
|
77
|
+
The memory file includes:
|
|
78
|
+
|
|
79
|
+
- **Vocabulary** — project-specific terms with disambiguation hints (e.g., "Claude Code" not "clock code")
|
|
80
|
+
- **Context** — brief descriptions of your projects and tech stacks
|
|
81
|
+
- **Notes** — space for personal customizations (accent, language mixing, corrections you've noticed)
|
|
82
|
+
|
|
83
|
+
Edit `~/.voice-coding/memory.md` anytime to add or fix terms.
|
|
84
|
+
|
|
85
|
+
## Usage
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
voice
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
**Hold Alt (⌥)** to start recording. **Release Alt** to stop, transcribe, and auto-paste into whichever app is focused.
|
|
92
|
+
|
|
93
|
+
Press **Ctrl+C** to quit.
|
|
94
|
+
|
|
95
|
+
### Tips
|
|
96
|
+
|
|
97
|
+
- Speak naturally — filler words (um, uh, like, you know) are automatically removed
|
|
98
|
+
- Minor grammar is corrected while preserving your original wording
|
|
99
|
+
- Recordings shorter than 0.5 seconds are ignored to prevent accidental triggers
|
|
100
|
+
|
|
101
|
+
## Coding Transforms
|
|
102
|
+
|
|
103
|
+
Voice Coding post-processes transcriptions with coding-aware rules:
|
|
104
|
+
|
|
105
|
+
| You say | You get |
|
|
106
|
+
|---------|---------|
|
|
107
|
+
| "dot env" | `.env` |
|
|
108
|
+
| "slash api" | `/api` |
|
|
109
|
+
| "camel case foo bar" | `fooBar` |
|
|
110
|
+
| "snake case my variable" | `my_variable` |
|
|
111
|
+
| "open paren" | `(` |
|
|
112
|
+
| "arrow" | `=>` |
|
|
113
|
+
| "triple equals" | `===` |
|
|
114
|
+
| "new line" | newline character |
|
|
115
|
+
|
|
116
|
+
## How It Works
|
|
117
|
+
|
|
118
|
+
1. A macOS `CGEventTap` listens for the Alt key globally (works in any app, including VS Code)
|
|
119
|
+
2. `sounddevice` captures mic audio at 16kHz mono while the hotkey is held
|
|
120
|
+
3. Audio is sent to Gemini 3.0 Flash for transcription, with vocabulary from `~/.voice-coding/memory.md` if present
|
|
121
|
+
4. Post-processor applies coding-aware text transforms
|
|
122
|
+
5. Result is copied to clipboard via `pbcopy` and pasted via `osascript` Cmd+V simulation
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
# Voice Coding
|
|
2
|
+
|
|
3
|
+
Private voice-to-text for developers. Hold a hotkey, speak, get text pasted into any app.
|
|
4
|
+
|
|
5
|
+
- **Private** — audio goes to Gemini Flash (your own API key), nowhere else
|
|
6
|
+
- **Fast** — ~1 second transcription via Gemini 3.0 Flash
|
|
7
|
+
- **Universal** — auto-pastes into any focused app: VS Code, Terminal, Slack, browser, etc.
|
|
8
|
+
- **Coding-aware** — "dot env" → `.env`, "camel case foo bar" → `fooBar`
|
|
9
|
+
|
|
10
|
+
## Install
|
|
11
|
+
|
|
12
|
+
```bash
|
|
13
|
+
pip install speakandcode
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
## Setup
|
|
17
|
+
|
|
18
|
+
1. Get a [Gemini API key](https://aistudio.google.com/apikey)
|
|
19
|
+
2. Set your API key:
|
|
20
|
+
```bash
|
|
21
|
+
export GEMINI_API_KEY=your_key_here
|
|
22
|
+
```
|
|
23
|
+
Or create a `~/.voice-coding/.env` file:
|
|
24
|
+
```
|
|
25
|
+
GEMINI_API_KEY=your_key_here
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
### macOS Permissions
|
|
29
|
+
|
|
30
|
+
Your terminal app (Terminal.app / iTerm / VS Code) needs two permissions in **System Settings → Privacy & Security**:
|
|
31
|
+
|
|
32
|
+
- **Microphone** — for audio recording
|
|
33
|
+
- **Accessibility** — for global hotkey detection and auto-paste keystroke simulation
|
|
34
|
+
|
|
35
|
+
After granting Accessibility, **restart your terminal app** for the permission to take effect.
|
|
36
|
+
|
|
37
|
+
## Learn Project Vocabulary (Optional)
|
|
38
|
+
|
|
39
|
+
Teach Voice Coding the vocabulary of any project you work on:
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
cd /path/to/your/project
|
|
43
|
+
voice learn
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
This scans the repo (README, package.json, etc.) and merges its vocabulary into your global memory at `~/.voice-coding/memory.md`. Run it in each repo you work on — terms accumulate across projects.
|
|
47
|
+
|
|
48
|
+
The memory file includes:
|
|
49
|
+
|
|
50
|
+
- **Vocabulary** — project-specific terms with disambiguation hints (e.g., "Claude Code" not "clock code")
|
|
51
|
+
- **Context** — brief descriptions of your projects and tech stacks
|
|
52
|
+
- **Notes** — space for personal customizations (accent, language mixing, corrections you've noticed)
|
|
53
|
+
|
|
54
|
+
Edit `~/.voice-coding/memory.md` anytime to add or fix terms.
|
|
55
|
+
|
|
56
|
+
## Usage
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
voice
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
**Hold Alt (⌥)** to start recording. **Release Alt** to stop, transcribe, and auto-paste into whichever app is focused.
|
|
63
|
+
|
|
64
|
+
Press **Ctrl+C** to quit.
|
|
65
|
+
|
|
66
|
+
### Tips
|
|
67
|
+
|
|
68
|
+
- Speak naturally — filler words (um, uh, like, you know) are automatically removed
|
|
69
|
+
- Minor grammar is corrected while preserving your original wording
|
|
70
|
+
- Recordings shorter than 0.5 seconds are ignored to prevent accidental triggers
|
|
71
|
+
|
|
72
|
+
## Coding Transforms
|
|
73
|
+
|
|
74
|
+
Voice Coding post-processes transcriptions with coding-aware rules:
|
|
75
|
+
|
|
76
|
+
| You say | You get |
|
|
77
|
+
|---------|---------|
|
|
78
|
+
| "dot env" | `.env` |
|
|
79
|
+
| "slash api" | `/api` |
|
|
80
|
+
| "camel case foo bar" | `fooBar` |
|
|
81
|
+
| "snake case my variable" | `my_variable` |
|
|
82
|
+
| "open paren" | `(` |
|
|
83
|
+
| "arrow" | `=>` |
|
|
84
|
+
| "triple equals" | `===` |
|
|
85
|
+
| "new line" | newline character |
|
|
86
|
+
|
|
87
|
+
## How It Works
|
|
88
|
+
|
|
89
|
+
1. A macOS `CGEventTap` listens for the Alt key globally (works in any app, including VS Code)
|
|
90
|
+
2. `sounddevice` captures mic audio at 16kHz mono while the hotkey is held
|
|
91
|
+
3. Audio is sent to Gemini 3.0 Flash for transcription, with vocabulary from `~/.voice-coding/memory.md` if present
|
|
92
|
+
4. Post-processor applies coding-aware text transforms
|
|
93
|
+
5. Result is copied to clipboard via `pbcopy` and pasted via `osascript` Cmd+V simulation
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "speakandcode"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Private voice-to-text for developers. Hold a key, speak, get text pasted into any app."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
requires-python = ">=3.11"
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Johnny Chien" },
|
|
14
|
+
]
|
|
15
|
+
keywords = ["voice", "coding", "transcription", "developer-tools", "voice-to-text"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 4 - Beta",
|
|
18
|
+
"Environment :: MacOS X",
|
|
19
|
+
"Intended Audience :: Developers",
|
|
20
|
+
"Operating System :: MacOS",
|
|
21
|
+
"Programming Language :: Python :: 3.11",
|
|
22
|
+
"Programming Language :: Python :: 3.12",
|
|
23
|
+
"Programming Language :: Python :: 3.13",
|
|
24
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
25
|
+
]
|
|
26
|
+
dependencies = [
|
|
27
|
+
"sounddevice>=0.5",
|
|
28
|
+
"soundfile>=0.12",
|
|
29
|
+
"numpy>=1.24",
|
|
30
|
+
"pynput>=1.7",
|
|
31
|
+
"google-genai>=1.0",
|
|
32
|
+
"python-dotenv>=1.0",
|
|
33
|
+
"pyobjc-framework-Quartz>=9.0",
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
[project.urls]
|
|
37
|
+
Homepage = "https://github.com/johnny-chien/voice-coding"
|
|
38
|
+
Repository = "https://github.com/johnny-chien/voice-coding"
|
|
39
|
+
|
|
40
|
+
[project.scripts]
|
|
41
|
+
voice = "voice_coding.main:main"
|
|
42
|
+
|
|
43
|
+
[tool.setuptools.packages.find]
|
|
44
|
+
include = ["voice_coding*"]
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: speakandcode
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Private voice-to-text for developers. Hold a key, speak, get text pasted into any app.
|
|
5
|
+
Author: Johnny Chien
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/johnny-chien/voice-coding
|
|
8
|
+
Project-URL: Repository, https://github.com/johnny-chien/voice-coding
|
|
9
|
+
Keywords: voice,coding,transcription,developer-tools,voice-to-text
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Environment :: MacOS X
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Operating System :: MacOS
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
18
|
+
Requires-Python: >=3.11
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: sounddevice>=0.5
|
|
22
|
+
Requires-Dist: soundfile>=0.12
|
|
23
|
+
Requires-Dist: numpy>=1.24
|
|
24
|
+
Requires-Dist: pynput>=1.7
|
|
25
|
+
Requires-Dist: google-genai>=1.0
|
|
26
|
+
Requires-Dist: python-dotenv>=1.0
|
|
27
|
+
Requires-Dist: pyobjc-framework-Quartz>=9.0
|
|
28
|
+
Dynamic: license-file
|
|
29
|
+
|
|
30
|
+
# Voice Coding
|
|
31
|
+
|
|
32
|
+
Private voice-to-text for developers. Hold a hotkey, speak, get text pasted into any app.
|
|
33
|
+
|
|
34
|
+
- **Private** — audio goes to Gemini Flash (your own API key), nowhere else
|
|
35
|
+
- **Fast** — ~1 second transcription via Gemini 3.0 Flash
|
|
36
|
+
- **Universal** — auto-pastes into any focused app: VS Code, Terminal, Slack, browser, etc.
|
|
37
|
+
- **Coding-aware** — "dot env" → `.env`, "camel case foo bar" → `fooBar`
|
|
38
|
+
|
|
39
|
+
## Install
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
pip install speakandcode
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## Setup
|
|
46
|
+
|
|
47
|
+
1. Get a [Gemini API key](https://aistudio.google.com/apikey)
|
|
48
|
+
2. Set your API key:
|
|
49
|
+
```bash
|
|
50
|
+
export GEMINI_API_KEY=your_key_here
|
|
51
|
+
```
|
|
52
|
+
Or create a `~/.voice-coding/.env` file:
|
|
53
|
+
```
|
|
54
|
+
GEMINI_API_KEY=your_key_here
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### macOS Permissions
|
|
58
|
+
|
|
59
|
+
Your terminal app (Terminal.app / iTerm / VS Code) needs two permissions in **System Settings → Privacy & Security**:
|
|
60
|
+
|
|
61
|
+
- **Microphone** — for audio recording
|
|
62
|
+
- **Accessibility** — for global hotkey detection and auto-paste keystroke simulation
|
|
63
|
+
|
|
64
|
+
After granting Accessibility, **restart your terminal app** for the permission to take effect.
|
|
65
|
+
|
|
66
|
+
## Learn Project Vocabulary (Optional)
|
|
67
|
+
|
|
68
|
+
Teach Voice Coding the vocabulary of any project you work on:
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
cd /path/to/your/project
|
|
72
|
+
voice learn
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
This scans the repo (README, package.json, etc.) and merges its vocabulary into your global memory at `~/.voice-coding/memory.md`. Run it in each repo you work on — terms accumulate across projects.
|
|
76
|
+
|
|
77
|
+
The memory file includes:
|
|
78
|
+
|
|
79
|
+
- **Vocabulary** — project-specific terms with disambiguation hints (e.g., "Claude Code" not "clock code")
|
|
80
|
+
- **Context** — brief descriptions of your projects and tech stacks
|
|
81
|
+
- **Notes** — space for personal customizations (accent, language mixing, corrections you've noticed)
|
|
82
|
+
|
|
83
|
+
Edit `~/.voice-coding/memory.md` anytime to add or fix terms.
|
|
84
|
+
|
|
85
|
+
## Usage
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
voice
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
**Hold Alt (⌥)** to start recording. **Release Alt** to stop, transcribe, and auto-paste into whichever app is focused.
|
|
92
|
+
|
|
93
|
+
Press **Ctrl+C** to quit.
|
|
94
|
+
|
|
95
|
+
### Tips
|
|
96
|
+
|
|
97
|
+
- Speak naturally — filler words (um, uh, like, you know) are automatically removed
|
|
98
|
+
- Minor grammar is corrected while preserving your original wording
|
|
99
|
+
- Recordings shorter than 0.5 seconds are ignored to prevent accidental triggers
|
|
100
|
+
|
|
101
|
+
## Coding Transforms
|
|
102
|
+
|
|
103
|
+
Voice Coding post-processes transcriptions with coding-aware rules:
|
|
104
|
+
|
|
105
|
+
| You say | You get |
|
|
106
|
+
|---------|---------|
|
|
107
|
+
| "dot env" | `.env` |
|
|
108
|
+
| "slash api" | `/api` |
|
|
109
|
+
| "camel case foo bar" | `fooBar` |
|
|
110
|
+
| "snake case my variable" | `my_variable` |
|
|
111
|
+
| "open paren" | `(` |
|
|
112
|
+
| "arrow" | `=>` |
|
|
113
|
+
| "triple equals" | `===` |
|
|
114
|
+
| "new line" | newline character |
|
|
115
|
+
|
|
116
|
+
## How It Works
|
|
117
|
+
|
|
118
|
+
1. A macOS `CGEventTap` listens for the Alt key globally (works in any app, including VS Code)
|
|
119
|
+
2. `sounddevice` captures mic audio at 16kHz mono while the hotkey is held
|
|
120
|
+
3. Audio is sent to Gemini 3.0 Flash for transcription, with vocabulary from `~/.voice-coding/memory.md` if present
|
|
121
|
+
4. Post-processor applies coding-aware text transforms
|
|
122
|
+
5. Result is copied to clipboard via `pbcopy` and pasted via `osascript` Cmd+V simulation
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
speakandcode.egg-info/PKG-INFO
|
|
5
|
+
speakandcode.egg-info/SOURCES.txt
|
|
6
|
+
speakandcode.egg-info/dependency_links.txt
|
|
7
|
+
speakandcode.egg-info/entry_points.txt
|
|
8
|
+
speakandcode.egg-info/requires.txt
|
|
9
|
+
speakandcode.egg-info/top_level.txt
|
|
10
|
+
voice_coding/__init__.py
|
|
11
|
+
voice_coding/clipboard.py
|
|
12
|
+
voice_coding/learn_cmd.py
|
|
13
|
+
voice_coding/main.py
|
|
14
|
+
voice_coding/memory.py
|
|
15
|
+
voice_coding/postprocessor.py
|
|
16
|
+
voice_coding/recorder.py
|
|
17
|
+
voice_coding/transcriber.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
voice_coding
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Voice Coding — Private voice-to-text for developers."""
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""Copy text to clipboard and auto-paste into the focused app."""
|
|
2
|
+
|
|
3
|
+
import subprocess
|
|
4
|
+
import time
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def copy_and_paste(text: str):
|
|
8
|
+
"""Copy text to macOS clipboard via pbcopy, then simulate Cmd+V."""
|
|
9
|
+
# Copy to clipboard
|
|
10
|
+
subprocess.run(["pbcopy"], input=text.encode(), check=True)
|
|
11
|
+
|
|
12
|
+
# Small delay for clipboard to settle
|
|
13
|
+
time.sleep(0.05)
|
|
14
|
+
|
|
15
|
+
# Simulate Cmd+V in the focused app
|
|
16
|
+
subprocess.run(
|
|
17
|
+
[
|
|
18
|
+
"osascript",
|
|
19
|
+
"-e",
|
|
20
|
+
'tell application "System Events" to keystroke "v" using command down',
|
|
21
|
+
],
|
|
22
|
+
check=True,
|
|
23
|
+
)
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
"""voice learn — scan a repo and merge its vocabulary into global memory."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import sys
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from dotenv import load_dotenv
|
|
8
|
+
from google import genai
|
|
9
|
+
from google.genai import types
|
|
10
|
+
|
|
11
|
+
LEARN_MODEL = "gemini-3-flash-preview"
|
|
12
|
+
|
|
13
|
+
GLOBAL_MEMORY_DIR = Path.home() / ".voice-coding"
|
|
14
|
+
GLOBAL_MEMORY_FILE = GLOBAL_MEMORY_DIR / "memory.md"
|
|
15
|
+
|
|
16
|
+
LEARN_PROMPT = """\
|
|
17
|
+
You are helping a developer add project-specific vocabulary to their voice-to-text memory file.
|
|
18
|
+
|
|
19
|
+
Given:
|
|
20
|
+
1. The developer's EXISTING memory file (may be empty if this is their first time)
|
|
21
|
+
2. A NEW project's context (README, package files, etc.)
|
|
22
|
+
|
|
23
|
+
Generate an UPDATED memory file that MERGES the new project's vocabulary into the existing one.
|
|
24
|
+
|
|
25
|
+
Rules:
|
|
26
|
+
- Keep ALL existing vocabulary entries — do not remove anything
|
|
27
|
+
- Add new terms from the project that are not already covered
|
|
28
|
+
- If a term already exists, keep the existing entry (don't duplicate)
|
|
29
|
+
- Keep all existing Context and Notes sections, and append new context about this project
|
|
30
|
+
- Focus on terms that a speech-to-text model might get wrong
|
|
31
|
+
- For each new term, add a disambiguation hint explaining what it is and what it should NOT be confused with
|
|
32
|
+
- Example entry: | Claude Code | AI coding assistant CLI, not "clock code" or "cloud code" |
|
|
33
|
+
|
|
34
|
+
The file should have these sections:
|
|
35
|
+
|
|
36
|
+
## Vocabulary
|
|
37
|
+
|
|
38
|
+
A markdown table with columns: Term | Hint
|
|
39
|
+
(merged from existing + new project)
|
|
40
|
+
|
|
41
|
+
## Context
|
|
42
|
+
|
|
43
|
+
Brief descriptions of all projects the developer works on.
|
|
44
|
+
|
|
45
|
+
## Notes
|
|
46
|
+
|
|
47
|
+
Keep any existing notes. If this is the first time, add HTML comment examples:
|
|
48
|
+
<!-- Add your own notes here -->
|
|
49
|
+
<!-- Examples: -->
|
|
50
|
+
<!-- - I have a [nationality] accent -->
|
|
51
|
+
<!-- - When I say "X", I usually mean Y -->
|
|
52
|
+
<!-- - I sometimes mix [language] and English -->
|
|
53
|
+
|
|
54
|
+
Output ONLY the markdown content. Start with "# Voice Coding Memory" as the first line.
|
|
55
|
+
Do NOT wrap in code fences.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _gather_repo_context(repo_dir: Path) -> str:
|
|
60
|
+
"""Read key files from the repo to send as context."""
|
|
61
|
+
context_parts = []
|
|
62
|
+
|
|
63
|
+
for filename in [
|
|
64
|
+
"README.md",
|
|
65
|
+
"README",
|
|
66
|
+
"package.json",
|
|
67
|
+
"pyproject.toml",
|
|
68
|
+
"Cargo.toml",
|
|
69
|
+
"go.mod",
|
|
70
|
+
"Gemfile",
|
|
71
|
+
"CLAUDE.md",
|
|
72
|
+
"requirements.txt",
|
|
73
|
+
]:
|
|
74
|
+
filepath = repo_dir / filename
|
|
75
|
+
if filepath.is_file():
|
|
76
|
+
content = filepath.read_text(encoding="utf-8", errors="ignore")[:4000]
|
|
77
|
+
context_parts.append(f"=== {filename} ===\n{content}")
|
|
78
|
+
|
|
79
|
+
try:
|
|
80
|
+
entries = sorted(p.name for p in repo_dir.iterdir() if not p.name.startswith("."))
|
|
81
|
+
context_parts.append(f"=== Directory listing ===\n{chr(10).join(entries)}")
|
|
82
|
+
except OSError:
|
|
83
|
+
pass
|
|
84
|
+
|
|
85
|
+
return "\n\n".join(context_parts)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def run_learn():
|
|
89
|
+
"""Scan current repo and merge its vocabulary into global memory."""
|
|
90
|
+
load_dotenv()
|
|
91
|
+
global_env = GLOBAL_MEMORY_DIR / ".env"
|
|
92
|
+
if not os.environ.get("GEMINI_API_KEY") and global_env.is_file():
|
|
93
|
+
load_dotenv(global_env)
|
|
94
|
+
|
|
95
|
+
api_key = os.environ.get("GEMINI_API_KEY")
|
|
96
|
+
if not api_key:
|
|
97
|
+
print("Error: GEMINI_API_KEY not set.")
|
|
98
|
+
print(" Option 1: export GEMINI_API_KEY=your_key")
|
|
99
|
+
print(f" Option 2: echo 'GEMINI_API_KEY=your_key' > {global_env}")
|
|
100
|
+
sys.exit(1)
|
|
101
|
+
|
|
102
|
+
repo_dir = Path.cwd()
|
|
103
|
+
|
|
104
|
+
print(f"Scanning {repo_dir.name}/ ...")
|
|
105
|
+
repo_context = _gather_repo_context(repo_dir)
|
|
106
|
+
|
|
107
|
+
if not repo_context.strip():
|
|
108
|
+
print("Warning: No project files found. Generating generic vocabulary.")
|
|
109
|
+
|
|
110
|
+
# Load existing global memory
|
|
111
|
+
existing_memory = ""
|
|
112
|
+
if GLOBAL_MEMORY_FILE.is_file():
|
|
113
|
+
existing_memory = GLOBAL_MEMORY_FILE.read_text(encoding="utf-8")
|
|
114
|
+
print(f"Found existing memory at {GLOBAL_MEMORY_FILE}")
|
|
115
|
+
|
|
116
|
+
print("Generating vocabulary with Gemini...")
|
|
117
|
+
client = genai.Client(api_key=api_key)
|
|
118
|
+
|
|
119
|
+
response = client.models.generate_content(
|
|
120
|
+
model=LEARN_MODEL,
|
|
121
|
+
contents=[
|
|
122
|
+
LEARN_PROMPT,
|
|
123
|
+
f"=== Existing memory file ===\n\n{existing_memory}" if existing_memory else "=== Existing memory file ===\n\n(empty — first time setup)",
|
|
124
|
+
f"=== New project context ===\n\n{repo_context}",
|
|
125
|
+
],
|
|
126
|
+
config=types.GenerateContentConfig(
|
|
127
|
+
max_output_tokens=4096,
|
|
128
|
+
),
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
memory_content = response.text.strip()
|
|
132
|
+
|
|
133
|
+
GLOBAL_MEMORY_DIR.mkdir(exist_ok=True)
|
|
134
|
+
GLOBAL_MEMORY_FILE.write_text(memory_content + "\n", encoding="utf-8")
|
|
135
|
+
|
|
136
|
+
print(f"Updated {GLOBAL_MEMORY_FILE}")
|
|
137
|
+
print("Edit this file anytime to add or fix terms.")
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
"""Entry point — CLI router and global hotkey listener."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import sys
|
|
5
|
+
import threading
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import Quartz
|
|
9
|
+
from dotenv import load_dotenv
|
|
10
|
+
|
|
11
|
+
from voice_coding.clipboard import copy_and_paste
|
|
12
|
+
from voice_coding.postprocessor import postprocess
|
|
13
|
+
from voice_coding.recorder import Recorder, SAMPLE_RATE
|
|
14
|
+
from voice_coding.transcriber import transcribe
|
|
15
|
+
|
|
16
|
+
GLOBAL_ENV = Path.home() / ".voice-coding" / ".env"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _load_env():
|
|
20
|
+
"""Load .env from cwd first, then global ~/.voice-coding/.env as fallback."""
|
|
21
|
+
load_dotenv()
|
|
22
|
+
if not os.environ.get("GEMINI_API_KEY") and GLOBAL_ENV.is_file():
|
|
23
|
+
load_dotenv(GLOBAL_ENV)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _run_listener():
|
|
27
|
+
"""Run the hold-to-record voice coding listener."""
|
|
28
|
+
_load_env()
|
|
29
|
+
|
|
30
|
+
api_key = os.environ.get("GEMINI_API_KEY")
|
|
31
|
+
if not api_key:
|
|
32
|
+
print("Error: GEMINI_API_KEY not set.")
|
|
33
|
+
print(" Option 1: export GEMINI_API_KEY=your_key")
|
|
34
|
+
print(f" Option 2: echo 'GEMINI_API_KEY=your_key' > {GLOBAL_ENV}")
|
|
35
|
+
sys.exit(1)
|
|
36
|
+
|
|
37
|
+
recorder = Recorder()
|
|
38
|
+
recording = False
|
|
39
|
+
alt_held = False
|
|
40
|
+
processing = False
|
|
41
|
+
|
|
42
|
+
def _finish_recording():
|
|
43
|
+
nonlocal recording, processing
|
|
44
|
+
if processing:
|
|
45
|
+
return
|
|
46
|
+
recording = False
|
|
47
|
+
processing = True
|
|
48
|
+
print("\r⏳ Transcribing...", end="", flush=True)
|
|
49
|
+
|
|
50
|
+
wav_bytes = recorder.stop()
|
|
51
|
+
if not wav_bytes:
|
|
52
|
+
print("\r⏳ Too short, skipped. ")
|
|
53
|
+
processing = False
|
|
54
|
+
return
|
|
55
|
+
|
|
56
|
+
try:
|
|
57
|
+
duration_secs = len(wav_bytes) / (SAMPLE_RATE * 4)
|
|
58
|
+
print(f"\r⏳ Transcribing {duration_secs:.1f}s of audio ({len(wav_bytes) / 1024:.0f} KB)...", end="", flush=True)
|
|
59
|
+
|
|
60
|
+
raw_text = transcribe(wav_bytes, api_key)
|
|
61
|
+
print(f"\n📝 Raw transcription ({len(raw_text)} chars):")
|
|
62
|
+
print(f"--- START ---\n{raw_text}\n--- END ---")
|
|
63
|
+
|
|
64
|
+
text = postprocess(raw_text)
|
|
65
|
+
if text != raw_text:
|
|
66
|
+
print(f"📝 After postprocess ({len(text)} chars):")
|
|
67
|
+
print(f"--- START ---\n{text}\n--- END ---")
|
|
68
|
+
|
|
69
|
+
copy_and_paste(text)
|
|
70
|
+
print(f"✅ Pasted ({len(text)} chars)")
|
|
71
|
+
except Exception as e:
|
|
72
|
+
print(f"\r❌ Error: {e} ")
|
|
73
|
+
finally:
|
|
74
|
+
processing = False
|
|
75
|
+
|
|
76
|
+
def cg_event_callback(proxy, event_type, event, refcon):
|
|
77
|
+
nonlocal recording, alt_held
|
|
78
|
+
|
|
79
|
+
if event_type == Quartz.kCGEventFlagsChanged:
|
|
80
|
+
flags = Quartz.CGEventGetFlags(event)
|
|
81
|
+
alt_now = bool(flags & Quartz.kCGEventFlagMaskAlternate)
|
|
82
|
+
if alt_now and not alt_held:
|
|
83
|
+
alt_held = True
|
|
84
|
+
if not recording and not processing:
|
|
85
|
+
recording = True
|
|
86
|
+
recorder.start()
|
|
87
|
+
print("\r🎙 Recording... (release Alt to stop)", end="", flush=True)
|
|
88
|
+
elif not alt_now and alt_held:
|
|
89
|
+
alt_held = False
|
|
90
|
+
if recording:
|
|
91
|
+
threading.Thread(target=_finish_recording, daemon=True).start()
|
|
92
|
+
|
|
93
|
+
return event
|
|
94
|
+
|
|
95
|
+
event_mask = (1 << Quartz.kCGEventFlagsChanged)
|
|
96
|
+
|
|
97
|
+
tap = Quartz.CGEventTapCreate(
|
|
98
|
+
Quartz.kCGSessionEventTap,
|
|
99
|
+
Quartz.kCGHeadInsertEventTap,
|
|
100
|
+
Quartz.kCGEventTapOptionDefault,
|
|
101
|
+
event_mask,
|
|
102
|
+
cg_event_callback,
|
|
103
|
+
None,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
if tap is None:
|
|
107
|
+
print("Error: Failed to create event tap. Grant Accessibility permission to your terminal app.")
|
|
108
|
+
print(" System Settings → Privacy & Security → Accessibility")
|
|
109
|
+
sys.exit(1)
|
|
110
|
+
|
|
111
|
+
run_loop_source = Quartz.CFMachPortCreateRunLoopSource(None, tap, 0)
|
|
112
|
+
Quartz.CFRunLoopAddSource(
|
|
113
|
+
Quartz.CFRunLoopGetCurrent(),
|
|
114
|
+
run_loop_source,
|
|
115
|
+
Quartz.kCFRunLoopCommonModes,
|
|
116
|
+
)
|
|
117
|
+
Quartz.CGEventTapEnable(tap, True)
|
|
118
|
+
|
|
119
|
+
print("Voice Coding running. Hold Alt to record. Release Alt to stop. Ctrl+C to quit.")
|
|
120
|
+
|
|
121
|
+
try:
|
|
122
|
+
Quartz.CFRunLoopRun()
|
|
123
|
+
except KeyboardInterrupt:
|
|
124
|
+
print("\nStopped.")
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def main():
|
|
128
|
+
"""CLI entry point — route to subcommand or default listener."""
|
|
129
|
+
if len(sys.argv) > 1 and sys.argv[1] == "learn":
|
|
130
|
+
from voice_coding.learn_cmd import run_learn
|
|
131
|
+
run_learn()
|
|
132
|
+
else:
|
|
133
|
+
_run_listener()
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
if __name__ == "__main__":
|
|
137
|
+
main()
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Load global voice coding memory from ~/.voice-coding/memory.md."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
GLOBAL_MEMORY_FILE = Path.home() / ".voice-coding" / "memory.md"
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def load_memory() -> str:
|
|
9
|
+
"""Load global memory.md content, or return empty string if not found."""
|
|
10
|
+
if GLOBAL_MEMORY_FILE.is_file():
|
|
11
|
+
return GLOBAL_MEMORY_FILE.read_text(encoding="utf-8")
|
|
12
|
+
return ""
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""Coding-aware text transforms applied after transcription."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
CODING_RULES: dict[str, str] = {
|
|
6
|
+
"dot env": ".env",
|
|
7
|
+
"double equals": "==",
|
|
8
|
+
"triple equals": "===",
|
|
9
|
+
"not equals": "!=",
|
|
10
|
+
"arrow": "=>",
|
|
11
|
+
"slash": "/",
|
|
12
|
+
"dot": ".",
|
|
13
|
+
"dash": "-",
|
|
14
|
+
"underscore": "_",
|
|
15
|
+
"equals": "=",
|
|
16
|
+
"hash": "#",
|
|
17
|
+
"at sign": "@",
|
|
18
|
+
"ampersand": "&",
|
|
19
|
+
"pipe": "|",
|
|
20
|
+
"tilde": "~",
|
|
21
|
+
"backtick": "`",
|
|
22
|
+
"open paren": "(",
|
|
23
|
+
"close paren": ")",
|
|
24
|
+
"open bracket": "[",
|
|
25
|
+
"close bracket": "]",
|
|
26
|
+
"open brace": "{",
|
|
27
|
+
"close brace": "}",
|
|
28
|
+
"new line": "\n",
|
|
29
|
+
"tab": "\t",
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _apply_coding_rules(text: str) -> str:
|
|
34
|
+
"""Replace spoken coding phrases with their symbol equivalents."""
|
|
35
|
+
# Sort by length descending so multi-word rules match first
|
|
36
|
+
for phrase, replacement in sorted(
|
|
37
|
+
CODING_RULES.items(), key=lambda x: len(x[0]), reverse=True
|
|
38
|
+
):
|
|
39
|
+
# Use word boundaries to avoid matching inside words (e.g. "dash" in "Dashboard")
|
|
40
|
+
pattern = r"\b" + re.escape(phrase) + r"\b"
|
|
41
|
+
text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
|
|
42
|
+
return text
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _apply_camel_case(text: str) -> str:
|
|
46
|
+
"""Convert 'camel case foo bar baz' → 'fooBarBaz'."""
|
|
47
|
+
|
|
48
|
+
def _to_camel(m: re.Match) -> str:
|
|
49
|
+
words = m.group(1).strip().split()
|
|
50
|
+
if not words:
|
|
51
|
+
return ""
|
|
52
|
+
return words[0].lower() + "".join(w.capitalize() for w in words[1:])
|
|
53
|
+
|
|
54
|
+
return re.sub(r"camel case\s+((?:\w+\s*){2,})", _to_camel, text, flags=re.IGNORECASE)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _apply_snake_case(text: str) -> str:
|
|
58
|
+
"""Convert 'snake case foo bar baz' → 'foo_bar_baz'."""
|
|
59
|
+
|
|
60
|
+
def _to_snake(m: re.Match) -> str:
|
|
61
|
+
words = m.group(1).strip().split()
|
|
62
|
+
return "_".join(w.lower() for w in words)
|
|
63
|
+
|
|
64
|
+
return re.sub(r"snake case\s+((?:\w+\s*){2,})", _to_snake, text, flags=re.IGNORECASE)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def postprocess(text: str) -> str:
|
|
68
|
+
"""Apply all coding-aware transforms to transcribed text."""
|
|
69
|
+
text = _apply_camel_case(text)
|
|
70
|
+
text = _apply_snake_case(text)
|
|
71
|
+
text = _apply_coding_rules(text)
|
|
72
|
+
return text
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""Microphone capture using sounddevice."""
|
|
2
|
+
|
|
3
|
+
import io
|
|
4
|
+
import threading
|
|
5
|
+
import time
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import sounddevice as sd
|
|
9
|
+
import soundfile as sf
|
|
10
|
+
|
|
11
|
+
SAMPLE_RATE = 16000
|
|
12
|
+
CHANNELS = 1
|
|
13
|
+
DTYPE = "float32"
|
|
14
|
+
MIN_DURATION_SECS = 0.5 # Ignore recordings shorter than this
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Recorder:
|
|
18
|
+
def __init__(self):
|
|
19
|
+
self._frames: list[np.ndarray] = []
|
|
20
|
+
self._stream: sd.InputStream | None = None
|
|
21
|
+
self._lock = threading.Lock()
|
|
22
|
+
self._start_time: float = 0
|
|
23
|
+
|
|
24
|
+
def start(self):
|
|
25
|
+
"""Open mic stream and start accumulating audio frames."""
|
|
26
|
+
with self._lock:
|
|
27
|
+
self._frames = []
|
|
28
|
+
self._start_time = time.monotonic()
|
|
29
|
+
self._stream = sd.InputStream(
|
|
30
|
+
samplerate=SAMPLE_RATE,
|
|
31
|
+
channels=CHANNELS,
|
|
32
|
+
dtype=DTYPE,
|
|
33
|
+
callback=self._callback,
|
|
34
|
+
)
|
|
35
|
+
self._stream.start()
|
|
36
|
+
|
|
37
|
+
def stop(self) -> bytes:
|
|
38
|
+
"""Stop recording and return WAV bytes. Returns empty if too short."""
|
|
39
|
+
with self._lock:
|
|
40
|
+
duration = time.monotonic() - self._start_time
|
|
41
|
+
|
|
42
|
+
if self._stream is not None:
|
|
43
|
+
self._stream.stop()
|
|
44
|
+
self._stream.close()
|
|
45
|
+
self._stream = None
|
|
46
|
+
|
|
47
|
+
if not self._frames or duration < MIN_DURATION_SECS:
|
|
48
|
+
self._frames = []
|
|
49
|
+
return b""
|
|
50
|
+
|
|
51
|
+
audio = np.concatenate(self._frames, axis=0)
|
|
52
|
+
num_frames = len(self._frames)
|
|
53
|
+
self._frames = []
|
|
54
|
+
|
|
55
|
+
buf = io.BytesIO()
|
|
56
|
+
sf.write(buf, audio, SAMPLE_RATE, format="WAV", subtype="FLOAT")
|
|
57
|
+
wav_bytes = buf.getvalue()
|
|
58
|
+
print(f"🎙 Recorded {duration:.1f}s ({num_frames} chunks, {len(audio)} samples, {len(wav_bytes) / 1024:.0f} KB WAV)")
|
|
59
|
+
return wav_bytes
|
|
60
|
+
|
|
61
|
+
def _callback(self, indata: np.ndarray, frames: int, time_info, status):
|
|
62
|
+
if status:
|
|
63
|
+
pass # silently ignore overflow warnings
|
|
64
|
+
self._frames.append(indata.copy())
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""Gemini Flash transcription."""
|
|
2
|
+
|
|
3
|
+
from google import genai
|
|
4
|
+
from google.genai import types
|
|
5
|
+
|
|
6
|
+
from voice_coding.memory import load_memory
|
|
7
|
+
|
|
8
|
+
MODEL = "gemini-3-flash-preview"
|
|
9
|
+
|
|
10
|
+
BASE_PROMPT = (
|
|
11
|
+
"Transcribe this audio exactly as spoken. Output ONLY the transcription text. "
|
|
12
|
+
"No preamble, no labels, no commentary. Start directly with the spoken content. "
|
|
13
|
+
"Rules:\n"
|
|
14
|
+
"- Remove filler words: um, uh, like, you know, hmm, so, basically, right\n"
|
|
15
|
+
"- Fix minor grammar issues (subject-verb agreement, missing articles) but preserve the speaker's original wording and meaning\n"
|
|
16
|
+
"- Do NOT rephrase, summarize, or add content that was not spoken\n"
|
|
17
|
+
"- Do NOT hallucinate or generate text if the audio is silent or unclear — output nothing instead\n"
|
|
18
|
+
"- Preserve technical terms, code references, and proper nouns exactly as spoken\n"
|
|
19
|
+
"- This is a software developer dictating into a text field\n"
|
|
20
|
+
"- Default developer vocabulary (prefer these spellings when the audio matches):\n"
|
|
21
|
+
" - 'VS Code' (editor, not 'V.S. code')\n"
|
|
22
|
+
" - 'GitHub' (not 'get hub')\n"
|
|
23
|
+
" - 'npm' (package manager)\n"
|
|
24
|
+
" - 'API' (not 'A.P.I.')\n"
|
|
25
|
+
" - 'CLI' (command line interface)\n"
|
|
26
|
+
" - 'JSON' (not 'Jason')\n"
|
|
27
|
+
" - 'regex' (regular expression)\n"
|
|
28
|
+
" - 'localhost' (not 'local host')"
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _build_prompt() -> str:
|
|
33
|
+
"""Build transcription prompt, appending memory.md if found."""
|
|
34
|
+
memory = load_memory()
|
|
35
|
+
if not memory:
|
|
36
|
+
return BASE_PROMPT
|
|
37
|
+
return BASE_PROMPT + "\n\n--- Project-specific context (from .voice-coding/memory.md) ---\n\n" + memory
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def transcribe(wav_bytes: bytes, api_key: str) -> str:
|
|
41
|
+
"""Send WAV audio to Gemini Flash and return the transcription text."""
|
|
42
|
+
client = genai.Client(api_key=api_key)
|
|
43
|
+
prompt = _build_prompt()
|
|
44
|
+
|
|
45
|
+
response = client.models.generate_content(
|
|
46
|
+
model=MODEL,
|
|
47
|
+
contents=[
|
|
48
|
+
prompt,
|
|
49
|
+
types.Part.from_bytes(data=wav_bytes, mime_type="audio/wav"),
|
|
50
|
+
],
|
|
51
|
+
config=types.GenerateContentConfig(
|
|
52
|
+
max_output_tokens=8192,
|
|
53
|
+
),
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
# Log finish reason and token usage for debugging
|
|
57
|
+
if response.candidates:
|
|
58
|
+
candidate = response.candidates[0]
|
|
59
|
+
print(f"🔍 Finish reason: {candidate.finish_reason}")
|
|
60
|
+
if response.usage_metadata:
|
|
61
|
+
meta = response.usage_metadata
|
|
62
|
+
print(f"🔍 Tokens — prompt: {meta.prompt_token_count}, output: {meta.candidates_token_count}")
|
|
63
|
+
|
|
64
|
+
return response.text.strip()
|