speech-opencode 1.0.0 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +25 -21
- package/dist/index.d.ts +3 -3
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +40 -117
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
Voice input plugin for [OpenCode](https://opencode.ai) using OpenAI Whisper.
|
|
4
4
|
|
|
5
|
-
Record audio from your microphone and transcribe it to text using OpenAI's Whisper API.
|
|
5
|
+
Record audio from your microphone and transcribe it to text using OpenAI's Whisper API. **Recording automatically stops when you stop talking** - no need to specify a duration!
|
|
6
6
|
|
|
7
7
|
## Installation
|
|
8
8
|
|
|
@@ -26,33 +26,32 @@ export OPENAI_API_KEY=your-api-key
|
|
|
26
26
|
|
|
27
27
|
### Audio Recording Tools
|
|
28
28
|
|
|
29
|
-
**
|
|
29
|
+
**sox** is required for audio recording with silence detection:
|
|
30
|
+
|
|
30
31
|
```bash
|
|
32
|
+
# macOS
|
|
33
|
+
brew install sox
|
|
34
|
+
|
|
31
35
|
# Ubuntu/Debian
|
|
32
|
-
sudo apt install
|
|
36
|
+
sudo apt install sox
|
|
33
37
|
|
|
34
38
|
# Fedora
|
|
35
|
-
sudo dnf install
|
|
39
|
+
sudo dnf install sox
|
|
36
40
|
|
|
37
41
|
# Arch
|
|
38
|
-
sudo pacman -S
|
|
39
|
-
```
|
|
40
|
-
|
|
41
|
-
**macOS:**
|
|
42
|
-
```bash
|
|
43
|
-
brew install sox
|
|
42
|
+
sudo pacman -S sox
|
|
44
43
|
```
|
|
45
44
|
|
|
46
45
|
## Usage
|
|
47
46
|
|
|
48
|
-
Once installed, OpenCode will have access to a `voice` tool.
|
|
47
|
+
Once installed, OpenCode will have access to a `voice` tool. Just ask OpenCode:
|
|
49
48
|
|
|
50
49
|
- "Listen to my voice"
|
|
51
50
|
- "Record what I say"
|
|
52
51
|
- "Use voice input"
|
|
53
|
-
- "
|
|
52
|
+
- "voice"
|
|
54
53
|
|
|
55
|
-
|
|
54
|
+
**Recording automatically stops after 7 seconds of silence**, so just speak naturally and pause when you're done.
|
|
56
55
|
|
|
57
56
|
## Configuration
|
|
58
57
|
|
|
@@ -66,11 +65,11 @@ export default VoicePlugin({
|
|
|
66
65
|
// Optional: specify language (auto-detects if not set)
|
|
67
66
|
language: "en",
|
|
68
67
|
|
|
69
|
-
// Optional:
|
|
70
|
-
|
|
68
|
+
// Optional: seconds of silence before stopping (default 7)
|
|
69
|
+
silenceDuration: 7,
|
|
71
70
|
|
|
72
|
-
// Optional: maximum recording
|
|
73
|
-
maxDuration:
|
|
71
|
+
// Optional: maximum recording time as safety timeout (default 300 = 5 min)
|
|
72
|
+
maxDuration: 300,
|
|
74
73
|
|
|
75
74
|
// Optional: override API key (defaults to OPENAI_API_KEY env var)
|
|
76
75
|
apiKey: process.env.MY_OPENAI_KEY,
|
|
@@ -92,9 +91,10 @@ Leave `language` unset for automatic detection.
|
|
|
92
91
|
|
|
93
92
|
## How It Works
|
|
94
93
|
|
|
95
|
-
1.
|
|
96
|
-
2.
|
|
97
|
-
3.
|
|
94
|
+
1. Starts recording from your microphone when you begin speaking
|
|
95
|
+
2. Automatically stops after detecting 7 seconds of silence
|
|
96
|
+
3. Sends the audio to OpenAI's Whisper API for transcription
|
|
97
|
+
4. Returns the transcribed text to OpenCode
|
|
98
98
|
|
|
99
99
|
## Troubleshooting
|
|
100
100
|
|
|
@@ -103,8 +103,12 @@ Leave `language` unset for automatic detection.
|
|
|
103
103
|
- Verify the correct input device is selected in your system settings
|
|
104
104
|
- On Linux, use `pavucontrol` to check input sources
|
|
105
105
|
|
|
106
|
+
### Recording doesn't stop
|
|
107
|
+
- Make sure you pause speaking for at least 7 seconds
|
|
108
|
+
- Check that background noise isn't being detected as speech
|
|
109
|
+
|
|
106
110
|
### Recording fails
|
|
107
|
-
- Ensure
|
|
111
|
+
- Ensure sox is installed: `which rec`
|
|
108
112
|
- Check that your microphone permissions are granted
|
|
109
113
|
|
|
110
114
|
## License
|
package/dist/index.d.ts
CHANGED
|
@@ -4,9 +4,9 @@ export interface VoicePluginOptions {
|
|
|
4
4
|
apiKey?: string;
|
|
5
5
|
/** Language code for transcription (e.g., "en", "es", "fr"). Auto-detects if not specified */
|
|
6
6
|
language?: string;
|
|
7
|
-
/**
|
|
8
|
-
|
|
9
|
-
/** Maximum
|
|
7
|
+
/** Seconds of silence before stopping recording (default 7) */
|
|
8
|
+
silenceDuration?: number;
|
|
9
|
+
/** Maximum recording duration in seconds as a safety timeout (default 300 = 5 minutes) */
|
|
10
10
|
maxDuration?: number;
|
|
11
11
|
}
|
|
12
12
|
/**
|
package/dist/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,MAAM,EAAQ,MAAM,qBAAqB,CAAA;
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,MAAM,EAAQ,MAAM,qBAAqB,CAAA;AAwGvD,MAAM,WAAW,kBAAkB;IACjC,yDAAyD;IACzD,MAAM,CAAC,EAAE,MAAM,CAAA;IACf,8FAA8F;IAC9F,QAAQ,CAAC,EAAE,MAAM,CAAA;IACjB,+DAA+D;IAC/D,eAAe,CAAC,EAAE,MAAM,CAAA;IACxB,0FAA0F;IAC1F,WAAW,CAAC,EAAE,MAAM,CAAA;CACrB;AAED;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,eAAO,MAAM,WAAW,GACrB,UAAS,kBAAuB,KAAG,MA4DnC,CAAA;;AAGH,wBAA4B"}
|
package/dist/index.js
CHANGED
|
@@ -5,127 +5,56 @@ import { unlinkSync, readFileSync } from "fs";
|
|
|
5
5
|
import { tmpdir } from "os";
|
|
6
6
|
import { join } from "path";
|
|
7
7
|
/**
|
|
8
|
-
*
|
|
9
|
-
*
|
|
8
|
+
* Records audio from the microphone with automatic silence detection.
|
|
9
|
+
* Recording stops after the specified silence duration.
|
|
10
|
+
* Uses sox on both Linux and macOS for silence detection.
|
|
11
|
+
*
|
|
12
|
+
* @param maxDurationSeconds - Maximum recording time (safety timeout)
|
|
13
|
+
* @param silenceDuration - Seconds of silence before stopping (default 7)
|
|
10
14
|
*/
|
|
11
|
-
async function
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
pactl.stdout.on("data", (data) => {
|
|
16
|
-
output += data.toString();
|
|
17
|
-
});
|
|
18
|
-
pactl.on("close", () => {
|
|
19
|
-
const lines = output.trim().split("\n");
|
|
20
|
-
for (const line of lines) {
|
|
21
|
-
const parts = line.split("\t");
|
|
22
|
-
if (parts.length >= 2) {
|
|
23
|
-
const name = parts[1];
|
|
24
|
-
// Skip monitor sources and bluetooth (prefer hardware input)
|
|
25
|
-
if (!name.includes(".monitor") && !name.includes("bluez")) {
|
|
26
|
-
resolve(name);
|
|
27
|
-
return;
|
|
28
|
-
}
|
|
29
|
-
}
|
|
30
|
-
}
|
|
31
|
-
resolve(null);
|
|
32
|
-
});
|
|
33
|
-
pactl.on("error", () => resolve(null));
|
|
34
|
-
});
|
|
15
|
+
async function recordAudio(maxDurationSeconds = 300, silenceDuration = 7) {
|
|
16
|
+
const tempFile = join(tmpdir(), `opencode-voice-${Date.now()}.wav`);
|
|
17
|
+
// Use sox with silence detection on all platforms
|
|
18
|
+
return recordWithSilenceDetection(tempFile, maxDurationSeconds, silenceDuration);
|
|
35
19
|
}
|
|
36
20
|
/**
|
|
37
|
-
* Records audio
|
|
38
|
-
*
|
|
39
|
-
*
|
|
21
|
+
* Records audio using sox with silence detection.
|
|
22
|
+
* Recording automatically stops after detecting silence.
|
|
23
|
+
*
|
|
24
|
+
* Sox silence syntax: silence [above_periods] [duration] [threshold] [below_periods] [duration] [threshold]
|
|
25
|
+
* - above_periods 1: need 1 period of audio above threshold to start
|
|
26
|
+
* - 0.1 3%: audio must be above 3% for 0.1s to count as speech start
|
|
27
|
+
* - below_periods 1: need 1 period below threshold to stop
|
|
28
|
+
* - silenceDuration 3%: stop after silenceDuration seconds below 3%
|
|
40
29
|
*/
|
|
41
|
-
async function
|
|
42
|
-
const tempFile = join(tmpdir(), `opencode-voice-${Date.now()}.wav`);
|
|
43
|
-
const platform = process.platform;
|
|
44
|
-
if (platform === "darwin") {
|
|
45
|
-
// macOS: use sox
|
|
46
|
-
return recordWithSox(tempFile, durationSeconds);
|
|
47
|
-
}
|
|
48
|
-
else {
|
|
49
|
-
// Linux: use parecord or arecord
|
|
50
|
-
return recordWithPulseAudio(tempFile, durationSeconds);
|
|
51
|
-
}
|
|
52
|
-
}
|
|
53
|
-
async function recordWithSox(tempFile, durationSeconds) {
|
|
30
|
+
async function recordWithSilenceDetection(tempFile, maxDurationSeconds, silenceDuration) {
|
|
54
31
|
return new Promise((resolve, reject) => {
|
|
55
|
-
|
|
32
|
+
// Use timeout to enforce max duration, sox for silence detection
|
|
33
|
+
const recorder = spawn("timeout", [
|
|
34
|
+
maxDurationSeconds.toString(),
|
|
35
|
+
"rec",
|
|
56
36
|
"-q",
|
|
57
|
-
"-r",
|
|
58
|
-
"
|
|
59
|
-
"-
|
|
60
|
-
"1",
|
|
61
|
-
"-b",
|
|
62
|
-
"16",
|
|
37
|
+
"-r", "16000",
|
|
38
|
+
"-c", "1",
|
|
39
|
+
"-b", "16",
|
|
63
40
|
tempFile,
|
|
64
|
-
"
|
|
65
|
-
"0",
|
|
66
|
-
|
|
41
|
+
"silence",
|
|
42
|
+
"1", "0.1", "3%", // Start recording when speech detected (above 3% for 0.1s)
|
|
43
|
+
"1", `${silenceDuration}.0`, "3%", // Stop after silenceDuration seconds of silence (below 3%)
|
|
67
44
|
]);
|
|
68
45
|
let errorOutput = "";
|
|
69
46
|
recorder.stderr.on("data", (data) => {
|
|
70
47
|
errorOutput += data.toString();
|
|
71
48
|
});
|
|
72
49
|
recorder.on("error", () => {
|
|
73
|
-
reject(new Error("sox not found. Please install it:\n" +
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
}
|
|
79
|
-
else {
|
|
80
|
-
reject(new Error(`Recording failed: ${errorOutput}`));
|
|
81
|
-
}
|
|
82
|
-
});
|
|
83
|
-
});
|
|
84
|
-
}
|
|
85
|
-
async function recordWithPulseAudio(tempFile, durationSeconds) {
|
|
86
|
-
const inputDevice = await getDefaultInputDevice();
|
|
87
|
-
return new Promise((resolve, reject) => {
|
|
88
|
-
const args = [(durationSeconds + 1).toString(), "parecord"];
|
|
89
|
-
if (inputDevice) {
|
|
90
|
-
args.push(`--device=${inputDevice}`);
|
|
91
|
-
}
|
|
92
|
-
args.push("--file-format=wav", tempFile);
|
|
93
|
-
const recorder = spawn("timeout", args);
|
|
94
|
-
let errorOutput = "";
|
|
95
|
-
recorder.stderr.on("data", (data) => {
|
|
96
|
-
errorOutput += data.toString();
|
|
97
|
-
});
|
|
98
|
-
recorder.on("error", () => {
|
|
99
|
-
// Fallback to arecord
|
|
100
|
-
const arecord = spawn("arecord", [
|
|
101
|
-
"-q",
|
|
102
|
-
"-f",
|
|
103
|
-
"S16_LE",
|
|
104
|
-
"-r",
|
|
105
|
-
"16000",
|
|
106
|
-
"-c",
|
|
107
|
-
"1",
|
|
108
|
-
"-d",
|
|
109
|
-
durationSeconds.toString(),
|
|
110
|
-
tempFile,
|
|
111
|
-
]);
|
|
112
|
-
arecord.on("error", () => {
|
|
113
|
-
reject(new Error("No audio recorder found. Please install:\n" +
|
|
114
|
-
" - Ubuntu/Debian: sudo apt install pulseaudio-utils\n" +
|
|
115
|
-
" - Fedora: sudo dnf install pulseaudio-utils\n" +
|
|
116
|
-
" - Arch: sudo pacman -S pulseaudio-utils"));
|
|
117
|
-
});
|
|
118
|
-
arecord.on("close", (code) => {
|
|
119
|
-
if (code === 0) {
|
|
120
|
-
resolve(tempFile);
|
|
121
|
-
}
|
|
122
|
-
else {
|
|
123
|
-
reject(new Error(`arecord failed with code ${code}`));
|
|
124
|
-
}
|
|
125
|
-
});
|
|
50
|
+
reject(new Error("sox not found. Please install it:\n" +
|
|
51
|
+
" - macOS: brew install sox\n" +
|
|
52
|
+
" - Ubuntu/Debian: sudo apt install sox\n" +
|
|
53
|
+
" - Fedora: sudo dnf install sox\n" +
|
|
54
|
+
" - Arch: sudo pacman -S sox"));
|
|
126
55
|
});
|
|
127
56
|
recorder.on("close", (code) => {
|
|
128
|
-
//
|
|
57
|
+
// code 0 = normal exit, 124 = timeout killed it (max duration reached)
|
|
129
58
|
if (code === 0 || code === 124) {
|
|
130
59
|
resolve(tempFile);
|
|
131
60
|
}
|
|
@@ -171,7 +100,7 @@ async function transcribeAudio(audioFilePath, apiKey, language) {
|
|
|
171
100
|
* ```
|
|
172
101
|
*/
|
|
173
102
|
export const VoicePlugin = (options = {}) => async (ctx) => {
|
|
174
|
-
const { apiKey = process.env.OPENAI_API_KEY, language,
|
|
103
|
+
const { apiKey = process.env.OPENAI_API_KEY, language, silenceDuration = 7, maxDuration = 300, } = options;
|
|
175
104
|
if (!apiKey) {
|
|
176
105
|
console.warn("[Voice Plugin] Warning: OPENAI_API_KEY not set. Voice transcription will fail.");
|
|
177
106
|
}
|
|
@@ -180,21 +109,15 @@ export const VoicePlugin = (options = {}) => async (ctx) => {
|
|
|
180
109
|
voice: tool({
|
|
181
110
|
description: "Records audio from the user's microphone and transcribes it using OpenAI Whisper. " +
|
|
182
111
|
"Use this tool when the user wants to provide input via voice or speech. " +
|
|
183
|
-
|
|
184
|
-
args: {
|
|
185
|
-
|
|
186
|
-
.number()
|
|
187
|
-
.optional()
|
|
188
|
-
.describe(`Recording duration in seconds. Default is ${defaultDuration} seconds. Max is ${maxDuration} seconds.`),
|
|
189
|
-
},
|
|
190
|
-
async execute(args) {
|
|
112
|
+
"Recording automatically stops after detecting silence, so the user can speak naturally without specifying a duration.",
|
|
113
|
+
args: {},
|
|
114
|
+
async execute() {
|
|
191
115
|
if (!apiKey) {
|
|
192
116
|
return "Error: OPENAI_API_KEY environment variable is not set. Please set it to use voice transcription.";
|
|
193
117
|
}
|
|
194
|
-
const duration = Math.min(args.duration || defaultDuration, maxDuration);
|
|
195
118
|
let audioFile = null;
|
|
196
119
|
try {
|
|
197
|
-
audioFile = await recordAudio(
|
|
120
|
+
audioFile = await recordAudio(maxDuration, silenceDuration);
|
|
198
121
|
const transcription = await transcribeAudio(audioFile, apiKey, language);
|
|
199
122
|
if (!transcription || transcription.trim() === "") {
|
|
200
123
|
return "No speech detected. Please try again and speak clearly into your microphone.";
|