speech-opencode 1.0.0 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  Voice input plugin for [OpenCode](https://opencode.ai) using OpenAI Whisper.
4
4
 
5
- Record audio from your microphone and transcribe it to text using OpenAI's Whisper API.
5
+ Record audio from your microphone and transcribe it to text using OpenAI's Whisper API. **Recording automatically stops when you stop talking** - no need to specify a duration!
6
6
 
7
7
  ## Installation
8
8
 
@@ -26,33 +26,32 @@ export OPENAI_API_KEY=your-api-key
26
26
 
27
27
  ### Audio Recording Tools
28
28
 
29
- **Linux (PulseAudio/PipeWire):**
29
+ **sox** is required for audio recording with silence detection:
30
+
30
31
  ```bash
32
+ # macOS
33
+ brew install sox
34
+
31
35
  # Ubuntu/Debian
32
- sudo apt install pulseaudio-utils
36
+ sudo apt install sox
33
37
 
34
38
  # Fedora
35
- sudo dnf install pulseaudio-utils
39
+ sudo dnf install sox
36
40
 
37
41
  # Arch
38
- sudo pacman -S pulseaudio-utils
39
- ```
40
-
41
- **macOS:**
42
- ```bash
43
- brew install sox
42
+ sudo pacman -S sox
44
43
  ```
45
44
 
46
45
  ## Usage
47
46
 
48
- Once installed, OpenCode will have access to a `voice` tool. You can ask OpenCode to use it:
47
+ Once installed, OpenCode will have access to a `voice` tool. Just ask OpenCode:
49
48
 
50
49
  - "Listen to my voice"
51
50
  - "Record what I say"
52
51
  - "Use voice input"
53
- - "Transcribe my speech for 10 seconds"
52
+ - "voice"
54
53
 
55
- The tool accepts an optional `duration` parameter (default: 5 seconds, max: 60 seconds).
54
+ **Recording automatically stops after 7 seconds of silence**, so just speak naturally and pause when you're done.
56
55
 
57
56
  ## Configuration
58
57
 
@@ -66,11 +65,11 @@ export default VoicePlugin({
66
65
  // Optional: specify language (auto-detects if not set)
67
66
  language: "en",
68
67
 
69
- // Optional: default recording duration in seconds
70
- defaultDuration: 5,
68
+ // Optional: seconds of silence before stopping (default 7)
69
+ silenceDuration: 7,
71
70
 
72
- // Optional: maximum recording duration in seconds
73
- maxDuration: 60,
71
+ // Optional: maximum recording time as safety timeout (default 300 = 5 min)
72
+ maxDuration: 300,
74
73
 
75
74
  // Optional: override API key (defaults to OPENAI_API_KEY env var)
76
75
  apiKey: process.env.MY_OPENAI_KEY,
@@ -92,9 +91,10 @@ Leave `language` unset for automatic detection.
92
91
 
93
92
  ## How It Works
94
93
 
95
- 1. Records audio from your default microphone using system tools
96
- 2. Sends the audio to OpenAI's Whisper API for transcription
97
- 3. Returns the transcribed text to OpenCode
94
+ 1. Starts recording from your microphone when you begin speaking
95
+ 2. Automatically stops after detecting 7 seconds of silence
96
+ 3. Sends the audio to OpenAI's Whisper API for transcription
97
+ 4. Returns the transcribed text to OpenCode
98
98
 
99
99
  ## Troubleshooting
100
100
 
@@ -103,8 +103,12 @@ Leave `language` unset for automatic detection.
103
103
  - Verify the correct input device is selected in your system settings
104
104
  - On Linux, use `pavucontrol` to check input sources
105
105
 
106
+ ### Recording doesn't stop
107
+ - Make sure you pause speaking for at least 7 seconds
108
+ - Check that background noise isn't being detected as speech
109
+
106
110
  ### Recording fails
107
- - Ensure you have the required audio tools installed
111
+ - Ensure sox is installed: `which rec`
108
112
  - Check that your microphone permissions are granted
109
113
 
110
114
  ## License
package/dist/index.d.ts CHANGED
@@ -4,32 +4,13 @@ export interface VoicePluginOptions {
4
4
  apiKey?: string;
5
5
  /** Language code for transcription (e.g., "en", "es", "fr"). Auto-detects if not specified */
6
6
  language?: string;
7
- /** Default recording duration in seconds */
8
- defaultDuration?: number;
9
- /** Maximum allowed recording duration in seconds */
7
+ /** Seconds of silence before stopping recording (default 7) */
8
+ silenceDuration?: number;
9
+ /** Maximum recording duration in seconds as a safety timeout (default 300 = 5 minutes) */
10
10
  maxDuration?: number;
11
+ /** Enable wake word trigger file watching (default true) */
12
+ enableWakeWord?: boolean;
11
13
  }
12
- /**
13
- * OpenCode Voice Plugin
14
- *
15
- * Adds a 'voice' tool that records audio from the microphone and transcribes it
16
- * using OpenAI's Whisper API.
17
- *
18
- * @example
19
- * ```ts
20
- * // In opencode.json
21
- * {
22
- * "plugin": ["opencode-voice"]
23
- * }
24
- * ```
25
- *
26
- * @example
27
- * ```ts
28
- * // With options in .opencode/plugin/voice.ts
29
- * import { VoicePlugin } from "opencode-voice"
30
- * export default VoicePlugin({ language: "en", defaultDuration: 10 })
31
- * ```
32
- */
33
14
  export declare const VoicePlugin: (options?: VoicePluginOptions) => Plugin;
34
15
  declare const _default: Plugin;
35
16
  export default _default;
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,MAAM,EAAQ,MAAM,qBAAqB,CAAA;AA6LvD,MAAM,WAAW,kBAAkB;IACjC,yDAAyD;IACzD,MAAM,CAAC,EAAE,MAAM,CAAA;IACf,8FAA8F;IAC9F,QAAQ,CAAC,EAAE,MAAM,CAAA;IACjB,4CAA4C;IAC5C,eAAe,CAAC,EAAE,MAAM,CAAA;IACxB,oDAAoD;IACpD,WAAW,CAAC,EAAE,MAAM,CAAA;CACrB;AAED;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,eAAO,MAAM,WAAW,GACrB,UAAS,kBAAuB,KAAG,MAoEnC,CAAA;;AAGH,wBAA4B"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,MAAM,EAAQ,MAAM,qBAAqB,CAAA;AA2GvD,MAAM,WAAW,kBAAkB;IACjC,yDAAyD;IACzD,MAAM,CAAC,EAAE,MAAM,CAAA;IACf,8FAA8F;IAC9F,QAAQ,CAAC,EAAE,MAAM,CAAA;IACjB,+DAA+D;IAC/D,eAAe,CAAC,EAAE,MAAM,CAAA;IACxB,0FAA0F;IAC1F,WAAW,CAAC,EAAE,MAAM,CAAA;IACpB,4DAA4D;IAC5D,cAAc,CAAC,EAAE,OAAO,CAAA;CACzB;AAmID,eAAO,MAAM,WAAW,GACrB,UAAS,kBAAuB,KAAG,MAyDnC,CAAA;;AAGH,wBAA4B"}
package/dist/index.js CHANGED
@@ -1,131 +1,62 @@
1
1
  import { tool } from "@opencode-ai/plugin";
2
2
  import OpenAI from "openai";
3
3
  import { spawn } from "child_process";
4
- import { unlinkSync, readFileSync } from "fs";
5
- import { tmpdir } from "os";
6
- import { join } from "path";
4
+ import { unlinkSync, readFileSync, existsSync, watch, mkdirSync } from "fs";
5
+ import { tmpdir, homedir } from "os";
6
+ import { join, dirname } from "path";
7
+ // Trigger file path for wake word integration
8
+ const TRIGGER_FILE = join(homedir(), ".cache", "opencode", "voice_trigger");
7
9
  /**
8
- * Gets the first available non-monitor, non-bluetooth audio input source
9
- * Works with PulseAudio and PipeWire on Linux
10
+ * Records audio from the microphone with automatic silence detection.
11
+ * Recording stops after the specified silence duration.
12
+ * Uses sox on both Linux and macOS for silence detection.
13
+ *
14
+ * @param maxDurationSeconds - Maximum recording time (safety timeout)
15
+ * @param silenceDuration - Seconds of silence before stopping (default 7)
10
16
  */
11
- async function getDefaultInputDevice() {
12
- return new Promise((resolve) => {
13
- const pactl = spawn("pactl", ["list", "sources", "short"]);
14
- let output = "";
15
- pactl.stdout.on("data", (data) => {
16
- output += data.toString();
17
- });
18
- pactl.on("close", () => {
19
- const lines = output.trim().split("\n");
20
- for (const line of lines) {
21
- const parts = line.split("\t");
22
- if (parts.length >= 2) {
23
- const name = parts[1];
24
- // Skip monitor sources and bluetooth (prefer hardware input)
25
- if (!name.includes(".monitor") && !name.includes("bluez")) {
26
- resolve(name);
27
- return;
28
- }
29
- }
30
- }
31
- resolve(null);
32
- });
33
- pactl.on("error", () => resolve(null));
34
- });
17
+ async function recordAudio(maxDurationSeconds = 300, silenceDuration = 7) {
18
+ const tempFile = join(tmpdir(), `opencode-voice-${Date.now()}.wav`);
19
+ // Use sox with silence detection on all platforms
20
+ return recordWithSilenceDetection(tempFile, maxDurationSeconds, silenceDuration);
35
21
  }
36
22
  /**
37
- * Records audio from the microphone
38
- * - Linux: Uses parecord (PulseAudio/PipeWire) or arecord (ALSA)
39
- * - macOS: Uses sox (rec command)
23
+ * Records audio using sox with silence detection.
24
+ * Recording automatically stops after detecting silence.
25
+ *
26
+ * Sox silence syntax: silence [above_periods] [duration] [threshold] [below_periods] [duration] [threshold]
27
+ * - above_periods 1: need 1 period of audio above threshold to start
28
+ * - 0.1 3%: audio must be above 3% for 0.1s to count as speech start
29
+ * - below_periods 1: need 1 period below threshold to stop
30
+ * - silenceDuration 3%: stop after silenceDuration seconds below 3%
40
31
  */
41
- async function recordAudio(durationSeconds = 5) {
42
- const tempFile = join(tmpdir(), `opencode-voice-${Date.now()}.wav`);
43
- const platform = process.platform;
44
- if (platform === "darwin") {
45
- // macOS: use sox
46
- return recordWithSox(tempFile, durationSeconds);
47
- }
48
- else {
49
- // Linux: use parecord or arecord
50
- return recordWithPulseAudio(tempFile, durationSeconds);
51
- }
52
- }
53
- async function recordWithSox(tempFile, durationSeconds) {
32
+ async function recordWithSilenceDetection(tempFile, maxDurationSeconds, silenceDuration) {
54
33
  return new Promise((resolve, reject) => {
55
- const recorder = spawn("rec", [
34
+ // Use timeout to enforce max duration, sox for silence detection
35
+ const recorder = spawn("timeout", [
36
+ maxDurationSeconds.toString(),
37
+ "rec",
56
38
  "-q",
57
- "-r",
58
- "16000",
59
- "-c",
60
- "1",
61
- "-b",
62
- "16",
39
+ "-r", "16000",
40
+ "-c", "1",
41
+ "-b", "16",
63
42
  tempFile,
64
- "trim",
65
- "0",
66
- durationSeconds.toString(),
43
+ "silence",
44
+ "1", "0.1", "3%", // Start recording when speech detected (above 3% for 0.1s)
45
+ "1", `${silenceDuration}.0`, "3%", // Stop after silenceDuration seconds of silence (below 3%)
67
46
  ]);
68
47
  let errorOutput = "";
69
48
  recorder.stderr.on("data", (data) => {
70
49
  errorOutput += data.toString();
71
50
  });
72
51
  recorder.on("error", () => {
73
- reject(new Error("sox not found. Please install it:\n" + " - macOS: brew install sox"));
52
+ reject(new Error("sox not found. Please install it:\n" +
53
+ " - macOS: brew install sox\n" +
54
+ " - Ubuntu/Debian: sudo apt install sox\n" +
55
+ " - Fedora: sudo dnf install sox\n" +
56
+ " - Arch: sudo pacman -S sox"));
74
57
  });
75
58
  recorder.on("close", (code) => {
76
- if (code === 0) {
77
- resolve(tempFile);
78
- }
79
- else {
80
- reject(new Error(`Recording failed: ${errorOutput}`));
81
- }
82
- });
83
- });
84
- }
85
- async function recordWithPulseAudio(tempFile, durationSeconds) {
86
- const inputDevice = await getDefaultInputDevice();
87
- return new Promise((resolve, reject) => {
88
- const args = [(durationSeconds + 1).toString(), "parecord"];
89
- if (inputDevice) {
90
- args.push(`--device=${inputDevice}`);
91
- }
92
- args.push("--file-format=wav", tempFile);
93
- const recorder = spawn("timeout", args);
94
- let errorOutput = "";
95
- recorder.stderr.on("data", (data) => {
96
- errorOutput += data.toString();
97
- });
98
- recorder.on("error", () => {
99
- // Fallback to arecord
100
- const arecord = spawn("arecord", [
101
- "-q",
102
- "-f",
103
- "S16_LE",
104
- "-r",
105
- "16000",
106
- "-c",
107
- "1",
108
- "-d",
109
- durationSeconds.toString(),
110
- tempFile,
111
- ]);
112
- arecord.on("error", () => {
113
- reject(new Error("No audio recorder found. Please install:\n" +
114
- " - Ubuntu/Debian: sudo apt install pulseaudio-utils\n" +
115
- " - Fedora: sudo dnf install pulseaudio-utils\n" +
116
- " - Arch: sudo pacman -S pulseaudio-utils"));
117
- });
118
- arecord.on("close", (code) => {
119
- if (code === 0) {
120
- resolve(tempFile);
121
- }
122
- else {
123
- reject(new Error(`arecord failed with code ${code}`));
124
- }
125
- });
126
- });
127
- recorder.on("close", (code) => {
128
- // timeout returns 124 when it kills the process, which is expected
59
+ // code 0 = normal exit, 124 = timeout killed it (max duration reached)
129
60
  if (code === 0 || code === 124) {
130
61
  resolve(tempFile);
131
62
  }
@@ -170,51 +101,120 @@ async function transcribeAudio(audioFilePath, apiKey, language) {
170
101
  * export default VoicePlugin({ language: "en", defaultDuration: 10 })
171
102
  * ```
172
103
  */
104
+ /**
105
+ * Records and transcribes audio, returning the transcription
106
+ */
107
+ async function recordAndTranscribe(apiKey, maxDuration, silenceDuration, language) {
108
+ let audioFile = null;
109
+ try {
110
+ audioFile = await recordAudio(maxDuration, silenceDuration);
111
+ const transcription = await transcribeAudio(audioFile, apiKey, language);
112
+ if (!transcription || transcription.trim() === "") {
113
+ return "No speech detected. Please try again and speak clearly into your microphone.";
114
+ }
115
+ return transcription;
116
+ }
117
+ finally {
118
+ if (audioFile) {
119
+ try {
120
+ unlinkSync(audioFile);
121
+ }
122
+ catch {
123
+ // Ignore cleanup errors
124
+ }
125
+ }
126
+ }
127
+ }
128
+ /**
129
+ * Clears the wake word trigger file
130
+ */
131
+ function clearTriggerFile() {
132
+ try {
133
+ if (existsSync(TRIGGER_FILE)) {
134
+ unlinkSync(TRIGGER_FILE);
135
+ }
136
+ }
137
+ catch {
138
+ // Ignore errors
139
+ }
140
+ }
141
+ /**
142
+ * Sets up wake word trigger file watching
143
+ * When the trigger file is written, it records audio, transcribes it, and appends to the TUI prompt
144
+ */
145
+ function setupWakeWordWatcher(apiKey, maxDuration, silenceDuration, language, client // OpenCode SDK client
146
+ ) {
147
+ // Ensure the directory exists
148
+ const triggerDir = dirname(TRIGGER_FILE);
149
+ mkdirSync(triggerDir, { recursive: true });
150
+ // Clear any existing trigger
151
+ clearTriggerFile();
152
+ console.log("[Voice Plugin] Wake word watcher enabled");
153
+ console.log(`[Voice Plugin] Watching: ${TRIGGER_FILE}`);
154
+ console.log("[Voice Plugin] Run 'python wakeword/listener.py' to enable 'Hey Jarvis' wake word");
155
+ // Watch the directory for the trigger file
156
+ let isRecording = false;
157
+ watch(triggerDir, async (eventType, filename) => {
158
+ if (filename !== "voice_trigger" || isRecording)
159
+ return;
160
+ if (!existsSync(TRIGGER_FILE))
161
+ return;
162
+ isRecording = true;
163
+ console.log("[Voice Plugin] Wake word triggered! Recording...");
164
+ try {
165
+ // Clear the trigger file immediately
166
+ clearTriggerFile();
167
+ // Record and transcribe
168
+ const transcription = await recordAndTranscribe(apiKey, maxDuration, silenceDuration, language);
169
+ if (transcription && !transcription.startsWith("No speech detected")) {
170
+ console.log(`[Voice Plugin] Transcribed: "${transcription}"`);
171
+ // Append transcription to the TUI prompt
172
+ try {
173
+ await client.tui.appendPrompt({ body: { text: transcription } });
174
+ // Auto-submit the prompt
175
+ await client.tui.submitPrompt();
176
+ }
177
+ catch (err) {
178
+ console.error("[Voice Plugin] Failed to send to TUI:", err);
179
+ }
180
+ }
181
+ }
182
+ catch (error) {
183
+ console.error("[Voice Plugin] Error:", error);
184
+ }
185
+ finally {
186
+ isRecording = false;
187
+ }
188
+ });
189
+ }
173
190
  export const VoicePlugin = (options = {}) => async (ctx) => {
174
- const { apiKey = process.env.OPENAI_API_KEY, language, defaultDuration = 5, maxDuration = 60, } = options;
191
+ const { apiKey = process.env.OPENAI_API_KEY, language, silenceDuration = 7, maxDuration = 300, enableWakeWord = true, } = options;
175
192
  if (!apiKey) {
176
193
  console.warn("[Voice Plugin] Warning: OPENAI_API_KEY not set. Voice transcription will fail.");
177
194
  }
195
+ // Set up wake word watcher if enabled
196
+ if (enableWakeWord && apiKey && ctx.client) {
197
+ setupWakeWordWatcher(apiKey, maxDuration, silenceDuration, language, ctx.client);
198
+ }
178
199
  return {
179
200
  tool: {
180
201
  voice: tool({
181
202
  description: "Records audio from the user's microphone and transcribes it using OpenAI Whisper. " +
182
203
  "Use this tool when the user wants to provide input via voice or speech. " +
183
- `The tool will record for the specified duration (default ${defaultDuration} seconds) and return the transcribed text.`,
184
- args: {
185
- duration: tool.schema
186
- .number()
187
- .optional()
188
- .describe(`Recording duration in seconds. Default is ${defaultDuration} seconds. Max is ${maxDuration} seconds.`),
189
- },
190
- async execute(args) {
204
+ "Recording automatically stops after detecting silence, so the user can speak naturally without specifying a duration.",
205
+ args: {},
206
+ async execute() {
191
207
  if (!apiKey) {
192
208
  return "Error: OPENAI_API_KEY environment variable is not set. Please set it to use voice transcription.";
193
209
  }
194
- const duration = Math.min(args.duration || defaultDuration, maxDuration);
195
- let audioFile = null;
196
210
  try {
197
- audioFile = await recordAudio(duration);
198
- const transcription = await transcribeAudio(audioFile, apiKey, language);
199
- if (!transcription || transcription.trim() === "") {
200
- return "No speech detected. Please try again and speak clearly into your microphone.";
201
- }
211
+ const transcription = await recordAndTranscribe(apiKey, maxDuration, silenceDuration, language);
202
212
  return `Transcribed speech: "${transcription}"`;
203
213
  }
204
214
  catch (error) {
205
215
  const errorMessage = error instanceof Error ? error.message : String(error);
206
216
  return `Voice recording/transcription failed: ${errorMessage}`;
207
217
  }
208
- finally {
209
- if (audioFile) {
210
- try {
211
- unlinkSync(audioFile);
212
- }
213
- catch {
214
- // Ignore cleanup errors
215
- }
216
- }
217
- }
218
218
  },
219
219
  }),
220
220
  },
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "speech-opencode",
3
- "version": "1.0.0",
3
+ "version": "1.1.2",
4
4
  "description": "Voice input plugin for OpenCode using OpenAI Whisper",
5
5
  "keywords": [
6
6
  "opencode",