speech-opencode 1.0.0 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  Voice input plugin for [OpenCode](https://opencode.ai) using OpenAI Whisper.
4
4
 
5
- Record audio from your microphone and transcribe it to text using OpenAI's Whisper API.
5
+ Record audio from your microphone and transcribe it to text using OpenAI's Whisper API. **Recording automatically stops when you stop talking** - no need to specify a duration!
6
6
 
7
7
  ## Installation
8
8
 
@@ -26,33 +26,32 @@ export OPENAI_API_KEY=your-api-key
26
26
 
27
27
  ### Audio Recording Tools
28
28
 
29
- **Linux (PulseAudio/PipeWire):**
29
+ **sox** is required for audio recording with silence detection:
30
+
30
31
  ```bash
32
+ # macOS
33
+ brew install sox
34
+
31
35
  # Ubuntu/Debian
32
- sudo apt install pulseaudio-utils
36
+ sudo apt install sox
33
37
 
34
38
  # Fedora
35
- sudo dnf install pulseaudio-utils
39
+ sudo dnf install sox
36
40
 
37
41
  # Arch
38
- sudo pacman -S pulseaudio-utils
39
- ```
40
-
41
- **macOS:**
42
- ```bash
43
- brew install sox
42
+ sudo pacman -S sox
44
43
  ```
45
44
 
46
45
  ## Usage
47
46
 
48
- Once installed, OpenCode will have access to a `voice` tool. You can ask OpenCode to use it:
47
+ Once installed, OpenCode will have access to a `voice` tool. Just ask OpenCode:
49
48
 
50
49
  - "Listen to my voice"
51
50
  - "Record what I say"
52
51
  - "Use voice input"
53
- - "Transcribe my speech for 10 seconds"
52
+ - "voice"
54
53
 
55
- The tool accepts an optional `duration` parameter (default: 5 seconds, max: 60 seconds).
54
+ **Recording automatically stops after 7 seconds of silence**, so just speak naturally and pause when you're done.
56
55
 
57
56
  ## Configuration
58
57
 
@@ -66,11 +65,11 @@ export default VoicePlugin({
66
65
  // Optional: specify language (auto-detects if not set)
67
66
  language: "en",
68
67
 
69
- // Optional: default recording duration in seconds
70
- defaultDuration: 5,
68
+ // Optional: seconds of silence before stopping (default 7)
69
+ silenceDuration: 7,
71
70
 
72
- // Optional: maximum recording duration in seconds
73
- maxDuration: 60,
71
+ // Optional: maximum recording time as safety timeout (default 300 = 5 min)
72
+ maxDuration: 300,
74
73
 
75
74
  // Optional: override API key (defaults to OPENAI_API_KEY env var)
76
75
  apiKey: process.env.MY_OPENAI_KEY,
@@ -92,9 +91,10 @@ Leave `language` unset for automatic detection.
92
91
 
93
92
  ## How It Works
94
93
 
95
- 1. Records audio from your default microphone using system tools
96
- 2. Sends the audio to OpenAI's Whisper API for transcription
97
- 3. Returns the transcribed text to OpenCode
94
+ 1. Starts recording from your microphone when you begin speaking
95
+ 2. Automatically stops after detecting 7 seconds of silence
96
+ 3. Sends the audio to OpenAI's Whisper API for transcription
97
+ 4. Returns the transcribed text to OpenCode
98
98
 
99
99
  ## Troubleshooting
100
100
 
@@ -103,8 +103,12 @@ Leave `language` unset for automatic detection.
103
103
  - Verify the correct input device is selected in your system settings
104
104
  - On Linux, use `pavucontrol` to check input sources
105
105
 
106
+ ### Recording doesn't stop
107
+ - Make sure you pause speaking for at least 7 seconds
108
+ - Check that background noise isn't being detected as speech
109
+
106
110
  ### Recording fails
107
- - Ensure you have the required audio tools installed
111
+ - Ensure sox is installed: `which rec`
108
112
  - Check that your microphone permissions are granted
109
113
 
110
114
  ## License
package/dist/index.d.ts CHANGED
@@ -4,9 +4,9 @@ export interface VoicePluginOptions {
4
4
  apiKey?: string;
5
5
  /** Language code for transcription (e.g., "en", "es", "fr"). Auto-detects if not specified */
6
6
  language?: string;
7
- /** Default recording duration in seconds */
8
- defaultDuration?: number;
9
- /** Maximum allowed recording duration in seconds */
7
+ /** Seconds of silence before stopping recording (default 7) */
8
+ silenceDuration?: number;
9
+ /** Maximum recording duration in seconds as a safety timeout (default 300 = 5 minutes) */
10
10
  maxDuration?: number;
11
11
  }
12
12
  /**
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,MAAM,EAAQ,MAAM,qBAAqB,CAAA;AA6LvD,MAAM,WAAW,kBAAkB;IACjC,yDAAyD;IACzD,MAAM,CAAC,EAAE,MAAM,CAAA;IACf,8FAA8F;IAC9F,QAAQ,CAAC,EAAE,MAAM,CAAA;IACjB,4CAA4C;IAC5C,eAAe,CAAC,EAAE,MAAM,CAAA;IACxB,oDAAoD;IACpD,WAAW,CAAC,EAAE,MAAM,CAAA;CACrB;AAED;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,eAAO,MAAM,WAAW,GACrB,UAAS,kBAAuB,KAAG,MAoEnC,CAAA;;AAGH,wBAA4B"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,MAAM,EAAQ,MAAM,qBAAqB,CAAA;AAwGvD,MAAM,WAAW,kBAAkB;IACjC,yDAAyD;IACzD,MAAM,CAAC,EAAE,MAAM,CAAA;IACf,8FAA8F;IAC9F,QAAQ,CAAC,EAAE,MAAM,CAAA;IACjB,+DAA+D;IAC/D,eAAe,CAAC,EAAE,MAAM,CAAA;IACxB,0FAA0F;IAC1F,WAAW,CAAC,EAAE,MAAM,CAAA;CACrB;AAED;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,eAAO,MAAM,WAAW,GACrB,UAAS,kBAAuB,KAAG,MA4DnC,CAAA;;AAGH,wBAA4B"}
package/dist/index.js CHANGED
@@ -5,127 +5,56 @@ import { unlinkSync, readFileSync } from "fs";
5
5
  import { tmpdir } from "os";
6
6
  import { join } from "path";
7
7
  /**
8
- * Gets the first available non-monitor, non-bluetooth audio input source
9
- * Works with PulseAudio and PipeWire on Linux
8
+ * Records audio from the microphone with automatic silence detection.
9
+ * Recording stops after the specified silence duration.
10
+ * Uses sox on both Linux and macOS for silence detection.
11
+ *
12
+ * @param maxDurationSeconds - Maximum recording time (safety timeout)
13
+ * @param silenceDuration - Seconds of silence before stopping (default 7)
10
14
  */
11
- async function getDefaultInputDevice() {
12
- return new Promise((resolve) => {
13
- const pactl = spawn("pactl", ["list", "sources", "short"]);
14
- let output = "";
15
- pactl.stdout.on("data", (data) => {
16
- output += data.toString();
17
- });
18
- pactl.on("close", () => {
19
- const lines = output.trim().split("\n");
20
- for (const line of lines) {
21
- const parts = line.split("\t");
22
- if (parts.length >= 2) {
23
- const name = parts[1];
24
- // Skip monitor sources and bluetooth (prefer hardware input)
25
- if (!name.includes(".monitor") && !name.includes("bluez")) {
26
- resolve(name);
27
- return;
28
- }
29
- }
30
- }
31
- resolve(null);
32
- });
33
- pactl.on("error", () => resolve(null));
34
- });
15
+ async function recordAudio(maxDurationSeconds = 300, silenceDuration = 7) {
16
+ const tempFile = join(tmpdir(), `opencode-voice-${Date.now()}.wav`);
17
+ // Use sox with silence detection on all platforms
18
+ return recordWithSilenceDetection(tempFile, maxDurationSeconds, silenceDuration);
35
19
  }
36
20
  /**
37
- * Records audio from the microphone
38
- * - Linux: Uses parecord (PulseAudio/PipeWire) or arecord (ALSA)
39
- * - macOS: Uses sox (rec command)
21
+ * Records audio using sox with silence detection.
22
+ * Recording automatically stops after detecting silence.
23
+ *
24
+ * Sox silence syntax: silence [above_periods] [duration] [threshold] [below_periods] [duration] [threshold]
25
+ * - above_periods 1: need 1 period of audio above threshold to start
26
+ * - 0.1 3%: audio must be above 3% for 0.1s to count as speech start
27
+ * - below_periods 1: need 1 period below threshold to stop
28
+ * - silenceDuration 3%: stop after silenceDuration seconds below 3%
40
29
  */
41
- async function recordAudio(durationSeconds = 5) {
42
- const tempFile = join(tmpdir(), `opencode-voice-${Date.now()}.wav`);
43
- const platform = process.platform;
44
- if (platform === "darwin") {
45
- // macOS: use sox
46
- return recordWithSox(tempFile, durationSeconds);
47
- }
48
- else {
49
- // Linux: use parecord or arecord
50
- return recordWithPulseAudio(tempFile, durationSeconds);
51
- }
52
- }
53
- async function recordWithSox(tempFile, durationSeconds) {
30
+ async function recordWithSilenceDetection(tempFile, maxDurationSeconds, silenceDuration) {
54
31
  return new Promise((resolve, reject) => {
55
- const recorder = spawn("rec", [
32
+ // Use timeout to enforce max duration, sox for silence detection
33
+ const recorder = spawn("timeout", [
34
+ maxDurationSeconds.toString(),
35
+ "rec",
56
36
  "-q",
57
- "-r",
58
- "16000",
59
- "-c",
60
- "1",
61
- "-b",
62
- "16",
37
+ "-r", "16000",
38
+ "-c", "1",
39
+ "-b", "16",
63
40
  tempFile,
64
- "trim",
65
- "0",
66
- durationSeconds.toString(),
41
+ "silence",
42
+ "1", "0.1", "3%", // Start recording when speech detected (above 3% for 0.1s)
43
+ "1", `${silenceDuration}.0`, "3%", // Stop after silenceDuration seconds of silence (below 3%)
67
44
  ]);
68
45
  let errorOutput = "";
69
46
  recorder.stderr.on("data", (data) => {
70
47
  errorOutput += data.toString();
71
48
  });
72
49
  recorder.on("error", () => {
73
- reject(new Error("sox not found. Please install it:\n" + " - macOS: brew install sox"));
74
- });
75
- recorder.on("close", (code) => {
76
- if (code === 0) {
77
- resolve(tempFile);
78
- }
79
- else {
80
- reject(new Error(`Recording failed: ${errorOutput}`));
81
- }
82
- });
83
- });
84
- }
85
- async function recordWithPulseAudio(tempFile, durationSeconds) {
86
- const inputDevice = await getDefaultInputDevice();
87
- return new Promise((resolve, reject) => {
88
- const args = [(durationSeconds + 1).toString(), "parecord"];
89
- if (inputDevice) {
90
- args.push(`--device=${inputDevice}`);
91
- }
92
- args.push("--file-format=wav", tempFile);
93
- const recorder = spawn("timeout", args);
94
- let errorOutput = "";
95
- recorder.stderr.on("data", (data) => {
96
- errorOutput += data.toString();
97
- });
98
- recorder.on("error", () => {
99
- // Fallback to arecord
100
- const arecord = spawn("arecord", [
101
- "-q",
102
- "-f",
103
- "S16_LE",
104
- "-r",
105
- "16000",
106
- "-c",
107
- "1",
108
- "-d",
109
- durationSeconds.toString(),
110
- tempFile,
111
- ]);
112
- arecord.on("error", () => {
113
- reject(new Error("No audio recorder found. Please install:\n" +
114
- " - Ubuntu/Debian: sudo apt install pulseaudio-utils\n" +
115
- " - Fedora: sudo dnf install pulseaudio-utils\n" +
116
- " - Arch: sudo pacman -S pulseaudio-utils"));
117
- });
118
- arecord.on("close", (code) => {
119
- if (code === 0) {
120
- resolve(tempFile);
121
- }
122
- else {
123
- reject(new Error(`arecord failed with code ${code}`));
124
- }
125
- });
50
+ reject(new Error("sox not found. Please install it:\n" +
51
+ " - macOS: brew install sox\n" +
52
+ " - Ubuntu/Debian: sudo apt install sox\n" +
53
+ " - Fedora: sudo dnf install sox\n" +
54
+ " - Arch: sudo pacman -S sox"));
126
55
  });
127
56
  recorder.on("close", (code) => {
128
- // timeout returns 124 when it kills the process, which is expected
57
+ // code 0 = normal exit, 124 = timeout killed it (max duration reached)
129
58
  if (code === 0 || code === 124) {
130
59
  resolve(tempFile);
131
60
  }
@@ -171,7 +100,7 @@ async function transcribeAudio(audioFilePath, apiKey, language) {
171
100
  * ```
172
101
  */
173
102
  export const VoicePlugin = (options = {}) => async (ctx) => {
174
- const { apiKey = process.env.OPENAI_API_KEY, language, defaultDuration = 5, maxDuration = 60, } = options;
103
+ const { apiKey = process.env.OPENAI_API_KEY, language, silenceDuration = 7, maxDuration = 300, } = options;
175
104
  if (!apiKey) {
176
105
  console.warn("[Voice Plugin] Warning: OPENAI_API_KEY not set. Voice transcription will fail.");
177
106
  }
@@ -180,21 +109,15 @@ export const VoicePlugin = (options = {}) => async (ctx) => {
180
109
  voice: tool({
181
110
  description: "Records audio from the user's microphone and transcribes it using OpenAI Whisper. " +
182
111
  "Use this tool when the user wants to provide input via voice or speech. " +
183
- `The tool will record for the specified duration (default ${defaultDuration} seconds) and return the transcribed text.`,
184
- args: {
185
- duration: tool.schema
186
- .number()
187
- .optional()
188
- .describe(`Recording duration in seconds. Default is ${defaultDuration} seconds. Max is ${maxDuration} seconds.`),
189
- },
190
- async execute(args) {
112
+ "Recording automatically stops after detecting silence, so the user can speak naturally without specifying a duration.",
113
+ args: {},
114
+ async execute() {
191
115
  if (!apiKey) {
192
116
  return "Error: OPENAI_API_KEY environment variable is not set. Please set it to use voice transcription.";
193
117
  }
194
- const duration = Math.min(args.duration || defaultDuration, maxDuration);
195
118
  let audioFile = null;
196
119
  try {
197
- audioFile = await recordAudio(duration);
120
+ audioFile = await recordAudio(maxDuration, silenceDuration);
198
121
  const transcription = await transcribeAudio(audioFile, apiKey, language);
199
122
  if (!transcription || transcription.trim() === "") {
200
123
  return "No speech detected. Please try again and speak clearly into your microphone.";
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "speech-opencode",
3
- "version": "1.0.0",
3
+ "version": "1.1.1",
4
4
  "description": "Voice input plugin for OpenCode using OpenAI Whisper",
5
5
  "keywords": [
6
6
  "opencode",