npm - speech-opencode - Versions diffs - 1.0.0 → 1.1.1 - Mend

speech-opencode 1.0.0 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/README.md CHANGED Viewed

@@ -2,7 +2,7 @@
 Voice input plugin for [OpenCode](https://opencode.ai) using OpenAI Whisper.
-Record audio from your microphone and transcribe it to text using OpenAI's Whisper API.
+Record audio from your microphone and transcribe it to text using OpenAI's Whisper API. **Recording automatically stops when you stop talking** - no need to specify a duration!
 ## Installation
@@ -26,33 +26,32 @@ export OPENAI_API_KEY=your-api-key
 ### Audio Recording Tools
-**Linux (PulseAudio/PipeWire):**
+**sox** is required for audio recording with silence detection:
 ```bash
+# macOS
+brew install sox
 # Ubuntu/Debian
-sudo apt install pulseaudio-utils
+sudo apt install sox
 # Fedora
-sudo dnf install pulseaudio-utils
+sudo dnf install sox
 # Arch
-sudo pacman -S pulseaudio-utils
-```
-**macOS:**
-```bash
-brew install sox
+sudo pacman -S sox
 ```
 ## Usage
-Once installed, OpenCode will have access to a `voice` tool. You can ask OpenCode to use it:
+Once installed, OpenCode will have access to a `voice` tool. Just ask OpenCode:
 - "Listen to my voice"
 - "Record what I say"
 - "Use voice input"
-- "Transcribe my speech for 10 seconds"
+- "voice"
-The tool accepts an optional `duration` parameter (default: 5 seconds, max: 60 seconds).
+**Recording automatically stops after 7 seconds of silence**, so just speak naturally and pause when you're done.
 ## Configuration
@@ -66,11 +65,11 @@ export default VoicePlugin({
   // Optional: specify language (auto-detects if not set)
   language: "en",
-  // Optional: default recording duration in seconds
-  defaultDuration: 5,
+  // Optional: seconds of silence before stopping (default 7)
+  silenceDuration: 7,
-  // Optional: maximum recording duration in seconds
-  maxDuration: 60,
+  // Optional: maximum recording time as safety timeout (default 300 = 5 min)
+  maxDuration: 300,
   // Optional: override API key (defaults to OPENAI_API_KEY env var)
   apiKey: process.env.MY_OPENAI_KEY,
@@ -92,9 +91,10 @@ Leave `language` unset for automatic detection.
 ## How It Works
-1. Records audio from your default microphone using system tools
-2. Sends the audio to OpenAI's Whisper API for transcription
-3. Returns the transcribed text to OpenCode
+1. Starts recording from your microphone when you begin speaking
+2. Automatically stops after detecting 7 seconds of silence
+3. Sends the audio to OpenAI's Whisper API for transcription
+4. Returns the transcribed text to OpenCode
 ## Troubleshooting
@@ -103,8 +103,12 @@ Leave `language` unset for automatic detection.
 - Verify the correct input device is selected in your system settings
 - On Linux, use `pavucontrol` to check input sources
+### Recording doesn't stop
+- Make sure you pause speaking for at least 7 seconds
+- Check that background noise isn't being detected as speech
 ### Recording fails
-- Ensure you have the required audio tools installed
+- Ensure sox is installed: `which rec`
 - Check that your microphone permissions are granted
 ## License

package/dist/index.d.ts CHANGED Viewed

@@ -4,9 +4,9 @@ export interface VoicePluginOptions {
     apiKey?: string;
     /** Language code for transcription (e.g., "en", "es", "fr"). Auto-detects if not specified */
     language?: string;
-    /** Default recording duration in seconds */
-    defaultDuration?: number;
-    /** Maximum allowed recording duration in seconds */
+    /** Seconds of silence before stopping recording (default 7) */
+    silenceDuration?: number;
+    /** Maximum recording duration in seconds as a safety timeout (default 300 = 5 minutes) */
     maxDuration?: number;
 }
 /**

package/dist/index.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,MAAM,EAAQ,MAAM,qBAAqB,CAAA;~~AA6LvD~~,MAAM,WAAW,kBAAkB;IACjC,yDAAyD;IACzD,MAAM,CAAC,EAAE,MAAM,CAAA;IACf,8FAA8F;IAC9F,QAAQ,CAAC,EAAE,MAAM,CAAA;IACjB~~,4CAA4C~~;~~IAC5C~~,eAAe,CAAC,EAAE,MAAM,CAAA;IACxB,~~oDAAoD~~;~~IACpD~~,WAAW,CAAC,EAAE,MAAM,CAAA;CACrB;AAED;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,eAAO,MAAM,WAAW,GACrB,UAAS,kBAAuB,KAAG,~~MAoEnC~~,CAAA;;AAGH,wBAA4B"}
1	+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,MAAM,EAAQ,MAAM,qBAAqB,CAAA;AAwGvD,MAAM,WAAW,kBAAkB;IACjC,yDAAyD;IACzD,MAAM,CAAC,EAAE,MAAM,CAAA;IACf,8FAA8F;IAC9F,QAAQ,CAAC,EAAE,MAAM,CAAA;IACjB,+DAA+D;IAC/D,eAAe,CAAC,EAAE,MAAM,CAAA;IACxB,0FAA0F;IAC1F,WAAW,CAAC,EAAE,MAAM,CAAA;CACrB;AAED;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,eAAO,MAAM,WAAW,GACrB,UAAS,kBAAuB,KAAG,MA4DnC,CAAA;;AAGH,wBAA4B"}

package/dist/index.js CHANGED Viewed

@@ -5,127 +5,56 @@ import { unlinkSync, readFileSync } from "fs";
 import { tmpdir } from "os";
 import { join } from "path";
 /**
- * Gets the first available non-monitor, non-bluetooth audio input source
- * Works with PulseAudio and PipeWire on Linux
+ * Records audio from the microphone with automatic silence detection.
+ * Recording stops after the specified silence duration.
+ * Uses sox on both Linux and macOS for silence detection.
+ *
+ * @param maxDurationSeconds - Maximum recording time (safety timeout)
+ * @param silenceDuration - Seconds of silence before stopping (default 7)
  */
-async function getDefaultInputDevice() {
-    return new Promise((resolve) => {
-        const pactl = spawn("pactl", ["list", "sources", "short"]);
-        let output = "";
-        pactl.stdout.on("data", (data) => {
-            output += data.toString();
-        });
-        pactl.on("close", () => {
-            const lines = output.trim().split("\n");
-            for (const line of lines) {
-                const parts = line.split("\t");
-                if (parts.length >= 2) {
-                    const name = parts[1];
-                    // Skip monitor sources and bluetooth (prefer hardware input)
-                    if (!name.includes(".monitor") && !name.includes("bluez")) {
-                        resolve(name);
-                        return;
-                    }
-                }
-            }
-            resolve(null);
-        });
-        pactl.on("error", () => resolve(null));
-    });
+async function recordAudio(maxDurationSeconds = 300, silenceDuration = 7) {
+    const tempFile = join(tmpdir(), `opencode-voice-${Date.now()}.wav`);
+    // Use sox with silence detection on all platforms
+    return recordWithSilenceDetection(tempFile, maxDurationSeconds, silenceDuration);
 }
 /**
- * Records audio from the microphone
- * - Linux: Uses parecord (PulseAudio/PipeWire) or arecord (ALSA)
- * - macOS: Uses sox (rec command)
+ * Records audio using sox with silence detection.
+ * Recording automatically stops after detecting silence.
+ *
+ * Sox silence syntax: silence [above_periods] [duration] [threshold] [below_periods] [duration] [threshold]
+ * - above_periods 1: need 1 period of audio above threshold to start
+ * - 0.1 3%: audio must be above 3% for 0.1s to count as speech start
+ * - below_periods 1: need 1 period below threshold to stop
+ * - silenceDuration 3%: stop after silenceDuration seconds below 3%
  */
-async function recordAudio(durationSeconds = 5) {
-    const tempFile = join(tmpdir(), `opencode-voice-${Date.now()}.wav`);
-    const platform = process.platform;
-    if (platform === "darwin") {
-        // macOS: use sox
-        return recordWithSox(tempFile, durationSeconds);
-    }
-    else {
-        // Linux: use parecord or arecord
-        return recordWithPulseAudio(tempFile, durationSeconds);
-    }
-}
-async function recordWithSox(tempFile, durationSeconds) {
+async function recordWithSilenceDetection(tempFile, maxDurationSeconds, silenceDuration) {
     return new Promise((resolve, reject) => {
-        const recorder = spawn("rec", [
+        // Use timeout to enforce max duration, sox for silence detection
+        const recorder = spawn("timeout", [
+            maxDurationSeconds.toString(),
+            "rec",
             "-q",
-            "-r",
-            "16000",
-            "-c",
-            "1",
-            "-b",
-            "16",
+            "-r", "16000",
+            "-c", "1",
+            "-b", "16",
             tempFile,
-            "trim",
-            "0",
-            durationSeconds.toString(),
+            "silence",
+            "1", "0.1", "3%", // Start recording when speech detected (above 3% for 0.1s)
+            "1", `${silenceDuration}.0`, "3%", // Stop after silenceDuration seconds of silence (below 3%)
         ]);
         let errorOutput = "";
         recorder.stderr.on("data", (data) => {
             errorOutput += data.toString();
         });
         recorder.on("error", () => {
-            reject(new Error("sox not found. Please install it:\n" + "  - macOS: brew install sox"));
-        });
-        recorder.on("close", (code) => {
-            if (code === 0) {
-                resolve(tempFile);
-            }
-            else {
-                reject(new Error(`Recording failed: ${errorOutput}`));
-            }
-        });
-    });
-}
-async function recordWithPulseAudio(tempFile, durationSeconds) {
-    const inputDevice = await getDefaultInputDevice();
-    return new Promise((resolve, reject) => {
-        const args = [(durationSeconds + 1).toString(), "parecord"];
-        if (inputDevice) {
-            args.push(`--device=${inputDevice}`);
-        }
-        args.push("--file-format=wav", tempFile);
-        const recorder = spawn("timeout", args);
-        let errorOutput = "";
-        recorder.stderr.on("data", (data) => {
-            errorOutput += data.toString();
-        });
-        recorder.on("error", () => {
-            // Fallback to arecord
-            const arecord = spawn("arecord", [
-                "-q",
-                "-f",
-                "S16_LE",
-                "-r",
-                "16000",
-                "-c",
-                "1",
-                "-d",
-                durationSeconds.toString(),
-                tempFile,
-            ]);
-            arecord.on("error", () => {
-                reject(new Error("No audio recorder found. Please install:\n" +
-                    "  - Ubuntu/Debian: sudo apt install pulseaudio-utils\n" +
-                    "  - Fedora: sudo dnf install pulseaudio-utils\n" +
-                    "  - Arch: sudo pacman -S pulseaudio-utils"));
-            });
-            arecord.on("close", (code) => {
-                if (code === 0) {
-                    resolve(tempFile);
-                }
-                else {
-                    reject(new Error(`arecord failed with code ${code}`));
-                }
-            });
+            reject(new Error("sox not found. Please install it:\n" +
+                "  - macOS: brew install sox\n" +
+                "  - Ubuntu/Debian: sudo apt install sox\n" +
+                "  - Fedora: sudo dnf install sox\n" +
+                "  - Arch: sudo pacman -S sox"));
         });
         recorder.on("close", (code) => {
-            // timeout returns 124 when it kills the process, which is expected
+            // code 0 = normal exit, 124 = timeout killed it (max duration reached)
             if (code === 0 || code === 124) {
                 resolve(tempFile);
             }
@@ -171,7 +100,7 @@ async function transcribeAudio(audioFilePath, apiKey, language) {
  * ```
  */
 export const VoicePlugin = (options = {}) => async (ctx) => {
-    const { apiKey = process.env.OPENAI_API_KEY, language, defaultDuration = 5, maxDuration = 60, } = options;
+    const { apiKey = process.env.OPENAI_API_KEY, language, silenceDuration = 7, maxDuration = 300, } = options;
     if (!apiKey) {
         console.warn("[Voice Plugin] Warning: OPENAI_API_KEY not set. Voice transcription will fail.");
     }
@@ -180,21 +109,15 @@ export const VoicePlugin = (options = {}) => async (ctx) => {
             voice: tool({
                 description: "Records audio from the user's microphone and transcribes it using OpenAI Whisper. " +
                     "Use this tool when the user wants to provide input via voice or speech. " +
-                    `The tool will record for the specified duration (default ${defaultDuration} seconds) and return the transcribed text.`,
-                args: {
-                    duration: tool.schema
-                        .number()
-                        .optional()
-                        .describe(`Recording duration in seconds. Default is ${defaultDuration} seconds. Max is ${maxDuration} seconds.`),
-                },
-                async execute(args) {
+                    "Recording automatically stops after detecting silence, so the user can speak naturally without specifying a duration.",
+                args: {},
+                async execute() {
                     if (!apiKey) {
                         return "Error: OPENAI_API_KEY environment variable is not set. Please set it to use voice transcription.";
                     }
-                    const duration = Math.min(args.duration || defaultDuration, maxDuration);
                     let audioFile = null;
                     try {
-                        audioFile = await recordAudio(duration);
+                        audioFile = await recordAudio(maxDuration, silenceDuration);
                         const transcription = await transcribeAudio(audioFile, apiKey, language);
                         if (!transcription || transcription.trim() === "") {
                             return "No speech detected. Please try again and speak clearly into your microphone.";

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "speech-opencode",
-  "version": "1.0.0",
+  "version": "1.1.1",
   "description": "Voice input plugin for OpenCode using OpenAI Whisper",
   "keywords": [
     "opencode",