npm - @agentclaws/openclaw-whisper - Versions diffs - 0.1.0 - Mend

@agentclaws/openclaw-whisper 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/README.md ADDED Viewed

@@ -0,0 +1,71 @@
+# @agentclaws/openclaw-whisper
+Voice message transcription for [OpenClaw](https://openclaw.com). Send a voice note on Telegram, WhatsApp, or Discord — your agent reads it as text.
+Fixes [#14374](https://github.com/openclaw/openclaw/issues/14374).
+## Quick Start
+```bash
+openclaw plugins install @agentclaws/openclaw-whisper
+```
+Works immediately with local Whisper. No API key needed.
+### Requirements (local mode)
+```bash
+pip install openai-whisper
+apt install ffmpeg
+```
+## Providers
+| Provider | Cost | Speed | Setup |
+|----------|------|-------|-------|
+| **local** (default) | Free | ~10s/msg | `pip install openai-whisper` |
+| **groq** | Free tier | ~1s/msg | Add `apiKey` to config |
+| **openai** | $0.006/min | ~2s/msg | Add `apiKey` to config |
+Local mode is the default. For faster transcription, add a Groq or OpenAI key:
+```json
+{
+  "plugins": {
+    "openclaw-whisper": {
+      "enabled": true,
+      "provider": "groq",
+      "apiKey": "gsk_your_key_here"
+    }
+  }
+}
+```
+## How It Works
+1. Voice note arrives via Telegram/WhatsApp/Discord
+2. OpenClaw saves audio to `~/.openclaw/media/inbound/`
+3. Plugin detects new file, transcribes it
+4. Text injected into agent context as `[Voice] your message here`
+5. Agent responds naturally
+Supports `.ogg` `.opus` `.mp3` `.wav` `.m4a` `.webm` `.flac` up to 25MB.
+## Config
+| Option | Default | Description |
+|--------|---------|-------------|
+| `provider` | `"local"` | `"local"`, `"groq"`, or `"openai"` |
+| `apiKey` | — | API key for groq/openai (not needed for local) |
+| `model` | auto | local: `base`. groq: `whisper-large-v3-turbo`. openai: `whisper-1` |
+| `language` | `"en"` | ISO 639-1 code |
+| `autoTranscribe` | `true` | Watch for new voice messages |
+| `pollSec` | `3` | Check interval (seconds) |
+| `watchDir` | auto | Override media directory path |
+## Gateway API
+```javascript
+await gateway.request("whisper.transcribe", { file: "/path/to/audio.ogg" });
+await gateway.request("whisper.status");
+```
+## License
+MIT

package/index.ts ADDED Viewed

@@ -0,0 +1,106 @@
+import { existsSync, readFileSync, writeFileSync } from "node:fs";
+import { join } from "node:path";
+import type { OpenClawPluginApi, GatewayRequestHandlerOptions } from "openclaw/plugin-sdk";
+import { transcribeAudio, type WhisperProvider } from "./src/transcribe.js";
+import { createWatcher } from "./src/watcher.js";
+type WhisperConfig = {
+  enabled: boolean; provider: WhisperProvider; apiKey: string; model: string;
+  language: string; watchDir: string; pollSec: number; autoTranscribe: boolean;
+};
+function parseConfig(raw: unknown): WhisperConfig {
+  const o = raw && typeof raw === "object" && !Array.isArray(raw) ? (raw as Record<string, unknown>) : {};
+  const provider = String(o.provider || process.env.WHISPER_PROVIDER || "local");
+  const validProvider: WhisperProvider = provider === "openai" ? "openai" : provider === "groq" ? "groq" : "local";
+  const defaultModel = validProvider === "openai" ? "whisper-1" : validProvider === "groq" ? "whisper-large-v3-turbo" : "base";
+  return {
+    enabled: o.enabled !== false,
+    provider: validProvider,
+    apiKey: String(o.apiKey || process.env.WHISPER_API_KEY || ""),
+    model: String(o.model || process.env.WHISPER_MODEL || defaultModel),
+    language: String(o.language || process.env.WHISPER_LANG || "en"),
+    watchDir: String(o.watchDir || ""),
+    pollSec: typeof o.pollSec === "number" && o.pollSec >= 1 ? o.pollSec : 3,
+    autoTranscribe: o.autoTranscribe !== false,
+  };
+}
+const pendingTranscriptions: string[] = [];
+const whisperPlugin = {
+  id: "openclaw-whisper",
+  name: "Whisper Transcribe",
+  description: "Automatic voice message transcription via local Whisper, Groq, or OpenAI",
+  register(api: OpenClawPluginApi) {
+    const config = parseConfig(api.pluginConfig);
+    if (!config.enabled) { api.logger.info("[whisper] Disabled"); return; }
+    if (!config.apiKey && config.provider !== "local") {
+      const url = config.provider === "openai" ? "platform.openai.com" : "console.groq.com";
+      api.logger.warn("[whisper] No API key. Get one free at " + url + ' and add "apiKey" to config.');
+    }
+    const watchDir = config.watchDir || findMediaDir();
+    api.registerGatewayMethod("whisper.transcribe", async ({ params, respond }: GatewayRequestHandlerOptions) => {
+      const filePath = typeof params?.file === "string" ? params.file.trim() : "";
+      if (!filePath) { respond(false, { error: "file path required" }); return; }
+      const result = await transcribeAudio(filePath, config);
+      if ("text" in result) respond(true, { text: result.text, file: filePath, provider: config.provider });
+      else respond(false, { error: result.error });
+    });
+    api.registerGatewayMethod("whisper.status", async ({ respond }: GatewayRequestHandlerOptions) => {
+      respond(true, { enabled: config.enabled, provider: config.provider, model: config.model,
+        language: config.language, hasApiKey: !!config.apiKey, watchDir: watchDir || "(not found)",
+        autoTranscribe: config.autoTranscribe, pending: pendingTranscriptions.length });
+    });
+    api.on("before_agent_start", () => {
+      if (pendingTranscriptions.length === 0) {
+        return { prependContext: "Voice transcription is active. Messages prefixed with [Voice] are transcribed from audio." };
+      }
+      const batch = pendingTranscriptions.splice(0);
+      const voiceBlock = batch.map((t) => "[Voice] " + t).join("\n");
+      return {
+        prependContext: "Voice transcription is active. The following voice messages were just transcribed:\n" + voiceBlock,
+      };
+    });
+    if (config.autoTranscribe && (config.apiKey || config.provider === "local") && watchDir) {
+      api.registerService({
+        id: "whisper-watcher",
+        label: "Whisper Voice Transcription",
+        start: async () => {
+          api.logger.info("[whisper] " + config.provider + " | " + config.model + " | watching " + watchDir);
+          const watcher = createWatcher({
+            watchDir, config, pollMs: config.pollSec * 1000, logger: api.logger,
+            onTranscription(text, filename) {
+              const preview = text.length > 80 ? text.slice(0, 77) + "..." : text;
+              api.logger.info("[whisper] " + filename + " -> " + preview);
+              pendingTranscriptions.push(text);
+            },
+            onError(error, filename) { api.logger.warn("[whisper] " + filename + ": " + error); },
+          });
+          watcher.start();
+          return () => watcher.stop();
+        },
+      });
+    } else if (config.autoTranscribe) {
+      if (!config.apiKey && config.provider !== "local") api.logger.warn("[whisper] Auto-transcribe disabled — no API key.");
+      if (!watchDir) api.logger.warn("[whisper] Auto-transcribe disabled — media/inbound not found. Set watchDir.");
+    }
+  },
+};
+function findMediaDir(): string {
+  const home = process.env.HOME || process.env.USERPROFILE || "";
+  const candidates = [
+    join(home, ".openclaw", "media", "inbound"),
+    join(process.env.XDG_DATA_HOME || join(home, ".local", "share"), "openclaw", "media", "inbound"),
+  ];
+  for (const dir of candidates) { if (existsSync(dir)) return dir; }
+  return "";
+}
+export default whisperPlugin;

package/openclaw.plugin.json ADDED Viewed

@@ -0,0 +1,32 @@
+{
+  "id": "openclaw-whisper",
+  "name": "Whisper Transcribe",
+  "version": "0.1.0",
+  "description": "Automatic voice message transcription via Groq or OpenAI Whisper API",
+  "uiHints": {
+    "provider": { "label": "Provider", "help": "groq (free, fast) or openai" },
+    "apiKey": { "label": "API Key", "sensitive": true },
+    "model": { "label": "Model", "advanced": true },
+    "language": { "label": "Language" },
+    "sessionId": { "label": "Session ID", "advanced": true },
+    "deliverChannel": { "label": "Deliver Channel", "advanced": true },
+    "pollSec": { "label": "Poll Interval (sec)", "advanced": true },
+    "watchDir": { "label": "Watch Directory", "advanced": true }
+  },
+  "configSchema": {
+    "type": "object",
+    "additionalProperties": false,
+    "properties": {
+      "enabled": { "type": "boolean" },
+      "provider": { "type": "string", "enum": ["local", "groq", "openai"] },
+      "apiKey": { "type": "string" },
+      "model": { "type": "string" },
+      "language": { "type": "string" },
+      "autoTranscribe": { "type": "boolean" },
+      "pollSec": { "type": "integer", "minimum": 1 },
+      "sessionId": { "type": "string" },
+      "deliverChannel": { "type": "string" },
+      "watchDir": { "type": "string" }
+    }
+  }
+}

package/package.json ADDED Viewed

@@ -0,0 +1,10 @@
+{
+  "name": "@agentclaws/openclaw-whisper",
+  "version": "0.1.0",
+  "description": "Voice message transcription for OpenClaw via Groq or OpenAI Whisper API",
+  "type": "module",
+  "license": "MIT",
+  "dependencies": {},
+  "devDependencies": { "openclaw": "workspace:*" },
+  "openclaw": { "extensions": ["./index.ts"] }
+}

package/src/transcribe.ts ADDED Viewed

@@ -0,0 +1,107 @@
+import { existsSync, readFileSync, statSync, mkdirSync, unlinkSync } from "node:fs";
+import { basename, extname, join } from "node:path";
+import { execSync } from "node:child_process";
+import { tmpdir } from "node:os";
+export type WhisperProvider = "local" | "groq" | "openai";
+export type TranscribeConfig = {
+  provider: WhisperProvider;
+  apiKey: string;
+  model: string;
+  language: string;
+};
+const MIME_TYPES: Record<string, string> = {
+  ".ogg": "audio/ogg", ".oga": "audio/ogg", ".opus": "audio/ogg",
+  ".mp3": "audio/mpeg", ".wav": "audio/wav", ".m4a": "audio/mp4",
+  ".webm": "audio/webm", ".flac": "audio/flac",
+};
+const MAX_FILE_SIZE = 25 * 1024 * 1024;
+export const AUDIO_EXTENSIONS = new Set(Object.keys(MIME_TYPES));
+export async function transcribeAudio(
+  filePath: string, config: TranscribeConfig
+): Promise<{ text: string } | { error: string }> {
+  if (!existsSync(filePath)) return { error: "File not found: " + filePath };
+  const size = statSync(filePath).size;
+  if (size < 100) return { error: "File too small — likely empty or corrupt" };
+  if (size > MAX_FILE_SIZE) return { error: "File too large (" + Math.round(size / 1024 / 1024) + "MB). Max 25MB" };
+  if (config.provider === "local") return transcribeLocal(filePath, config);
+  return transcribeApi(filePath, config);
+}
+function transcribeLocal(
+  filePath: string, config: TranscribeConfig
+): { text: string } | { error: string } {
+  const outDir = join(tmpdir(), "agentclaws-whisper");
+  mkdirSync(outDir, { recursive: true });
+  try {
+    execSync(
+      'whisper "' + filePath + '" --model ' + config.model + ' --language ' + config.language + ' --output_format txt --output_dir "' + outDir + '"',
+      { timeout: 180000, stdio: "pipe" }
+    );
+  } catch {
+    return { error: "Local whisper failed. Install it: pip install openai-whisper && apt install ffmpeg" };
+  }
+  const stem = basename(filePath).replace(/\.[^.]+$/, "");
+  const txtPath = join(outDir, stem + ".txt");
+  if (existsSync(txtPath)) {
+    const text = readFileSync(txtPath, "utf-8").trim();
+    try { unlinkSync(txtPath); } catch {}
+    return text ? { text } : { error: "Empty transcription — audio may be silent" };
+  }
+  return { error: "Whisper produced no output file" };
+}
+async function transcribeApi(
+  filePath: string, config: TranscribeConfig
+): Promise<{ text: string } | { error: string }> {
+  if (!config.apiKey) {
+    return { error: "API key required for " + config.provider + ". Set apiKey in plugin config." };
+  }
+  const url = config.provider === "openai"
+    ? "https://api.openai.com/v1/audio/transcriptions"
+    : "https://api.groq.com/openai/v1/audio/transcriptions";
+  const ext = extname(filePath).toLowerCase();
+  const mimeType = MIME_TYPES[ext] || "audio/ogg";
+  const audioData = readFileSync(filePath);
+  const filename = basename(filePath);
+  const boundary = "----ACWhisper" + Date.now();
+  const parts: Buffer[] = [];
+  const enc = (s: string) => Buffer.from(s, "utf-8");
+  parts.push(enc("--" + boundary + "\r\nContent-Disposition: form-data; name=\"file\"; filename=\"" + filename + "\"\r\nContent-Type: " + mimeType + "\r\n\r\n"));
+  parts.push(audioData);
+  parts.push(enc("\r\n"));
+  parts.push(enc("--" + boundary + "\r\nContent-Disposition: form-data; name=\"model\"\r\n\r\n" + config.model + "\r\n"));
+  if (config.language) {
+    parts.push(enc("--" + boundary + "\r\nContent-Disposition: form-data; name=\"language\"\r\n\r\n" + config.language + "\r\n"));
+  }
+  parts.push(enc("--" + boundary + "\r\nContent-Disposition: form-data; name=\"response_format\"\r\n\r\njson\r\n"));
+  parts.push(enc("--" + boundary + "--\r\n"));
+  try {
+    const res = await fetch(url, {
+      method: "POST",
+      headers: { Authorization: "Bearer " + config.apiKey, "Content-Type": "multipart/form-data; boundary=" + boundary },
+      body: Buffer.concat(parts),
+      signal: AbortSignal.timeout(60000),
+    });
+    if (!res.ok) {
+      const errBody = await res.text().catch(() => "");
+      if (res.status === 401) return { error: "Invalid API key for " + config.provider };
+      if (res.status === 429) return { error: config.provider + " rate limit — try again shortly" };
+      return { error: config.provider + " error " + res.status + ": " + errBody.slice(0, 200) };
+    }
+    const json = (await res.json()) as Record<string, unknown>;
+    const text = typeof json.text === "string" ? json.text.trim() : "";
+    return text ? { text } : { error: "Empty transcription — audio may be silent" };
+  } catch (err) {
+    if (err instanceof Error && err.name === "TimeoutError") return { error: "Timed out (60s)" };
+    return { error: "Network error: " + (err instanceof Error ? err.message : String(err)) };
+  }
+}

package/src/watcher.ts ADDED Viewed

@@ -0,0 +1,68 @@
+import { readdirSync, existsSync, statSync, writeFileSync, readFileSync, appendFileSync } from "node:fs";
+import { join, extname } from "node:path";
+import { transcribeAudio, AUDIO_EXTENSIONS, type TranscribeConfig } from "./transcribe.js";
+const DONE_FILE = ".whisper_done";
+const MAX_DONE_ENTRIES = 5000;
+type Logger = { info(msg: string): void; warn(msg: string): void };
+type WatcherOpts = {
+  watchDir: string; config: TranscribeConfig; pollMs: number; logger: Logger;
+  onTranscription: (text: string, filename: string) => void;
+  onError: (error: string, filename: string) => void;
+};
+export function createWatcher(opts: WatcherOpts) {
+  let timer: ReturnType<typeof setInterval> | null = null;
+  let processing = false;
+  let done: Set<string>;
+  function loadDone(): Set<string> {
+    const p = join(opts.watchDir, DONE_FILE);
+    if (!existsSync(p)) return new Set();
+    const lines = readFileSync(p, "utf-8").trim().split("\n").filter(Boolean);
+    if (lines.length > MAX_DONE_ENTRIES) {
+      const trimmed = lines.slice(-MAX_DONE_ENTRIES);
+      writeFileSync(p, trimmed.join("\n") + "\n");
+      return new Set(trimmed);
+    }
+    return new Set(lines);
+  }
+  function markDone(name: string) {
+    done.add(name);
+    appendFileSync(join(opts.watchDir, DONE_FILE), name + "\n");
+  }
+  function isStable(filePath: string): boolean {
+    try {
+      const stat = statSync(filePath);
+      return stat.size > 0 && Date.now() - stat.mtimeMs > 2000;
+    } catch { return false; }
+  }
+  async function poll() {
+    if (processing || !existsSync(opts.watchDir)) return;
+    processing = true;
+    try {
+      const files = readdirSync(opts.watchDir)
+        .filter((f) => AUDIO_EXTENSIONS.has(extname(f).toLowerCase()) && !done.has(f))
+        .sort();
+      for (const file of files) {
+        const fullPath = join(opts.watchDir, file);
+        if (!isStable(fullPath)) continue;
+        const size = statSync(fullPath).size;
+        opts.logger.info(`[whisper] New: ${file} (${Math.round(size / 1024)}KB)`);
+        const result = await transcribeAudio(fullPath, opts.config);
+        markDone(file);
+        if ("text" in result) { opts.onTranscription(result.text, file); }
+        else { opts.onError(result.error, file); }
+      }
+    } catch (err) { opts.logger.warn(`[whisper] Poll error: ${err}`); }
+    finally { processing = false; }
+  }
+  return {
+    start() { done = loadDone(); poll(); timer = setInterval(poll, opts.pollMs); },
+    stop() { if (timer) { clearInterval(timer); timer = null; } },
+  };
+}