@agentclaws/openclaw-whisper 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,71 @@
1
+ # @agentclaws/openclaw-whisper
2
+
3
+ Voice message transcription for [OpenClaw](https://openclaw.com). Send a voice note on Telegram, WhatsApp, or Discord — your agent reads it as text.
4
+
5
+ Fixes [#14374](https://github.com/openclaw/openclaw/issues/14374).
6
+
7
+ ## Quick Start
8
+ ```bash
9
+ openclaw plugins install @agentclaws/openclaw-whisper
10
+ ```
11
+
12
+ Works immediately with local Whisper. No API key needed.
13
+
14
+ ### Requirements (local mode)
15
+ ```bash
16
+ pip install openai-whisper
17
+ apt install ffmpeg
18
+ ```
19
+
20
+ ## Providers
21
+
22
+ | Provider | Cost | Speed | Setup |
23
+ |----------|------|-------|-------|
24
+ | **local** (default) | Free | ~10s/msg | `pip install openai-whisper` |
25
+ | **groq** | Free tier | ~1s/msg | Add `apiKey` to config |
26
+ | **openai** | $0.006/min | ~2s/msg | Add `apiKey` to config |
27
+
28
+ Local mode is the default. For faster transcription, add a Groq or OpenAI key:
29
+ ```json
30
+ {
31
+ "plugins": {
32
+ "openclaw-whisper": {
33
+ "enabled": true,
34
+ "provider": "groq",
35
+ "apiKey": "gsk_your_key_here"
36
+ }
37
+ }
38
+ }
39
+ ```
40
+
41
+ ## How It Works
42
+
43
+ 1. Voice note arrives via Telegram/WhatsApp/Discord
44
+ 2. OpenClaw saves audio to `~/.openclaw/media/inbound/`
45
+ 3. Plugin detects new file, transcribes it
46
+ 4. Text injected into agent context as `[Voice] your message here`
47
+ 5. Agent responds naturally
48
+
49
+ Supports `.ogg` `.opus` `.mp3` `.wav` `.m4a` `.webm` `.flac` up to 25MB.
50
+
51
+ ## Config
52
+
53
+ | Option | Default | Description |
54
+ |--------|---------|-------------|
55
+ | `provider` | `"local"` | `"local"`, `"groq"`, or `"openai"` |
56
+ | `apiKey` | — | API key for groq/openai (not needed for local) |
57
+ | `model` | auto | local: `base`. groq: `whisper-large-v3-turbo`. openai: `whisper-1` |
58
+ | `language` | `"en"` | ISO 639-1 code |
59
+ | `autoTranscribe` | `true` | Watch for new voice messages |
60
+ | `pollSec` | `3` | Check interval (seconds) |
61
+ | `watchDir` | auto | Override media directory path |
62
+
63
+ ## Gateway API
64
+ ```javascript
65
+ await gateway.request("whisper.transcribe", { file: "/path/to/audio.ogg" });
66
+ await gateway.request("whisper.status");
67
+ ```
68
+
69
+ ## License
70
+
71
+ MIT
package/index.ts ADDED
@@ -0,0 +1,106 @@
1
+ import { existsSync, readFileSync, writeFileSync } from "node:fs";
2
+ import { join } from "node:path";
3
+ import type { OpenClawPluginApi, GatewayRequestHandlerOptions } from "openclaw/plugin-sdk";
4
+ import { transcribeAudio, type WhisperProvider } from "./src/transcribe.js";
5
+ import { createWatcher } from "./src/watcher.js";
6
+
7
+ type WhisperConfig = {
8
+ enabled: boolean; provider: WhisperProvider; apiKey: string; model: string;
9
+ language: string; watchDir: string; pollSec: number; autoTranscribe: boolean;
10
+ };
11
+
12
+ function parseConfig(raw: unknown): WhisperConfig {
13
+ const o = raw && typeof raw === "object" && !Array.isArray(raw) ? (raw as Record<string, unknown>) : {};
14
+ const provider = String(o.provider || process.env.WHISPER_PROVIDER || "local");
15
+ const validProvider: WhisperProvider = provider === "openai" ? "openai" : provider === "groq" ? "groq" : "local";
16
+ const defaultModel = validProvider === "openai" ? "whisper-1" : validProvider === "groq" ? "whisper-large-v3-turbo" : "base";
17
+ return {
18
+ enabled: o.enabled !== false,
19
+ provider: validProvider,
20
+ apiKey: String(o.apiKey || process.env.WHISPER_API_KEY || ""),
21
+ model: String(o.model || process.env.WHISPER_MODEL || defaultModel),
22
+ language: String(o.language || process.env.WHISPER_LANG || "en"),
23
+ watchDir: String(o.watchDir || ""),
24
+ pollSec: typeof o.pollSec === "number" && o.pollSec >= 1 ? o.pollSec : 3,
25
+ autoTranscribe: o.autoTranscribe !== false,
26
+ };
27
+ }
28
+
29
+ const pendingTranscriptions: string[] = [];
30
+
31
+ const whisperPlugin = {
32
+ id: "openclaw-whisper",
33
+ name: "Whisper Transcribe",
34
+ description: "Automatic voice message transcription via local Whisper, Groq, or OpenAI",
35
+
36
+ register(api: OpenClawPluginApi) {
37
+ const config = parseConfig(api.pluginConfig);
38
+ if (!config.enabled) { api.logger.info("[whisper] Disabled"); return; }
39
+ if (!config.apiKey && config.provider !== "local") {
40
+ const url = config.provider === "openai" ? "platform.openai.com" : "console.groq.com";
41
+ api.logger.warn("[whisper] No API key. Get one free at " + url + ' and add "apiKey" to config.');
42
+ }
43
+ const watchDir = config.watchDir || findMediaDir();
44
+
45
+ api.registerGatewayMethod("whisper.transcribe", async ({ params, respond }: GatewayRequestHandlerOptions) => {
46
+ const filePath = typeof params?.file === "string" ? params.file.trim() : "";
47
+ if (!filePath) { respond(false, { error: "file path required" }); return; }
48
+ const result = await transcribeAudio(filePath, config);
49
+ if ("text" in result) respond(true, { text: result.text, file: filePath, provider: config.provider });
50
+ else respond(false, { error: result.error });
51
+ });
52
+
53
+ api.registerGatewayMethod("whisper.status", async ({ respond }: GatewayRequestHandlerOptions) => {
54
+ respond(true, { enabled: config.enabled, provider: config.provider, model: config.model,
55
+ language: config.language, hasApiKey: !!config.apiKey, watchDir: watchDir || "(not found)",
56
+ autoTranscribe: config.autoTranscribe, pending: pendingTranscriptions.length });
57
+ });
58
+
59
+ api.on("before_agent_start", () => {
60
+ if (pendingTranscriptions.length === 0) {
61
+ return { prependContext: "Voice transcription is active. Messages prefixed with [Voice] are transcribed from audio." };
62
+ }
63
+ const batch = pendingTranscriptions.splice(0);
64
+ const voiceBlock = batch.map((t) => "[Voice] " + t).join("\n");
65
+ return {
66
+ prependContext: "Voice transcription is active. The following voice messages were just transcribed:\n" + voiceBlock,
67
+ };
68
+ });
69
+
70
+ if (config.autoTranscribe && (config.apiKey || config.provider === "local") && watchDir) {
71
+ api.registerService({
72
+ id: "whisper-watcher",
73
+ label: "Whisper Voice Transcription",
74
+ start: async () => {
75
+ api.logger.info("[whisper] " + config.provider + " | " + config.model + " | watching " + watchDir);
76
+ const watcher = createWatcher({
77
+ watchDir, config, pollMs: config.pollSec * 1000, logger: api.logger,
78
+ onTranscription(text, filename) {
79
+ const preview = text.length > 80 ? text.slice(0, 77) + "..." : text;
80
+ api.logger.info("[whisper] " + filename + " -> " + preview);
81
+ pendingTranscriptions.push(text);
82
+ },
83
+ onError(error, filename) { api.logger.warn("[whisper] " + filename + ": " + error); },
84
+ });
85
+ watcher.start();
86
+ return () => watcher.stop();
87
+ },
88
+ });
89
+ } else if (config.autoTranscribe) {
90
+ if (!config.apiKey && config.provider !== "local") api.logger.warn("[whisper] Auto-transcribe disabled — no API key.");
91
+ if (!watchDir) api.logger.warn("[whisper] Auto-transcribe disabled — media/inbound not found. Set watchDir.");
92
+ }
93
+ },
94
+ };
95
+
96
+ function findMediaDir(): string {
97
+ const home = process.env.HOME || process.env.USERPROFILE || "";
98
+ const candidates = [
99
+ join(home, ".openclaw", "media", "inbound"),
100
+ join(process.env.XDG_DATA_HOME || join(home, ".local", "share"), "openclaw", "media", "inbound"),
101
+ ];
102
+ for (const dir of candidates) { if (existsSync(dir)) return dir; }
103
+ return "";
104
+ }
105
+
106
+ export default whisperPlugin;
@@ -0,0 +1,32 @@
1
+ {
2
+ "id": "openclaw-whisper",
3
+ "name": "Whisper Transcribe",
4
+ "version": "0.1.0",
5
+ "description": "Automatic voice message transcription via Groq or OpenAI Whisper API",
6
+ "uiHints": {
7
+ "provider": { "label": "Provider", "help": "groq (free, fast) or openai" },
8
+ "apiKey": { "label": "API Key", "sensitive": true },
9
+ "model": { "label": "Model", "advanced": true },
10
+ "language": { "label": "Language" },
11
+ "sessionId": { "label": "Session ID", "advanced": true },
12
+ "deliverChannel": { "label": "Deliver Channel", "advanced": true },
13
+ "pollSec": { "label": "Poll Interval (sec)", "advanced": true },
14
+ "watchDir": { "label": "Watch Directory", "advanced": true }
15
+ },
16
+ "configSchema": {
17
+ "type": "object",
18
+ "additionalProperties": false,
19
+ "properties": {
20
+ "enabled": { "type": "boolean" },
21
+ "provider": { "type": "string", "enum": ["local", "groq", "openai"] },
22
+ "apiKey": { "type": "string" },
23
+ "model": { "type": "string" },
24
+ "language": { "type": "string" },
25
+ "autoTranscribe": { "type": "boolean" },
26
+ "pollSec": { "type": "integer", "minimum": 1 },
27
+ "sessionId": { "type": "string" },
28
+ "deliverChannel": { "type": "string" },
29
+ "watchDir": { "type": "string" }
30
+ }
31
+ }
32
+ }
package/package.json ADDED
@@ -0,0 +1,10 @@
1
+ {
2
+ "name": "@agentclaws/openclaw-whisper",
3
+ "version": "0.1.0",
4
+ "description": "Voice message transcription for OpenClaw via Groq or OpenAI Whisper API",
5
+ "type": "module",
6
+ "license": "MIT",
7
+ "dependencies": {},
8
+ "devDependencies": { "openclaw": "workspace:*" },
9
+ "openclaw": { "extensions": ["./index.ts"] }
10
+ }
@@ -0,0 +1,107 @@
1
+ import { existsSync, readFileSync, statSync, mkdirSync, unlinkSync } from "node:fs";
2
+ import { basename, extname, join } from "node:path";
3
+ import { execSync } from "node:child_process";
4
+ import { tmpdir } from "node:os";
5
+
6
+ export type WhisperProvider = "local" | "groq" | "openai";
7
+
8
+ export type TranscribeConfig = {
9
+ provider: WhisperProvider;
10
+ apiKey: string;
11
+ model: string;
12
+ language: string;
13
+ };
14
+
15
+ const MIME_TYPES: Record<string, string> = {
16
+ ".ogg": "audio/ogg", ".oga": "audio/ogg", ".opus": "audio/ogg",
17
+ ".mp3": "audio/mpeg", ".wav": "audio/wav", ".m4a": "audio/mp4",
18
+ ".webm": "audio/webm", ".flac": "audio/flac",
19
+ };
20
+
21
+ const MAX_FILE_SIZE = 25 * 1024 * 1024;
22
+ export const AUDIO_EXTENSIONS = new Set(Object.keys(MIME_TYPES));
23
+
24
+ export async function transcribeAudio(
25
+ filePath: string, config: TranscribeConfig
26
+ ): Promise<{ text: string } | { error: string }> {
27
+ if (!existsSync(filePath)) return { error: "File not found: " + filePath };
28
+ const size = statSync(filePath).size;
29
+ if (size < 100) return { error: "File too small — likely empty or corrupt" };
30
+ if (size > MAX_FILE_SIZE) return { error: "File too large (" + Math.round(size / 1024 / 1024) + "MB). Max 25MB" };
31
+
32
+ if (config.provider === "local") return transcribeLocal(filePath, config);
33
+ return transcribeApi(filePath, config);
34
+ }
35
+
36
+ function transcribeLocal(
37
+ filePath: string, config: TranscribeConfig
38
+ ): { text: string } | { error: string } {
39
+ const outDir = join(tmpdir(), "agentclaws-whisper");
40
+ mkdirSync(outDir, { recursive: true });
41
+ try {
42
+ execSync(
43
+ 'whisper "' + filePath + '" --model ' + config.model + ' --language ' + config.language + ' --output_format txt --output_dir "' + outDir + '"',
44
+ { timeout: 180000, stdio: "pipe" }
45
+ );
46
+ } catch {
47
+ return { error: "Local whisper failed. Install it: pip install openai-whisper && apt install ffmpeg" };
48
+ }
49
+ const stem = basename(filePath).replace(/\.[^.]+$/, "");
50
+ const txtPath = join(outDir, stem + ".txt");
51
+ if (existsSync(txtPath)) {
52
+ const text = readFileSync(txtPath, "utf-8").trim();
53
+ try { unlinkSync(txtPath); } catch {}
54
+ return text ? { text } : { error: "Empty transcription — audio may be silent" };
55
+ }
56
+ return { error: "Whisper produced no output file" };
57
+ }
58
+
59
+ async function transcribeApi(
60
+ filePath: string, config: TranscribeConfig
61
+ ): Promise<{ text: string } | { error: string }> {
62
+ if (!config.apiKey) {
63
+ return { error: "API key required for " + config.provider + ". Set apiKey in plugin config." };
64
+ }
65
+ const url = config.provider === "openai"
66
+ ? "https://api.openai.com/v1/audio/transcriptions"
67
+ : "https://api.groq.com/openai/v1/audio/transcriptions";
68
+
69
+ const ext = extname(filePath).toLowerCase();
70
+ const mimeType = MIME_TYPES[ext] || "audio/ogg";
71
+ const audioData = readFileSync(filePath);
72
+ const filename = basename(filePath);
73
+ const boundary = "----ACWhisper" + Date.now();
74
+
75
+ const parts: Buffer[] = [];
76
+ const enc = (s: string) => Buffer.from(s, "utf-8");
77
+ parts.push(enc("--" + boundary + "\r\nContent-Disposition: form-data; name=\"file\"; filename=\"" + filename + "\"\r\nContent-Type: " + mimeType + "\r\n\r\n"));
78
+ parts.push(audioData);
79
+ parts.push(enc("\r\n"));
80
+ parts.push(enc("--" + boundary + "\r\nContent-Disposition: form-data; name=\"model\"\r\n\r\n" + config.model + "\r\n"));
81
+ if (config.language) {
82
+ parts.push(enc("--" + boundary + "\r\nContent-Disposition: form-data; name=\"language\"\r\n\r\n" + config.language + "\r\n"));
83
+ }
84
+ parts.push(enc("--" + boundary + "\r\nContent-Disposition: form-data; name=\"response_format\"\r\n\r\njson\r\n"));
85
+ parts.push(enc("--" + boundary + "--\r\n"));
86
+
87
+ try {
88
+ const res = await fetch(url, {
89
+ method: "POST",
90
+ headers: { Authorization: "Bearer " + config.apiKey, "Content-Type": "multipart/form-data; boundary=" + boundary },
91
+ body: Buffer.concat(parts),
92
+ signal: AbortSignal.timeout(60000),
93
+ });
94
+ if (!res.ok) {
95
+ const errBody = await res.text().catch(() => "");
96
+ if (res.status === 401) return { error: "Invalid API key for " + config.provider };
97
+ if (res.status === 429) return { error: config.provider + " rate limit — try again shortly" };
98
+ return { error: config.provider + " error " + res.status + ": " + errBody.slice(0, 200) };
99
+ }
100
+ const json = (await res.json()) as Record<string, unknown>;
101
+ const text = typeof json.text === "string" ? json.text.trim() : "";
102
+ return text ? { text } : { error: "Empty transcription — audio may be silent" };
103
+ } catch (err) {
104
+ if (err instanceof Error && err.name === "TimeoutError") return { error: "Timed out (60s)" };
105
+ return { error: "Network error: " + (err instanceof Error ? err.message : String(err)) };
106
+ }
107
+ }
package/src/watcher.ts ADDED
@@ -0,0 +1,68 @@
1
+ import { readdirSync, existsSync, statSync, writeFileSync, readFileSync, appendFileSync } from "node:fs";
2
+ import { join, extname } from "node:path";
3
+ import { transcribeAudio, AUDIO_EXTENSIONS, type TranscribeConfig } from "./transcribe.js";
4
+
5
+ const DONE_FILE = ".whisper_done";
6
+ const MAX_DONE_ENTRIES = 5000;
7
+ type Logger = { info(msg: string): void; warn(msg: string): void };
8
+ type WatcherOpts = {
9
+ watchDir: string; config: TranscribeConfig; pollMs: number; logger: Logger;
10
+ onTranscription: (text: string, filename: string) => void;
11
+ onError: (error: string, filename: string) => void;
12
+ };
13
+
14
+ export function createWatcher(opts: WatcherOpts) {
15
+ let timer: ReturnType<typeof setInterval> | null = null;
16
+ let processing = false;
17
+ let done: Set<string>;
18
+
19
+ function loadDone(): Set<string> {
20
+ const p = join(opts.watchDir, DONE_FILE);
21
+ if (!existsSync(p)) return new Set();
22
+ const lines = readFileSync(p, "utf-8").trim().split("\n").filter(Boolean);
23
+ if (lines.length > MAX_DONE_ENTRIES) {
24
+ const trimmed = lines.slice(-MAX_DONE_ENTRIES);
25
+ writeFileSync(p, trimmed.join("\n") + "\n");
26
+ return new Set(trimmed);
27
+ }
28
+ return new Set(lines);
29
+ }
30
+
31
+ function markDone(name: string) {
32
+ done.add(name);
33
+ appendFileSync(join(opts.watchDir, DONE_FILE), name + "\n");
34
+ }
35
+
36
+ function isStable(filePath: string): boolean {
37
+ try {
38
+ const stat = statSync(filePath);
39
+ return stat.size > 0 && Date.now() - stat.mtimeMs > 2000;
40
+ } catch { return false; }
41
+ }
42
+
43
+ async function poll() {
44
+ if (processing || !existsSync(opts.watchDir)) return;
45
+ processing = true;
46
+ try {
47
+ const files = readdirSync(opts.watchDir)
48
+ .filter((f) => AUDIO_EXTENSIONS.has(extname(f).toLowerCase()) && !done.has(f))
49
+ .sort();
50
+ for (const file of files) {
51
+ const fullPath = join(opts.watchDir, file);
52
+ if (!isStable(fullPath)) continue;
53
+ const size = statSync(fullPath).size;
54
+ opts.logger.info(`[whisper] New: ${file} (${Math.round(size / 1024)}KB)`);
55
+ const result = await transcribeAudio(fullPath, opts.config);
56
+ markDone(file);
57
+ if ("text" in result) { opts.onTranscription(result.text, file); }
58
+ else { opts.onError(result.error, file); }
59
+ }
60
+ } catch (err) { opts.logger.warn(`[whisper] Poll error: ${err}`); }
61
+ finally { processing = false; }
62
+ }
63
+
64
+ return {
65
+ start() { done = loadDone(); poll(); timer = setInterval(poll, opts.pollMs); },
66
+ stop() { if (timer) { clearInterval(timer); timer = null; } },
67
+ };
68
+ }