@keychat-io/keychat 0.1.25 → 0.1.27
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -39
- package/package.json +1 -1
- package/src/bridge-client.ts +7 -2
- package/src/channel.ts +58 -6
- package/src/media.ts +16 -0
- package/src/stt.ts +159 -0
- package/src/tts.ts +103 -0
package/README.md
CHANGED
|
@@ -6,8 +6,6 @@ E2E encrypted AI agent communication via Keychat protocol.
|
|
|
6
6
|
|
|
7
7
|
This plugin gives your OpenClaw agent a **sovereign identity** — a self-generated Public Key ID (Nostr keypair) — and enables **end-to-end encrypted communication** using the Signal Protocol over Nostr relays.
|
|
8
8
|
|
|
9
|
-
Your agent becomes a full Keychat citizen: it can receive friend requests, establish Signal Protocol sessions, and exchange messages with Keychat app users. All messages are encrypted with forward and backward secrecy — not even relay operators can read them.
|
|
10
|
-
|
|
11
9
|
## Install
|
|
12
10
|
|
|
13
11
|
```bash
|
|
@@ -17,42 +15,22 @@ openclaw gateway restart
|
|
|
17
15
|
|
|
18
16
|
That's it. The plugin automatically downloads the bridge binary and initializes the config on first load.
|
|
19
17
|
|
|
20
|
-
Supported platforms: macOS (ARM/x64), Linux (x64/ARM64).
|
|
21
|
-
|
|
22
|
-
### Security Warnings
|
|
23
|
-
|
|
24
|
-
During installation, OpenClaw will show the following warning:
|
|
25
|
-
|
|
26
|
-
```
|
|
27
|
-
WARNING: Plugin "keychat" contains dangerous code patterns:
|
|
28
|
-
Shell command execution detected (child_process) (src/bridge-client.ts)
|
|
29
|
-
Shell command execution detected (child_process) (src/keychain.ts)
|
|
30
|
-
```
|
|
31
|
-
|
|
32
|
-
**Both are expected and required.** Here's why:
|
|
33
|
-
|
|
34
|
-
| Warning | File | Why it's needed |
|
|
35
|
-
|---------|------|----------------|
|
|
36
|
-
| Shell command execution (child_process) | `bridge-client.ts` | Spawns the Rust sidecar process for Signal Protocol & MLS encryption. Keychat's cryptography runs in Rust and must be bridged to Node.js. |
|
|
37
|
-
| Shell command execution (child_process) | `keychain.ts` | Stores the agent's identity mnemonic in the OS keychain (macOS Keychain / Linux libsecret) instead of plain-text files. |
|
|
38
|
-
|
|
39
|
-
These warnings cannot be removed without sacrificing core functionality or security. The plugin does **not** execute arbitrary commands — it only spawns the bundled bridge binary and accesses the system keychain.
|
|
40
|
-
|
|
41
18
|
Alternatively, install via shell script:
|
|
42
19
|
|
|
43
20
|
```bash
|
|
44
21
|
curl -fsSL https://raw.githubusercontent.com/keychat-io/keychat-openclaw/main/scripts/install.sh | bash
|
|
45
22
|
```
|
|
46
23
|
|
|
24
|
+
Supported platforms: macOS (ARM/x64), Linux (x64/ARM64).
|
|
25
|
+
|
|
47
26
|
### Security Warnings
|
|
48
27
|
|
|
49
|
-
During installation, OpenClaw's security scanner may show
|
|
28
|
+
During installation, OpenClaw's security scanner may show two warnings. Both are expected:
|
|
50
29
|
|
|
51
|
-
| Warning | Reason
|
|
52
|
-
| ------------------------------------------ |
|
|
53
|
-
| Shell command execution (bridge-client.ts) | Spawns a Rust sidecar for Signal Protocol and MLS encryption.
|
|
54
|
-
| Shell command execution (keychain.ts) | Stores identity mnemonics in the OS keychain (macOS Keychain / Linux libsecret).
|
|
55
|
-
| Shell command execution (notify.ts) | Notifies the agent on startup so it can send the Keychat ID and QR code to the user. |
|
|
30
|
+
| Warning | Reason |
|
|
31
|
+
| ------------------------------------------ | -------------------------------------------------------------------------------- |
|
|
32
|
+
| Shell command execution (bridge-client.ts) | Spawns a Rust sidecar for Signal Protocol and MLS encryption. |
|
|
33
|
+
| Shell command execution (keychain.ts) | Stores identity mnemonics in the OS keychain (macOS Keychain / Linux libsecret). |
|
|
56
34
|
|
|
57
35
|
Source code is fully open: [github.com/keychat-io/keychat-openclaw](https://github.com/keychat-io/keychat-openclaw)
|
|
58
36
|
|
|
@@ -61,7 +39,7 @@ Source code is fully open: [github.com/keychat-io/keychat-openclaw](https://gith
|
|
|
61
39
|
Tell your agent "upgrade keychat" in any chat, or manually:
|
|
62
40
|
|
|
63
41
|
```bash
|
|
64
|
-
openclaw plugins
|
|
42
|
+
openclaw plugins update keychat
|
|
65
43
|
openclaw gateway restart
|
|
66
44
|
```
|
|
67
45
|
|
|
@@ -77,15 +55,7 @@ After `openclaw gateway restart`, the agent will send you its **Keychat ID**, **
|
|
|
77
55
|
|
|
78
56
|
Open the [Keychat app](https://keychat.io) → tap the link, paste the npub, or scan the QR code to add as contact. If `dmPolicy` is `open` (default after auto-init), the agent accepts immediately.
|
|
79
57
|
|
|
80
|
-
**Can't find the public key?**
|
|
81
|
-
|
|
82
|
-
```bash
|
|
83
|
-
# View the agent's npub in config
|
|
84
|
-
cat ~/.openclaw/openclaw.json | grep npub
|
|
85
|
-
|
|
86
|
-
# Or watch the gateway logs for the Keychat ID
|
|
87
|
-
openclaw logs --follow
|
|
88
|
-
```
|
|
58
|
+
**Can't find the public key?** Just ask your agent in chat: "What's your Keychat ID?"
|
|
89
59
|
|
|
90
60
|
## Configuration
|
|
91
61
|
|
package/package.json
CHANGED
package/src/bridge-client.ts
CHANGED
|
@@ -306,8 +306,13 @@ export class KeychatBridgeClient {
|
|
|
306
306
|
this.pending.set(id, { resolve, reject });
|
|
307
307
|
|
|
308
308
|
const request = JSON.stringify({ id, method, params: params ?? {} });
|
|
309
|
-
|
|
310
|
-
|
|
309
|
+
try {
|
|
310
|
+
this.process.stdin.write(request + '\n');
|
|
311
|
+
} catch (error) {
|
|
312
|
+
this.pending.delete(id);
|
|
313
|
+
reject(new Error(`Bridge write failed: ${error}`));
|
|
314
|
+
return;
|
|
315
|
+
}
|
|
311
316
|
// Timeout after 30 seconds
|
|
312
317
|
setTimeout(() => {
|
|
313
318
|
if (this.pending.has(id)) {
|
package/src/channel.ts
CHANGED
|
@@ -17,6 +17,28 @@ import {
|
|
|
17
17
|
formatPairingApproveHint,
|
|
18
18
|
type ChannelPlugin,
|
|
19
19
|
} from "openclaw/plugin-sdk";
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* Strip "Reasoning:\n_..._" prefix that OpenClaw core prepends when
|
|
23
|
+
* reasoning display is enabled. Keychat has no collapsible UI for it,
|
|
24
|
+
* so we silently drop it to keep messages clean.
|
|
25
|
+
*/
|
|
26
|
+
function stripReasoningPrefix(text: string): string {
|
|
27
|
+
// Strip reasoning in multiple formats:
|
|
28
|
+
// 1. "Reasoning:\n_line1_\n_line2_\n\nActual answer..."
|
|
29
|
+
// 2. Leading italic blocks: "_thinking text_\n_more thinking_\n\nActual answer..."
|
|
30
|
+
// 3. "**Heading**\n_thinking_\n\nActual answer..."
|
|
31
|
+
let result = text;
|
|
32
|
+
|
|
33
|
+
// Format 1: Explicit "Reasoning:" prefix
|
|
34
|
+
result = result.replace(/^Reasoning:\n(?:_[^\n]*_\n?)+\n*/s, "");
|
|
35
|
+
|
|
36
|
+
// Format 2: Leading italic lines (markdown _text_) at the start
|
|
37
|
+
// Keep stripping italic lines until we hit a non-italic line
|
|
38
|
+
result = result.replace(/^(?:_[^\n]*_\n*)+\n*/s, "");
|
|
39
|
+
|
|
40
|
+
return result.trim();
|
|
41
|
+
}
|
|
20
42
|
import { KeychatConfigSchema } from "./config-schema.js";
|
|
21
43
|
import { getKeychatRuntime } from "./runtime.js";
|
|
22
44
|
import {
|
|
@@ -35,6 +57,7 @@ import {
|
|
|
35
57
|
} from "./bridge-client.js";
|
|
36
58
|
import { storeMnemonic, retrieveMnemonic } from "./keychain.js";
|
|
37
59
|
import { parseMediaUrl, downloadAndDecrypt, encryptAndUpload } from "./media.js";
|
|
60
|
+
import { transcribe, type SttConfig } from "./stt.js";
|
|
38
61
|
import { join } from "node:path";
|
|
39
62
|
import { existsSync, mkdirSync } from "node:fs";
|
|
40
63
|
import { signalDbPath, qrCodePath, WORKSPACE_KEYCHAT_DIR } from "./paths.js";
|
|
@@ -435,7 +458,7 @@ export const keychatPlugin: ChannelPlugin<ResolvedKeychatAccount> = {
|
|
|
435
458
|
channel: "keychat",
|
|
436
459
|
accountId: aid,
|
|
437
460
|
});
|
|
438
|
-
const message = core.channel.text.convertMarkdownTables(text ?? "", tableMode);
|
|
461
|
+
const message = stripReasoningPrefix(core.channel.text.convertMarkdownTables(text ?? "", tableMode));
|
|
439
462
|
const normalizedTo = normalizePubkey(to);
|
|
440
463
|
|
|
441
464
|
// Handle /reset signal command — reset Signal session and re-send hello
|
|
@@ -1437,7 +1460,19 @@ async function handleMlsGroupMessage(
|
|
|
1437
1460
|
try {
|
|
1438
1461
|
mlsMediaPath = await downloadAndDecrypt(mlsMediaInfo);
|
|
1439
1462
|
ctx.log?.info(`[${accountId}] MLS group media downloaded: ${mlsMediaInfo.kctype} → ${mlsMediaPath}`);
|
|
1440
|
-
|
|
1463
|
+
if (mlsMediaInfo.isVoiceNote) {
|
|
1464
|
+
try {
|
|
1465
|
+
const sttConfig: SttConfig = { provider: "whisper-cpp", language: "auto" };
|
|
1466
|
+
const transcription = await transcribe(mlsMediaPath!, sttConfig);
|
|
1467
|
+
ctx.log?.info(`[${accountId}] MLS voice note transcribed: ${transcription.slice(0, 80)}...`);
|
|
1468
|
+
mlsDisplayText = `[voice message, ${mlsMediaInfo.duration || '?'}s] ${transcription}`;
|
|
1469
|
+
} catch (sttErr) {
|
|
1470
|
+
ctx.log?.error(`[${accountId}] MLS voice note STT failed: ${sttErr}`);
|
|
1471
|
+
mlsDisplayText = `[voice message — transcription failed, audio saved to ${mlsMediaPath}]`;
|
|
1472
|
+
}
|
|
1473
|
+
} else {
|
|
1474
|
+
mlsDisplayText = `[${mlsMediaInfo.kctype}: ${mlsMediaInfo.sourceName || mlsMediaInfo.suffix}] (saved to ${mlsMediaPath})`;
|
|
1475
|
+
}
|
|
1441
1476
|
} catch (err) {
|
|
1442
1477
|
ctx.log?.error(`[${accountId}] MLS group media download failed: ${err}`);
|
|
1443
1478
|
mlsDisplayText = `[${mlsMediaInfo.kctype} message — download failed]`;
|
|
@@ -1690,7 +1725,7 @@ async function dispatchMlsGroupToAgent(
|
|
|
1690
1725
|
...prefixOptions,
|
|
1691
1726
|
deliver: async (payload: { text?: string }) => {
|
|
1692
1727
|
if (!payload.text) return;
|
|
1693
|
-
const message = core.channel.text.convertMarkdownTables(payload.text, tableMode);
|
|
1728
|
+
const message = stripReasoningPrefix(core.channel.text.convertMarkdownTables(payload.text, tableMode));
|
|
1694
1729
|
deliverBuffer.push(message);
|
|
1695
1730
|
if (deliverTimer) clearTimeout(deliverTimer);
|
|
1696
1731
|
deliverTimer = setTimeout(() => { flushDeliverBuffer(); }, DELIVER_DEBOUNCE_MS);
|
|
@@ -2137,7 +2172,24 @@ async function handleEncryptedDM(
|
|
|
2137
2172
|
const localPath = await downloadAndDecrypt(mediaInfo);
|
|
2138
2173
|
mediaPath = localPath;
|
|
2139
2174
|
ctx.log?.info(`[${accountId}] Downloaded ${mediaInfo.kctype}: ${localPath}`);
|
|
2140
|
-
|
|
2175
|
+
|
|
2176
|
+
// Voice note: transcribe to text via STT
|
|
2177
|
+
if (mediaInfo.isVoiceNote) {
|
|
2178
|
+
try {
|
|
2179
|
+
const sttConfig: SttConfig = {
|
|
2180
|
+
provider: "whisper-cpp",
|
|
2181
|
+
language: "auto",
|
|
2182
|
+
};
|
|
2183
|
+
const transcription = await transcribe(localPath, sttConfig);
|
|
2184
|
+
ctx.log?.info(`[${accountId}] Voice note transcribed (${mediaInfo.duration || '?'}s): ${transcription.slice(0, 80)}...`);
|
|
2185
|
+
displayText = `[voice message, ${mediaInfo.duration || '?'}s] ${transcription}`;
|
|
2186
|
+
} catch (sttErr) {
|
|
2187
|
+
ctx.log?.error(`[${accountId}] Voice note STT failed: ${sttErr}`);
|
|
2188
|
+
displayText = `[voice message, ${mediaInfo.duration || '?'}s — transcription failed, audio saved to ${localPath}]`;
|
|
2189
|
+
}
|
|
2190
|
+
} else {
|
|
2191
|
+
displayText = `[${mediaInfo.kctype}: ${mediaInfo.sourceName || mediaInfo.suffix}] (saved to ${localPath})`;
|
|
2192
|
+
}
|
|
2141
2193
|
} catch (err) {
|
|
2142
2194
|
ctx.log?.error(`[${accountId}] Failed to download media: ${err}`);
|
|
2143
2195
|
displayText = `[${mediaInfo.kctype} message — download failed]`;
|
|
@@ -2255,7 +2307,7 @@ async function dispatchToAgent(
|
|
|
2255
2307
|
...prefixOptions,
|
|
2256
2308
|
deliver: async (payload: { text?: string }) => {
|
|
2257
2309
|
if (!payload.text) return;
|
|
2258
|
-
const message = core.channel.text.convertMarkdownTables(payload.text, tableMode);
|
|
2310
|
+
const message = stripReasoningPrefix(core.channel.text.convertMarkdownTables(payload.text, tableMode));
|
|
2259
2311
|
deliverBuffer.push(message);
|
|
2260
2312
|
// Reset debounce timer — wait for more chunks before sending
|
|
2261
2313
|
if (deliverTimer) clearTimeout(deliverTimer);
|
|
@@ -2378,7 +2430,7 @@ async function dispatchGroupToAgent(
|
|
|
2378
2430
|
...prefixOptions,
|
|
2379
2431
|
deliver: async (payload: { text?: string }) => {
|
|
2380
2432
|
if (!payload.text) return;
|
|
2381
|
-
const message = core.channel.text.convertMarkdownTables(payload.text, tableMode);
|
|
2433
|
+
const message = stripReasoningPrefix(core.channel.text.convertMarkdownTables(payload.text, tableMode));
|
|
2382
2434
|
deliverBuffer.push(message);
|
|
2383
2435
|
if (deliverTimer) clearTimeout(deliverTimer);
|
|
2384
2436
|
deliverTimer = setTimeout(() => { flushDeliverBuffer(); }, DELIVER_DEBOUNCE_MS);
|
package/src/media.ts
CHANGED
|
@@ -12,6 +12,9 @@ export interface KeychatMediaInfo {
|
|
|
12
12
|
size: number;
|
|
13
13
|
hash?: string;
|
|
14
14
|
sourceName?: string;
|
|
15
|
+
isVoiceNote?: boolean;
|
|
16
|
+
duration?: number; // seconds
|
|
17
|
+
waveform?: string; // base64 5-bit packed
|
|
15
18
|
}
|
|
16
19
|
|
|
17
20
|
export interface MediaUploadResult {
|
|
@@ -30,8 +33,11 @@ function resolveKctype(filePath: string, mimeType?: string): string {
|
|
|
30
33
|
const imageExts = [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".tiff", ".svg"];
|
|
31
34
|
const videoExts = [".mp4", ".avi", ".mov", ".mkv", ".webm", ".flv", ".wmv", ".m4v"];
|
|
32
35
|
|
|
36
|
+
const audioExts = [".ogg", ".opus", ".aac", ".m4a", ".mp3", ".wav"];
|
|
37
|
+
|
|
33
38
|
if (mimeType?.startsWith("image/") || imageExts.includes(ext)) return "image";
|
|
34
39
|
if (mimeType?.startsWith("video/") || videoExts.includes(ext)) return "video";
|
|
40
|
+
if (mimeType?.startsWith("audio/") || audioExts.includes(ext)) return "voiceNote";
|
|
35
41
|
return "file";
|
|
36
42
|
}
|
|
37
43
|
|
|
@@ -123,6 +129,7 @@ export async function encryptAndUpload(
|
|
|
123
129
|
signEvent: (content: string, tags: string[][]) => Promise<string>,
|
|
124
130
|
server?: string,
|
|
125
131
|
mimeType?: string,
|
|
132
|
+
voiceNote?: { duration?: number; waveform?: string },
|
|
126
133
|
): Promise<MediaUploadResult> {
|
|
127
134
|
const { encrypted, key, iv, hash, suffix, sourceName } = await encryptFile(filePath);
|
|
128
135
|
const url = await uploadToBlossom(encrypted, hash, signEvent, server);
|
|
@@ -139,6 +146,12 @@ export async function encryptAndUpload(
|
|
|
139
146
|
mediaUrl.searchParams.set("hash", hash);
|
|
140
147
|
mediaUrl.searchParams.set("sourceName", sourceName);
|
|
141
148
|
|
|
149
|
+
if (voiceNote || kctype === "voiceNote") {
|
|
150
|
+
mediaUrl.searchParams.set("isVoiceNote", "1");
|
|
151
|
+
if (voiceNote?.duration) mediaUrl.searchParams.set("duration", voiceNote.duration.toString());
|
|
152
|
+
if (voiceNote?.waveform) mediaUrl.searchParams.set("waveform", voiceNote.waveform);
|
|
153
|
+
}
|
|
154
|
+
|
|
142
155
|
return { mediaUrl: mediaUrl.toString(), kctype };
|
|
143
156
|
}
|
|
144
157
|
|
|
@@ -166,6 +179,9 @@ export function parseMediaUrl(content: string): KeychatMediaInfo | null {
|
|
|
166
179
|
size: parseInt(uri.searchParams.get("size") || "0", 10),
|
|
167
180
|
hash: uri.searchParams.get("hash") || undefined,
|
|
168
181
|
sourceName: uri.searchParams.get("sourceName") || undefined,
|
|
182
|
+
isVoiceNote: kctype === "voiceNote" || uri.searchParams.get("isVoiceNote") === "1",
|
|
183
|
+
duration: parseInt(uri.searchParams.get("duration") || "0", 10) || undefined,
|
|
184
|
+
waveform: uri.searchParams.get("waveform") || undefined,
|
|
169
185
|
};
|
|
170
186
|
}
|
|
171
187
|
|
package/src/stt.ts
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
import { execFile } from "node:child_process";
|
|
2
|
+
import { promisify } from "node:util";
|
|
3
|
+
import { existsSync } from "node:fs";
|
|
4
|
+
import { join } from "node:path";
|
|
5
|
+
|
|
6
|
+
const execFileAsync = promisify(execFile);
|
|
7
|
+
|
|
8
|
+
export interface SttConfig {
|
|
9
|
+
provider: "whisper-cpp" | "openai";
|
|
10
|
+
/** Path to whisper-cpp binary (default: auto-detect via which) */
|
|
11
|
+
whisperPath?: string;
|
|
12
|
+
/** Path to whisper model file */
|
|
13
|
+
modelPath?: string;
|
|
14
|
+
/** Model size for auto-download: tiny, base, small, medium */
|
|
15
|
+
modelSize?: string;
|
|
16
|
+
/** OpenAI API key (for openai provider) */
|
|
17
|
+
openaiApiKey?: string;
|
|
18
|
+
/** Language hint (e.g. "zh", "en", "auto") */
|
|
19
|
+
language?: string;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
const DEFAULT_MODEL_SIZE = "small";
|
|
23
|
+
|
|
24
|
+
/** Find whisper-cpp binary */
|
|
25
|
+
async function findWhisperBinary(configPath?: string): Promise<string> {
|
|
26
|
+
if (configPath && existsSync(configPath)) return configPath;
|
|
27
|
+
|
|
28
|
+
// Try common locations
|
|
29
|
+
const candidates = [
|
|
30
|
+
"/opt/homebrew/bin/whisper-cpp",
|
|
31
|
+
"/usr/local/bin/whisper-cpp",
|
|
32
|
+
"/usr/bin/whisper-cpp",
|
|
33
|
+
];
|
|
34
|
+
|
|
35
|
+
for (const c of candidates) {
|
|
36
|
+
if (existsSync(c)) return c;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
// Try which
|
|
40
|
+
try {
|
|
41
|
+
const { stdout } = await execFileAsync("which", ["whisper-cpp"]);
|
|
42
|
+
const path = stdout.trim();
|
|
43
|
+
if (path && existsSync(path)) return path;
|
|
44
|
+
} catch {}
|
|
45
|
+
|
|
46
|
+
throw new Error("whisper-cpp not found. Install with: brew install whisper-cpp");
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/** Find or download whisper model */
|
|
50
|
+
async function findModel(configPath?: string, modelSize?: string): Promise<string> {
|
|
51
|
+
if (configPath && existsSync(configPath)) return configPath;
|
|
52
|
+
|
|
53
|
+
const size = modelSize || DEFAULT_MODEL_SIZE;
|
|
54
|
+
|
|
55
|
+
// Check common model locations
|
|
56
|
+
const candidates = [
|
|
57
|
+
join(process.env.HOME || "", `.cache/whisper/ggml-${size}.bin`),
|
|
58
|
+
`/opt/homebrew/share/whisper-cpp/models/ggml-${size}.bin`,
|
|
59
|
+
`/usr/local/share/whisper-cpp/models/ggml-${size}.bin`,
|
|
60
|
+
join(process.env.HOME || "", `whisper-models/ggml-${size}.bin`),
|
|
61
|
+
];
|
|
62
|
+
|
|
63
|
+
for (const c of candidates) {
|
|
64
|
+
if (existsSync(c)) return c;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
throw new Error(
|
|
68
|
+
`Whisper model ggml-${size}.bin not found. Download it:\n` +
|
|
69
|
+
` mkdir -p ~/.cache/whisper && cd ~/.cache/whisper\n` +
|
|
70
|
+
` curl -LO https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-${size}.bin`
|
|
71
|
+
);
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
/**
|
|
75
|
+
* Transcribe an audio file to text using whisper-cpp.
|
|
76
|
+
*/
|
|
77
|
+
export async function transcribeLocal(
|
|
78
|
+
audioPath: string,
|
|
79
|
+
config: SttConfig = { provider: "whisper-cpp" },
|
|
80
|
+
): Promise<string> {
|
|
81
|
+
const binary = await findWhisperBinary(config.whisperPath);
|
|
82
|
+
const model = await findModel(config.modelPath, config.modelSize);
|
|
83
|
+
|
|
84
|
+
const args = [
|
|
85
|
+
"-m", model,
|
|
86
|
+
"-f", audioPath,
|
|
87
|
+
"--no-timestamps",
|
|
88
|
+
"--print-special", "false",
|
|
89
|
+
"-t", "4", // threads
|
|
90
|
+
];
|
|
91
|
+
|
|
92
|
+
if (config.language && config.language !== "auto") {
|
|
93
|
+
args.push("-l", config.language);
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
try {
|
|
97
|
+
const { stdout, stderr } = await execFileAsync(binary, args, {
|
|
98
|
+
timeout: 60000, // 60s timeout
|
|
99
|
+
maxBuffer: 10 * 1024 * 1024,
|
|
100
|
+
});
|
|
101
|
+
|
|
102
|
+
// whisper-cpp outputs text to stdout, strip whitespace
|
|
103
|
+
const text = stdout.trim();
|
|
104
|
+
if (!text) {
|
|
105
|
+
console.warn(`[stt] whisper-cpp produced no output. stderr: ${stderr}`);
|
|
106
|
+
return "[voice message - transcription empty]";
|
|
107
|
+
}
|
|
108
|
+
return text;
|
|
109
|
+
} catch (err: any) {
|
|
110
|
+
console.error(`[stt] whisper-cpp failed: ${err.message}`);
|
|
111
|
+
throw new Error(`Speech-to-text failed: ${err.message}`);
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
/**
|
|
116
|
+
* Transcribe using OpenAI Whisper API (fallback).
|
|
117
|
+
*/
|
|
118
|
+
export async function transcribeOpenAI(
|
|
119
|
+
audioPath: string,
|
|
120
|
+
apiKey: string,
|
|
121
|
+
language?: string,
|
|
122
|
+
): Promise<string> {
|
|
123
|
+
const { readFile } = await import("node:fs/promises");
|
|
124
|
+
const audioData = await readFile(audioPath);
|
|
125
|
+
const blob = new Blob([audioData], { type: "audio/ogg" });
|
|
126
|
+
|
|
127
|
+
const form = new FormData();
|
|
128
|
+
form.append("file", blob, "voice.ogg");
|
|
129
|
+
form.append("model", "whisper-1");
|
|
130
|
+
if (language && language !== "auto") form.append("language", language);
|
|
131
|
+
|
|
132
|
+
const response = await fetch("https://api.openai.com/v1/audio/transcriptions", {
|
|
133
|
+
method: "POST",
|
|
134
|
+
headers: { Authorization: `Bearer ${apiKey}` },
|
|
135
|
+
body: form,
|
|
136
|
+
});
|
|
137
|
+
|
|
138
|
+
if (!response.ok) {
|
|
139
|
+
const body = await response.text().catch(() => "");
|
|
140
|
+
throw new Error(`OpenAI Whisper API failed (${response.status}): ${body}`);
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
const result = await response.json() as { text: string };
|
|
144
|
+
return result.text || "[voice message - transcription empty]";
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
/**
|
|
148
|
+
* Main transcribe function — picks provider from config.
|
|
149
|
+
*/
|
|
150
|
+
export async function transcribe(
|
|
151
|
+
audioPath: string,
|
|
152
|
+
config: SttConfig = { provider: "whisper-cpp" },
|
|
153
|
+
): Promise<string> {
|
|
154
|
+
if (config.provider === "openai") {
|
|
155
|
+
if (!config.openaiApiKey) throw new Error("OpenAI API key required for openai STT provider");
|
|
156
|
+
return transcribeOpenAI(audioPath, config.openaiApiKey, config.language);
|
|
157
|
+
}
|
|
158
|
+
return transcribeLocal(audioPath, config);
|
|
159
|
+
}
|
package/src/tts.ts
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
import { execFile } from "node:child_process";
|
|
2
|
+
import { promisify } from "node:util";
|
|
3
|
+
import { existsSync } from "node:fs";
|
|
4
|
+
import { tmpdir } from "node:os";
|
|
5
|
+
import { join } from "node:path";
|
|
6
|
+
|
|
7
|
+
const execFileAsync = promisify(execFile);
|
|
8
|
+
|
|
9
|
+
export interface TtsConfig {
|
|
10
|
+
provider: "say" | "piper" | "openai";
|
|
11
|
+
/** macOS 'say' voice name (e.g. "Tingting" for Chinese, "Samantha" for English) */
|
|
12
|
+
voice?: string;
|
|
13
|
+
/** Path to piper binary */
|
|
14
|
+
piperPath?: string;
|
|
15
|
+
/** Path to piper voice model */
|
|
16
|
+
piperModel?: string;
|
|
17
|
+
/** OpenAI API key */
|
|
18
|
+
openaiApiKey?: string;
|
|
19
|
+
/** OpenAI TTS voice */
|
|
20
|
+
openaiVoice?: string;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Generate speech from text using macOS 'say' command.
|
|
25
|
+
* Outputs AIFF, then converts to OGG via ffmpeg if available.
|
|
26
|
+
*/
|
|
27
|
+
async function ttsSay(text: string, config: TtsConfig): Promise<string> {
|
|
28
|
+
const outPath = join(tmpdir(), `tts_${Date.now()}.aiff`);
|
|
29
|
+
const args = ["-o", outPath];
|
|
30
|
+
|
|
31
|
+
if (config.voice) {
|
|
32
|
+
args.push("-v", config.voice);
|
|
33
|
+
}
|
|
34
|
+
args.push(text);
|
|
35
|
+
|
|
36
|
+
await execFileAsync("say", args, { timeout: 30000 });
|
|
37
|
+
|
|
38
|
+
// Try to convert to OGG with ffmpeg
|
|
39
|
+
const oggPath = outPath.replace(".aiff", ".ogg");
|
|
40
|
+
try {
|
|
41
|
+
await execFileAsync("ffmpeg", [
|
|
42
|
+
"-i", outPath,
|
|
43
|
+
"-c:a", "libopus",
|
|
44
|
+
"-b:a", "24k",
|
|
45
|
+
"-ar", "48000",
|
|
46
|
+
"-ac", "1",
|
|
47
|
+
"-y", oggPath,
|
|
48
|
+
], { timeout: 30000 });
|
|
49
|
+
return oggPath;
|
|
50
|
+
} catch {
|
|
51
|
+
// ffmpeg not available, return AIFF
|
|
52
|
+
return outPath;
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* Generate speech using OpenAI TTS API.
|
|
58
|
+
*/
|
|
59
|
+
async function ttsOpenAI(text: string, config: TtsConfig): Promise<string> {
|
|
60
|
+
if (!config.openaiApiKey) throw new Error("OpenAI API key required");
|
|
61
|
+
|
|
62
|
+
const response = await fetch("https://api.openai.com/v1/audio/speech", {
|
|
63
|
+
method: "POST",
|
|
64
|
+
headers: {
|
|
65
|
+
Authorization: `Bearer ${config.openaiApiKey}`,
|
|
66
|
+
"Content-Type": "application/json",
|
|
67
|
+
},
|
|
68
|
+
body: JSON.stringify({
|
|
69
|
+
model: "tts-1",
|
|
70
|
+
input: text,
|
|
71
|
+
voice: config.openaiVoice || "alloy",
|
|
72
|
+
response_format: "opus",
|
|
73
|
+
}),
|
|
74
|
+
});
|
|
75
|
+
|
|
76
|
+
if (!response.ok) {
|
|
77
|
+
const body = await response.text().catch(() => "");
|
|
78
|
+
throw new Error(`OpenAI TTS failed (${response.status}): ${body}`);
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
const { writeFile } = await import("node:fs/promises");
|
|
82
|
+
const audioData = Buffer.from(await response.arrayBuffer());
|
|
83
|
+
const outPath = join(tmpdir(), `tts_${Date.now()}.ogg`);
|
|
84
|
+
await writeFile(outPath, audioData);
|
|
85
|
+
return outPath;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
/**
|
|
89
|
+
* Main TTS function — generate audio file from text.
|
|
90
|
+
* Returns path to audio file (OGG preferred).
|
|
91
|
+
*/
|
|
92
|
+
export async function synthesize(text: string, config: TtsConfig = { provider: "say" }): Promise<string> {
|
|
93
|
+
switch (config.provider) {
|
|
94
|
+
case "say":
|
|
95
|
+
return ttsSay(text, config);
|
|
96
|
+
case "openai":
|
|
97
|
+
return ttsOpenAI(text, config);
|
|
98
|
+
case "piper":
|
|
99
|
+
throw new Error("Piper TTS not yet implemented");
|
|
100
|
+
default:
|
|
101
|
+
throw new Error(`Unknown TTS provider: ${config.provider}`);
|
|
102
|
+
}
|
|
103
|
+
}
|