@keychat-io/keychat 0.1.26 → 0.1.28
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -39
- package/package.json +1 -1
- package/src/bridge-client.ts +7 -2
- package/src/channel.ts +48 -8
- package/src/media.ts +16 -0
- package/src/stt.ts +159 -0
- package/src/tts.ts +103 -0
package/README.md
CHANGED
|
@@ -6,8 +6,6 @@ E2E encrypted AI agent communication via Keychat protocol.
|
|
|
6
6
|
|
|
7
7
|
This plugin gives your OpenClaw agent a **sovereign identity** — a self-generated Public Key ID (Nostr keypair) — and enables **end-to-end encrypted communication** using the Signal Protocol over Nostr relays.
|
|
8
8
|
|
|
9
|
-
Your agent becomes a full Keychat citizen: it can receive friend requests, establish Signal Protocol sessions, and exchange messages with Keychat app users. All messages are encrypted with forward and backward secrecy — not even relay operators can read them.
|
|
10
|
-
|
|
11
9
|
## Install
|
|
12
10
|
|
|
13
11
|
```bash
|
|
@@ -17,42 +15,22 @@ openclaw gateway restart
|
|
|
17
15
|
|
|
18
16
|
That's it. The plugin automatically downloads the bridge binary and initializes the config on first load.
|
|
19
17
|
|
|
20
|
-
Supported platforms: macOS (ARM/x64), Linux (x64/ARM64).
|
|
21
|
-
|
|
22
|
-
### Security Warnings
|
|
23
|
-
|
|
24
|
-
During installation, OpenClaw will show the following warning:
|
|
25
|
-
|
|
26
|
-
```
|
|
27
|
-
WARNING: Plugin "keychat" contains dangerous code patterns:
|
|
28
|
-
Shell command execution detected (child_process) (src/bridge-client.ts)
|
|
29
|
-
Shell command execution detected (child_process) (src/keychain.ts)
|
|
30
|
-
```
|
|
31
|
-
|
|
32
|
-
**Both are expected and required.** Here's why:
|
|
33
|
-
|
|
34
|
-
| Warning | File | Why it's needed |
|
|
35
|
-
|---------|------|----------------|
|
|
36
|
-
| Shell command execution (child_process) | `bridge-client.ts` | Spawns the Rust sidecar process for Signal Protocol & MLS encryption. Keychat's cryptography runs in Rust and must be bridged to Node.js. |
|
|
37
|
-
| Shell command execution (child_process) | `keychain.ts` | Stores the agent's identity mnemonic in the OS keychain (macOS Keychain / Linux libsecret) instead of plain-text files. |
|
|
38
|
-
|
|
39
|
-
These warnings cannot be removed without sacrificing core functionality or security. The plugin does **not** execute arbitrary commands — it only spawns the bundled bridge binary and accesses the system keychain.
|
|
40
|
-
|
|
41
18
|
Alternatively, install via shell script:
|
|
42
19
|
|
|
43
20
|
```bash
|
|
44
21
|
curl -fsSL https://raw.githubusercontent.com/keychat-io/keychat-openclaw/main/scripts/install.sh | bash
|
|
45
22
|
```
|
|
46
23
|
|
|
24
|
+
Supported platforms: macOS (ARM/x64), Linux (x64/ARM64).
|
|
25
|
+
|
|
47
26
|
### Security Warnings
|
|
48
27
|
|
|
49
|
-
During installation, OpenClaw's security scanner may show
|
|
28
|
+
During installation, OpenClaw's security scanner may show two warnings. Both are expected:
|
|
50
29
|
|
|
51
|
-
| Warning | Reason
|
|
52
|
-
| ------------------------------------------ |
|
|
53
|
-
| Shell command execution (bridge-client.ts) | Spawns a Rust sidecar for Signal Protocol and MLS encryption.
|
|
54
|
-
| Shell command execution (keychain.ts) | Stores identity mnemonics in the OS keychain (macOS Keychain / Linux libsecret).
|
|
55
|
-
| Shell command execution (notify.ts) | Notifies the agent on startup so it can send the Keychat ID and QR code to the user. |
|
|
30
|
+
| Warning | Reason |
|
|
31
|
+
| ------------------------------------------ | -------------------------------------------------------------------------------- |
|
|
32
|
+
| Shell command execution (bridge-client.ts) | Spawns a Rust sidecar for Signal Protocol and MLS encryption. |
|
|
33
|
+
| Shell command execution (keychain.ts) | Stores identity mnemonics in the OS keychain (macOS Keychain / Linux libsecret). |
|
|
56
34
|
|
|
57
35
|
Source code is fully open: [github.com/keychat-io/keychat-openclaw](https://github.com/keychat-io/keychat-openclaw)
|
|
58
36
|
|
|
@@ -61,7 +39,7 @@ Source code is fully open: [github.com/keychat-io/keychat-openclaw](https://gith
|
|
|
61
39
|
Tell your agent "upgrade keychat" in any chat, or manually:
|
|
62
40
|
|
|
63
41
|
```bash
|
|
64
|
-
openclaw plugins
|
|
42
|
+
openclaw plugins update keychat
|
|
65
43
|
openclaw gateway restart
|
|
66
44
|
```
|
|
67
45
|
|
|
@@ -77,15 +55,7 @@ After `openclaw gateway restart`, the agent will send you its **Keychat ID**, **
|
|
|
77
55
|
|
|
78
56
|
Open the [Keychat app](https://keychat.io) → tap the link, paste the npub, or scan the QR code to add as contact. If `dmPolicy` is `open` (default after auto-init), the agent accepts immediately.
|
|
79
57
|
|
|
80
|
-
**Can't find the public key?**
|
|
81
|
-
|
|
82
|
-
```bash
|
|
83
|
-
# View the agent's npub in config
|
|
84
|
-
cat ~/.openclaw/openclaw.json | grep npub
|
|
85
|
-
|
|
86
|
-
# Or watch the gateway logs for the Keychat ID
|
|
87
|
-
openclaw logs --follow
|
|
88
|
-
```
|
|
58
|
+
**Can't find the public key?** Just ask your agent in chat: "What's your Keychat ID?"
|
|
89
59
|
|
|
90
60
|
## Configuration
|
|
91
61
|
|
package/package.json
CHANGED
package/src/bridge-client.ts
CHANGED
|
@@ -306,8 +306,13 @@ export class KeychatBridgeClient {
|
|
|
306
306
|
this.pending.set(id, { resolve, reject });
|
|
307
307
|
|
|
308
308
|
const request = JSON.stringify({ id, method, params: params ?? {} });
|
|
309
|
-
|
|
310
|
-
|
|
309
|
+
try {
|
|
310
|
+
this.process.stdin.write(request + '\n');
|
|
311
|
+
} catch (error) {
|
|
312
|
+
this.pending.delete(id);
|
|
313
|
+
reject(new Error(`Bridge write failed: ${error}`));
|
|
314
|
+
return;
|
|
315
|
+
}
|
|
311
316
|
// Timeout after 30 seconds
|
|
312
317
|
setTimeout(() => {
|
|
313
318
|
if (this.pending.has(id)) {
|
package/src/channel.ts
CHANGED
|
@@ -24,10 +24,20 @@ import {
|
|
|
24
24
|
* so we silently drop it to keep messages clean.
|
|
25
25
|
*/
|
|
26
26
|
function stripReasoningPrefix(text: string): string {
|
|
27
|
-
//
|
|
28
|
-
//
|
|
29
|
-
|
|
30
|
-
|
|
27
|
+
// Strip reasoning in multiple formats:
|
|
28
|
+
// 1. "Reasoning:\n_line1_\n_line2_\n\nActual answer..."
|
|
29
|
+
// 2. Leading italic blocks: "_thinking text_\n_more thinking_\n\nActual answer..."
|
|
30
|
+
// 3. "**Heading**\n_thinking_\n\nActual answer..."
|
|
31
|
+
let result = text;
|
|
32
|
+
|
|
33
|
+
// Format 1: Explicit "Reasoning:" prefix
|
|
34
|
+
result = result.replace(/^Reasoning:\n(?:_[^\n]*_\n?)+\n*/s, "");
|
|
35
|
+
|
|
36
|
+
// Format 2: Leading italic lines (markdown _text_) at the start
|
|
37
|
+
// Keep stripping italic lines until we hit a non-italic line
|
|
38
|
+
result = result.replace(/^(?:_[^\n]*_\n*)+\n*/s, "");
|
|
39
|
+
|
|
40
|
+
return result.trim();
|
|
31
41
|
}
|
|
32
42
|
import { KeychatConfigSchema } from "./config-schema.js";
|
|
33
43
|
import { getKeychatRuntime } from "./runtime.js";
|
|
@@ -47,6 +57,7 @@ import {
|
|
|
47
57
|
} from "./bridge-client.js";
|
|
48
58
|
import { storeMnemonic, retrieveMnemonic } from "./keychain.js";
|
|
49
59
|
import { parseMediaUrl, downloadAndDecrypt, encryptAndUpload } from "./media.js";
|
|
60
|
+
import { transcribe, type SttConfig } from "./stt.js";
|
|
50
61
|
import { join } from "node:path";
|
|
51
62
|
import { existsSync, mkdirSync } from "node:fs";
|
|
52
63
|
import { signalDbPath, qrCodePath, WORKSPACE_KEYCHAT_DIR } from "./paths.js";
|
|
@@ -295,7 +306,7 @@ const mlsInitialized = new Set<string>();
|
|
|
295
306
|
* Normalize a pubkey: strip nostr: prefix, handle npub/hex.
|
|
296
307
|
*/
|
|
297
308
|
function normalizePubkey(input: string): string {
|
|
298
|
-
const trimmed = input.replace(/^nostr:/i, "").trim();
|
|
309
|
+
const trimmed = input.replace(/^nostr:/i, "").replace(/^keychat:/i, "").trim();
|
|
299
310
|
// If it's hex, lowercase it
|
|
300
311
|
if (/^[0-9a-fA-F]{64}$/.test(trimmed)) {
|
|
301
312
|
return trimmed.toLowerCase();
|
|
@@ -1449,7 +1460,19 @@ async function handleMlsGroupMessage(
|
|
|
1449
1460
|
try {
|
|
1450
1461
|
mlsMediaPath = await downloadAndDecrypt(mlsMediaInfo);
|
|
1451
1462
|
ctx.log?.info(`[${accountId}] MLS group media downloaded: ${mlsMediaInfo.kctype} → ${mlsMediaPath}`);
|
|
1452
|
-
|
|
1463
|
+
if (mlsMediaInfo.isVoiceNote) {
|
|
1464
|
+
try {
|
|
1465
|
+
const sttConfig: SttConfig = { provider: "whisper-cpp", language: "auto" };
|
|
1466
|
+
const transcription = await transcribe(mlsMediaPath!, sttConfig);
|
|
1467
|
+
ctx.log?.info(`[${accountId}] MLS voice note transcribed: ${transcription.slice(0, 80)}...`);
|
|
1468
|
+
mlsDisplayText = `[voice message, ${mlsMediaInfo.duration || '?'}s] ${transcription}`;
|
|
1469
|
+
} catch (sttErr) {
|
|
1470
|
+
ctx.log?.error(`[${accountId}] MLS voice note STT failed: ${sttErr}`);
|
|
1471
|
+
mlsDisplayText = `[voice message — transcription failed, audio saved to ${mlsMediaPath}]`;
|
|
1472
|
+
}
|
|
1473
|
+
} else {
|
|
1474
|
+
mlsDisplayText = `[${mlsMediaInfo.kctype}: ${mlsMediaInfo.sourceName || mlsMediaInfo.suffix}] (saved to ${mlsMediaPath})`;
|
|
1475
|
+
}
|
|
1453
1476
|
} catch (err) {
|
|
1454
1477
|
ctx.log?.error(`[${accountId}] MLS group media download failed: ${err}`);
|
|
1455
1478
|
mlsDisplayText = `[${mlsMediaInfo.kctype} message — download failed]`;
|
|
@@ -2149,7 +2172,24 @@ async function handleEncryptedDM(
|
|
|
2149
2172
|
const localPath = await downloadAndDecrypt(mediaInfo);
|
|
2150
2173
|
mediaPath = localPath;
|
|
2151
2174
|
ctx.log?.info(`[${accountId}] Downloaded ${mediaInfo.kctype}: ${localPath}`);
|
|
2152
|
-
|
|
2175
|
+
|
|
2176
|
+
// Voice note: transcribe to text via STT
|
|
2177
|
+
if (mediaInfo.isVoiceNote) {
|
|
2178
|
+
try {
|
|
2179
|
+
const sttConfig: SttConfig = {
|
|
2180
|
+
provider: "whisper-cpp",
|
|
2181
|
+
language: "auto",
|
|
2182
|
+
};
|
|
2183
|
+
const transcription = await transcribe(localPath, sttConfig);
|
|
2184
|
+
ctx.log?.info(`[${accountId}] Voice note transcribed (${mediaInfo.duration || '?'}s): ${transcription.slice(0, 80)}...`);
|
|
2185
|
+
displayText = `[voice message, ${mediaInfo.duration || '?'}s] ${transcription}`;
|
|
2186
|
+
} catch (sttErr) {
|
|
2187
|
+
ctx.log?.error(`[${accountId}] Voice note STT failed: ${sttErr}`);
|
|
2188
|
+
displayText = `[voice message, ${mediaInfo.duration || '?'}s — transcription failed, audio saved to ${localPath}]`;
|
|
2189
|
+
}
|
|
2190
|
+
} else {
|
|
2191
|
+
displayText = `[${mediaInfo.kctype}: ${mediaInfo.sourceName || mediaInfo.suffix}] (saved to ${localPath})`;
|
|
2192
|
+
}
|
|
2153
2193
|
} catch (err) {
|
|
2154
2194
|
ctx.log?.error(`[${accountId}] Failed to download media: ${err}`);
|
|
2155
2195
|
displayText = `[${mediaInfo.kctype} message — download failed]`;
|
|
@@ -2221,7 +2261,7 @@ async function dispatchToAgent(
|
|
|
2221
2261
|
Surface: "keychat" as const,
|
|
2222
2262
|
MessageSid: eventId,
|
|
2223
2263
|
OriginatingChannel: "keychat" as const,
|
|
2224
|
-
OriginatingTo: `keychat:${
|
|
2264
|
+
OriginatingTo: `keychat:${peerNostrPubkey}`,
|
|
2225
2265
|
...(mediaPath ? { MediaPath: mediaPath } : {}),
|
|
2226
2266
|
});
|
|
2227
2267
|
|
package/src/media.ts
CHANGED
|
@@ -12,6 +12,9 @@ export interface KeychatMediaInfo {
|
|
|
12
12
|
size: number;
|
|
13
13
|
hash?: string;
|
|
14
14
|
sourceName?: string;
|
|
15
|
+
isVoiceNote?: boolean;
|
|
16
|
+
duration?: number; // seconds
|
|
17
|
+
waveform?: string; // base64 5-bit packed
|
|
15
18
|
}
|
|
16
19
|
|
|
17
20
|
export interface MediaUploadResult {
|
|
@@ -30,8 +33,11 @@ function resolveKctype(filePath: string, mimeType?: string): string {
|
|
|
30
33
|
const imageExts = [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".tiff", ".svg"];
|
|
31
34
|
const videoExts = [".mp4", ".avi", ".mov", ".mkv", ".webm", ".flv", ".wmv", ".m4v"];
|
|
32
35
|
|
|
36
|
+
const audioExts = [".ogg", ".opus", ".aac", ".m4a", ".mp3", ".wav"];
|
|
37
|
+
|
|
33
38
|
if (mimeType?.startsWith("image/") || imageExts.includes(ext)) return "image";
|
|
34
39
|
if (mimeType?.startsWith("video/") || videoExts.includes(ext)) return "video";
|
|
40
|
+
if (mimeType?.startsWith("audio/") || audioExts.includes(ext)) return "voiceNote";
|
|
35
41
|
return "file";
|
|
36
42
|
}
|
|
37
43
|
|
|
@@ -123,6 +129,7 @@ export async function encryptAndUpload(
|
|
|
123
129
|
signEvent: (content: string, tags: string[][]) => Promise<string>,
|
|
124
130
|
server?: string,
|
|
125
131
|
mimeType?: string,
|
|
132
|
+
voiceNote?: { duration?: number; waveform?: string },
|
|
126
133
|
): Promise<MediaUploadResult> {
|
|
127
134
|
const { encrypted, key, iv, hash, suffix, sourceName } = await encryptFile(filePath);
|
|
128
135
|
const url = await uploadToBlossom(encrypted, hash, signEvent, server);
|
|
@@ -139,6 +146,12 @@ export async function encryptAndUpload(
|
|
|
139
146
|
mediaUrl.searchParams.set("hash", hash);
|
|
140
147
|
mediaUrl.searchParams.set("sourceName", sourceName);
|
|
141
148
|
|
|
149
|
+
if (voiceNote || kctype === "voiceNote") {
|
|
150
|
+
mediaUrl.searchParams.set("isVoiceNote", "1");
|
|
151
|
+
if (voiceNote?.duration) mediaUrl.searchParams.set("duration", voiceNote.duration.toString());
|
|
152
|
+
if (voiceNote?.waveform) mediaUrl.searchParams.set("waveform", voiceNote.waveform);
|
|
153
|
+
}
|
|
154
|
+
|
|
142
155
|
return { mediaUrl: mediaUrl.toString(), kctype };
|
|
143
156
|
}
|
|
144
157
|
|
|
@@ -166,6 +179,9 @@ export function parseMediaUrl(content: string): KeychatMediaInfo | null {
|
|
|
166
179
|
size: parseInt(uri.searchParams.get("size") || "0", 10),
|
|
167
180
|
hash: uri.searchParams.get("hash") || undefined,
|
|
168
181
|
sourceName: uri.searchParams.get("sourceName") || undefined,
|
|
182
|
+
isVoiceNote: kctype === "voiceNote" || uri.searchParams.get("isVoiceNote") === "1",
|
|
183
|
+
duration: parseInt(uri.searchParams.get("duration") || "0", 10) || undefined,
|
|
184
|
+
waveform: uri.searchParams.get("waveform") || undefined,
|
|
169
185
|
};
|
|
170
186
|
}
|
|
171
187
|
|
package/src/stt.ts
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
import { execFile } from "node:child_process";
|
|
2
|
+
import { promisify } from "node:util";
|
|
3
|
+
import { existsSync } from "node:fs";
|
|
4
|
+
import { join } from "node:path";
|
|
5
|
+
|
|
6
|
+
const execFileAsync = promisify(execFile);
|
|
7
|
+
|
|
8
|
+
export interface SttConfig {
|
|
9
|
+
provider: "whisper-cpp" | "openai";
|
|
10
|
+
/** Path to whisper-cpp binary (default: auto-detect via which) */
|
|
11
|
+
whisperPath?: string;
|
|
12
|
+
/** Path to whisper model file */
|
|
13
|
+
modelPath?: string;
|
|
14
|
+
/** Model size for auto-download: tiny, base, small, medium */
|
|
15
|
+
modelSize?: string;
|
|
16
|
+
/** OpenAI API key (for openai provider) */
|
|
17
|
+
openaiApiKey?: string;
|
|
18
|
+
/** Language hint (e.g. "zh", "en", "auto") */
|
|
19
|
+
language?: string;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
const DEFAULT_MODEL_SIZE = "small";
|
|
23
|
+
|
|
24
|
+
/** Find whisper-cpp binary */
|
|
25
|
+
async function findWhisperBinary(configPath?: string): Promise<string> {
|
|
26
|
+
if (configPath && existsSync(configPath)) return configPath;
|
|
27
|
+
|
|
28
|
+
// Try common locations
|
|
29
|
+
const candidates = [
|
|
30
|
+
"/opt/homebrew/bin/whisper-cpp",
|
|
31
|
+
"/usr/local/bin/whisper-cpp",
|
|
32
|
+
"/usr/bin/whisper-cpp",
|
|
33
|
+
];
|
|
34
|
+
|
|
35
|
+
for (const c of candidates) {
|
|
36
|
+
if (existsSync(c)) return c;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
// Try which
|
|
40
|
+
try {
|
|
41
|
+
const { stdout } = await execFileAsync("which", ["whisper-cpp"]);
|
|
42
|
+
const path = stdout.trim();
|
|
43
|
+
if (path && existsSync(path)) return path;
|
|
44
|
+
} catch {}
|
|
45
|
+
|
|
46
|
+
throw new Error("whisper-cpp not found. Install with: brew install whisper-cpp");
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/** Find or download whisper model */
|
|
50
|
+
async function findModel(configPath?: string, modelSize?: string): Promise<string> {
|
|
51
|
+
if (configPath && existsSync(configPath)) return configPath;
|
|
52
|
+
|
|
53
|
+
const size = modelSize || DEFAULT_MODEL_SIZE;
|
|
54
|
+
|
|
55
|
+
// Check common model locations
|
|
56
|
+
const candidates = [
|
|
57
|
+
join(process.env.HOME || "", `.cache/whisper/ggml-${size}.bin`),
|
|
58
|
+
`/opt/homebrew/share/whisper-cpp/models/ggml-${size}.bin`,
|
|
59
|
+
`/usr/local/share/whisper-cpp/models/ggml-${size}.bin`,
|
|
60
|
+
join(process.env.HOME || "", `whisper-models/ggml-${size}.bin`),
|
|
61
|
+
];
|
|
62
|
+
|
|
63
|
+
for (const c of candidates) {
|
|
64
|
+
if (existsSync(c)) return c;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
throw new Error(
|
|
68
|
+
`Whisper model ggml-${size}.bin not found. Download it:\n` +
|
|
69
|
+
` mkdir -p ~/.cache/whisper && cd ~/.cache/whisper\n` +
|
|
70
|
+
` curl -LO https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-${size}.bin`
|
|
71
|
+
);
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
/**
|
|
75
|
+
* Transcribe an audio file to text using whisper-cpp.
|
|
76
|
+
*/
|
|
77
|
+
export async function transcribeLocal(
|
|
78
|
+
audioPath: string,
|
|
79
|
+
config: SttConfig = { provider: "whisper-cpp" },
|
|
80
|
+
): Promise<string> {
|
|
81
|
+
const binary = await findWhisperBinary(config.whisperPath);
|
|
82
|
+
const model = await findModel(config.modelPath, config.modelSize);
|
|
83
|
+
|
|
84
|
+
const args = [
|
|
85
|
+
"-m", model,
|
|
86
|
+
"-f", audioPath,
|
|
87
|
+
"--no-timestamps",
|
|
88
|
+
"--print-special", "false",
|
|
89
|
+
"-t", "4", // threads
|
|
90
|
+
];
|
|
91
|
+
|
|
92
|
+
if (config.language && config.language !== "auto") {
|
|
93
|
+
args.push("-l", config.language);
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
try {
|
|
97
|
+
const { stdout, stderr } = await execFileAsync(binary, args, {
|
|
98
|
+
timeout: 60000, // 60s timeout
|
|
99
|
+
maxBuffer: 10 * 1024 * 1024,
|
|
100
|
+
});
|
|
101
|
+
|
|
102
|
+
// whisper-cpp outputs text to stdout, strip whitespace
|
|
103
|
+
const text = stdout.trim();
|
|
104
|
+
if (!text) {
|
|
105
|
+
console.warn(`[stt] whisper-cpp produced no output. stderr: ${stderr}`);
|
|
106
|
+
return "[voice message - transcription empty]";
|
|
107
|
+
}
|
|
108
|
+
return text;
|
|
109
|
+
} catch (err: any) {
|
|
110
|
+
console.error(`[stt] whisper-cpp failed: ${err.message}`);
|
|
111
|
+
throw new Error(`Speech-to-text failed: ${err.message}`);
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
/**
|
|
116
|
+
* Transcribe using OpenAI Whisper API (fallback).
|
|
117
|
+
*/
|
|
118
|
+
export async function transcribeOpenAI(
|
|
119
|
+
audioPath: string,
|
|
120
|
+
apiKey: string,
|
|
121
|
+
language?: string,
|
|
122
|
+
): Promise<string> {
|
|
123
|
+
const { readFile } = await import("node:fs/promises");
|
|
124
|
+
const audioData = await readFile(audioPath);
|
|
125
|
+
const blob = new Blob([audioData], { type: "audio/ogg" });
|
|
126
|
+
|
|
127
|
+
const form = new FormData();
|
|
128
|
+
form.append("file", blob, "voice.ogg");
|
|
129
|
+
form.append("model", "whisper-1");
|
|
130
|
+
if (language && language !== "auto") form.append("language", language);
|
|
131
|
+
|
|
132
|
+
const response = await fetch("https://api.openai.com/v1/audio/transcriptions", {
|
|
133
|
+
method: "POST",
|
|
134
|
+
headers: { Authorization: `Bearer ${apiKey}` },
|
|
135
|
+
body: form,
|
|
136
|
+
});
|
|
137
|
+
|
|
138
|
+
if (!response.ok) {
|
|
139
|
+
const body = await response.text().catch(() => "");
|
|
140
|
+
throw new Error(`OpenAI Whisper API failed (${response.status}): ${body}`);
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
const result = await response.json() as { text: string };
|
|
144
|
+
return result.text || "[voice message - transcription empty]";
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
/**
|
|
148
|
+
* Main transcribe function — picks provider from config.
|
|
149
|
+
*/
|
|
150
|
+
export async function transcribe(
|
|
151
|
+
audioPath: string,
|
|
152
|
+
config: SttConfig = { provider: "whisper-cpp" },
|
|
153
|
+
): Promise<string> {
|
|
154
|
+
if (config.provider === "openai") {
|
|
155
|
+
if (!config.openaiApiKey) throw new Error("OpenAI API key required for openai STT provider");
|
|
156
|
+
return transcribeOpenAI(audioPath, config.openaiApiKey, config.language);
|
|
157
|
+
}
|
|
158
|
+
return transcribeLocal(audioPath, config);
|
|
159
|
+
}
|
package/src/tts.ts
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
import { execFile } from "node:child_process";
|
|
2
|
+
import { promisify } from "node:util";
|
|
3
|
+
import { existsSync } from "node:fs";
|
|
4
|
+
import { tmpdir } from "node:os";
|
|
5
|
+
import { join } from "node:path";
|
|
6
|
+
|
|
7
|
+
const execFileAsync = promisify(execFile);
|
|
8
|
+
|
|
9
|
+
export interface TtsConfig {
|
|
10
|
+
provider: "say" | "piper" | "openai";
|
|
11
|
+
/** macOS 'say' voice name (e.g. "Tingting" for Chinese, "Samantha" for English) */
|
|
12
|
+
voice?: string;
|
|
13
|
+
/** Path to piper binary */
|
|
14
|
+
piperPath?: string;
|
|
15
|
+
/** Path to piper voice model */
|
|
16
|
+
piperModel?: string;
|
|
17
|
+
/** OpenAI API key */
|
|
18
|
+
openaiApiKey?: string;
|
|
19
|
+
/** OpenAI TTS voice */
|
|
20
|
+
openaiVoice?: string;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Generate speech from text using macOS 'say' command.
|
|
25
|
+
* Outputs AIFF, then converts to OGG via ffmpeg if available.
|
|
26
|
+
*/
|
|
27
|
+
async function ttsSay(text: string, config: TtsConfig): Promise<string> {
|
|
28
|
+
const outPath = join(tmpdir(), `tts_${Date.now()}.aiff`);
|
|
29
|
+
const args = ["-o", outPath];
|
|
30
|
+
|
|
31
|
+
if (config.voice) {
|
|
32
|
+
args.push("-v", config.voice);
|
|
33
|
+
}
|
|
34
|
+
args.push(text);
|
|
35
|
+
|
|
36
|
+
await execFileAsync("say", args, { timeout: 30000 });
|
|
37
|
+
|
|
38
|
+
// Try to convert to OGG with ffmpeg
|
|
39
|
+
const oggPath = outPath.replace(".aiff", ".ogg");
|
|
40
|
+
try {
|
|
41
|
+
await execFileAsync("ffmpeg", [
|
|
42
|
+
"-i", outPath,
|
|
43
|
+
"-c:a", "libopus",
|
|
44
|
+
"-b:a", "24k",
|
|
45
|
+
"-ar", "48000",
|
|
46
|
+
"-ac", "1",
|
|
47
|
+
"-y", oggPath,
|
|
48
|
+
], { timeout: 30000 });
|
|
49
|
+
return oggPath;
|
|
50
|
+
} catch {
|
|
51
|
+
// ffmpeg not available, return AIFF
|
|
52
|
+
return outPath;
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* Generate speech using OpenAI TTS API.
|
|
58
|
+
*/
|
|
59
|
+
async function ttsOpenAI(text: string, config: TtsConfig): Promise<string> {
|
|
60
|
+
if (!config.openaiApiKey) throw new Error("OpenAI API key required");
|
|
61
|
+
|
|
62
|
+
const response = await fetch("https://api.openai.com/v1/audio/speech", {
|
|
63
|
+
method: "POST",
|
|
64
|
+
headers: {
|
|
65
|
+
Authorization: `Bearer ${config.openaiApiKey}`,
|
|
66
|
+
"Content-Type": "application/json",
|
|
67
|
+
},
|
|
68
|
+
body: JSON.stringify({
|
|
69
|
+
model: "tts-1",
|
|
70
|
+
input: text,
|
|
71
|
+
voice: config.openaiVoice || "alloy",
|
|
72
|
+
response_format: "opus",
|
|
73
|
+
}),
|
|
74
|
+
});
|
|
75
|
+
|
|
76
|
+
if (!response.ok) {
|
|
77
|
+
const body = await response.text().catch(() => "");
|
|
78
|
+
throw new Error(`OpenAI TTS failed (${response.status}): ${body}`);
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
const { writeFile } = await import("node:fs/promises");
|
|
82
|
+
const audioData = Buffer.from(await response.arrayBuffer());
|
|
83
|
+
const outPath = join(tmpdir(), `tts_${Date.now()}.ogg`);
|
|
84
|
+
await writeFile(outPath, audioData);
|
|
85
|
+
return outPath;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
/**
|
|
89
|
+
* Main TTS function — generate audio file from text.
|
|
90
|
+
* Returns path to audio file (OGG preferred).
|
|
91
|
+
*/
|
|
92
|
+
export async function synthesize(text: string, config: TtsConfig = { provider: "say" }): Promise<string> {
|
|
93
|
+
switch (config.provider) {
|
|
94
|
+
case "say":
|
|
95
|
+
return ttsSay(text, config);
|
|
96
|
+
case "openai":
|
|
97
|
+
return ttsOpenAI(text, config);
|
|
98
|
+
case "piper":
|
|
99
|
+
throw new Error("Piper TTS not yet implemented");
|
|
100
|
+
default:
|
|
101
|
+
throw new Error(`Unknown TTS provider: ${config.provider}`);
|
|
102
|
+
}
|
|
103
|
+
}
|