@cheeko-ai/esp32-voice 2026.2.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/NPM_PUBLISH_READINESS.md +299 -0
- package/README.md +226 -0
- package/TODO.md +418 -0
- package/index.ts +128 -0
- package/openclaw.plugin.json +9 -0
- package/package.json +62 -0
- package/src/accounts.ts +110 -0
- package/src/channel.ts +270 -0
- package/src/config-schema.ts +37 -0
- package/src/device/device-otp.ts +173 -0
- package/src/http-handler.ts +154 -0
- package/src/monitor.ts +124 -0
- package/src/onboarding.ts +575 -0
- package/src/runtime.ts +14 -0
- package/src/stt/deepgram.ts +215 -0
- package/src/stt/stt-provider.ts +107 -0
- package/src/stt/stt-registry.ts +71 -0
- package/src/tts/elevenlabs.ts +215 -0
- package/src/tts/tts-provider.ts +111 -0
- package/src/tts/tts-registry.ts +71 -0
- package/src/types.ts +136 -0
- package/src/voice/voice-endpoint.ts +296 -0
- package/src/voice/voice-session.ts +1041 -0
|
@@ -0,0 +1,1041 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Per-client voice session orchestrator.
|
|
3
|
+
*
|
|
4
|
+
* Ported from cheekoclaw_bridge/voice_session.py
|
|
5
|
+
*
|
|
6
|
+
* Wires together:
|
|
7
|
+
* Client WebSocket ↔ Opus codec ↔ STT Provider ↔ OpenClaw Agent ↔ TTS Provider
|
|
8
|
+
*
|
|
9
|
+
* State machine: IDLE → LISTENING → PROCESSING_STT → QUERYING_LLM → STREAMING_TTS → IDLE
|
|
10
|
+
*
|
|
11
|
+
* Each session manages its own STT/TTS provider instances, OpenClaw connection,
|
|
12
|
+
* and Opus encoding/decoding state.
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
import type WebSocket from "ws";
|
|
16
|
+
import { sttRegistry } from "../stt/stt-registry.js";
|
|
17
|
+
import { ttsRegistry } from "../tts/tts-registry.js";
|
|
18
|
+
import type { SttProvider } from "../stt/stt-provider.js";
|
|
19
|
+
import type { TtsProvider } from "../tts/tts-provider.js";
|
|
20
|
+
import { deviceOtpManager } from "../device/device-otp.js";
|
|
21
|
+
|
|
22
|
+
// ── Opus Encoder (lazy-loaded) ────────────────────────────────
|
|
23
|
+
// opusscript is a pure JS/WASM Opus encoder — no native binary needed.
|
|
24
|
+
// It converts PCM audio from TTS into Opus frames that the ESP32 can decode.
|
|
25
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
26
|
+
let opusEncoderInstance: any = null;
|
|
27
|
+
|
|
28
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
29
|
+
async function getOpusEncoder(): Promise<any> {
|
|
30
|
+
if (opusEncoderInstance) return opusEncoderInstance;
|
|
31
|
+
|
|
32
|
+
try {
|
|
33
|
+
// opusscript is pure JS/WASM — works without native binaries
|
|
34
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
35
|
+
const OpusScript = (await import("opusscript")) as any;
|
|
36
|
+
const Ctor = OpusScript.default ?? OpusScript;
|
|
37
|
+
// Application.AUDIO = 2048 (best for voice/music), VOIP = 2049
|
|
38
|
+
opusEncoderInstance = new Ctor(OUTPUT_SAMPLE_RATE, 1, Ctor.Application.VOIP);
|
|
39
|
+
// Set 32kbps bitrate — matches cheekoclaw_bridge (OPUS_SET_BITRATE_REQUEST = 4002)
|
|
40
|
+
try { opusEncoderInstance.encoderCTL(4002, 32000); } catch { /* best effort */ }
|
|
41
|
+
console.log(`[opus] Encoder initialized via opusscript: ${OUTPUT_SAMPLE_RATE}Hz mono 32kbps VOIP`);
|
|
42
|
+
return opusEncoderInstance;
|
|
43
|
+
} catch (err) {
|
|
44
|
+
console.error("[opus] Failed to load opusscript:", err);
|
|
45
|
+
throw new Error("Opus encoder not available. Install opusscript.");
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
// Same sentence boundary regex as the gateway (cheeko-chat.ts)
|
|
50
|
+
const SENTENCE_BOUNDARY_RE = /(?<=[.!?])\s+/;
|
|
51
|
+
|
|
52
|
+
// Silence pause between sentences in milliseconds
|
|
53
|
+
const SENTENCE_PAUSE_MS = 300;
|
|
54
|
+
|
|
55
|
+
// Opus frame parameters — matches cheekoclaw_bridge/audio_codec.py exactly.
|
|
56
|
+
// 24kHz output, 60ms frames, 1440 samples/frame, 2880 bytes PCM/frame.
|
|
57
|
+
const OUTPUT_SAMPLE_RATE = 24000;
|
|
58
|
+
const OUTPUT_FRAME_MS = 60;
|
|
59
|
+
const OUTPUT_SAMPLES_PER_FRAME = (OUTPUT_SAMPLE_RATE * OUTPUT_FRAME_MS) / 1000; // 1440
|
|
60
|
+
const OUTPUT_FRAME_BYTES = OUTPUT_SAMPLES_PER_FRAME * 2; // 2880 bytes (16-bit PCM)
|
|
61
|
+
|
|
62
|
+
export type VoiceSessionState =
|
|
63
|
+
| "idle"
|
|
64
|
+
| "listening"
|
|
65
|
+
| "processing_stt"
|
|
66
|
+
| "querying_llm"
|
|
67
|
+
| "streaming_tts";
|
|
68
|
+
|
|
69
|
+
interface SessionConfig {
|
|
70
|
+
/** OpenClaw Gateway WebSocket URL. */
|
|
71
|
+
openclawUrl: string;
|
|
72
|
+
/** OpenClaw Gateway auth token. */
|
|
73
|
+
openclawToken: string;
|
|
74
|
+
/** STT provider ID. */
|
|
75
|
+
sttProvider: string;
|
|
76
|
+
/** STT API key. */
|
|
77
|
+
sttApiKey: string;
|
|
78
|
+
/** STT model. */
|
|
79
|
+
sttModel?: string;
|
|
80
|
+
/** TTS provider ID. */
|
|
81
|
+
ttsProvider: string;
|
|
82
|
+
/** TTS API key. */
|
|
83
|
+
ttsApiKey: string;
|
|
84
|
+
/** TTS voice ID. */
|
|
85
|
+
ttsVoiceId?: string;
|
|
86
|
+
/** TTS model ID. */
|
|
87
|
+
ttsModel?: string;
|
|
88
|
+
/** Language code. */
|
|
89
|
+
language: string;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* Represents a single voice session with an ESP32 or voice client.
|
|
94
|
+
*
|
|
95
|
+
* The session is created when a client connects to the `/voice/stream`
|
|
96
|
+
* WebSocket endpoint and destroyed when the connection closes.
|
|
97
|
+
*/
|
|
98
|
+
export class VoiceSession {
|
|
99
|
+
readonly sessionId: string;
|
|
100
|
+
|
|
101
|
+
private ws: WebSocket;
|
|
102
|
+
private state: VoiceSessionState = "idle";
|
|
103
|
+
private cfg: SessionConfig | null = null;
|
|
104
|
+
private isEsp32 = false;
|
|
105
|
+
private deviceId = "unknown";
|
|
106
|
+
|
|
107
|
+
// STT/TTS provider instances (created per utterance)
|
|
108
|
+
private stt: SttProvider | null = null;
|
|
109
|
+
private tts: TtsProvider | null = null;
|
|
110
|
+
|
|
111
|
+
// OpenClaw: dispatched via runtime.channel.reply.dispatchReplyFromConfig (in-process, no WS needed)
|
|
112
|
+
private openclawConnected = false;
|
|
113
|
+
private openclawWs: WebSocket | null = null; // kept for cleanup compat
|
|
114
|
+
|
|
115
|
+
// Processing task abort support
|
|
116
|
+
private processingAbortController: AbortController | null = null;
|
|
117
|
+
|
|
118
|
+
constructor(ws: WebSocket, sessionId: string) {
|
|
119
|
+
this.ws = ws;
|
|
120
|
+
this.sessionId = sessionId;
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
/**
|
|
124
|
+
* Handle an incoming message (binary audio or JSON control message).
|
|
125
|
+
*/
|
|
126
|
+
async handleMessage(data: Buffer | string): Promise<void> {
|
|
127
|
+
if (Buffer.isBuffer(data)) {
|
|
128
|
+
// XiaoZhi firmware sends JSON control messages as binary WebSocket frames.
|
|
129
|
+
// Try to detect JSON by checking if the buffer starts with '{'.
|
|
130
|
+
if (data.length > 0 && data[0] === 0x7b) { // 0x7b = '{'
|
|
131
|
+
try {
|
|
132
|
+
const text = data.toString("utf8");
|
|
133
|
+
JSON.parse(text); // validate it's real JSON
|
|
134
|
+
await this.handleJson(text);
|
|
135
|
+
return;
|
|
136
|
+
} catch {
|
|
137
|
+
// Not JSON — fall through to audio handling
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
await this.handleAudio(data);
|
|
141
|
+
} else {
|
|
142
|
+
await this.handleJson(data);
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
/**
|
|
147
|
+
* Clean up all resources when the session ends.
|
|
148
|
+
*/
|
|
149
|
+
async cleanup(): Promise<void> {
|
|
150
|
+
if (this.stt) {
|
|
151
|
+
await this.stt.close();
|
|
152
|
+
this.stt = null;
|
|
153
|
+
}
|
|
154
|
+
if (this.tts) {
|
|
155
|
+
await this.tts.close();
|
|
156
|
+
this.tts = null;
|
|
157
|
+
}
|
|
158
|
+
if (this.openclawWs) {
|
|
159
|
+
try {
|
|
160
|
+
this.openclawWs.close();
|
|
161
|
+
} catch {
|
|
162
|
+
// Ignore
|
|
163
|
+
}
|
|
164
|
+
this.openclawWs = null;
|
|
165
|
+
this.openclawConnected = false;
|
|
166
|
+
}
|
|
167
|
+
this.log("info", "Session cleaned up");
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
// ── Private Handlers ──────────────────────────────────────────
|
|
171
|
+
|
|
172
|
+
private async handleAudio(opusFrame: Buffer): Promise<void> {
|
|
173
|
+
// ── Auto-hello for firmware that sends binary before/without a JSON hello ──
|
|
174
|
+
// XiaoZhi firmware sends raw Opus binary as first message, no JSON hello.
|
|
175
|
+
// We auto-initialize from env vars and send the server hello immediately.
|
|
176
|
+
if (!this.cfg) {
|
|
177
|
+
this.isEsp32 = true;
|
|
178
|
+
this.log("info", "First binary frame before hello — auto-initializing (XiaoZhi firmware)");
|
|
179
|
+
|
|
180
|
+
const gatewayUrl = process.env.OPENCLAW_GATEWAY_URL ?? "ws://127.0.0.1:18789";
|
|
181
|
+
const gatewayToken = process.env.OPENCLAW_GATEWAY_TOKEN ?? "";
|
|
182
|
+
this.cfg = {
|
|
183
|
+
openclawUrl: gatewayUrl,
|
|
184
|
+
openclawToken: gatewayToken,
|
|
185
|
+
sttProvider: "deepgram",
|
|
186
|
+
sttApiKey: process.env.DEEPGRAM_API_KEY ?? "",
|
|
187
|
+
sttModel: process.env.DEEPGRAM_MODEL,
|
|
188
|
+
ttsProvider: "elevenlabs",
|
|
189
|
+
ttsApiKey: process.env.ELEVENLABS_API_KEY ?? "",
|
|
190
|
+
ttsVoiceId: process.env.ELEVENLABS_VOICE_ID,
|
|
191
|
+
ttsModel: process.env.ELEVENLABS_MODEL_ID,
|
|
192
|
+
language: "en",
|
|
193
|
+
};
|
|
194
|
+
|
|
195
|
+
// Send server hello — required before firmware starts sending audio
|
|
196
|
+
await this.sendJson({
|
|
197
|
+
type: "hello",
|
|
198
|
+
transport: "websocket",
|
|
199
|
+
session_id: this.sessionId,
|
|
200
|
+
audio_params: {
|
|
201
|
+
format: "opus",
|
|
202
|
+
sample_rate: OUTPUT_SAMPLE_RATE,
|
|
203
|
+
channels: 1,
|
|
204
|
+
frame_duration: OUTPUT_FRAME_MS,
|
|
205
|
+
},
|
|
206
|
+
});
|
|
207
|
+
this.log("info", `Auto-hello sent. STT key: ${this.cfg.sttApiKey ? "✓" : "MISSING"}, TTS key: ${this.cfg.ttsApiKey ? "✓" : "MISSING"}`);
|
|
208
|
+
|
|
209
|
+
// Connect to Gateway in background
|
|
210
|
+
if (gatewayUrl) {
|
|
211
|
+
this.connectToOpenClaw().catch((err) => {
|
|
212
|
+
this.log("error", `Background Gateway connect failed: ${err}`);
|
|
213
|
+
});
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
if (this.state === "idle") {
|
|
218
|
+
// First audio frame while idle → auto-start listening
|
|
219
|
+
await this.startListening();
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
if (this.state !== "listening" || !this.stt) {
|
|
223
|
+
return;
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
try {
|
|
227
|
+
await this.stt.sendAudio(opusFrame);
|
|
228
|
+
} catch (err) {
|
|
229
|
+
this.log("error", `Audio send error: ${err}`);
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
private async handleJson(text: string): Promise<void> {
|
|
235
|
+
let msg: Record<string, unknown>;
|
|
236
|
+
try {
|
|
237
|
+
msg = JSON.parse(text);
|
|
238
|
+
} catch {
|
|
239
|
+
return;
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
const msgType = msg.type as string;
|
|
243
|
+
this.log("debug", `Received: ${msgType}`);
|
|
244
|
+
|
|
245
|
+
switch (msgType) {
|
|
246
|
+
case "hello":
|
|
247
|
+
await this.handleHello(msg);
|
|
248
|
+
break;
|
|
249
|
+
case "listen":
|
|
250
|
+
await this.handleListen(msg);
|
|
251
|
+
break;
|
|
252
|
+
case "speech_end":
|
|
253
|
+
if (this.state === "listening") {
|
|
254
|
+
await this.processUtterance();
|
|
255
|
+
}
|
|
256
|
+
break;
|
|
257
|
+
case "abort":
|
|
258
|
+
this.log("info", `Abort: ${(msg.reason as string) ?? "unknown"}`);
|
|
259
|
+
await this.handleAbort();
|
|
260
|
+
break;
|
|
261
|
+
default:
|
|
262
|
+
break;
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
private async handleHello(msg: Record<string, unknown>): Promise<void> {
|
|
267
|
+
// Detect ESP32 client
|
|
268
|
+
this.isEsp32 = Boolean(msg.transport || msg.audio_params || typeof msg.version === "number");
|
|
269
|
+
this.deviceId = (msg.deviceId as string) ?? "unknown";
|
|
270
|
+
this.log("info", `Hello received — full message: ${JSON.stringify(msg).slice(0, 500)}`);
|
|
271
|
+
this.log("info", `Hello from ${this.isEsp32 ? "ESP32" : "voice_client"} device: ${this.deviceId}`);
|
|
272
|
+
|
|
273
|
+
// ── OTP pairing (optional — we never block the connection on failure) ──
|
|
274
|
+
const otp = msg.otp as string | undefined;
|
|
275
|
+
if (otp) {
|
|
276
|
+
const result = deviceOtpManager.verifyOtp(otp, this.deviceId);
|
|
277
|
+
if (result) {
|
|
278
|
+
this.log("info", `Device "${this.deviceId}" paired via OTP`);
|
|
279
|
+
await this.sendJson({
|
|
280
|
+
type: "paired",
|
|
281
|
+
deviceId: this.deviceId,
|
|
282
|
+
deviceToken: result.deviceToken,
|
|
283
|
+
});
|
|
284
|
+
} else {
|
|
285
|
+
// ⚠️ Don't return — just warn and continue.
|
|
286
|
+
// Returning here would block the hello response and cause
|
|
287
|
+
// "Failed to receive server hello" on the firmware side.
|
|
288
|
+
this.log("warn", `OTP "${otp}" invalid or expired — allowing connection anyway (dev mode)`);
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
// ── Extract per-session config from hello ──
|
|
293
|
+
const oc = msg.openclaw as Record<string, string> | undefined;
|
|
294
|
+
const sttConfig = msg.stt as Record<string, string> | undefined;
|
|
295
|
+
const ttsConfig = msg.tts as Record<string, string> | undefined;
|
|
296
|
+
|
|
297
|
+
// Resolve OpenClaw Gateway URL — fall back to localhost if not in hello
|
|
298
|
+
const resolvedOpenclawUrl =
|
|
299
|
+
oc?.url?.trim() ||
|
|
300
|
+
process.env.OPENCLAW_GATEWAY_URL ||
|
|
301
|
+
"ws://127.0.0.1:18789";
|
|
302
|
+
const resolvedOpenclawToken =
|
|
303
|
+
oc?.token?.trim() ||
|
|
304
|
+
process.env.OPENCLAW_GATEWAY_TOKEN ||
|
|
305
|
+
"";
|
|
306
|
+
|
|
307
|
+
if (!oc?.url) {
|
|
308
|
+
this.log("info", `No openclaw URL in hello, falling back to ${resolvedOpenclawUrl}`);
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
this.cfg = {
|
|
312
|
+
openclawUrl: resolvedOpenclawUrl,
|
|
313
|
+
openclawToken: resolvedOpenclawToken,
|
|
314
|
+
sttProvider: sttConfig?.provider ?? "deepgram",
|
|
315
|
+
sttApiKey: sttConfig?.apiKey ?? process.env.DEEPGRAM_API_KEY ?? "",
|
|
316
|
+
sttModel: sttConfig?.model ?? process.env.DEEPGRAM_MODEL,
|
|
317
|
+
ttsProvider: ttsConfig?.provider ?? "elevenlabs",
|
|
318
|
+
ttsApiKey: ttsConfig?.apiKey ?? process.env.ELEVENLABS_API_KEY ?? "",
|
|
319
|
+
ttsVoiceId: ttsConfig?.voiceId ?? process.env.ELEVENLABS_VOICE_ID,
|
|
320
|
+
ttsModel: ttsConfig?.model ?? process.env.ELEVENLABS_MODEL_ID,
|
|
321
|
+
language: (msg.language as string) ?? "en",
|
|
322
|
+
};
|
|
323
|
+
|
|
324
|
+
// ── Send hello response FIRST ──────────────────────────────────────────
|
|
325
|
+
// The ESP32 firmware has a short timeout (a few seconds) for the server
|
|
326
|
+
// hello. We must reply immediately — BEFORE connecting to the Gateway,
|
|
327
|
+
// which can take time and would cause "Failed to receive server hello".
|
|
328
|
+
if (this.isEsp32) {
|
|
329
|
+
await this.sendJson({
|
|
330
|
+
type: "hello",
|
|
331
|
+
transport: "websocket",
|
|
332
|
+
session_id: this.sessionId,
|
|
333
|
+
audio_params: {
|
|
334
|
+
format: "opus",
|
|
335
|
+
sample_rate: OUTPUT_SAMPLE_RATE,
|
|
336
|
+
channels: 1,
|
|
337
|
+
frame_duration: OUTPUT_FRAME_MS,
|
|
338
|
+
},
|
|
339
|
+
});
|
|
340
|
+
} else {
|
|
341
|
+
await this.sendJson({
|
|
342
|
+
type: "hello",
|
|
343
|
+
sessionId: this.sessionId,
|
|
344
|
+
});
|
|
345
|
+
}
|
|
346
|
+
this.log("info", "Hello response sent — now connecting to OpenClaw Gateway in background");
|
|
347
|
+
|
|
348
|
+
// ── Connect to OpenClaw Gateway in background (non-blocking) ──────────
|
|
349
|
+
// Do NOT await this — the firmware is already past the hello handshake
|
|
350
|
+
// and ready for audio. Gateway connection failure is handled gracefully
|
|
351
|
+
// inside processUtterance().
|
|
352
|
+
if (this.cfg.openclawUrl) {
|
|
353
|
+
this.connectToOpenClaw().catch((err) => {
|
|
354
|
+
this.log("error", `Background Gateway connect failed: ${err}`);
|
|
355
|
+
});
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
private async handleListen(msg: Record<string, unknown>): Promise<void> {
|
|
361
|
+
const listenState = msg.state as string;
|
|
362
|
+
|
|
363
|
+
if (listenState === "start") {
|
|
364
|
+
this.log("info", "Listen start");
|
|
365
|
+
// If busy, abort first (same as gateway)
|
|
366
|
+
if (this.state !== "idle") {
|
|
367
|
+
this.log("info", `Aborting ${this.state} for new listen`);
|
|
368
|
+
await this.handleAbort();
|
|
369
|
+
}
|
|
370
|
+
await this.startListening();
|
|
371
|
+
} else if (listenState === "stop") {
|
|
372
|
+
this.log("info", "Listen stop");
|
|
373
|
+
if (this.state === "listening") {
|
|
374
|
+
await this.processUtterance();
|
|
375
|
+
}
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
private async handleAbort(): Promise<void> {
|
|
380
|
+
// Cancel processing
|
|
381
|
+
if (this.processingAbortController) {
|
|
382
|
+
this.processingAbortController.abort();
|
|
383
|
+
this.processingAbortController = null;
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
// Close STT
|
|
387
|
+
if (this.stt) {
|
|
388
|
+
await this.stt.close();
|
|
389
|
+
this.stt = null;
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
// Close TTS
|
|
393
|
+
if (this.tts) {
|
|
394
|
+
await this.tts.close();
|
|
395
|
+
this.tts = null;
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
// Signal stop to client
|
|
399
|
+
if (this.isEsp32) {
|
|
400
|
+
await this.sendJson({ type: "tts", state: "stop" });
|
|
401
|
+
} else {
|
|
402
|
+
await this.sendJson({ type: "audio_end" });
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
this.setState("idle");
|
|
406
|
+
this.log("info", "Abort complete, back to idle");
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
// ── Voice Pipeline ────────────────────────────────────────────
|
|
410
|
+
|
|
411
|
+
private async startListening(): Promise<void> {
|
|
412
|
+
if (this.state !== "idle" || !this.cfg) return;
|
|
413
|
+
|
|
414
|
+
// Validate STT provider
|
|
415
|
+
if (!this.cfg.sttApiKey) {
|
|
416
|
+
await this.sendJson({
|
|
417
|
+
type: "error",
|
|
418
|
+
message: `STT API key not configured (provider: ${this.cfg.sttProvider})`,
|
|
419
|
+
});
|
|
420
|
+
return;
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
if (!sttRegistry.has(this.cfg.sttProvider)) {
|
|
424
|
+
await this.sendJson({
|
|
425
|
+
type: "error",
|
|
426
|
+
message: `STT provider "${this.cfg.sttProvider}" not available`,
|
|
427
|
+
});
|
|
428
|
+
return;
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
this.setState("listening");
|
|
432
|
+
|
|
433
|
+
// Create STT provider instance
|
|
434
|
+
this.stt = sttRegistry.create(this.cfg.sttProvider, {
|
|
435
|
+
apiKey: this.cfg.sttApiKey,
|
|
436
|
+
model: this.cfg.sttModel,
|
|
437
|
+
language: this.cfg.language,
|
|
438
|
+
});
|
|
439
|
+
|
|
440
|
+
// Set up transcript callback
|
|
441
|
+
this.stt.onTranscript = async (text: string, isFinal: boolean) => {
|
|
442
|
+
this.log("debug", `STT [${isFinal ? "FINAL" : "partial"}]: ${text}`);
|
|
443
|
+
await this.sendJson({
|
|
444
|
+
type: "transcript",
|
|
445
|
+
text,
|
|
446
|
+
partial: !isFinal,
|
|
447
|
+
});
|
|
448
|
+
};
|
|
449
|
+
|
|
450
|
+
// Set up VAD end-of-speech callback (fired by Deepgram speech_final)
|
|
451
|
+
// This triggers processUtterance without needing a speech_end JSON message
|
|
452
|
+
if (this.stt.onSpeechEnd !== undefined) {
|
|
453
|
+
this.stt.onSpeechEnd = () => {
|
|
454
|
+
if (this.state === "listening") {
|
|
455
|
+
this.log("info", "VAD speech_final → triggering processUtterance");
|
|
456
|
+
this.processUtterance().catch((err) => this.log("error", `processUtterance error: ${err}`));
|
|
457
|
+
}
|
|
458
|
+
};
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
await this.stt.connect();
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
private async processUtterance(): Promise<void> {
|
|
465
|
+
// Guard against double-invocation (speech_end + onSpeechEnd VAD can both fire)
|
|
466
|
+
if (this.state !== "listening") {
|
|
467
|
+
this.log("debug", `processUtterance skipped — state is ${this.state}`);
|
|
468
|
+
return;
|
|
469
|
+
}
|
|
470
|
+
this.setState("processing_stt");
|
|
471
|
+
this.processingAbortController = new AbortController();
|
|
472
|
+
|
|
473
|
+
try {
|
|
474
|
+
// 1. Finalize STT to get final transcript
|
|
475
|
+
let transcript = "";
|
|
476
|
+
if (this.stt) {
|
|
477
|
+
transcript = await this.stt.finalize();
|
|
478
|
+
await this.stt.close();
|
|
479
|
+
this.stt = null;
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
if (!transcript.trim()) {
|
|
483
|
+
this.log("info", "Empty transcript, skipping");
|
|
484
|
+
this.setState("idle");
|
|
485
|
+
return;
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
this.log("info", `Transcript: ${transcript}`);
|
|
489
|
+
|
|
490
|
+
// Send final transcript to client
|
|
491
|
+
if (this.isEsp32) {
|
|
492
|
+
await this.sendJson({ type: "stt", text: transcript });
|
|
493
|
+
} else {
|
|
494
|
+
await this.sendJson({ type: "transcript", text: transcript, partial: false });
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
// 2. Query OpenClaw LLM
|
|
498
|
+
// If the Gateway connection is still in progress (non-blocking connect),
|
|
499
|
+
// wait up to 5 seconds for it to complete before failing.
|
|
500
|
+
if (!this.openclawConnected) {
|
|
501
|
+
this.log("info", "Waiting for OpenClaw connection...");
|
|
502
|
+
const waitMs = 5000;
|
|
503
|
+
const pollMs = 100;
|
|
504
|
+
let waited = 0;
|
|
505
|
+
while (!this.openclawConnected && waited < waitMs) {
|
|
506
|
+
await new Promise((r) => setTimeout(r, pollMs));
|
|
507
|
+
waited += pollMs;
|
|
508
|
+
}
|
|
509
|
+
}
|
|
510
|
+
if (!this.openclawConnected) {
|
|
511
|
+
await this.sendJson({
|
|
512
|
+
type: "error",
|
|
513
|
+
message: "No OpenClaw connection (missing credentials or gateway unavailable)",
|
|
514
|
+
});
|
|
515
|
+
this.setState("idle");
|
|
516
|
+
return;
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
this.setState("querying_llm");
|
|
520
|
+
await this.sendJson({ type: "status", stage: "thinking" });
|
|
521
|
+
this.log("info", `Querying OpenClaw: ${transcript.slice(0, 80)}`);
|
|
522
|
+
|
|
523
|
+
let responseText: string;
|
|
524
|
+
try {
|
|
525
|
+
responseText = await this.sendToOpenClaw(transcript);
|
|
526
|
+
} catch (err) {
|
|
527
|
+
this.log("error", `OpenClaw error: ${err}`);
|
|
528
|
+
responseText = "Sorry, I encountered an error processing your request.";
|
|
529
|
+
}
|
|
530
|
+
|
|
531
|
+
if (!responseText.trim()) {
|
|
532
|
+
responseText = "I didn't get a response.";
|
|
533
|
+
}
|
|
534
|
+
|
|
535
|
+
this.log("info", `OpenClaw response (${responseText.length} chars): ${responseText.slice(0, 120)}`);
|
|
536
|
+
|
|
537
|
+
// 3. Stream TTS audio back — sentence by sentence
|
|
538
|
+
await this.streamTtsResponse(responseText);
|
|
539
|
+
} catch (err) {
|
|
540
|
+
if ((err as Error).name === "AbortError") {
|
|
541
|
+
this.log("info", "Processing cancelled (abort)");
|
|
542
|
+
return;
|
|
543
|
+
}
|
|
544
|
+
this.log("error", `Process error: ${err}`);
|
|
545
|
+
await this.sendJson({ type: "error", message: String(err) });
|
|
546
|
+
} finally {
|
|
547
|
+
this.processingAbortController = null;
|
|
548
|
+
this.setState("idle");
|
|
549
|
+
}
|
|
550
|
+
}
|
|
551
|
+
|
|
552
|
+
private async streamTtsResponse(responseText: string): Promise<void> {
|
|
553
|
+
if (!this.cfg) return;
|
|
554
|
+
|
|
555
|
+
this.setState("streaming_tts");
|
|
556
|
+
|
|
557
|
+
// Split into sentences (same regex as gateway)
|
|
558
|
+
let sentences = responseText.split(SENTENCE_BOUNDARY_RE).filter((s) => s.trim());
|
|
559
|
+
if (sentences.length === 0) sentences = [responseText];
|
|
560
|
+
this.log("info", `TTS: ${sentences.length} sentence(s) to speak`);
|
|
561
|
+
|
|
562
|
+
// Signal TTS start
|
|
563
|
+
if (this.isEsp32) {
|
|
564
|
+
await this.sendJson({ type: "tts", state: "start" });
|
|
565
|
+
} else {
|
|
566
|
+
await this.sendJson({ type: "response_text", text: responseText, partial: false });
|
|
567
|
+
await this.sendJson({ type: "status", stage: "speaking" });
|
|
568
|
+
}
|
|
569
|
+
|
|
570
|
+
// Validate TTS provider
|
|
571
|
+
if (!this.cfg.ttsApiKey) {
|
|
572
|
+
await this.sendJson({
|
|
573
|
+
type: "error",
|
|
574
|
+
message: `TTS API key not configured (provider: ${this.cfg.ttsProvider})`,
|
|
575
|
+
});
|
|
576
|
+
return;
|
|
577
|
+
}
|
|
578
|
+
|
|
579
|
+
for (let i = 0; i < sentences.length; i++) {
|
|
580
|
+
const sentence = sentences[i];
|
|
581
|
+
this.log("info", `TTS sentence ${i + 1}/${sentences.length}: ${sentence.slice(0, 80)}`);
|
|
582
|
+
|
|
583
|
+
// Per-sentence signal — matches cheekoclaw_bridge protocol
|
|
584
|
+
if (this.isEsp32) {
|
|
585
|
+
await this.sendJson({ type: "tts", state: "sentence_start", text: sentence });
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
// Synthesize and stream — awaited so sentences play sequentially
|
|
589
|
+
await this.synthesizeAndStream(sentence);
|
|
590
|
+
|
|
591
|
+
// Insert silence pause between sentences (not after last)
|
|
592
|
+
if (i < sentences.length - 1) {
|
|
593
|
+
this.log("debug", `Inserting ${SENTENCE_PAUSE_MS}ms silence`);
|
|
594
|
+
await this.sendSilence(SENTENCE_PAUSE_MS);
|
|
595
|
+
}
|
|
596
|
+
}
|
|
597
|
+
|
|
598
|
+
// Signal TTS complete
|
|
599
|
+
if (this.isEsp32) {
|
|
600
|
+
await this.sendJson({ type: "tts", state: "stop" });
|
|
601
|
+
} else {
|
|
602
|
+
await this.sendJson({ type: "audio_end" });
|
|
603
|
+
}
|
|
604
|
+
this.log("debug", "Audio stream complete");
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
private async synthesizeAndStream(text: string): Promise<void> {
|
|
608
|
+
if (!this.cfg) return;
|
|
609
|
+
|
|
610
|
+
const pcmBuffer: Buffer[] = [];
|
|
611
|
+
let frameCount = 0;
|
|
612
|
+
// Track next frame deadline for accurate real-time pacing.
|
|
613
|
+
// 0 = not started yet; anchored to the moment the FIRST frame is sent
|
|
614
|
+
// so subsequent frames are spaced exactly OUTPUT_FRAME_MS apart.
|
|
615
|
+
let nextFrameAt = 0;
|
|
616
|
+
|
|
617
|
+
// drainPromise: tracks the last pacing sleep so we can await it after flush().
|
|
618
|
+
// This ensures synthesizeAndStream() doesn't return until ALL frames have
|
|
619
|
+
// actually been sent AND their pacing delay has elapsed — i.e. the ESP32
|
|
620
|
+
// has had enough time to play every frame before we start the next sentence.
|
|
621
|
+
let drainPromise: Promise<void> = Promise.resolve();
|
|
622
|
+
|
|
623
|
+
// Get Opus encoder for ESP32 clients (they expect Opus-encoded binary frames)
|
|
624
|
+
let encoder: Awaited<ReturnType<typeof getOpusEncoder>> | null = null;
|
|
625
|
+
if (this.isEsp32) {
|
|
626
|
+
try {
|
|
627
|
+
encoder = await getOpusEncoder();
|
|
628
|
+
this.log("debug", "Using Opus encoding for ESP32 output");
|
|
629
|
+
} catch (err) {
|
|
630
|
+
this.log("error", `Opus encoder unavailable, sending raw PCM: ${err}`);
|
|
631
|
+
}
|
|
632
|
+
}
|
|
633
|
+
|
|
634
|
+
// Create TTS provider instance for this sentence
|
|
635
|
+
this.tts = ttsRegistry.create(this.cfg.ttsProvider, {
|
|
636
|
+
apiKey: this.cfg.ttsApiKey,
|
|
637
|
+
voiceId: this.cfg.ttsVoiceId,
|
|
638
|
+
model: this.cfg.ttsModel,
|
|
639
|
+
language: this.cfg.language,
|
|
640
|
+
});
|
|
641
|
+
|
|
642
|
+
// Collect PCM audio, Opus-encode if ESP32, send as binary frames
|
|
643
|
+
this.tts.onAudio = async (pcmChunk: Buffer) => {
|
|
644
|
+
// Buffer PCM and send in fixed-size frames (paced at real-time rate)
|
|
645
|
+
pcmBuffer.push(pcmChunk);
|
|
646
|
+
const totalPcm = Buffer.concat(pcmBuffer);
|
|
647
|
+
pcmBuffer.length = 0;
|
|
648
|
+
|
|
649
|
+
let offset = 0;
|
|
650
|
+
while (offset + OUTPUT_FRAME_BYTES <= totalPcm.length) {
|
|
651
|
+
const pcmFrame = totalPcm.subarray(offset, offset + OUTPUT_FRAME_BYTES);
|
|
652
|
+
offset += OUTPUT_FRAME_BYTES;
|
|
653
|
+
|
|
654
|
+
// Encode PCM → Opus for ESP32, or send raw PCM for other clients
|
|
655
|
+
let frameToSend: Buffer;
|
|
656
|
+
if (encoder) {
|
|
657
|
+
try {
|
|
658
|
+
frameToSend = Buffer.from(encoder.encode(pcmFrame, OUTPUT_SAMPLES_PER_FRAME));
|
|
659
|
+
} catch (err) {
|
|
660
|
+
this.log("error", `Opus encode error: ${err}`);
|
|
661
|
+
frameToSend = pcmFrame; // Fallback to raw PCM
|
|
662
|
+
}
|
|
663
|
+
} else {
|
|
664
|
+
frameToSend = pcmFrame;
|
|
665
|
+
}
|
|
666
|
+
|
|
667
|
+
await this.sendBinary(frameToSend);
|
|
668
|
+
|
|
669
|
+
// Anchor pacing to first frame send time (matches cheekoclaw_bridge last_send_time=0)
|
|
670
|
+
if (nextFrameAt === 0) nextFrameAt = Date.now();
|
|
671
|
+
frameCount++;
|
|
672
|
+
|
|
673
|
+
// Accurate real-time pacing: sleep only the remaining time until next frame deadline.
|
|
674
|
+
// We chain a new drainPromise so the caller can await the very last sleep.
|
|
675
|
+
nextFrameAt += OUTPUT_FRAME_MS;
|
|
676
|
+
const sleepMs = nextFrameAt - Date.now();
|
|
677
|
+
if (sleepMs > 0) {
|
|
678
|
+
drainPromise = new Promise<void>((resolve) => { setTimeout(resolve, sleepMs); });
|
|
679
|
+
await drainPromise;
|
|
680
|
+
} else {
|
|
681
|
+
// No sleep needed but still mark drain as resolved
|
|
682
|
+
drainPromise = Promise.resolve();
|
|
683
|
+
}
|
|
684
|
+
}
|
|
685
|
+
|
|
686
|
+
// Keep remainder in buffer
|
|
687
|
+
if (offset < totalPcm.length) {
|
|
688
|
+
pcmBuffer.push(totalPcm.subarray(offset));
|
|
689
|
+
}
|
|
690
|
+
};
|
|
691
|
+
|
|
692
|
+
try {
|
|
693
|
+
await this.tts.connect();
|
|
694
|
+
await this.tts.synthesize(text);
|
|
695
|
+
await this.tts.flush();
|
|
696
|
+
|
|
697
|
+
// Flush remaining PCM buffer (pad with silence to full frame size)
|
|
698
|
+
if (pcmBuffer.length > 0) {
|
|
699
|
+
const remaining = Buffer.concat(pcmBuffer);
|
|
700
|
+
if (remaining.length > 0) {
|
|
701
|
+
const padded = Buffer.alloc(OUTPUT_FRAME_BYTES);
|
|
702
|
+
remaining.copy(padded);
|
|
703
|
+
|
|
704
|
+
let frameToSend: Buffer;
|
|
705
|
+
if (encoder) {
|
|
706
|
+
try {
|
|
707
|
+
frameToSend = Buffer.from(encoder.encode(padded, OUTPUT_SAMPLES_PER_FRAME));
|
|
708
|
+
} catch {
|
|
709
|
+
frameToSend = padded;
|
|
710
|
+
}
|
|
711
|
+
} else {
|
|
712
|
+
frameToSend = padded;
|
|
713
|
+
}
|
|
714
|
+
|
|
715
|
+
await this.sendBinary(frameToSend);
|
|
716
|
+
frameCount++;
|
|
717
|
+
|
|
718
|
+
// Pace the final padded frame too
|
|
719
|
+
nextFrameAt += OUTPUT_FRAME_MS;
|
|
720
|
+
const sleepMs = nextFrameAt - Date.now();
|
|
721
|
+
if (sleepMs > 0) {
|
|
722
|
+
drainPromise = new Promise<void>((resolve) => { setTimeout(resolve, sleepMs); });
|
|
723
|
+
}
|
|
724
|
+
}
|
|
725
|
+
}
|
|
726
|
+
|
|
727
|
+
// Wait for the very last pacing sleep to complete before returning.
|
|
728
|
+
// This is what ensures sentences play one-after-another on the ESP32.
|
|
729
|
+
await drainPromise;
|
|
730
|
+
|
|
731
|
+
this.log("debug", `TTS sent ${frameCount} ${encoder ? "Opus" : "PCM"} frames (paced at ${OUTPUT_FRAME_MS}ms)`);
|
|
732
|
+
} finally {
|
|
733
|
+
await this.tts.close();
|
|
734
|
+
this.tts = null;
|
|
735
|
+
}
|
|
736
|
+
}
|
|
737
|
+
|
|
738
|
+
private async sendSilence(durationMs: number): Promise<void> {
|
|
739
|
+
const totalSamples = (OUTPUT_SAMPLE_RATE * durationMs) / 1000;
|
|
740
|
+
const silenceBytes = totalSamples * 2; // 16-bit
|
|
741
|
+
const silence = Buffer.alloc(silenceBytes);
|
|
742
|
+
|
|
743
|
+
// Get Opus encoder for ESP32 silence frames
|
|
744
|
+
let encoder: Awaited<ReturnType<typeof getOpusEncoder>> | null = null;
|
|
745
|
+
if (this.isEsp32) {
|
|
746
|
+
try {
|
|
747
|
+
encoder = await getOpusEncoder();
|
|
748
|
+
} catch {
|
|
749
|
+
// Fall back to raw PCM
|
|
750
|
+
}
|
|
751
|
+
}
|
|
752
|
+
|
|
753
|
+
let offset = 0;
|
|
754
|
+
while (offset + OUTPUT_FRAME_BYTES <= silence.length) {
|
|
755
|
+
const pcmFrame = silence.subarray(offset, offset + OUTPUT_FRAME_BYTES);
|
|
756
|
+
|
|
757
|
+
let frameToSend: Buffer;
|
|
758
|
+
if (encoder) {
|
|
759
|
+
try {
|
|
760
|
+
frameToSend = Buffer.from(encoder.encode(pcmFrame, OUTPUT_SAMPLES_PER_FRAME));
|
|
761
|
+
} catch {
|
|
762
|
+
frameToSend = pcmFrame;
|
|
763
|
+
}
|
|
764
|
+
} else {
|
|
765
|
+
frameToSend = pcmFrame;
|
|
766
|
+
}
|
|
767
|
+
|
|
768
|
+
await this.sendBinary(frameToSend);
|
|
769
|
+
offset += OUTPUT_FRAME_BYTES;
|
|
770
|
+
await new Promise((resolve) => setTimeout(resolve, OUTPUT_FRAME_MS));
|
|
771
|
+
}
|
|
772
|
+
}
|
|
773
|
+
|
|
774
|
+
// ── OpenClaw Communication ────────────────────────────────────
|
|
775
|
+
// Connects using the device identity Ed25519 key stored in
|
|
776
|
+
// ~/.openclaw/identity/device.json — implemented entirely in the
|
|
777
|
+
// extension, no core code changes needed.
|
|
778
|
+
|
|
779
|
+
private async connectToOpenClaw(): Promise<void> {
|
|
780
|
+
if (!this.cfg?.openclawUrl) return;
|
|
781
|
+
|
|
782
|
+
const { WebSocket: WS } = await import("ws");
|
|
783
|
+
const nodeCrypto = await import("node:crypto");
|
|
784
|
+
const fs = await import("node:fs");
|
|
785
|
+
const path = await import("node:path");
|
|
786
|
+
const os = await import("node:os");
|
|
787
|
+
|
|
788
|
+
// ── Load device identity ──────────────────────────────────
|
|
789
|
+
const stateDir = process.env.OPENCLAW_STATE_DIR ?? path.default.join(os.default.homedir(), ".openclaw");
|
|
790
|
+
const identityPath = path.default.join(stateDir, "identity", "device.json");
|
|
791
|
+
let deviceIdentity: { deviceId: string; publicKeyPem: string; privateKeyPem: string } | null = null;
|
|
792
|
+
|
|
793
|
+
try {
|
|
794
|
+
if (fs.default.existsSync(identityPath)) {
|
|
795
|
+
const raw = JSON.parse(fs.default.readFileSync(identityPath, "utf8"));
|
|
796
|
+
if (raw?.version === 1 && raw.deviceId && raw.publicKeyPem && raw.privateKeyPem) {
|
|
797
|
+
deviceIdentity = { deviceId: raw.deviceId, publicKeyPem: raw.publicKeyPem, privateKeyPem: raw.privateKeyPem };
|
|
798
|
+
this.log("info", `Loaded device identity: ${deviceIdentity.deviceId.slice(0, 16)}...`);
|
|
799
|
+
}
|
|
800
|
+
}
|
|
801
|
+
} catch (err) {
|
|
802
|
+
this.log("warn", `Could not load device identity: ${err}`);
|
|
803
|
+
}
|
|
804
|
+
|
|
805
|
+
const token = this.cfg.openclawToken || process.env.OPENCLAW_GATEWAY_TOKEN;
|
|
806
|
+
|
|
807
|
+
return new Promise<void>((resolve) => {
|
|
808
|
+
this.openclawWs = new WS(this.cfg!.openclawUrl);
|
|
809
|
+
|
|
810
|
+
this.openclawWs.on("open", () => {
|
|
811
|
+
this.log("info", `Connected to OpenClaw at ${this.cfg!.openclawUrl}`);
|
|
812
|
+
});
|
|
813
|
+
|
|
814
|
+
// First message should be connect.challenge
|
|
815
|
+
this.openclawWs!.once("message", (data: Buffer) => {
|
|
816
|
+
try {
|
|
817
|
+
const frame = JSON.parse(data.toString());
|
|
818
|
+
if (frame.type !== "event" || frame.event !== "connect.challenge") {
|
|
819
|
+
this.log("warn", `Unexpected first frame: ${JSON.stringify(frame).slice(0, 100)}`);
|
|
820
|
+
resolve();
|
|
821
|
+
return;
|
|
822
|
+
}
|
|
823
|
+
|
|
824
|
+
const nonce = frame.payload?.nonce as string | undefined;
|
|
825
|
+
const role = "operator";
|
|
826
|
+
const scopes = ["operator.read", "operator.write"];
|
|
827
|
+
const clientId = "cli";
|
|
828
|
+
const clientMode = "cli";
|
|
829
|
+
|
|
830
|
+
// ── Build device signature ───────────────────────────
|
|
831
|
+
let device: Record<string, unknown> | undefined;
|
|
832
|
+
if (deviceIdentity) {
|
|
833
|
+
const signedAtMs = Date.now();
|
|
834
|
+
const payloadParts = nonce
|
|
835
|
+
? ["v2", deviceIdentity.deviceId, clientId, clientMode, role, scopes.join(","), String(signedAtMs), token ?? "", nonce]
|
|
836
|
+
: ["v1", deviceIdentity.deviceId, clientId, clientMode, role, scopes.join(","), String(signedAtMs), token ?? ""];
|
|
837
|
+
const payload = payloadParts.join("|");
|
|
838
|
+
|
|
839
|
+
const privateKey = nodeCrypto.default.createPrivateKey(deviceIdentity.privateKeyPem);
|
|
840
|
+
const signature = nodeCrypto.default.sign(null, Buffer.from(payload), privateKey).toString("base64url");
|
|
841
|
+
|
|
842
|
+
// Extract raw 32-byte Ed25519 public key from SPKI DER
|
|
843
|
+
const pubKey = nodeCrypto.default.createPublicKey(deviceIdentity.publicKeyPem);
|
|
844
|
+
const spki = pubKey.export({ type: "spki", format: "der" }) as Buffer;
|
|
845
|
+
const ED25519_PREFIX = Buffer.from("302a300506032b6570032100", "hex");
|
|
846
|
+
const rawPub = (spki.length === ED25519_PREFIX.length + 32 && spki.subarray(0, ED25519_PREFIX.length).equals(ED25519_PREFIX))
|
|
847
|
+
? spki.subarray(ED25519_PREFIX.length)
|
|
848
|
+
: spki;
|
|
849
|
+
|
|
850
|
+
device = {
|
|
851
|
+
id: deviceIdentity.deviceId,
|
|
852
|
+
publicKey: rawPub.toString("base64url"),
|
|
853
|
+
signature,
|
|
854
|
+
signedAt: signedAtMs,
|
|
855
|
+
...(nonce ? { nonce } : {}),
|
|
856
|
+
};
|
|
857
|
+
}
|
|
858
|
+
|
|
859
|
+
const connectRequest = {
|
|
860
|
+
type: "req",
|
|
861
|
+
id: nodeCrypto.default.randomUUID(),
|
|
862
|
+
method: "connect",
|
|
863
|
+
params: {
|
|
864
|
+
minProtocol: 3,
|
|
865
|
+
maxProtocol: 3,
|
|
866
|
+
client: { id: clientId, version: "1.0.0", platform: "node", mode: clientMode, displayName: `ESP32 Voice [${this.deviceId}]` },
|
|
867
|
+
role,
|
|
868
|
+
scopes,
|
|
869
|
+
caps: [],
|
|
870
|
+
...(token ? { auth: { token } } : {}),
|
|
871
|
+
...(device ? { device } : {}),
|
|
872
|
+
},
|
|
873
|
+
};
|
|
874
|
+
|
|
875
|
+
this.openclawWs!.send(JSON.stringify(connectRequest));
|
|
876
|
+
|
|
877
|
+
// Wait for the connect response
|
|
878
|
+
this.openclawWs!.once("message", (resp: Buffer) => {
|
|
879
|
+
try {
|
|
880
|
+
const response = JSON.parse(resp.toString());
|
|
881
|
+
if (response.type === "res" && response.ok) {
|
|
882
|
+
this.log("info", "OpenClaw handshake complete");
|
|
883
|
+
this.openclawConnected = true;
|
|
884
|
+
} else {
|
|
885
|
+
this.log("error", `OpenClaw handshake failed: ${response.error?.message ?? JSON.stringify(response)}`);
|
|
886
|
+
}
|
|
887
|
+
} catch {
|
|
888
|
+
this.log("error", "OpenClaw handshake parse error");
|
|
889
|
+
}
|
|
890
|
+
resolve();
|
|
891
|
+
});
|
|
892
|
+
} catch {
|
|
893
|
+
resolve();
|
|
894
|
+
}
|
|
895
|
+
});
|
|
896
|
+
|
|
897
|
+
this.openclawWs.on("error", (err: Error) => {
|
|
898
|
+
this.log("error", `OpenClaw connection error: ${err.message}`);
|
|
899
|
+
this.openclawConnected = false;
|
|
900
|
+
resolve();
|
|
901
|
+
});
|
|
902
|
+
|
|
903
|
+
this.openclawWs.on("close", () => {
|
|
904
|
+
this.log("info", "OpenClaw connection closed");
|
|
905
|
+
this.openclawConnected = false;
|
|
906
|
+
});
|
|
907
|
+
});
|
|
908
|
+
}
|
|
909
|
+
|
|
910
|
+
// Returns true if a gateway response is a heartbeat ack (HEARTBEAT_OK),
|
|
911
|
+
// which should be ignored — the session is still waiting for the real reply.
|
|
912
|
+
private isHeartbeatResponse(text: string): boolean {
|
|
913
|
+
const lower = (text ?? "").trim().toLowerCase();
|
|
914
|
+
if (!lower) return false;
|
|
915
|
+
if (!lower.startsWith("heartbeat_ok")) return false;
|
|
916
|
+
// Allow "HEARTBEAT_OK" alone or followed by punctuation/spaces — not a word char
|
|
917
|
+
const suffix = lower.slice("heartbeat_ok".length);
|
|
918
|
+
return suffix.length === 0 || !/[a-z0-9_]/.test(suffix[0]);
|
|
919
|
+
}
|
|
920
|
+
|
|
921
|
+
private async sendToOpenClaw(text: string): Promise<string> {
|
|
922
|
+
if (!this.openclawWs || !this.openclawConnected) {
|
|
923
|
+
throw new Error("Not connected to OpenClaw");
|
|
924
|
+
}
|
|
925
|
+
|
|
926
|
+
const chatRequest = {
|
|
927
|
+
type: "req",
|
|
928
|
+
id: crypto.randomUUID(),
|
|
929
|
+
method: "chat.send",
|
|
930
|
+
params: {
|
|
931
|
+
sessionKey: "agent:main:main",
|
|
932
|
+
message: text,
|
|
933
|
+
idempotencyKey: crypto.randomUUID(),
|
|
934
|
+
},
|
|
935
|
+
};
|
|
936
|
+
|
|
937
|
+
this.openclawWs.send(JSON.stringify(chatRequest));
|
|
938
|
+
|
|
939
|
+
return new Promise<string>((resolve) => {
|
|
940
|
+
let responseContent = "";
|
|
941
|
+
const timeout = setTimeout(() => {
|
|
942
|
+
resolve(responseContent || "Request timed out.");
|
|
943
|
+
}, 120000);
|
|
944
|
+
|
|
945
|
+
const messageHandler = (data: Buffer) => {
|
|
946
|
+
try {
|
|
947
|
+
const event = JSON.parse(data.toString());
|
|
948
|
+
if (event.type === "event") {
|
|
949
|
+
if (event.event === "agent" && event.payload?.stream === "assistant" && event.payload?.data?.text) {
|
|
950
|
+
const candidate = event.payload.data.text as string;
|
|
951
|
+
// Skip heartbeat ack responses — they are internal gateway noise
|
|
952
|
+
if (!this.isHeartbeatResponse(candidate)) {
|
|
953
|
+
responseContent = candidate;
|
|
954
|
+
}
|
|
955
|
+
} else if (event.event === "chat") {
|
|
956
|
+
const payload = event.payload ?? {};
|
|
957
|
+
const state = payload.state;
|
|
958
|
+
const messageObj = payload.message;
|
|
959
|
+
let candidate = "";
|
|
960
|
+
if (typeof messageObj?.content === "string") {
|
|
961
|
+
candidate = messageObj.content;
|
|
962
|
+
} else if (Array.isArray(messageObj?.content)) {
|
|
963
|
+
const textBlocks = (messageObj.content as Array<{ type: string; text?: string }>)
|
|
964
|
+
.filter(b => b.type === "text").map(b => b.text ?? "");
|
|
965
|
+
if (textBlocks.length > 0) candidate = textBlocks.join("");
|
|
966
|
+
}
|
|
967
|
+
// Skip heartbeat ack responses — keep waiting for real content
|
|
968
|
+
if (candidate && !this.isHeartbeatResponse(candidate)) {
|
|
969
|
+
responseContent = candidate;
|
|
970
|
+
}
|
|
971
|
+
if (state === "final" || state === "done" || state === "complete") {
|
|
972
|
+
// If the final response is a heartbeat ack, keep waiting
|
|
973
|
+
if (this.isHeartbeatResponse(responseContent)) {
|
|
974
|
+
responseContent = "";
|
|
975
|
+
return;
|
|
976
|
+
}
|
|
977
|
+
const hasPendingTools = Array.isArray(messageObj?.content) &&
|
|
978
|
+
(messageObj.content as Array<{ type: string }>).some(b => b.type === "tool_use");
|
|
979
|
+
if (!hasPendingTools) {
|
|
980
|
+
clearTimeout(timeout);
|
|
981
|
+
this.openclawWs?.off("message", messageHandler);
|
|
982
|
+
resolve(responseContent || "No response received");
|
|
983
|
+
}
|
|
984
|
+
} else if (state === "aborted" || state === "error") {
|
|
985
|
+
clearTimeout(timeout);
|
|
986
|
+
this.openclawWs?.off("message", messageHandler);
|
|
987
|
+
resolve(responseContent || `Request ${state}`);
|
|
988
|
+
}
|
|
989
|
+
}
|
|
990
|
+
}
|
|
991
|
+
} catch { /* ignore parse errors */ }
|
|
992
|
+
};
|
|
993
|
+
|
|
994
|
+
this.openclawWs!.on("message", messageHandler);
|
|
995
|
+
});
|
|
996
|
+
}
|
|
997
|
+
|
|
998
|
+
// ── Utilities ─────────────────────────────────────────────────
|
|
999
|
+
|
|
1000
|
+
private setState(newState: VoiceSessionState): void {
|
|
1001
|
+
const old = this.state;
|
|
1002
|
+
this.state = newState;
|
|
1003
|
+
this.log("debug", `State: ${old} → ${newState}`);
|
|
1004
|
+
}
|
|
1005
|
+
|
|
1006
|
+
private async sendJson(obj: Record<string, unknown>): Promise<void> {
|
|
1007
|
+
try {
|
|
1008
|
+
this.ws.send(JSON.stringify(obj));
|
|
1009
|
+
} catch (err) {
|
|
1010
|
+
this.log("error", `Send error (${obj.type}): ${err}`);
|
|
1011
|
+
}
|
|
1012
|
+
}
|
|
1013
|
+
|
|
1014
|
+
private async sendBinary(data: Buffer): Promise<void> {
|
|
1015
|
+
try {
|
|
1016
|
+
this.ws.send(data);
|
|
1017
|
+
} catch (err) {
|
|
1018
|
+
this.log("error", `Binary send error: ${err}`);
|
|
1019
|
+
}
|
|
1020
|
+
}
|
|
1021
|
+
|
|
1022
|
+
private log(level: string, msg: string): void {
|
|
1023
|
+
const prefix = `[${this.sessionId.slice(0, 8)}]`;
|
|
1024
|
+
switch (level) {
|
|
1025
|
+
case "error":
|
|
1026
|
+
console.error(`${prefix} ${msg}`);
|
|
1027
|
+
break;
|
|
1028
|
+
case "warn":
|
|
1029
|
+
console.warn(`${prefix} ${msg}`);
|
|
1030
|
+
break;
|
|
1031
|
+
case "debug":
|
|
1032
|
+
// Only log debug in development
|
|
1033
|
+
if (process.env.NODE_ENV !== "production") {
|
|
1034
|
+
console.log(`${prefix} [debug] ${msg}`);
|
|
1035
|
+
}
|
|
1036
|
+
break;
|
|
1037
|
+
default:
|
|
1038
|
+
console.log(`${prefix} ${msg}`);
|
|
1039
|
+
}
|
|
1040
|
+
}
|
|
1041
|
+
}
|