@cheeko-ai/esp32-voice 2026.2.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1041 @@
1
+ /**
2
+ * Per-client voice session orchestrator.
3
+ *
4
+ * Ported from cheekoclaw_bridge/voice_session.py
5
+ *
6
+ * Wires together:
7
+ * Client WebSocket ↔ Opus codec ↔ STT Provider ↔ OpenClaw Agent ↔ TTS Provider
8
+ *
9
+ * State machine: IDLE → LISTENING → PROCESSING_STT → QUERYING_LLM → STREAMING_TTS → IDLE
10
+ *
11
+ * Each session manages its own STT/TTS provider instances, OpenClaw connection,
12
+ * and Opus encoding/decoding state.
13
+ */
14
+
15
+ import type WebSocket from "ws";
16
+ import { sttRegistry } from "../stt/stt-registry.js";
17
+ import { ttsRegistry } from "../tts/tts-registry.js";
18
+ import type { SttProvider } from "../stt/stt-provider.js";
19
+ import type { TtsProvider } from "../tts/tts-provider.js";
20
+ import { deviceOtpManager } from "../device/device-otp.js";
21
+
22
+ // ── Opus Encoder (lazy-loaded) ────────────────────────────────
23
+ // opusscript is a pure JS/WASM Opus encoder — no native binary needed.
24
+ // It converts PCM audio from TTS into Opus frames that the ESP32 can decode.
25
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
26
+ let opusEncoderInstance: any = null;
27
+
28
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
29
+ async function getOpusEncoder(): Promise<any> {
30
+ if (opusEncoderInstance) return opusEncoderInstance;
31
+
32
+ try {
33
+ // opusscript is pure JS/WASM — works without native binaries
34
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
35
+ const OpusScript = (await import("opusscript")) as any;
36
+ const Ctor = OpusScript.default ?? OpusScript;
37
+ // Application.AUDIO = 2048 (best for voice/music), VOIP = 2049
38
+ opusEncoderInstance = new Ctor(OUTPUT_SAMPLE_RATE, 1, Ctor.Application.VOIP);
39
+ // Set 32kbps bitrate — matches cheekoclaw_bridge (OPUS_SET_BITRATE_REQUEST = 4002)
40
+ try { opusEncoderInstance.encoderCTL(4002, 32000); } catch { /* best effort */ }
41
+ console.log(`[opus] Encoder initialized via opusscript: ${OUTPUT_SAMPLE_RATE}Hz mono 32kbps VOIP`);
42
+ return opusEncoderInstance;
43
+ } catch (err) {
44
+ console.error("[opus] Failed to load opusscript:", err);
45
+ throw new Error("Opus encoder not available. Install opusscript.");
46
+ }
47
+ }
48
+
49
+ // Same sentence boundary regex as the gateway (cheeko-chat.ts)
50
+ const SENTENCE_BOUNDARY_RE = /(?<=[.!?])\s+/;
51
+
52
+ // Silence pause between sentences in milliseconds
53
+ const SENTENCE_PAUSE_MS = 300;
54
+
55
+ // Opus frame parameters — matches cheekoclaw_bridge/audio_codec.py exactly.
56
+ // 24kHz output, 60ms frames, 1440 samples/frame, 2880 bytes PCM/frame.
57
+ const OUTPUT_SAMPLE_RATE = 24000;
58
+ const OUTPUT_FRAME_MS = 60;
59
+ const OUTPUT_SAMPLES_PER_FRAME = (OUTPUT_SAMPLE_RATE * OUTPUT_FRAME_MS) / 1000; // 1440
60
+ const OUTPUT_FRAME_BYTES = OUTPUT_SAMPLES_PER_FRAME * 2; // 2880 bytes (16-bit PCM)
61
+
62
+ export type VoiceSessionState =
63
+ | "idle"
64
+ | "listening"
65
+ | "processing_stt"
66
+ | "querying_llm"
67
+ | "streaming_tts";
68
+
69
+ interface SessionConfig {
70
+ /** OpenClaw Gateway WebSocket URL. */
71
+ openclawUrl: string;
72
+ /** OpenClaw Gateway auth token. */
73
+ openclawToken: string;
74
+ /** STT provider ID. */
75
+ sttProvider: string;
76
+ /** STT API key. */
77
+ sttApiKey: string;
78
+ /** STT model. */
79
+ sttModel?: string;
80
+ /** TTS provider ID. */
81
+ ttsProvider: string;
82
+ /** TTS API key. */
83
+ ttsApiKey: string;
84
+ /** TTS voice ID. */
85
+ ttsVoiceId?: string;
86
+ /** TTS model ID. */
87
+ ttsModel?: string;
88
+ /** Language code. */
89
+ language: string;
90
+ }
91
+
92
+ /**
93
+ * Represents a single voice session with an ESP32 or voice client.
94
+ *
95
+ * The session is created when a client connects to the `/voice/stream`
96
+ * WebSocket endpoint and destroyed when the connection closes.
97
+ */
98
+ export class VoiceSession {
99
+ readonly sessionId: string;
100
+
101
+ private ws: WebSocket;
102
+ private state: VoiceSessionState = "idle";
103
+ private cfg: SessionConfig | null = null;
104
+ private isEsp32 = false;
105
+ private deviceId = "unknown";
106
+
107
+ // STT/TTS provider instances (created per utterance)
108
+ private stt: SttProvider | null = null;
109
+ private tts: TtsProvider | null = null;
110
+
111
+ // OpenClaw: dispatched via runtime.channel.reply.dispatchReplyFromConfig (in-process, no WS needed)
112
+ private openclawConnected = false;
113
+ private openclawWs: WebSocket | null = null; // kept for cleanup compat
114
+
115
+ // Processing task abort support
116
+ private processingAbortController: AbortController | null = null;
117
+
118
+ constructor(ws: WebSocket, sessionId: string) {
119
+ this.ws = ws;
120
+ this.sessionId = sessionId;
121
+ }
122
+
123
+ /**
124
+ * Handle an incoming message (binary audio or JSON control message).
125
+ */
126
+ async handleMessage(data: Buffer | string): Promise<void> {
127
+ if (Buffer.isBuffer(data)) {
128
+ // XiaoZhi firmware sends JSON control messages as binary WebSocket frames.
129
+ // Try to detect JSON by checking if the buffer starts with '{'.
130
+ if (data.length > 0 && data[0] === 0x7b) { // 0x7b = '{'
131
+ try {
132
+ const text = data.toString("utf8");
133
+ JSON.parse(text); // validate it's real JSON
134
+ await this.handleJson(text);
135
+ return;
136
+ } catch {
137
+ // Not JSON — fall through to audio handling
138
+ }
139
+ }
140
+ await this.handleAudio(data);
141
+ } else {
142
+ await this.handleJson(data);
143
+ }
144
+ }
145
+
146
+ /**
147
+ * Clean up all resources when the session ends.
148
+ */
149
+ async cleanup(): Promise<void> {
150
+ if (this.stt) {
151
+ await this.stt.close();
152
+ this.stt = null;
153
+ }
154
+ if (this.tts) {
155
+ await this.tts.close();
156
+ this.tts = null;
157
+ }
158
+ if (this.openclawWs) {
159
+ try {
160
+ this.openclawWs.close();
161
+ } catch {
162
+ // Ignore
163
+ }
164
+ this.openclawWs = null;
165
+ this.openclawConnected = false;
166
+ }
167
+ this.log("info", "Session cleaned up");
168
+ }
169
+
170
+ // ── Private Handlers ──────────────────────────────────────────
171
+
172
+ private async handleAudio(opusFrame: Buffer): Promise<void> {
173
+ // ── Auto-hello for firmware that sends binary before/without a JSON hello ──
174
+ // XiaoZhi firmware sends raw Opus binary as first message, no JSON hello.
175
+ // We auto-initialize from env vars and send the server hello immediately.
176
+ if (!this.cfg) {
177
+ this.isEsp32 = true;
178
+ this.log("info", "First binary frame before hello — auto-initializing (XiaoZhi firmware)");
179
+
180
+ const gatewayUrl = process.env.OPENCLAW_GATEWAY_URL ?? "ws://127.0.0.1:18789";
181
+ const gatewayToken = process.env.OPENCLAW_GATEWAY_TOKEN ?? "";
182
+ this.cfg = {
183
+ openclawUrl: gatewayUrl,
184
+ openclawToken: gatewayToken,
185
+ sttProvider: "deepgram",
186
+ sttApiKey: process.env.DEEPGRAM_API_KEY ?? "",
187
+ sttModel: process.env.DEEPGRAM_MODEL,
188
+ ttsProvider: "elevenlabs",
189
+ ttsApiKey: process.env.ELEVENLABS_API_KEY ?? "",
190
+ ttsVoiceId: process.env.ELEVENLABS_VOICE_ID,
191
+ ttsModel: process.env.ELEVENLABS_MODEL_ID,
192
+ language: "en",
193
+ };
194
+
195
+ // Send server hello — required before firmware starts sending audio
196
+ await this.sendJson({
197
+ type: "hello",
198
+ transport: "websocket",
199
+ session_id: this.sessionId,
200
+ audio_params: {
201
+ format: "opus",
202
+ sample_rate: OUTPUT_SAMPLE_RATE,
203
+ channels: 1,
204
+ frame_duration: OUTPUT_FRAME_MS,
205
+ },
206
+ });
207
+ this.log("info", `Auto-hello sent. STT key: ${this.cfg.sttApiKey ? "✓" : "MISSING"}, TTS key: ${this.cfg.ttsApiKey ? "✓" : "MISSING"}`);
208
+
209
+ // Connect to Gateway in background
210
+ if (gatewayUrl) {
211
+ this.connectToOpenClaw().catch((err) => {
212
+ this.log("error", `Background Gateway connect failed: ${err}`);
213
+ });
214
+ }
215
+ }
216
+
217
+ if (this.state === "idle") {
218
+ // First audio frame while idle → auto-start listening
219
+ await this.startListening();
220
+ }
221
+
222
+ if (this.state !== "listening" || !this.stt) {
223
+ return;
224
+ }
225
+
226
+ try {
227
+ await this.stt.sendAudio(opusFrame);
228
+ } catch (err) {
229
+ this.log("error", `Audio send error: ${err}`);
230
+ }
231
+ }
232
+
233
+
234
+ private async handleJson(text: string): Promise<void> {
235
+ let msg: Record<string, unknown>;
236
+ try {
237
+ msg = JSON.parse(text);
238
+ } catch {
239
+ return;
240
+ }
241
+
242
+ const msgType = msg.type as string;
243
+ this.log("debug", `Received: ${msgType}`);
244
+
245
+ switch (msgType) {
246
+ case "hello":
247
+ await this.handleHello(msg);
248
+ break;
249
+ case "listen":
250
+ await this.handleListen(msg);
251
+ break;
252
+ case "speech_end":
253
+ if (this.state === "listening") {
254
+ await this.processUtterance();
255
+ }
256
+ break;
257
+ case "abort":
258
+ this.log("info", `Abort: ${(msg.reason as string) ?? "unknown"}`);
259
+ await this.handleAbort();
260
+ break;
261
+ default:
262
+ break;
263
+ }
264
+ }
265
+
266
+ private async handleHello(msg: Record<string, unknown>): Promise<void> {
267
+ // Detect ESP32 client
268
+ this.isEsp32 = Boolean(msg.transport || msg.audio_params || typeof msg.version === "number");
269
+ this.deviceId = (msg.deviceId as string) ?? "unknown";
270
+ this.log("info", `Hello received — full message: ${JSON.stringify(msg).slice(0, 500)}`);
271
+ this.log("info", `Hello from ${this.isEsp32 ? "ESP32" : "voice_client"} device: ${this.deviceId}`);
272
+
273
+ // ── OTP pairing (optional — we never block the connection on failure) ──
274
+ const otp = msg.otp as string | undefined;
275
+ if (otp) {
276
+ const result = deviceOtpManager.verifyOtp(otp, this.deviceId);
277
+ if (result) {
278
+ this.log("info", `Device "${this.deviceId}" paired via OTP`);
279
+ await this.sendJson({
280
+ type: "paired",
281
+ deviceId: this.deviceId,
282
+ deviceToken: result.deviceToken,
283
+ });
284
+ } else {
285
+ // ⚠️ Don't return — just warn and continue.
286
+ // Returning here would block the hello response and cause
287
+ // "Failed to receive server hello" on the firmware side.
288
+ this.log("warn", `OTP "${otp}" invalid or expired — allowing connection anyway (dev mode)`);
289
+ }
290
+ }
291
+
292
+ // ── Extract per-session config from hello ──
293
+ const oc = msg.openclaw as Record<string, string> | undefined;
294
+ const sttConfig = msg.stt as Record<string, string> | undefined;
295
+ const ttsConfig = msg.tts as Record<string, string> | undefined;
296
+
297
+ // Resolve OpenClaw Gateway URL — fall back to localhost if not in hello
298
+ const resolvedOpenclawUrl =
299
+ oc?.url?.trim() ||
300
+ process.env.OPENCLAW_GATEWAY_URL ||
301
+ "ws://127.0.0.1:18789";
302
+ const resolvedOpenclawToken =
303
+ oc?.token?.trim() ||
304
+ process.env.OPENCLAW_GATEWAY_TOKEN ||
305
+ "";
306
+
307
+ if (!oc?.url) {
308
+ this.log("info", `No openclaw URL in hello, falling back to ${resolvedOpenclawUrl}`);
309
+ }
310
+
311
+ this.cfg = {
312
+ openclawUrl: resolvedOpenclawUrl,
313
+ openclawToken: resolvedOpenclawToken,
314
+ sttProvider: sttConfig?.provider ?? "deepgram",
315
+ sttApiKey: sttConfig?.apiKey ?? process.env.DEEPGRAM_API_KEY ?? "",
316
+ sttModel: sttConfig?.model ?? process.env.DEEPGRAM_MODEL,
317
+ ttsProvider: ttsConfig?.provider ?? "elevenlabs",
318
+ ttsApiKey: ttsConfig?.apiKey ?? process.env.ELEVENLABS_API_KEY ?? "",
319
+ ttsVoiceId: ttsConfig?.voiceId ?? process.env.ELEVENLABS_VOICE_ID,
320
+ ttsModel: ttsConfig?.model ?? process.env.ELEVENLABS_MODEL_ID,
321
+ language: (msg.language as string) ?? "en",
322
+ };
323
+
324
+ // ── Send hello response FIRST ──────────────────────────────────────────
325
+ // The ESP32 firmware has a short timeout (a few seconds) for the server
326
+ // hello. We must reply immediately — BEFORE connecting to the Gateway,
327
+ // which can take time and would cause "Failed to receive server hello".
328
+ if (this.isEsp32) {
329
+ await this.sendJson({
330
+ type: "hello",
331
+ transport: "websocket",
332
+ session_id: this.sessionId,
333
+ audio_params: {
334
+ format: "opus",
335
+ sample_rate: OUTPUT_SAMPLE_RATE,
336
+ channels: 1,
337
+ frame_duration: OUTPUT_FRAME_MS,
338
+ },
339
+ });
340
+ } else {
341
+ await this.sendJson({
342
+ type: "hello",
343
+ sessionId: this.sessionId,
344
+ });
345
+ }
346
+ this.log("info", "Hello response sent — now connecting to OpenClaw Gateway in background");
347
+
348
+ // ── Connect to OpenClaw Gateway in background (non-blocking) ──────────
349
+ // Do NOT await this — the firmware is already past the hello handshake
350
+ // and ready for audio. Gateway connection failure is handled gracefully
351
+ // inside processUtterance().
352
+ if (this.cfg.openclawUrl) {
353
+ this.connectToOpenClaw().catch((err) => {
354
+ this.log("error", `Background Gateway connect failed: ${err}`);
355
+ });
356
+ }
357
+ }
358
+
359
+
360
+ private async handleListen(msg: Record<string, unknown>): Promise<void> {
361
+ const listenState = msg.state as string;
362
+
363
+ if (listenState === "start") {
364
+ this.log("info", "Listen start");
365
+ // If busy, abort first (same as gateway)
366
+ if (this.state !== "idle") {
367
+ this.log("info", `Aborting ${this.state} for new listen`);
368
+ await this.handleAbort();
369
+ }
370
+ await this.startListening();
371
+ } else if (listenState === "stop") {
372
+ this.log("info", "Listen stop");
373
+ if (this.state === "listening") {
374
+ await this.processUtterance();
375
+ }
376
+ }
377
+ }
378
+
379
+ private async handleAbort(): Promise<void> {
380
+ // Cancel processing
381
+ if (this.processingAbortController) {
382
+ this.processingAbortController.abort();
383
+ this.processingAbortController = null;
384
+ }
385
+
386
+ // Close STT
387
+ if (this.stt) {
388
+ await this.stt.close();
389
+ this.stt = null;
390
+ }
391
+
392
+ // Close TTS
393
+ if (this.tts) {
394
+ await this.tts.close();
395
+ this.tts = null;
396
+ }
397
+
398
+ // Signal stop to client
399
+ if (this.isEsp32) {
400
+ await this.sendJson({ type: "tts", state: "stop" });
401
+ } else {
402
+ await this.sendJson({ type: "audio_end" });
403
+ }
404
+
405
+ this.setState("idle");
406
+ this.log("info", "Abort complete, back to idle");
407
+ }
408
+
409
+ // ── Voice Pipeline ────────────────────────────────────────────
410
+
411
+ private async startListening(): Promise<void> {
412
+ if (this.state !== "idle" || !this.cfg) return;
413
+
414
+ // Validate STT provider
415
+ if (!this.cfg.sttApiKey) {
416
+ await this.sendJson({
417
+ type: "error",
418
+ message: `STT API key not configured (provider: ${this.cfg.sttProvider})`,
419
+ });
420
+ return;
421
+ }
422
+
423
+ if (!sttRegistry.has(this.cfg.sttProvider)) {
424
+ await this.sendJson({
425
+ type: "error",
426
+ message: `STT provider "${this.cfg.sttProvider}" not available`,
427
+ });
428
+ return;
429
+ }
430
+
431
+ this.setState("listening");
432
+
433
+ // Create STT provider instance
434
+ this.stt = sttRegistry.create(this.cfg.sttProvider, {
435
+ apiKey: this.cfg.sttApiKey,
436
+ model: this.cfg.sttModel,
437
+ language: this.cfg.language,
438
+ });
439
+
440
+ // Set up transcript callback
441
+ this.stt.onTranscript = async (text: string, isFinal: boolean) => {
442
+ this.log("debug", `STT [${isFinal ? "FINAL" : "partial"}]: ${text}`);
443
+ await this.sendJson({
444
+ type: "transcript",
445
+ text,
446
+ partial: !isFinal,
447
+ });
448
+ };
449
+
450
+ // Set up VAD end-of-speech callback (fired by Deepgram speech_final)
451
+ // This triggers processUtterance without needing a speech_end JSON message
452
+ if (this.stt.onSpeechEnd !== undefined) {
453
+ this.stt.onSpeechEnd = () => {
454
+ if (this.state === "listening") {
455
+ this.log("info", "VAD speech_final → triggering processUtterance");
456
+ this.processUtterance().catch((err) => this.log("error", `processUtterance error: ${err}`));
457
+ }
458
+ };
459
+ }
460
+
461
+ await this.stt.connect();
462
+ }
463
+
464
+ private async processUtterance(): Promise<void> {
465
+ // Guard against double-invocation (speech_end + onSpeechEnd VAD can both fire)
466
+ if (this.state !== "listening") {
467
+ this.log("debug", `processUtterance skipped — state is ${this.state}`);
468
+ return;
469
+ }
470
+ this.setState("processing_stt");
471
+ this.processingAbortController = new AbortController();
472
+
473
+ try {
474
+ // 1. Finalize STT to get final transcript
475
+ let transcript = "";
476
+ if (this.stt) {
477
+ transcript = await this.stt.finalize();
478
+ await this.stt.close();
479
+ this.stt = null;
480
+ }
481
+
482
+ if (!transcript.trim()) {
483
+ this.log("info", "Empty transcript, skipping");
484
+ this.setState("idle");
485
+ return;
486
+ }
487
+
488
+ this.log("info", `Transcript: ${transcript}`);
489
+
490
+ // Send final transcript to client
491
+ if (this.isEsp32) {
492
+ await this.sendJson({ type: "stt", text: transcript });
493
+ } else {
494
+ await this.sendJson({ type: "transcript", text: transcript, partial: false });
495
+ }
496
+
497
+ // 2. Query OpenClaw LLM
498
+ // If the Gateway connection is still in progress (non-blocking connect),
499
+ // wait up to 5 seconds for it to complete before failing.
500
+ if (!this.openclawConnected) {
501
+ this.log("info", "Waiting for OpenClaw connection...");
502
+ const waitMs = 5000;
503
+ const pollMs = 100;
504
+ let waited = 0;
505
+ while (!this.openclawConnected && waited < waitMs) {
506
+ await new Promise((r) => setTimeout(r, pollMs));
507
+ waited += pollMs;
508
+ }
509
+ }
510
+ if (!this.openclawConnected) {
511
+ await this.sendJson({
512
+ type: "error",
513
+ message: "No OpenClaw connection (missing credentials or gateway unavailable)",
514
+ });
515
+ this.setState("idle");
516
+ return;
517
+ }
518
+
519
+ this.setState("querying_llm");
520
+ await this.sendJson({ type: "status", stage: "thinking" });
521
+ this.log("info", `Querying OpenClaw: ${transcript.slice(0, 80)}`);
522
+
523
+ let responseText: string;
524
+ try {
525
+ responseText = await this.sendToOpenClaw(transcript);
526
+ } catch (err) {
527
+ this.log("error", `OpenClaw error: ${err}`);
528
+ responseText = "Sorry, I encountered an error processing your request.";
529
+ }
530
+
531
+ if (!responseText.trim()) {
532
+ responseText = "I didn't get a response.";
533
+ }
534
+
535
+ this.log("info", `OpenClaw response (${responseText.length} chars): ${responseText.slice(0, 120)}`);
536
+
537
+ // 3. Stream TTS audio back — sentence by sentence
538
+ await this.streamTtsResponse(responseText);
539
+ } catch (err) {
540
+ if ((err as Error).name === "AbortError") {
541
+ this.log("info", "Processing cancelled (abort)");
542
+ return;
543
+ }
544
+ this.log("error", `Process error: ${err}`);
545
+ await this.sendJson({ type: "error", message: String(err) });
546
+ } finally {
547
+ this.processingAbortController = null;
548
+ this.setState("idle");
549
+ }
550
+ }
551
+
552
+ private async streamTtsResponse(responseText: string): Promise<void> {
553
+ if (!this.cfg) return;
554
+
555
+ this.setState("streaming_tts");
556
+
557
+ // Split into sentences (same regex as gateway)
558
+ let sentences = responseText.split(SENTENCE_BOUNDARY_RE).filter((s) => s.trim());
559
+ if (sentences.length === 0) sentences = [responseText];
560
+ this.log("info", `TTS: ${sentences.length} sentence(s) to speak`);
561
+
562
+ // Signal TTS start
563
+ if (this.isEsp32) {
564
+ await this.sendJson({ type: "tts", state: "start" });
565
+ } else {
566
+ await this.sendJson({ type: "response_text", text: responseText, partial: false });
567
+ await this.sendJson({ type: "status", stage: "speaking" });
568
+ }
569
+
570
+ // Validate TTS provider
571
+ if (!this.cfg.ttsApiKey) {
572
+ await this.sendJson({
573
+ type: "error",
574
+ message: `TTS API key not configured (provider: ${this.cfg.ttsProvider})`,
575
+ });
576
+ return;
577
+ }
578
+
579
+ for (let i = 0; i < sentences.length; i++) {
580
+ const sentence = sentences[i];
581
+ this.log("info", `TTS sentence ${i + 1}/${sentences.length}: ${sentence.slice(0, 80)}`);
582
+
583
+ // Per-sentence signal — matches cheekoclaw_bridge protocol
584
+ if (this.isEsp32) {
585
+ await this.sendJson({ type: "tts", state: "sentence_start", text: sentence });
586
+ }
587
+
588
+ // Synthesize and stream — awaited so sentences play sequentially
589
+ await this.synthesizeAndStream(sentence);
590
+
591
+ // Insert silence pause between sentences (not after last)
592
+ if (i < sentences.length - 1) {
593
+ this.log("debug", `Inserting ${SENTENCE_PAUSE_MS}ms silence`);
594
+ await this.sendSilence(SENTENCE_PAUSE_MS);
595
+ }
596
+ }
597
+
598
+ // Signal TTS complete
599
+ if (this.isEsp32) {
600
+ await this.sendJson({ type: "tts", state: "stop" });
601
+ } else {
602
+ await this.sendJson({ type: "audio_end" });
603
+ }
604
+ this.log("debug", "Audio stream complete");
605
+ }
606
+
607
+ private async synthesizeAndStream(text: string): Promise<void> {
608
+ if (!this.cfg) return;
609
+
610
+ const pcmBuffer: Buffer[] = [];
611
+ let frameCount = 0;
612
+ // Track next frame deadline for accurate real-time pacing.
613
+ // 0 = not started yet; anchored to the moment the FIRST frame is sent
614
+ // so subsequent frames are spaced exactly OUTPUT_FRAME_MS apart.
615
+ let nextFrameAt = 0;
616
+
617
+ // drainPromise: tracks the last pacing sleep so we can await it after flush().
618
+ // This ensures synthesizeAndStream() doesn't return until ALL frames have
619
+ // actually been sent AND their pacing delay has elapsed — i.e. the ESP32
620
+ // has had enough time to play every frame before we start the next sentence.
621
+ let drainPromise: Promise<void> = Promise.resolve();
622
+
623
+ // Get Opus encoder for ESP32 clients (they expect Opus-encoded binary frames)
624
+ let encoder: Awaited<ReturnType<typeof getOpusEncoder>> | null = null;
625
+ if (this.isEsp32) {
626
+ try {
627
+ encoder = await getOpusEncoder();
628
+ this.log("debug", "Using Opus encoding for ESP32 output");
629
+ } catch (err) {
630
+ this.log("error", `Opus encoder unavailable, sending raw PCM: ${err}`);
631
+ }
632
+ }
633
+
634
+ // Create TTS provider instance for this sentence
635
+ this.tts = ttsRegistry.create(this.cfg.ttsProvider, {
636
+ apiKey: this.cfg.ttsApiKey,
637
+ voiceId: this.cfg.ttsVoiceId,
638
+ model: this.cfg.ttsModel,
639
+ language: this.cfg.language,
640
+ });
641
+
642
+ // Collect PCM audio, Opus-encode if ESP32, send as binary frames
643
+ this.tts.onAudio = async (pcmChunk: Buffer) => {
644
+ // Buffer PCM and send in fixed-size frames (paced at real-time rate)
645
+ pcmBuffer.push(pcmChunk);
646
+ const totalPcm = Buffer.concat(pcmBuffer);
647
+ pcmBuffer.length = 0;
648
+
649
+ let offset = 0;
650
+ while (offset + OUTPUT_FRAME_BYTES <= totalPcm.length) {
651
+ const pcmFrame = totalPcm.subarray(offset, offset + OUTPUT_FRAME_BYTES);
652
+ offset += OUTPUT_FRAME_BYTES;
653
+
654
+ // Encode PCM → Opus for ESP32, or send raw PCM for other clients
655
+ let frameToSend: Buffer;
656
+ if (encoder) {
657
+ try {
658
+ frameToSend = Buffer.from(encoder.encode(pcmFrame, OUTPUT_SAMPLES_PER_FRAME));
659
+ } catch (err) {
660
+ this.log("error", `Opus encode error: ${err}`);
661
+ frameToSend = pcmFrame; // Fallback to raw PCM
662
+ }
663
+ } else {
664
+ frameToSend = pcmFrame;
665
+ }
666
+
667
+ await this.sendBinary(frameToSend);
668
+
669
+ // Anchor pacing to first frame send time (matches cheekoclaw_bridge last_send_time=0)
670
+ if (nextFrameAt === 0) nextFrameAt = Date.now();
671
+ frameCount++;
672
+
673
+ // Accurate real-time pacing: sleep only the remaining time until next frame deadline.
674
+ // We chain a new drainPromise so the caller can await the very last sleep.
675
+ nextFrameAt += OUTPUT_FRAME_MS;
676
+ const sleepMs = nextFrameAt - Date.now();
677
+ if (sleepMs > 0) {
678
+ drainPromise = new Promise<void>((resolve) => { setTimeout(resolve, sleepMs); });
679
+ await drainPromise;
680
+ } else {
681
+ // No sleep needed but still mark drain as resolved
682
+ drainPromise = Promise.resolve();
683
+ }
684
+ }
685
+
686
+ // Keep remainder in buffer
687
+ if (offset < totalPcm.length) {
688
+ pcmBuffer.push(totalPcm.subarray(offset));
689
+ }
690
+ };
691
+
692
+ try {
693
+ await this.tts.connect();
694
+ await this.tts.synthesize(text);
695
+ await this.tts.flush();
696
+
697
+ // Flush remaining PCM buffer (pad with silence to full frame size)
698
+ if (pcmBuffer.length > 0) {
699
+ const remaining = Buffer.concat(pcmBuffer);
700
+ if (remaining.length > 0) {
701
+ const padded = Buffer.alloc(OUTPUT_FRAME_BYTES);
702
+ remaining.copy(padded);
703
+
704
+ let frameToSend: Buffer;
705
+ if (encoder) {
706
+ try {
707
+ frameToSend = Buffer.from(encoder.encode(padded, OUTPUT_SAMPLES_PER_FRAME));
708
+ } catch {
709
+ frameToSend = padded;
710
+ }
711
+ } else {
712
+ frameToSend = padded;
713
+ }
714
+
715
+ await this.sendBinary(frameToSend);
716
+ frameCount++;
717
+
718
+ // Pace the final padded frame too
719
+ nextFrameAt += OUTPUT_FRAME_MS;
720
+ const sleepMs = nextFrameAt - Date.now();
721
+ if (sleepMs > 0) {
722
+ drainPromise = new Promise<void>((resolve) => { setTimeout(resolve, sleepMs); });
723
+ }
724
+ }
725
+ }
726
+
727
+ // Wait for the very last pacing sleep to complete before returning.
728
+ // This is what ensures sentences play one-after-another on the ESP32.
729
+ await drainPromise;
730
+
731
+ this.log("debug", `TTS sent ${frameCount} ${encoder ? "Opus" : "PCM"} frames (paced at ${OUTPUT_FRAME_MS}ms)`);
732
+ } finally {
733
+ await this.tts.close();
734
+ this.tts = null;
735
+ }
736
+ }
737
+
738
+ private async sendSilence(durationMs: number): Promise<void> {
739
+ const totalSamples = (OUTPUT_SAMPLE_RATE * durationMs) / 1000;
740
+ const silenceBytes = totalSamples * 2; // 16-bit
741
+ const silence = Buffer.alloc(silenceBytes);
742
+
743
+ // Get Opus encoder for ESP32 silence frames
744
+ let encoder: Awaited<ReturnType<typeof getOpusEncoder>> | null = null;
745
+ if (this.isEsp32) {
746
+ try {
747
+ encoder = await getOpusEncoder();
748
+ } catch {
749
+ // Fall back to raw PCM
750
+ }
751
+ }
752
+
753
+ let offset = 0;
754
+ while (offset + OUTPUT_FRAME_BYTES <= silence.length) {
755
+ const pcmFrame = silence.subarray(offset, offset + OUTPUT_FRAME_BYTES);
756
+
757
+ let frameToSend: Buffer;
758
+ if (encoder) {
759
+ try {
760
+ frameToSend = Buffer.from(encoder.encode(pcmFrame, OUTPUT_SAMPLES_PER_FRAME));
761
+ } catch {
762
+ frameToSend = pcmFrame;
763
+ }
764
+ } else {
765
+ frameToSend = pcmFrame;
766
+ }
767
+
768
+ await this.sendBinary(frameToSend);
769
+ offset += OUTPUT_FRAME_BYTES;
770
+ await new Promise((resolve) => setTimeout(resolve, OUTPUT_FRAME_MS));
771
+ }
772
+ }
773
+
774
+ // ── OpenClaw Communication ────────────────────────────────────
775
+ // Connects using the device identity Ed25519 key stored in
776
+ // ~/.openclaw/identity/device.json — implemented entirely in the
777
+ // extension, no core code changes needed.
778
+
779
+ private async connectToOpenClaw(): Promise<void> {
780
+ if (!this.cfg?.openclawUrl) return;
781
+
782
+ const { WebSocket: WS } = await import("ws");
783
+ const nodeCrypto = await import("node:crypto");
784
+ const fs = await import("node:fs");
785
+ const path = await import("node:path");
786
+ const os = await import("node:os");
787
+
788
+ // ── Load device identity ──────────────────────────────────
789
+ const stateDir = process.env.OPENCLAW_STATE_DIR ?? path.default.join(os.default.homedir(), ".openclaw");
790
+ const identityPath = path.default.join(stateDir, "identity", "device.json");
791
+ let deviceIdentity: { deviceId: string; publicKeyPem: string; privateKeyPem: string } | null = null;
792
+
793
+ try {
794
+ if (fs.default.existsSync(identityPath)) {
795
+ const raw = JSON.parse(fs.default.readFileSync(identityPath, "utf8"));
796
+ if (raw?.version === 1 && raw.deviceId && raw.publicKeyPem && raw.privateKeyPem) {
797
+ deviceIdentity = { deviceId: raw.deviceId, publicKeyPem: raw.publicKeyPem, privateKeyPem: raw.privateKeyPem };
798
+ this.log("info", `Loaded device identity: ${deviceIdentity.deviceId.slice(0, 16)}...`);
799
+ }
800
+ }
801
+ } catch (err) {
802
+ this.log("warn", `Could not load device identity: ${err}`);
803
+ }
804
+
805
+ const token = this.cfg.openclawToken || process.env.OPENCLAW_GATEWAY_TOKEN;
806
+
807
+ return new Promise<void>((resolve) => {
808
+ this.openclawWs = new WS(this.cfg!.openclawUrl);
809
+
810
+ this.openclawWs.on("open", () => {
811
+ this.log("info", `Connected to OpenClaw at ${this.cfg!.openclawUrl}`);
812
+ });
813
+
814
+ // First message should be connect.challenge
815
+ this.openclawWs!.once("message", (data: Buffer) => {
816
+ try {
817
+ const frame = JSON.parse(data.toString());
818
+ if (frame.type !== "event" || frame.event !== "connect.challenge") {
819
+ this.log("warn", `Unexpected first frame: ${JSON.stringify(frame).slice(0, 100)}`);
820
+ resolve();
821
+ return;
822
+ }
823
+
824
+ const nonce = frame.payload?.nonce as string | undefined;
825
+ const role = "operator";
826
+ const scopes = ["operator.read", "operator.write"];
827
+ const clientId = "cli";
828
+ const clientMode = "cli";
829
+
830
+ // ── Build device signature ───────────────────────────
831
+ let device: Record<string, unknown> | undefined;
832
+ if (deviceIdentity) {
833
+ const signedAtMs = Date.now();
834
+ const payloadParts = nonce
835
+ ? ["v2", deviceIdentity.deviceId, clientId, clientMode, role, scopes.join(","), String(signedAtMs), token ?? "", nonce]
836
+ : ["v1", deviceIdentity.deviceId, clientId, clientMode, role, scopes.join(","), String(signedAtMs), token ?? ""];
837
+ const payload = payloadParts.join("|");
838
+
839
+ const privateKey = nodeCrypto.default.createPrivateKey(deviceIdentity.privateKeyPem);
840
+ const signature = nodeCrypto.default.sign(null, Buffer.from(payload), privateKey).toString("base64url");
841
+
842
+ // Extract raw 32-byte Ed25519 public key from SPKI DER
843
+ const pubKey = nodeCrypto.default.createPublicKey(deviceIdentity.publicKeyPem);
844
+ const spki = pubKey.export({ type: "spki", format: "der" }) as Buffer;
845
+ const ED25519_PREFIX = Buffer.from("302a300506032b6570032100", "hex");
846
+ const rawPub = (spki.length === ED25519_PREFIX.length + 32 && spki.subarray(0, ED25519_PREFIX.length).equals(ED25519_PREFIX))
847
+ ? spki.subarray(ED25519_PREFIX.length)
848
+ : spki;
849
+
850
+ device = {
851
+ id: deviceIdentity.deviceId,
852
+ publicKey: rawPub.toString("base64url"),
853
+ signature,
854
+ signedAt: signedAtMs,
855
+ ...(nonce ? { nonce } : {}),
856
+ };
857
+ }
858
+
859
+ const connectRequest = {
860
+ type: "req",
861
+ id: nodeCrypto.default.randomUUID(),
862
+ method: "connect",
863
+ params: {
864
+ minProtocol: 3,
865
+ maxProtocol: 3,
866
+ client: { id: clientId, version: "1.0.0", platform: "node", mode: clientMode, displayName: `ESP32 Voice [${this.deviceId}]` },
867
+ role,
868
+ scopes,
869
+ caps: [],
870
+ ...(token ? { auth: { token } } : {}),
871
+ ...(device ? { device } : {}),
872
+ },
873
+ };
874
+
875
+ this.openclawWs!.send(JSON.stringify(connectRequest));
876
+
877
+ // Wait for the connect response
878
+ this.openclawWs!.once("message", (resp: Buffer) => {
879
+ try {
880
+ const response = JSON.parse(resp.toString());
881
+ if (response.type === "res" && response.ok) {
882
+ this.log("info", "OpenClaw handshake complete");
883
+ this.openclawConnected = true;
884
+ } else {
885
+ this.log("error", `OpenClaw handshake failed: ${response.error?.message ?? JSON.stringify(response)}`);
886
+ }
887
+ } catch {
888
+ this.log("error", "OpenClaw handshake parse error");
889
+ }
890
+ resolve();
891
+ });
892
+ } catch {
893
+ resolve();
894
+ }
895
+ });
896
+
897
+ this.openclawWs.on("error", (err: Error) => {
898
+ this.log("error", `OpenClaw connection error: ${err.message}`);
899
+ this.openclawConnected = false;
900
+ resolve();
901
+ });
902
+
903
+ this.openclawWs.on("close", () => {
904
+ this.log("info", "OpenClaw connection closed");
905
+ this.openclawConnected = false;
906
+ });
907
+ });
908
+ }
909
+
910
+ // Returns true if a gateway response is a heartbeat ack (HEARTBEAT_OK),
911
+ // which should be ignored — the session is still waiting for the real reply.
912
+ private isHeartbeatResponse(text: string): boolean {
913
+ const lower = (text ?? "").trim().toLowerCase();
914
+ if (!lower) return false;
915
+ if (!lower.startsWith("heartbeat_ok")) return false;
916
+ // Allow "HEARTBEAT_OK" alone or followed by punctuation/spaces — not a word char
917
+ const suffix = lower.slice("heartbeat_ok".length);
918
+ return suffix.length === 0 || !/[a-z0-9_]/.test(suffix[0]);
919
+ }
920
+
921
+ private async sendToOpenClaw(text: string): Promise<string> {
922
+ if (!this.openclawWs || !this.openclawConnected) {
923
+ throw new Error("Not connected to OpenClaw");
924
+ }
925
+
926
+ const chatRequest = {
927
+ type: "req",
928
+ id: crypto.randomUUID(),
929
+ method: "chat.send",
930
+ params: {
931
+ sessionKey: "agent:main:main",
932
+ message: text,
933
+ idempotencyKey: crypto.randomUUID(),
934
+ },
935
+ };
936
+
937
+ this.openclawWs.send(JSON.stringify(chatRequest));
938
+
939
+ return new Promise<string>((resolve) => {
940
+ let responseContent = "";
941
+ const timeout = setTimeout(() => {
942
+ resolve(responseContent || "Request timed out.");
943
+ }, 120000);
944
+
945
+ const messageHandler = (data: Buffer) => {
946
+ try {
947
+ const event = JSON.parse(data.toString());
948
+ if (event.type === "event") {
949
+ if (event.event === "agent" && event.payload?.stream === "assistant" && event.payload?.data?.text) {
950
+ const candidate = event.payload.data.text as string;
951
+ // Skip heartbeat ack responses — they are internal gateway noise
952
+ if (!this.isHeartbeatResponse(candidate)) {
953
+ responseContent = candidate;
954
+ }
955
+ } else if (event.event === "chat") {
956
+ const payload = event.payload ?? {};
957
+ const state = payload.state;
958
+ const messageObj = payload.message;
959
+ let candidate = "";
960
+ if (typeof messageObj?.content === "string") {
961
+ candidate = messageObj.content;
962
+ } else if (Array.isArray(messageObj?.content)) {
963
+ const textBlocks = (messageObj.content as Array<{ type: string; text?: string }>)
964
+ .filter(b => b.type === "text").map(b => b.text ?? "");
965
+ if (textBlocks.length > 0) candidate = textBlocks.join("");
966
+ }
967
+ // Skip heartbeat ack responses — keep waiting for real content
968
+ if (candidate && !this.isHeartbeatResponse(candidate)) {
969
+ responseContent = candidate;
970
+ }
971
+ if (state === "final" || state === "done" || state === "complete") {
972
+ // If the final response is a heartbeat ack, keep waiting
973
+ if (this.isHeartbeatResponse(responseContent)) {
974
+ responseContent = "";
975
+ return;
976
+ }
977
+ const hasPendingTools = Array.isArray(messageObj?.content) &&
978
+ (messageObj.content as Array<{ type: string }>).some(b => b.type === "tool_use");
979
+ if (!hasPendingTools) {
980
+ clearTimeout(timeout);
981
+ this.openclawWs?.off("message", messageHandler);
982
+ resolve(responseContent || "No response received");
983
+ }
984
+ } else if (state === "aborted" || state === "error") {
985
+ clearTimeout(timeout);
986
+ this.openclawWs?.off("message", messageHandler);
987
+ resolve(responseContent || `Request ${state}`);
988
+ }
989
+ }
990
+ }
991
+ } catch { /* ignore parse errors */ }
992
+ };
993
+
994
+ this.openclawWs!.on("message", messageHandler);
995
+ });
996
+ }
997
+
998
+ // ── Utilities ─────────────────────────────────────────────────
999
+
1000
+ private setState(newState: VoiceSessionState): void {
1001
+ const old = this.state;
1002
+ this.state = newState;
1003
+ this.log("debug", `State: ${old} → ${newState}`);
1004
+ }
1005
+
1006
+ private async sendJson(obj: Record<string, unknown>): Promise<void> {
1007
+ try {
1008
+ this.ws.send(JSON.stringify(obj));
1009
+ } catch (err) {
1010
+ this.log("error", `Send error (${obj.type}): ${err}`);
1011
+ }
1012
+ }
1013
+
1014
+ private async sendBinary(data: Buffer): Promise<void> {
1015
+ try {
1016
+ this.ws.send(data);
1017
+ } catch (err) {
1018
+ this.log("error", `Binary send error: ${err}`);
1019
+ }
1020
+ }
1021
+
1022
+ private log(level: string, msg: string): void {
1023
+ const prefix = `[${this.sessionId.slice(0, 8)}]`;
1024
+ switch (level) {
1025
+ case "error":
1026
+ console.error(`${prefix} ${msg}`);
1027
+ break;
1028
+ case "warn":
1029
+ console.warn(`${prefix} ${msg}`);
1030
+ break;
1031
+ case "debug":
1032
+ // Only log debug in development
1033
+ if (process.env.NODE_ENV !== "production") {
1034
+ console.log(`${prefix} [debug] ${msg}`);
1035
+ }
1036
+ break;
1037
+ default:
1038
+ console.log(`${prefix} ${msg}`);
1039
+ }
1040
+ }
1041
+ }