@oh-my-pi/pi-coding-agent 6.8.5 → 6.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,976 +0,0 @@
1
- import { logger, ptree } from "@oh-my-pi/pi-utils";
2
- import {
3
- RealtimeAgent,
4
- RealtimeSession,
5
- type RealtimeSessionConfig,
6
- type TransportEvent,
7
- type TransportLayerAudio,
8
- tool,
9
- } from "@openai/agents/realtime";
10
- import type { ReadableStreamDefaultReader as WebReadableStreamDefaultReader } from "stream/web";
11
- import { z } from "zod";
12
- import type { ModelRegistry } from "./model-registry";
13
-
14
- const DEFAULT_REALTIME_MODEL = process.env.OMP_VOICE_REALTIME_MODEL ?? "gpt-realtime";
15
- const DEFAULT_REALTIME_VOICE = process.env.OMP_VOICE_REALTIME_VOICE ?? "marin";
16
- const DEFAULT_SAMPLE_RATE = 24000;
17
- const DEFAULT_CHANNELS = 1;
18
- const DEFAULT_BITS = 16;
19
- const INTERRUPT_DEBOUNCE_MS = 200;
20
- const MAX_RESULT_CHARS = 6000;
21
- const MAX_PROGRESS_CHARS = 1400;
22
- const PLAYBACK_ACTIVE_WINDOW_MS = 350;
23
- // Echo cancellation: only suppress mic when playback is active and mic is much quieter
24
- const ECHO_SUPPRESSION_RATIO = 2.5;
25
- // Minimum RMS to ever send (absolute noise floor)
26
- const MIC_NOISE_FLOOR = 0.005;
27
- const PLAYBACK_ERROR_COOLDOWN_MS = 2000;
28
-
29
- const SUPERVISOR_INSTRUCTIONS = [
30
- "You are the realtime voice supervisor for a terminal coding agent.",
31
- "Manage conversation flow, turn-taking, and what gets spoken aloud.",
32
- "For user speech: if unclear, ask exactly one short question.",
33
- "If clear, call send_to_agent with a concise instruction for the coding agent.",
34
- "If the user is greeting/smalltalk or gives no actionable request, respond briefly and do not call send_to_agent.",
35
- "Keep spoken responses to 1-2 short sentences (<=40 words).",
36
- "You will receive system updates prefixed with SYSTEM_EVENT, PROGRESS_UPDATE, or AGENT_OUTPUT.",
37
- "For AGENT_OUTPUT, always respond with a brief spoken summary and any single question needed.",
38
- "For PROGRESS_UPDATE, speak a short update only if it helps the user stay oriented.",
39
- "Do not call send_to_agent for system updates.",
40
- "If the user asks to stop or cancel work, call interrupt_agent.",
41
- ].join(" ");
42
-
43
- type VoiceSupervisorCallbacks = {
44
- onSendToAgent: (text: string) => Promise<void> | void;
45
- onInterruptAgent: (reason?: string) => Promise<void> | void;
46
- onStatus: (status?: string) => void;
47
- onError: (error: Error) => void;
48
- onWarning?: (message: string) => void;
49
- };
50
-
51
- function normalizeText(text: string): string {
52
- return text.replace(/\s+/g, " ").trim();
53
- }
54
-
55
- function truncateText(text: string, maxChars: number): string {
56
- if (text.length <= maxChars) return text;
57
- return `${text.slice(0, maxChars)}...`;
58
- }
59
-
60
- function toArrayBuffer(chunk: Uint8Array): ArrayBuffer {
61
- const buffer = chunk.buffer;
62
- if (buffer instanceof ArrayBuffer) {
63
- if (chunk.byteOffset === 0 && chunk.byteLength === buffer.byteLength) {
64
- return buffer;
65
- }
66
- return buffer.slice(chunk.byteOffset, chunk.byteOffset + chunk.byteLength);
67
- }
68
- const copy = new Uint8Array(chunk.byteLength);
69
- copy.set(chunk);
70
- return copy.buffer;
71
- }
72
-
73
- function describeError(error: unknown): string {
74
- if (error instanceof Error) return error.message;
75
- if (typeof error === "string") return error;
76
- if (error && typeof error === "object") {
77
- const maybeMessage = (error as { message?: unknown }).message;
78
- if (typeof maybeMessage === "string") return maybeMessage;
79
- const nested = (error as { error?: unknown }).error;
80
- if (nested) return describeError(nested);
81
- try {
82
- return JSON.stringify(error);
83
- } catch {
84
- return String(error);
85
- }
86
- }
87
- return String(error);
88
- }
89
-
90
- type AudioToolStatus = {
91
- capture: { available: boolean; tool?: string; command?: string[] };
92
- playback: { available: boolean; tool?: string; command?: string[] };
93
- };
94
-
95
- function checkAudioTools(sampleRate: number, channels: number): AudioToolStatus {
96
- const captureResult = buildCaptureCommand(sampleRate, channels);
97
- const playbackCmd = buildPlaybackCommand(sampleRate, channels);
98
-
99
- return {
100
- capture: {
101
- available: captureResult !== null,
102
- tool: captureResult?.command[0],
103
- command: captureResult?.command,
104
- },
105
- playback: {
106
- available: playbackCmd !== null,
107
- tool: playbackCmd?.[0],
108
- command: playbackCmd ?? undefined,
109
- },
110
- };
111
- }
112
-
113
- function getMissingToolsMessage(): string {
114
- const platform = process.platform;
115
- const lines: string[] = ["Voice mode requires audio tools. Install one of the following:"];
116
-
117
- if (platform === "linux") {
118
- lines.push("");
119
- lines.push(" For capture (microphone):");
120
- lines.push(" • sox (recommended): sudo dnf install sox");
121
- lines.push(" • pulseaudio-utils: sudo dnf install pulseaudio-utils");
122
- lines.push(" • alsa-utils: sudo dnf install alsa-utils");
123
- lines.push(" • ffmpeg: sudo dnf install ffmpeg");
124
- lines.push("");
125
- lines.push(" For playback (speaker):");
126
- lines.push(" • sox (recommended): sudo dnf install sox");
127
- lines.push(" • ffmpeg: sudo dnf install ffmpeg");
128
- lines.push("");
129
- lines.push(" Set OMP_VOICE_CAPTURE_DEVICE to override the default capture device.");
130
- lines.push(" (Applies to all tools; for sox, this sets AUDIODEV internally.)");
131
- } else if (platform === "darwin") {
132
- lines.push("");
133
- lines.push(" • sox (recommended): brew install sox");
134
- lines.push(" • ffmpeg: brew install ffmpeg");
135
- } else if (platform === "win32") {
136
- lines.push("");
137
- lines.push(" • sox: choco install sox");
138
- lines.push(" • ffmpeg: choco install ffmpeg");
139
- }
140
-
141
- return lines.join("\n");
142
- }
143
-
144
- type CaptureCommand = { command: string[]; env?: Record<string, string> };
145
-
146
- function buildCaptureCommand(sampleRate: number, channels: number): CaptureCommand | null {
147
- const platform = process.platform;
148
- // Allow user to override capture device via environment
149
- const captureDevice = process.env.OMP_VOICE_CAPTURE_DEVICE;
150
-
151
- // Prefer sox/rec as they work well across platforms
152
- const soxPath = Bun.which("sox") ?? Bun.which("rec");
153
- if (soxPath) {
154
- const command = [
155
- soxPath,
156
- "-q",
157
- "-d",
158
- "-t",
159
- "raw",
160
- "-r",
161
- String(sampleRate),
162
- "-e",
163
- "signed-integer",
164
- "-b",
165
- String(DEFAULT_BITS),
166
- "-c",
167
- String(channels),
168
- "-",
169
- ];
170
- // sox uses AUDIODEV env var to override the default device
171
- const env = captureDevice ? { AUDIODEV: captureDevice } : undefined;
172
- return { command, env };
173
- }
174
-
175
- // On Linux, try PulseAudio first (parecord)
176
- if (platform === "linux") {
177
- const parecordPath = Bun.which("parecord");
178
- if (parecordPath) {
179
- const command = [parecordPath, "--raw", "--format=s16le", `--rate=${sampleRate}`, `--channels=${channels}`];
180
- if (captureDevice) {
181
- command.push(`--device=${captureDevice}`);
182
- }
183
- return { command };
184
- }
185
- }
186
-
187
- // ALSA arecord as fallback on Linux
188
- const arecordPath = Bun.which("arecord");
189
- if (arecordPath) {
190
- const device = captureDevice ?? "default";
191
- return {
192
- command: [
193
- arecordPath,
194
- "-q",
195
- "-D",
196
- device,
197
- "-f",
198
- "S16_LE",
199
- "-r",
200
- String(sampleRate),
201
- "-c",
202
- String(channels),
203
- "-t",
204
- "raw",
205
- ],
206
- };
207
- }
208
-
209
- // ffmpeg fallback with platform-specific input
210
- const ffmpegPath = Bun.which("ffmpeg");
211
- if (ffmpegPath) {
212
- if (platform === "darwin") {
213
- const device = captureDevice ?? ":0";
214
- return {
215
- command: [
216
- ffmpegPath,
217
- "-hide_banner",
218
- "-loglevel",
219
- "error",
220
- "-f",
221
- "avfoundation",
222
- "-i",
223
- device,
224
- "-ac",
225
- String(channels),
226
- "-ar",
227
- String(sampleRate),
228
- "-f",
229
- "s16le",
230
- "-",
231
- ],
232
- };
233
- }
234
- if (platform === "linux") {
235
- // Try PulseAudio format first, fall back to ALSA
236
- const hasPulse = Bun.which("pulseaudio") || Bun.which("pipewire-pulse") || process.env.PULSE_SERVER;
237
- const format = hasPulse ? "pulse" : "alsa";
238
- const device = captureDevice ?? "default";
239
- return {
240
- command: [
241
- ffmpegPath,
242
- "-hide_banner",
243
- "-loglevel",
244
- "error",
245
- "-f",
246
- format,
247
- "-i",
248
- device,
249
- "-ac",
250
- String(channels),
251
- "-ar",
252
- String(sampleRate),
253
- "-f",
254
- "s16le",
255
- "-",
256
- ],
257
- };
258
- }
259
- if (platform === "win32") {
260
- const device = captureDevice ?? "audio=default";
261
- return {
262
- command: [
263
- ffmpegPath,
264
- "-hide_banner",
265
- "-loglevel",
266
- "error",
267
- "-f",
268
- "dshow",
269
- "-i",
270
- device,
271
- "-ac",
272
- String(channels),
273
- "-ar",
274
- String(sampleRate),
275
- "-f",
276
- "s16le",
277
- "-",
278
- ],
279
- };
280
- }
281
- }
282
-
283
- return null;
284
- }
285
-
286
- function buildPlaybackCommand(sampleRate: number, channels: number): string[] | null {
287
- const preferred = process.env.OMP_VOICE_PLAYBACK?.toLowerCase();
288
- const ffplayPath = Bun.which("ffplay");
289
- const playPath = Bun.which("play");
290
- const soxPath = Bun.which("sox");
291
-
292
- const playCommand = playPath
293
- ? [
294
- playPath,
295
- "-q",
296
- "-t",
297
- "raw",
298
- "-r",
299
- String(sampleRate),
300
- "-e",
301
- "signed-integer",
302
- "-b",
303
- String(DEFAULT_BITS),
304
- "-c",
305
- String(channels),
306
- "-",
307
- ]
308
- : null;
309
-
310
- const soxCommand = soxPath
311
- ? [
312
- soxPath,
313
- "-q",
314
- "-t",
315
- "raw",
316
- "-r",
317
- String(sampleRate),
318
- "-e",
319
- "signed-integer",
320
- "-b",
321
- String(DEFAULT_BITS),
322
- "-c",
323
- String(channels),
324
- "-",
325
- "-d",
326
- ]
327
- : null;
328
-
329
- const ffplayCommand = ffplayPath
330
- ? [
331
- ffplayPath,
332
- "-nodisp",
333
- "-autoexit",
334
- "-hide_banner",
335
- "-loglevel",
336
- "error",
337
- "-fflags",
338
- "nobuffer",
339
- "-flags",
340
- "low_delay",
341
- "-f",
342
- "s16le",
343
- "-ar",
344
- String(sampleRate),
345
- "-ac",
346
- String(channels),
347
- "-",
348
- ]
349
- : null;
350
-
351
- if (preferred === "ffplay") return ffplayCommand;
352
- if (preferred === "play") return playCommand ?? soxCommand;
353
- if (preferred === "sox") return soxCommand ?? playCommand;
354
-
355
- return playCommand ?? soxCommand ?? ffplayCommand;
356
- }
357
-
358
- function rms16le(buffer: Uint8Array): number {
359
- if (buffer.byteLength < 2) return 0;
360
- const view = new DataView(buffer.buffer, buffer.byteOffset, buffer.byteLength);
361
- let sum = 0;
362
- let count = 0;
363
- for (let i = 0; i + 1 < buffer.byteLength; i += 2) {
364
- const sample = view.getInt16(i, true) / 32768;
365
- sum += sample * sample;
366
- count += 1;
367
- }
368
- if (count === 0) return 0;
369
- return Math.sqrt(sum / count);
370
- }
371
-
372
- export class VoiceSupervisor {
373
- private session: RealtimeSession | undefined = undefined;
374
- private captureProcess: ptree.ChildProcess | undefined = undefined;
375
- private captureReader: WebReadableStreamDefaultReader<Uint8Array> | undefined = undefined;
376
- private playbackProcess: ptree.ChildProcess | undefined = undefined;
377
- private playbackWriter:
378
- | {
379
- write: (chunk: Uint8Array) => Promise<void>;
380
- close: () => Promise<void>;
381
- }
382
- | undefined = undefined;
383
- private active = false;
384
- private connected = false;
385
- private sessionReady = false;
386
- private lastInterruptAt = 0;
387
- private lastPlaybackAt = 0;
388
- private lastPlaybackRms = 0;
389
- private lastPlaybackErrorAt = 0;
390
- // Fallback transcript handling: track user speech when no tool call is made
391
- private pendingTranscript = "";
392
- private pendingResponseHasToolCall = false;
393
- private pendingResponseHasAudioOutput = false;
394
-
395
- constructor(
396
- private registry: ModelRegistry,
397
- private callbacks: VoiceSupervisorCallbacks,
398
- ) {}
399
-
400
- /**
401
- * Check if audio tools are available for voice mode.
402
- * Returns null if all tools are available, or an error message if not.
403
- */
404
- static checkAvailability(): { available: boolean; error?: string; tools?: AudioToolStatus } {
405
- const status = checkAudioTools(DEFAULT_SAMPLE_RATE, DEFAULT_CHANNELS);
406
- if (status.capture.available && status.playback.available) {
407
- return { available: true, tools: status };
408
- }
409
-
410
- const missing: string[] = [];
411
- if (!status.capture.available) missing.push("capture");
412
- if (!status.playback.available) missing.push("playback");
413
-
414
- return {
415
- available: false,
416
- error: `Missing audio ${missing.join(" and ")} tools.\n\n${getMissingToolsMessage()}`,
417
- tools: status,
418
- };
419
- }
420
-
421
- get isActive(): boolean {
422
- return this.active;
423
- }
424
-
425
- async start(): Promise<void> {
426
- if (this.active) return;
427
-
428
- const apiKey = await this.registry.getApiKeyForProvider("openai");
429
- if (!apiKey) {
430
- throw new Error("OpenAI API key not found (set OPENAI_API_KEY or login).");
431
- }
432
-
433
- this.active = true;
434
- this.lastInterruptAt = 0;
435
- this.sessionReady = false;
436
- this.lastPlaybackErrorAt = 0;
437
- this.pendingTranscript = "";
438
- this.pendingResponseHasToolCall = false;
439
- this.pendingResponseHasAudioOutput = false;
440
- this.callbacks.onStatus("Connecting realtime voice...");
441
-
442
- try {
443
- const agent = this.createSupervisorAgent();
444
- const session = new RealtimeSession(agent, {
445
- transport: "websocket",
446
- model: DEFAULT_REALTIME_MODEL,
447
- config: this.buildSessionConfig(),
448
- });
449
-
450
- this.session = session;
451
- this.bindSessionEvents(session);
452
- await session.connect({ apiKey });
453
- this.connected = session.transport.status === "connected";
454
- this.sessionReady = this.connected;
455
- if (!this.connected) {
456
- await this.waitForConnection(session, 5000);
457
- }
458
- await this.waitForSessionReady(session, 5000);
459
- await this.startCapture();
460
- await this.ensurePlayback();
461
- this.callbacks.onStatus("Listening... (auto-send on silence, Ctrl+Y to stop)");
462
- } catch (error) {
463
- await this.stop();
464
- throw new Error(describeError(error));
465
- }
466
- }
467
-
468
- async stop(): Promise<void> {
469
- if (!this.active) return;
470
- this.active = false;
471
- this.connected = false;
472
- this.sessionReady = false;
473
- await this.stopCapture();
474
- await this.resetPlayback();
475
- if (this.session) {
476
- this.session.close();
477
- this.session = undefined;
478
- }
479
- this.callbacks.onStatus(undefined);
480
- }
481
-
482
- notifyProgress(text: string): void {
483
- this.sendSystemMessage("PROGRESS_UPDATE", text, MAX_PROGRESS_CHARS);
484
- }
485
-
486
- notifyResult(text: string): void {
487
- this.sendSystemMessage("AGENT_OUTPUT", text, MAX_RESULT_CHARS);
488
- }
489
-
490
- private sendSystemMessage(prefix: string, text: string, maxChars: number): void {
491
- if (!this.session || !this.active) return;
492
- if (!this.connected || !this.sessionReady || this.session.transport.status !== "connected") return;
493
- const trimmed = normalizeText(text);
494
- if (!trimmed) return;
495
- const payload = `${prefix}: ${truncateText(trimmed, maxChars)}`;
496
- try {
497
- this.session.transport.sendEvent({
498
- type: "conversation.item.create",
499
- item: {
500
- type: "message",
501
- role: "system",
502
- content: [{ type: "input_text", text: payload }],
503
- },
504
- });
505
- this.session.transport.sendEvent({ type: "response.create" });
506
- } catch (error) {
507
- const message = describeError(error);
508
- if (message.includes("WebSocket is not connected")) return;
509
- this.callbacks.onError(error instanceof Error ? error : new Error(message));
510
- }
511
- }
512
-
513
- private createSupervisorAgent(): RealtimeAgent {
514
- const sendToAgentTool = tool({
515
- name: "send_to_agent",
516
- description: "Send a concise instruction to the coding agent.",
517
- parameters: z.object({
518
- text: z.string().min(1),
519
- }),
520
- execute: async ({ text }) => {
521
- const cleaned = normalizeText(text);
522
- if (cleaned) {
523
- await this.callbacks.onSendToAgent(cleaned);
524
- }
525
- return "sent";
526
- },
527
- });
528
-
529
- const interruptAgentTool = tool({
530
- name: "interrupt_agent",
531
- description: "Interrupt the coding agent immediately.",
532
- parameters: z.object({
533
- reason: z.string().optional(),
534
- }),
535
- execute: async ({ reason }) => {
536
- await this.callbacks.onInterruptAgent(reason);
537
- return "interrupted";
538
- },
539
- });
540
-
541
- return new RealtimeAgent({
542
- name: "Voice Supervisor",
543
- instructions: SUPERVISOR_INSTRUCTIONS,
544
- tools: [sendToAgentTool, interruptAgentTool],
545
- voice: DEFAULT_REALTIME_VOICE,
546
- });
547
- }
548
-
549
- private buildSessionConfig(): Partial<RealtimeSessionConfig> {
550
- return {
551
- outputModalities: ["audio"],
552
- audio: {
553
- input: {
554
- format: { type: "audio/pcm", rate: DEFAULT_SAMPLE_RATE },
555
- noiseReduction: { type: "near_field" },
556
- turnDetection: {
557
- type: "semantic_vad",
558
- createResponse: true,
559
- interruptResponse: true,
560
- },
561
- },
562
- output: {
563
- format: { type: "audio/pcm", rate: DEFAULT_SAMPLE_RATE },
564
- ...(DEFAULT_REALTIME_VOICE ? { voice: DEFAULT_REALTIME_VOICE } : {}),
565
- },
566
- },
567
- };
568
- }
569
-
570
- private bindSessionEvents(session: RealtimeSession): void {
571
- session.transport.on("connection_change", (status) => {
572
- this.connected = status === "connected";
573
- if (this.connected) {
574
- this.sessionReady = true;
575
- } else {
576
- this.sessionReady = false;
577
- }
578
- if (!this.active) return;
579
- if (this.connected) {
580
- this.callbacks.onStatus("Listening... (auto-send on silence, Ctrl+Y to stop)");
581
- } else {
582
- this.callbacks.onStatus("Reconnecting realtime voice...");
583
- }
584
- });
585
-
586
- session.on("audio", (event: TransportLayerAudio) => {
587
- void this.handleAudio(event);
588
- });
589
-
590
- session.on("audio_start", () => {
591
- if (!this.active) return;
592
- this.pendingResponseHasAudioOutput = true;
593
- this.callbacks.onStatus("Speaking...");
594
- });
595
-
596
- session.on("audio_stopped", () => {
597
- if (!this.active) return;
598
- this.callbacks.onStatus("Listening... (auto-send on silence, Ctrl+Y to stop)");
599
- });
600
-
601
- session.on("audio_interrupted", () => {
602
- void this.resetPlayback();
603
- if (!this.active) return;
604
- this.callbacks.onStatus("Listening... (auto-send on silence, Ctrl+Y to stop)");
605
- });
606
-
607
- session.on("transport_event", (event: TransportEvent) => {
608
- this.handleTransportEvent(event);
609
- });
610
-
611
- session.on("error", (error) => {
612
- const message = describeError(error);
613
- logger.debug("voice-supervisor: realtime error", { error: message });
614
- if (message.includes("WebSocket is not connected")) {
615
- if (this.active) {
616
- this.callbacks.onStatus("Reconnecting realtime voice...");
617
- }
618
- return;
619
- }
620
- this.callbacks.onError(new Error(message));
621
- });
622
- }
623
-
624
- private handleTransportEvent(event: TransportEvent): void {
625
- if (!this.active) return;
626
-
627
- // Session ready
628
- if (event.type === "session.created") {
629
- this.sessionReady = true;
630
- return;
631
- }
632
-
633
- // User speech started - interrupt agent and reset tracking
634
- if (event.type === "input_audio_buffer.speech_started") {
635
- const now = Date.now();
636
- if (now - this.lastInterruptAt < INTERRUPT_DEBOUNCE_MS) return;
637
- this.lastInterruptAt = now;
638
- this.pendingTranscript = "";
639
- this.pendingResponseHasToolCall = false;
640
- this.pendingResponseHasAudioOutput = false;
641
- void this.callbacks.onInterruptAgent();
642
- return;
643
- }
644
-
645
- // User speech transcript completed - store for fallback
646
- if (event.type === "conversation.item.input_audio_transcription.completed") {
647
- const transcript = (event as { transcript?: string }).transcript;
648
- if (transcript && typeof transcript === "string") {
649
- this.pendingTranscript = normalizeText(transcript);
650
- logger.debug("voice-supervisor: transcript captured", { transcript: this.pendingTranscript });
651
- }
652
- return;
653
- }
654
-
655
- // Response started - begin tracking
656
- if (event.type === "response.created") {
657
- this.pendingResponseHasToolCall = false;
658
- this.pendingResponseHasAudioOutput = false;
659
- return;
660
- }
661
-
662
- // Tool call detected - mark so we know not to use fallback
663
- // Check multiple event types for robustness against API changes
664
- if (
665
- event.type === "function_call" ||
666
- event.type === "response.function_call_arguments.done" ||
667
- event.type === "response.function_call_arguments.delta" ||
668
- event.type === "response.output_item.added"
669
- ) {
670
- // For output_item.added, only mark if it's a function_call type
671
- if (event.type === "response.output_item.added") {
672
- const item = (event as { item?: { type?: string } }).item;
673
- if (item?.type === "function_call") {
674
- this.pendingResponseHasToolCall = true;
675
- }
676
- } else {
677
- this.pendingResponseHasToolCall = true;
678
- }
679
- return;
680
- }
681
-
682
- // Audio output detected - mark so we don't fallback
683
- if (
684
- event.type === "response.output_audio.delta" ||
685
- event.type === "response.output_audio.done" ||
686
- event.type === "response.output_audio_transcript.delta" ||
687
- event.type === "response.output_audio_transcript.done" ||
688
- event.type === "response.content_part.added" ||
689
- event.type === "response.content_part.done"
690
- ) {
691
- this.pendingResponseHasAudioOutput = true;
692
- return;
693
- }
694
-
695
- // Response completed - check if we need fallback
696
- if (event.type === "response.done") {
697
- // Only use fallback if we have a transcript AND there was no tool call AND no audio output
698
- // This prevents duplicate responses when the realtime assistant already spoke
699
- if (this.pendingTranscript && !this.pendingResponseHasToolCall && !this.pendingResponseHasAudioOutput) {
700
- logger.debug("voice-supervisor: using fallback transcript path", {
701
- transcript: this.pendingTranscript,
702
- });
703
- const transcript = this.pendingTranscript;
704
- this.pendingTranscript = "";
705
- // Queue the fallback asynchronously to avoid blocking
706
- setImmediate(() => {
707
- if (this.active) {
708
- void this.callbacks.onSendToAgent(transcript);
709
- }
710
- });
711
- }
712
- return;
713
- }
714
- }
715
-
716
- private async handleAudio(event: TransportLayerAudio): Promise<void> {
717
- if (!this.active) return;
718
- const now = Date.now();
719
- try {
720
- await this.ensurePlayback();
721
- } catch (error) {
722
- this.callbacks.onError(new Error(describeError(error)));
723
- return;
724
- }
725
- if (!this.playbackWriter) return;
726
- try {
727
- await this.playbackWriter.write(new Uint8Array(event.data));
728
- this.lastPlaybackAt = now;
729
- this.lastPlaybackRms = rms16le(new Uint8Array(event.data));
730
- } catch (error) {
731
- logger.debug("voice-supervisor: playback write failed", {
732
- error: describeError(error),
733
- });
734
- void this.resetPlayback();
735
- }
736
- }
737
-
738
- private async startCapture(): Promise<void> {
739
- const captureResult = buildCaptureCommand(DEFAULT_SAMPLE_RATE, DEFAULT_CHANNELS);
740
- if (!captureResult) {
741
- throw new Error(`No audio capture tool found.\n\n${getMissingToolsMessage()}`);
742
- }
743
-
744
- const { command, env: captureEnv } = captureResult;
745
- logger.debug("voice-supervisor: starting mic capture", { command, env: captureEnv });
746
- const proc = ptree.cspawn(command, {
747
- env: captureEnv ? { ...process.env, ...captureEnv } : undefined,
748
- });
749
- this.captureProcess = proc;
750
- const reader = proc.stdout.getReader();
751
- this.captureReader = reader;
752
-
753
- (async () => {
754
- while (this.active) {
755
- const { value, done } = await reader.read();
756
- if (done || !this.active) break;
757
- if (!value || !this.session) continue;
758
- if (!this.connected || !this.sessionReady || this.session.transport.status !== "connected") {
759
- continue;
760
- }
761
-
762
- const micRms = rms16le(value);
763
- const now = Date.now();
764
- const playbackActive = now - this.lastPlaybackAt < PLAYBACK_ACTIVE_WINDOW_MS;
765
-
766
- // Echo suppression: only skip if playback is active AND mic is very quiet relative to playback
767
- // This prevents feedback loops while allowing user to speak over the assistant
768
- if (playbackActive && micRms < MIC_NOISE_FLOOR && micRms < this.lastPlaybackRms / ECHO_SUPPRESSION_RATIO) {
769
- continue;
770
- }
771
-
772
- // Send all audio to realtime API - let semantic_vad handle turn detection
773
- const buffer = toArrayBuffer(value);
774
- if (buffer.byteLength === 0) continue;
775
- try {
776
- this.session.sendAudio(buffer);
777
- } catch (error) {
778
- const message = describeError(error);
779
- logger.debug("voice-supervisor: sendAudio failed", { error: message });
780
- if (message.includes("WebSocket is not connected")) {
781
- continue;
782
- }
783
- this.callbacks.onError(error instanceof Error ? error : new Error(message));
784
- return;
785
- }
786
- }
787
- if (this.active) {
788
- this.callbacks.onError(new Error("Voice capture stopped unexpectedly."));
789
- }
790
- })().catch((error) => {
791
- if (!this.active) return;
792
- logger.debug("voice-supervisor: capture loop error", {
793
- error: describeError(error),
794
- });
795
- this.callbacks.onError(new Error(describeError(error)));
796
- });
797
- }
798
-
799
- private async stopCapture(): Promise<void> {
800
- if (this.captureReader) {
801
- try {
802
- await this.captureReader.cancel();
803
- } catch {
804
- // ignore
805
- }
806
- this.captureReader = undefined;
807
- }
808
- if (this.captureProcess) {
809
- try {
810
- this.captureProcess.kill("SIGINT");
811
- } catch {
812
- // ignore
813
- }
814
- await this.captureProcess.exited;
815
- this.captureProcess = undefined;
816
- }
817
- }
818
-
819
- private async ensurePlayback(): Promise<void> {
820
- if (this.playbackProcess && this.playbackWriter) return;
821
- const command = buildPlaybackCommand(DEFAULT_SAMPLE_RATE, DEFAULT_CHANNELS);
822
- if (!command) {
823
- throw new Error(`No audio playback tool found.\n\n${getMissingToolsMessage()}`);
824
- }
825
-
826
- logger.debug("voice-supervisor: starting audio playback", { command });
827
- const proc = ptree.cspawn(command, {
828
- stdin: "pipe",
829
- });
830
- const startedAt = Date.now();
831
-
832
- this.playbackProcess = proc;
833
- const stdin = proc.stdin;
834
- if (!stdin) {
835
- throw new Error("Audio playback stdin unavailable.");
836
- }
837
- if ("getWriter" in stdin && typeof stdin.getWriter === "function") {
838
- const writer = (stdin as unknown as WritableStream<Uint8Array>).getWriter();
839
- this.playbackWriter = {
840
- write: async (chunk) => {
841
- await writer.write(chunk);
842
- },
843
- close: async () => {
844
- await writer.close();
845
- },
846
- };
847
- } else if ("write" in stdin && typeof (stdin as { write?: unknown }).write === "function") {
848
- const sink = stdin as unknown as {
849
- write: (chunk: Uint8Array) => undefined | number | Promise<undefined | number>;
850
- end?: () => undefined | number | Promise<undefined | number>;
851
- close?: () => undefined | number | Promise<undefined | number>;
852
- };
853
- this.playbackWriter = {
854
- write: async (chunk) => {
855
- await sink.write(chunk);
856
- },
857
- close: async () => {
858
- if (sink.end) {
859
- await sink.end();
860
- } else if (sink.close) {
861
- await sink.close();
862
- }
863
- },
864
- };
865
- } else {
866
- throw new Error("Audio playback stdin is not writable.");
867
- }
868
-
869
- proc.exited
870
- .then(() => {
871
- const code = proc.exitCode;
872
- if (this.playbackProcess === proc) {
873
- this.playbackProcess = undefined;
874
- this.playbackWriter = undefined;
875
- }
876
- const trimmed = proc.peekStderr().trim();
877
- if (trimmed) {
878
- logger.debug("voice-supervisor: playback stderr", { stderr: trimmed });
879
- }
880
- const elapsed = Date.now() - startedAt;
881
- if (code !== 0 && elapsed < 2000 && this.active && code !== null) {
882
- this.maybeWarnPlaybackFailure(trimmed || `exit code ${code}`);
883
- }
884
- })
885
- .catch(() => {
886
- // ignore
887
- });
888
- }
889
-
890
- private async resetPlayback(): Promise<void> {
891
- if (this.playbackWriter) {
892
- try {
893
- await this.playbackWriter.close();
894
- } catch {
895
- // ignore
896
- }
897
- }
898
- if (this.playbackProcess) {
899
- try {
900
- this.playbackProcess.kill();
901
- } catch {
902
- // ignore
903
- }
904
- await this.playbackProcess.exited;
905
- }
906
- this.playbackProcess = undefined;
907
- this.playbackWriter = undefined;
908
- }
909
-
910
- private maybeWarnPlaybackFailure(message: string): void {
911
- if (!this.callbacks.onWarning) return;
912
- const now = Date.now();
913
- if (now - this.lastPlaybackErrorAt < PLAYBACK_ERROR_COOLDOWN_MS) return;
914
- this.lastPlaybackErrorAt = now;
915
- this.callbacks.onWarning(`Audio playback failed: ${message}`);
916
- }
917
-
918
- private async waitForConnection(session: RealtimeSession, timeoutMs: number): Promise<void> {
919
- if (session.transport.status === "connected") {
920
- this.connected = true;
921
- return;
922
- }
923
- await new Promise<void>((resolve, reject) => {
924
- const timeout = setTimeout(() => {
925
- cleanup();
926
- reject(new Error("Realtime voice connection timed out."));
927
- }, timeoutMs);
928
-
929
- const cleanup = () => {
930
- clearTimeout(timeout);
931
- session.transport.off("connection_change", onChange);
932
- };
933
-
934
- const onChange = (status: string) => {
935
- if (status === "connected") {
936
- this.connected = true;
937
- cleanup();
938
- resolve();
939
- }
940
- };
941
-
942
- session.transport.on("connection_change", onChange);
943
- });
944
- }
945
-
946
- private async waitForSessionReady(session: RealtimeSession, timeoutMs: number): Promise<void> {
947
- if (this.sessionReady) return;
948
- await new Promise<void>((resolve, reject) => {
949
- let resolved = false;
950
-
951
- const cleanup = () => {
952
- clearTimeout(timeout);
953
- session.off("transport_event", onEvent);
954
- };
955
-
956
- const timeout = setTimeout(() => {
957
- if (resolved) return;
958
- resolved = true;
959
- cleanup();
960
- reject(new Error("Realtime voice session not ready."));
961
- }, timeoutMs);
962
-
963
- const onEvent = (event: TransportEvent) => {
964
- if (resolved) return;
965
- if (event.type === "session.created") {
966
- this.sessionReady = true;
967
- resolved = true;
968
- cleanup();
969
- resolve();
970
- }
971
- };
972
-
973
- session.on("transport_event", onEvent);
974
- });
975
- }
976
- }