@oh-my-pi/pi-coding-agent 6.8.5 → 6.9.69
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +51 -0
- package/package.json +6 -6
- package/src/cli/stats-cli.ts +191 -0
- package/src/core/agent-session.ts +103 -1
- package/src/core/extensions/index.ts +2 -0
- package/src/core/extensions/runner.ts +31 -0
- package/src/core/extensions/types.ts +24 -0
- package/src/core/messages.ts +48 -0
- package/src/core/sdk.ts +0 -2
- package/src/core/session-manager.ts +10 -1
- package/src/core/settings-manager.ts +0 -105
- package/src/core/tools/bash.ts +5 -7
- package/src/core/tools/index.ts +1 -5
- package/src/core/tools/patch/applicator.ts +115 -17
- package/src/core/tools/patch/index.ts +1 -1
- package/src/core/tools/patch/normalize.ts +185 -10
- package/src/core/tools/python.ts +444 -86
- package/src/core/tools/task/executor.ts +2 -6
- package/src/core/tools/task/index.ts +30 -12
- package/src/core/tools/task/render.ts +163 -30
- package/src/core/tools/task/template.ts +37 -0
- package/src/core/tools/task/types.ts +6 -2
- package/src/core/tools/task/worker.ts +1 -1
- package/src/index.ts +2 -2
- package/src/main.ts +12 -0
- package/src/modes/interactive/components/python-execution.ts +180 -0
- package/src/modes/interactive/components/settings-defs.ts +0 -70
- package/src/modes/interactive/components/settings-selector.ts +0 -1
- package/src/modes/interactive/components/welcome.ts +1 -0
- package/src/modes/interactive/controllers/command-controller.ts +46 -0
- package/src/modes/interactive/controllers/event-controller.ts +0 -11
- package/src/modes/interactive/controllers/input-controller.ts +28 -1
- package/src/modes/interactive/controllers/selector-controller.ts +0 -9
- package/src/modes/interactive/interactive-mode.ts +10 -58
- package/src/modes/interactive/theme/dark.json +2 -9
- package/src/modes/interactive/theme/defaults/alabaster.json +2 -8
- package/src/modes/interactive/theme/defaults/amethyst.json +2 -9
- package/src/modes/interactive/theme/defaults/anthracite.json +2 -9
- package/src/modes/interactive/theme/defaults/basalt.json +89 -88
- package/src/modes/interactive/theme/defaults/birch.json +2 -8
- package/src/modes/interactive/theme/defaults/dark-abyss.json +2 -8
- package/src/modes/interactive/theme/defaults/dark-arctic.json +2 -9
- package/src/modes/interactive/theme/defaults/dark-aurora.json +3 -2
- package/src/modes/interactive/theme/defaults/dark-catppuccin.json +2 -1
- package/src/modes/interactive/theme/defaults/dark-cavern.json +2 -8
- package/src/modes/interactive/theme/defaults/dark-copper.json +3 -2
- package/src/modes/interactive/theme/defaults/dark-cosmos.json +2 -8
- package/src/modes/interactive/theme/defaults/dark-cyberpunk.json +2 -9
- package/src/modes/interactive/theme/defaults/dark-dracula.json +2 -9
- package/src/modes/interactive/theme/defaults/dark-eclipse.json +2 -8
- package/src/modes/interactive/theme/defaults/dark-ember.json +3 -2
- package/src/modes/interactive/theme/defaults/dark-equinox.json +2 -8
- package/src/modes/interactive/theme/defaults/dark-forest.json +2 -9
- package/src/modes/interactive/theme/defaults/dark-github.json +2 -9
- package/src/modes/interactive/theme/defaults/dark-gruvbox.json +2 -9
- package/src/modes/interactive/theme/defaults/dark-lavender.json +3 -2
- package/src/modes/interactive/theme/defaults/dark-lunar.json +2 -8
- package/src/modes/interactive/theme/defaults/dark-midnight.json +3 -2
- package/src/modes/interactive/theme/defaults/dark-monochrome.json +2 -9
- package/src/modes/interactive/theme/defaults/dark-monokai.json +2 -9
- package/src/modes/interactive/theme/defaults/dark-nebula.json +2 -8
- package/src/modes/interactive/theme/defaults/dark-nord.json +2 -9
- package/src/modes/interactive/theme/defaults/dark-ocean.json +2 -9
- package/src/modes/interactive/theme/defaults/dark-one.json +2 -9
- package/src/modes/interactive/theme/defaults/dark-rainforest.json +2 -8
- package/src/modes/interactive/theme/defaults/dark-reef.json +2 -8
- package/src/modes/interactive/theme/defaults/dark-retro.json +2 -9
- package/src/modes/interactive/theme/defaults/dark-rose-pine.json +2 -1
- package/src/modes/interactive/theme/defaults/dark-sakura.json +3 -2
- package/src/modes/interactive/theme/defaults/dark-slate.json +3 -2
- package/src/modes/interactive/theme/defaults/dark-solarized.json +2 -1
- package/src/modes/interactive/theme/defaults/dark-solstice.json +2 -8
- package/src/modes/interactive/theme/defaults/dark-starfall.json +2 -8
- package/src/modes/interactive/theme/defaults/dark-sunset.json +2 -9
- package/src/modes/interactive/theme/defaults/dark-swamp.json +2 -8
- package/src/modes/interactive/theme/defaults/dark-synthwave.json +2 -1
- package/src/modes/interactive/theme/defaults/dark-taiga.json +2 -8
- package/src/modes/interactive/theme/defaults/dark-terminal.json +3 -2
- package/src/modes/interactive/theme/defaults/dark-tokyo-night.json +2 -9
- package/src/modes/interactive/theme/defaults/dark-tundra.json +2 -8
- package/src/modes/interactive/theme/defaults/dark-twilight.json +2 -8
- package/src/modes/interactive/theme/defaults/dark-volcanic.json +2 -8
- package/src/modes/interactive/theme/defaults/graphite.json +2 -9
- package/src/modes/interactive/theme/defaults/light-arctic.json +2 -1
- package/src/modes/interactive/theme/defaults/light-aurora-day.json +2 -8
- package/src/modes/interactive/theme/defaults/light-canyon.json +2 -8
- package/src/modes/interactive/theme/defaults/light-catppuccin.json +2 -1
- package/src/modes/interactive/theme/defaults/light-cirrus.json +2 -8
- package/src/modes/interactive/theme/defaults/light-coral.json +3 -2
- package/src/modes/interactive/theme/defaults/light-cyberpunk.json +2 -9
- package/src/modes/interactive/theme/defaults/light-dawn.json +2 -8
- package/src/modes/interactive/theme/defaults/light-dunes.json +2 -8
- package/src/modes/interactive/theme/defaults/light-eucalyptus.json +3 -2
- package/src/modes/interactive/theme/defaults/light-forest.json +2 -9
- package/src/modes/interactive/theme/defaults/light-frost.json +3 -2
- package/src/modes/interactive/theme/defaults/light-github.json +2 -1
- package/src/modes/interactive/theme/defaults/light-glacier.json +2 -8
- package/src/modes/interactive/theme/defaults/light-gruvbox.json +2 -9
- package/src/modes/interactive/theme/defaults/light-haze.json +2 -8
- package/src/modes/interactive/theme/defaults/light-honeycomb.json +3 -2
- package/src/modes/interactive/theme/defaults/light-lagoon.json +2 -8
- package/src/modes/interactive/theme/defaults/light-lavender.json +3 -2
- package/src/modes/interactive/theme/defaults/light-meadow.json +2 -8
- package/src/modes/interactive/theme/defaults/light-mint.json +3 -2
- package/src/modes/interactive/theme/defaults/light-monochrome.json +2 -1
- package/src/modes/interactive/theme/defaults/light-ocean.json +2 -9
- package/src/modes/interactive/theme/defaults/light-one.json +2 -8
- package/src/modes/interactive/theme/defaults/light-opal.json +2 -8
- package/src/modes/interactive/theme/defaults/light-orchard.json +2 -8
- package/src/modes/interactive/theme/defaults/light-paper.json +3 -2
- package/src/modes/interactive/theme/defaults/light-prism.json +2 -8
- package/src/modes/interactive/theme/defaults/light-retro.json +2 -9
- package/src/modes/interactive/theme/defaults/light-sand.json +3 -2
- package/src/modes/interactive/theme/defaults/light-savanna.json +2 -8
- package/src/modes/interactive/theme/defaults/light-solarized.json +2 -1
- package/src/modes/interactive/theme/defaults/light-soleil.json +2 -8
- package/src/modes/interactive/theme/defaults/light-sunset.json +2 -9
- package/src/modes/interactive/theme/defaults/light-synthwave.json +2 -9
- package/src/modes/interactive/theme/defaults/light-tokyo-night.json +2 -9
- package/src/modes/interactive/theme/defaults/light-wetland.json +2 -8
- package/src/modes/interactive/theme/defaults/light-zenith.json +2 -8
- package/src/modes/interactive/theme/defaults/limestone.json +2 -8
- package/src/modes/interactive/theme/defaults/mahogany.json +2 -9
- package/src/modes/interactive/theme/defaults/marble.json +2 -8
- package/src/modes/interactive/theme/defaults/obsidian.json +89 -88
- package/src/modes/interactive/theme/defaults/onyx.json +89 -88
- package/src/modes/interactive/theme/defaults/pearl.json +2 -8
- package/src/modes/interactive/theme/defaults/porcelain.json +89 -88
- package/src/modes/interactive/theme/defaults/quartz.json +2 -8
- package/src/modes/interactive/theme/defaults/sandstone.json +2 -8
- package/src/modes/interactive/theme/defaults/titanium.json +88 -87
- package/src/modes/interactive/theme/light.json +2 -8
- package/src/modes/interactive/theme/theme-schema.json +5 -0
- package/src/modes/interactive/theme/theme.ts +7 -0
- package/src/modes/interactive/types.ts +5 -15
- package/src/modes/interactive/utils/ui-helpers.ts +20 -0
- package/src/prompts/system/system-prompt.md +8 -0
- package/src/prompts/tools/python.md +40 -2
- package/src/prompts/tools/task.md +8 -13
- package/src/core/custom-commands/bundled/wt/index.ts +0 -435
- package/src/core/tools/git.ts +0 -213
- package/src/core/voice-controller.ts +0 -135
- package/src/core/voice-supervisor.ts +0 -976
- package/src/core/voice.ts +0 -314
- package/src/lib/worktree/collapse.ts +0 -180
- package/src/lib/worktree/constants.ts +0 -14
- package/src/lib/worktree/errors.ts +0 -23
- package/src/lib/worktree/git.ts +0 -60
- package/src/lib/worktree/index.ts +0 -15
- package/src/lib/worktree/operations.ts +0 -216
- package/src/lib/worktree/session.ts +0 -114
- package/src/lib/worktree/stats.ts +0 -67
- package/src/modes/interactive/utils/voice-manager.ts +0 -96
- package/src/prompts/tools/git.md +0 -9
- package/src/prompts/voice-summary.md +0 -12
|
@@ -1,976 +0,0 @@
|
|
|
1
|
-
import { logger, ptree } from "@oh-my-pi/pi-utils";
|
|
2
|
-
import {
|
|
3
|
-
RealtimeAgent,
|
|
4
|
-
RealtimeSession,
|
|
5
|
-
type RealtimeSessionConfig,
|
|
6
|
-
type TransportEvent,
|
|
7
|
-
type TransportLayerAudio,
|
|
8
|
-
tool,
|
|
9
|
-
} from "@openai/agents/realtime";
|
|
10
|
-
import type { ReadableStreamDefaultReader as WebReadableStreamDefaultReader } from "stream/web";
|
|
11
|
-
import { z } from "zod";
|
|
12
|
-
import type { ModelRegistry } from "./model-registry";
|
|
13
|
-
|
|
14
|
-
const DEFAULT_REALTIME_MODEL = process.env.OMP_VOICE_REALTIME_MODEL ?? "gpt-realtime";
|
|
15
|
-
const DEFAULT_REALTIME_VOICE = process.env.OMP_VOICE_REALTIME_VOICE ?? "marin";
|
|
16
|
-
const DEFAULT_SAMPLE_RATE = 24000;
|
|
17
|
-
const DEFAULT_CHANNELS = 1;
|
|
18
|
-
const DEFAULT_BITS = 16;
|
|
19
|
-
const INTERRUPT_DEBOUNCE_MS = 200;
|
|
20
|
-
const MAX_RESULT_CHARS = 6000;
|
|
21
|
-
const MAX_PROGRESS_CHARS = 1400;
|
|
22
|
-
const PLAYBACK_ACTIVE_WINDOW_MS = 350;
|
|
23
|
-
// Echo cancellation: only suppress mic when playback is active and mic is much quieter
|
|
24
|
-
const ECHO_SUPPRESSION_RATIO = 2.5;
|
|
25
|
-
// Minimum RMS to ever send (absolute noise floor)
|
|
26
|
-
const MIC_NOISE_FLOOR = 0.005;
|
|
27
|
-
const PLAYBACK_ERROR_COOLDOWN_MS = 2000;
|
|
28
|
-
|
|
29
|
-
const SUPERVISOR_INSTRUCTIONS = [
|
|
30
|
-
"You are the realtime voice supervisor for a terminal coding agent.",
|
|
31
|
-
"Manage conversation flow, turn-taking, and what gets spoken aloud.",
|
|
32
|
-
"For user speech: if unclear, ask exactly one short question.",
|
|
33
|
-
"If clear, call send_to_agent with a concise instruction for the coding agent.",
|
|
34
|
-
"If the user is greeting/smalltalk or gives no actionable request, respond briefly and do not call send_to_agent.",
|
|
35
|
-
"Keep spoken responses to 1-2 short sentences (<=40 words).",
|
|
36
|
-
"You will receive system updates prefixed with SYSTEM_EVENT, PROGRESS_UPDATE, or AGENT_OUTPUT.",
|
|
37
|
-
"For AGENT_OUTPUT, always respond with a brief spoken summary and any single question needed.",
|
|
38
|
-
"For PROGRESS_UPDATE, speak a short update only if it helps the user stay oriented.",
|
|
39
|
-
"Do not call send_to_agent for system updates.",
|
|
40
|
-
"If the user asks to stop or cancel work, call interrupt_agent.",
|
|
41
|
-
].join(" ");
|
|
42
|
-
|
|
43
|
-
type VoiceSupervisorCallbacks = {
|
|
44
|
-
onSendToAgent: (text: string) => Promise<void> | void;
|
|
45
|
-
onInterruptAgent: (reason?: string) => Promise<void> | void;
|
|
46
|
-
onStatus: (status?: string) => void;
|
|
47
|
-
onError: (error: Error) => void;
|
|
48
|
-
onWarning?: (message: string) => void;
|
|
49
|
-
};
|
|
50
|
-
|
|
51
|
-
function normalizeText(text: string): string {
|
|
52
|
-
return text.replace(/\s+/g, " ").trim();
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
function truncateText(text: string, maxChars: number): string {
|
|
56
|
-
if (text.length <= maxChars) return text;
|
|
57
|
-
return `${text.slice(0, maxChars)}...`;
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
function toArrayBuffer(chunk: Uint8Array): ArrayBuffer {
|
|
61
|
-
const buffer = chunk.buffer;
|
|
62
|
-
if (buffer instanceof ArrayBuffer) {
|
|
63
|
-
if (chunk.byteOffset === 0 && chunk.byteLength === buffer.byteLength) {
|
|
64
|
-
return buffer;
|
|
65
|
-
}
|
|
66
|
-
return buffer.slice(chunk.byteOffset, chunk.byteOffset + chunk.byteLength);
|
|
67
|
-
}
|
|
68
|
-
const copy = new Uint8Array(chunk.byteLength);
|
|
69
|
-
copy.set(chunk);
|
|
70
|
-
return copy.buffer;
|
|
71
|
-
}
|
|
72
|
-
|
|
73
|
-
function describeError(error: unknown): string {
|
|
74
|
-
if (error instanceof Error) return error.message;
|
|
75
|
-
if (typeof error === "string") return error;
|
|
76
|
-
if (error && typeof error === "object") {
|
|
77
|
-
const maybeMessage = (error as { message?: unknown }).message;
|
|
78
|
-
if (typeof maybeMessage === "string") return maybeMessage;
|
|
79
|
-
const nested = (error as { error?: unknown }).error;
|
|
80
|
-
if (nested) return describeError(nested);
|
|
81
|
-
try {
|
|
82
|
-
return JSON.stringify(error);
|
|
83
|
-
} catch {
|
|
84
|
-
return String(error);
|
|
85
|
-
}
|
|
86
|
-
}
|
|
87
|
-
return String(error);
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
type AudioToolStatus = {
|
|
91
|
-
capture: { available: boolean; tool?: string; command?: string[] };
|
|
92
|
-
playback: { available: boolean; tool?: string; command?: string[] };
|
|
93
|
-
};
|
|
94
|
-
|
|
95
|
-
function checkAudioTools(sampleRate: number, channels: number): AudioToolStatus {
|
|
96
|
-
const captureResult = buildCaptureCommand(sampleRate, channels);
|
|
97
|
-
const playbackCmd = buildPlaybackCommand(sampleRate, channels);
|
|
98
|
-
|
|
99
|
-
return {
|
|
100
|
-
capture: {
|
|
101
|
-
available: captureResult !== null,
|
|
102
|
-
tool: captureResult?.command[0],
|
|
103
|
-
command: captureResult?.command,
|
|
104
|
-
},
|
|
105
|
-
playback: {
|
|
106
|
-
available: playbackCmd !== null,
|
|
107
|
-
tool: playbackCmd?.[0],
|
|
108
|
-
command: playbackCmd ?? undefined,
|
|
109
|
-
},
|
|
110
|
-
};
|
|
111
|
-
}
|
|
112
|
-
|
|
113
|
-
function getMissingToolsMessage(): string {
|
|
114
|
-
const platform = process.platform;
|
|
115
|
-
const lines: string[] = ["Voice mode requires audio tools. Install one of the following:"];
|
|
116
|
-
|
|
117
|
-
if (platform === "linux") {
|
|
118
|
-
lines.push("");
|
|
119
|
-
lines.push(" For capture (microphone):");
|
|
120
|
-
lines.push(" • sox (recommended): sudo dnf install sox");
|
|
121
|
-
lines.push(" • pulseaudio-utils: sudo dnf install pulseaudio-utils");
|
|
122
|
-
lines.push(" • alsa-utils: sudo dnf install alsa-utils");
|
|
123
|
-
lines.push(" • ffmpeg: sudo dnf install ffmpeg");
|
|
124
|
-
lines.push("");
|
|
125
|
-
lines.push(" For playback (speaker):");
|
|
126
|
-
lines.push(" • sox (recommended): sudo dnf install sox");
|
|
127
|
-
lines.push(" • ffmpeg: sudo dnf install ffmpeg");
|
|
128
|
-
lines.push("");
|
|
129
|
-
lines.push(" Set OMP_VOICE_CAPTURE_DEVICE to override the default capture device.");
|
|
130
|
-
lines.push(" (Applies to all tools; for sox, this sets AUDIODEV internally.)");
|
|
131
|
-
} else if (platform === "darwin") {
|
|
132
|
-
lines.push("");
|
|
133
|
-
lines.push(" • sox (recommended): brew install sox");
|
|
134
|
-
lines.push(" • ffmpeg: brew install ffmpeg");
|
|
135
|
-
} else if (platform === "win32") {
|
|
136
|
-
lines.push("");
|
|
137
|
-
lines.push(" • sox: choco install sox");
|
|
138
|
-
lines.push(" • ffmpeg: choco install ffmpeg");
|
|
139
|
-
}
|
|
140
|
-
|
|
141
|
-
return lines.join("\n");
|
|
142
|
-
}
|
|
143
|
-
|
|
144
|
-
type CaptureCommand = { command: string[]; env?: Record<string, string> };
|
|
145
|
-
|
|
146
|
-
function buildCaptureCommand(sampleRate: number, channels: number): CaptureCommand | null {
|
|
147
|
-
const platform = process.platform;
|
|
148
|
-
// Allow user to override capture device via environment
|
|
149
|
-
const captureDevice = process.env.OMP_VOICE_CAPTURE_DEVICE;
|
|
150
|
-
|
|
151
|
-
// Prefer sox/rec as they work well across platforms
|
|
152
|
-
const soxPath = Bun.which("sox") ?? Bun.which("rec");
|
|
153
|
-
if (soxPath) {
|
|
154
|
-
const command = [
|
|
155
|
-
soxPath,
|
|
156
|
-
"-q",
|
|
157
|
-
"-d",
|
|
158
|
-
"-t",
|
|
159
|
-
"raw",
|
|
160
|
-
"-r",
|
|
161
|
-
String(sampleRate),
|
|
162
|
-
"-e",
|
|
163
|
-
"signed-integer",
|
|
164
|
-
"-b",
|
|
165
|
-
String(DEFAULT_BITS),
|
|
166
|
-
"-c",
|
|
167
|
-
String(channels),
|
|
168
|
-
"-",
|
|
169
|
-
];
|
|
170
|
-
// sox uses AUDIODEV env var to override the default device
|
|
171
|
-
const env = captureDevice ? { AUDIODEV: captureDevice } : undefined;
|
|
172
|
-
return { command, env };
|
|
173
|
-
}
|
|
174
|
-
|
|
175
|
-
// On Linux, try PulseAudio first (parecord)
|
|
176
|
-
if (platform === "linux") {
|
|
177
|
-
const parecordPath = Bun.which("parecord");
|
|
178
|
-
if (parecordPath) {
|
|
179
|
-
const command = [parecordPath, "--raw", "--format=s16le", `--rate=${sampleRate}`, `--channels=${channels}`];
|
|
180
|
-
if (captureDevice) {
|
|
181
|
-
command.push(`--device=${captureDevice}`);
|
|
182
|
-
}
|
|
183
|
-
return { command };
|
|
184
|
-
}
|
|
185
|
-
}
|
|
186
|
-
|
|
187
|
-
// ALSA arecord as fallback on Linux
|
|
188
|
-
const arecordPath = Bun.which("arecord");
|
|
189
|
-
if (arecordPath) {
|
|
190
|
-
const device = captureDevice ?? "default";
|
|
191
|
-
return {
|
|
192
|
-
command: [
|
|
193
|
-
arecordPath,
|
|
194
|
-
"-q",
|
|
195
|
-
"-D",
|
|
196
|
-
device,
|
|
197
|
-
"-f",
|
|
198
|
-
"S16_LE",
|
|
199
|
-
"-r",
|
|
200
|
-
String(sampleRate),
|
|
201
|
-
"-c",
|
|
202
|
-
String(channels),
|
|
203
|
-
"-t",
|
|
204
|
-
"raw",
|
|
205
|
-
],
|
|
206
|
-
};
|
|
207
|
-
}
|
|
208
|
-
|
|
209
|
-
// ffmpeg fallback with platform-specific input
|
|
210
|
-
const ffmpegPath = Bun.which("ffmpeg");
|
|
211
|
-
if (ffmpegPath) {
|
|
212
|
-
if (platform === "darwin") {
|
|
213
|
-
const device = captureDevice ?? ":0";
|
|
214
|
-
return {
|
|
215
|
-
command: [
|
|
216
|
-
ffmpegPath,
|
|
217
|
-
"-hide_banner",
|
|
218
|
-
"-loglevel",
|
|
219
|
-
"error",
|
|
220
|
-
"-f",
|
|
221
|
-
"avfoundation",
|
|
222
|
-
"-i",
|
|
223
|
-
device,
|
|
224
|
-
"-ac",
|
|
225
|
-
String(channels),
|
|
226
|
-
"-ar",
|
|
227
|
-
String(sampleRate),
|
|
228
|
-
"-f",
|
|
229
|
-
"s16le",
|
|
230
|
-
"-",
|
|
231
|
-
],
|
|
232
|
-
};
|
|
233
|
-
}
|
|
234
|
-
if (platform === "linux") {
|
|
235
|
-
// Try PulseAudio format first, fall back to ALSA
|
|
236
|
-
const hasPulse = Bun.which("pulseaudio") || Bun.which("pipewire-pulse") || process.env.PULSE_SERVER;
|
|
237
|
-
const format = hasPulse ? "pulse" : "alsa";
|
|
238
|
-
const device = captureDevice ?? "default";
|
|
239
|
-
return {
|
|
240
|
-
command: [
|
|
241
|
-
ffmpegPath,
|
|
242
|
-
"-hide_banner",
|
|
243
|
-
"-loglevel",
|
|
244
|
-
"error",
|
|
245
|
-
"-f",
|
|
246
|
-
format,
|
|
247
|
-
"-i",
|
|
248
|
-
device,
|
|
249
|
-
"-ac",
|
|
250
|
-
String(channels),
|
|
251
|
-
"-ar",
|
|
252
|
-
String(sampleRate),
|
|
253
|
-
"-f",
|
|
254
|
-
"s16le",
|
|
255
|
-
"-",
|
|
256
|
-
],
|
|
257
|
-
};
|
|
258
|
-
}
|
|
259
|
-
if (platform === "win32") {
|
|
260
|
-
const device = captureDevice ?? "audio=default";
|
|
261
|
-
return {
|
|
262
|
-
command: [
|
|
263
|
-
ffmpegPath,
|
|
264
|
-
"-hide_banner",
|
|
265
|
-
"-loglevel",
|
|
266
|
-
"error",
|
|
267
|
-
"-f",
|
|
268
|
-
"dshow",
|
|
269
|
-
"-i",
|
|
270
|
-
device,
|
|
271
|
-
"-ac",
|
|
272
|
-
String(channels),
|
|
273
|
-
"-ar",
|
|
274
|
-
String(sampleRate),
|
|
275
|
-
"-f",
|
|
276
|
-
"s16le",
|
|
277
|
-
"-",
|
|
278
|
-
],
|
|
279
|
-
};
|
|
280
|
-
}
|
|
281
|
-
}
|
|
282
|
-
|
|
283
|
-
return null;
|
|
284
|
-
}
|
|
285
|
-
|
|
286
|
-
function buildPlaybackCommand(sampleRate: number, channels: number): string[] | null {
|
|
287
|
-
const preferred = process.env.OMP_VOICE_PLAYBACK?.toLowerCase();
|
|
288
|
-
const ffplayPath = Bun.which("ffplay");
|
|
289
|
-
const playPath = Bun.which("play");
|
|
290
|
-
const soxPath = Bun.which("sox");
|
|
291
|
-
|
|
292
|
-
const playCommand = playPath
|
|
293
|
-
? [
|
|
294
|
-
playPath,
|
|
295
|
-
"-q",
|
|
296
|
-
"-t",
|
|
297
|
-
"raw",
|
|
298
|
-
"-r",
|
|
299
|
-
String(sampleRate),
|
|
300
|
-
"-e",
|
|
301
|
-
"signed-integer",
|
|
302
|
-
"-b",
|
|
303
|
-
String(DEFAULT_BITS),
|
|
304
|
-
"-c",
|
|
305
|
-
String(channels),
|
|
306
|
-
"-",
|
|
307
|
-
]
|
|
308
|
-
: null;
|
|
309
|
-
|
|
310
|
-
const soxCommand = soxPath
|
|
311
|
-
? [
|
|
312
|
-
soxPath,
|
|
313
|
-
"-q",
|
|
314
|
-
"-t",
|
|
315
|
-
"raw",
|
|
316
|
-
"-r",
|
|
317
|
-
String(sampleRate),
|
|
318
|
-
"-e",
|
|
319
|
-
"signed-integer",
|
|
320
|
-
"-b",
|
|
321
|
-
String(DEFAULT_BITS),
|
|
322
|
-
"-c",
|
|
323
|
-
String(channels),
|
|
324
|
-
"-",
|
|
325
|
-
"-d",
|
|
326
|
-
]
|
|
327
|
-
: null;
|
|
328
|
-
|
|
329
|
-
const ffplayCommand = ffplayPath
|
|
330
|
-
? [
|
|
331
|
-
ffplayPath,
|
|
332
|
-
"-nodisp",
|
|
333
|
-
"-autoexit",
|
|
334
|
-
"-hide_banner",
|
|
335
|
-
"-loglevel",
|
|
336
|
-
"error",
|
|
337
|
-
"-fflags",
|
|
338
|
-
"nobuffer",
|
|
339
|
-
"-flags",
|
|
340
|
-
"low_delay",
|
|
341
|
-
"-f",
|
|
342
|
-
"s16le",
|
|
343
|
-
"-ar",
|
|
344
|
-
String(sampleRate),
|
|
345
|
-
"-ac",
|
|
346
|
-
String(channels),
|
|
347
|
-
"-",
|
|
348
|
-
]
|
|
349
|
-
: null;
|
|
350
|
-
|
|
351
|
-
if (preferred === "ffplay") return ffplayCommand;
|
|
352
|
-
if (preferred === "play") return playCommand ?? soxCommand;
|
|
353
|
-
if (preferred === "sox") return soxCommand ?? playCommand;
|
|
354
|
-
|
|
355
|
-
return playCommand ?? soxCommand ?? ffplayCommand;
|
|
356
|
-
}
|
|
357
|
-
|
|
358
|
-
function rms16le(buffer: Uint8Array): number {
|
|
359
|
-
if (buffer.byteLength < 2) return 0;
|
|
360
|
-
const view = new DataView(buffer.buffer, buffer.byteOffset, buffer.byteLength);
|
|
361
|
-
let sum = 0;
|
|
362
|
-
let count = 0;
|
|
363
|
-
for (let i = 0; i + 1 < buffer.byteLength; i += 2) {
|
|
364
|
-
const sample = view.getInt16(i, true) / 32768;
|
|
365
|
-
sum += sample * sample;
|
|
366
|
-
count += 1;
|
|
367
|
-
}
|
|
368
|
-
if (count === 0) return 0;
|
|
369
|
-
return Math.sqrt(sum / count);
|
|
370
|
-
}
|
|
371
|
-
|
|
372
|
-
export class VoiceSupervisor {
|
|
373
|
-
private session: RealtimeSession | undefined = undefined;
|
|
374
|
-
private captureProcess: ptree.ChildProcess | undefined = undefined;
|
|
375
|
-
private captureReader: WebReadableStreamDefaultReader<Uint8Array> | undefined = undefined;
|
|
376
|
-
private playbackProcess: ptree.ChildProcess | undefined = undefined;
|
|
377
|
-
private playbackWriter:
|
|
378
|
-
| {
|
|
379
|
-
write: (chunk: Uint8Array) => Promise<void>;
|
|
380
|
-
close: () => Promise<void>;
|
|
381
|
-
}
|
|
382
|
-
| undefined = undefined;
|
|
383
|
-
private active = false;
|
|
384
|
-
private connected = false;
|
|
385
|
-
private sessionReady = false;
|
|
386
|
-
private lastInterruptAt = 0;
|
|
387
|
-
private lastPlaybackAt = 0;
|
|
388
|
-
private lastPlaybackRms = 0;
|
|
389
|
-
private lastPlaybackErrorAt = 0;
|
|
390
|
-
// Fallback transcript handling: track user speech when no tool call is made
|
|
391
|
-
private pendingTranscript = "";
|
|
392
|
-
private pendingResponseHasToolCall = false;
|
|
393
|
-
private pendingResponseHasAudioOutput = false;
|
|
394
|
-
|
|
395
|
-
constructor(
|
|
396
|
-
private registry: ModelRegistry,
|
|
397
|
-
private callbacks: VoiceSupervisorCallbacks,
|
|
398
|
-
) {}
|
|
399
|
-
|
|
400
|
-
/**
|
|
401
|
-
* Check if audio tools are available for voice mode.
|
|
402
|
-
* Returns null if all tools are available, or an error message if not.
|
|
403
|
-
*/
|
|
404
|
-
static checkAvailability(): { available: boolean; error?: string; tools?: AudioToolStatus } {
|
|
405
|
-
const status = checkAudioTools(DEFAULT_SAMPLE_RATE, DEFAULT_CHANNELS);
|
|
406
|
-
if (status.capture.available && status.playback.available) {
|
|
407
|
-
return { available: true, tools: status };
|
|
408
|
-
}
|
|
409
|
-
|
|
410
|
-
const missing: string[] = [];
|
|
411
|
-
if (!status.capture.available) missing.push("capture");
|
|
412
|
-
if (!status.playback.available) missing.push("playback");
|
|
413
|
-
|
|
414
|
-
return {
|
|
415
|
-
available: false,
|
|
416
|
-
error: `Missing audio ${missing.join(" and ")} tools.\n\n${getMissingToolsMessage()}`,
|
|
417
|
-
tools: status,
|
|
418
|
-
};
|
|
419
|
-
}
|
|
420
|
-
|
|
421
|
-
get isActive(): boolean {
|
|
422
|
-
return this.active;
|
|
423
|
-
}
|
|
424
|
-
|
|
425
|
-
async start(): Promise<void> {
|
|
426
|
-
if (this.active) return;
|
|
427
|
-
|
|
428
|
-
const apiKey = await this.registry.getApiKeyForProvider("openai");
|
|
429
|
-
if (!apiKey) {
|
|
430
|
-
throw new Error("OpenAI API key not found (set OPENAI_API_KEY or login).");
|
|
431
|
-
}
|
|
432
|
-
|
|
433
|
-
this.active = true;
|
|
434
|
-
this.lastInterruptAt = 0;
|
|
435
|
-
this.sessionReady = false;
|
|
436
|
-
this.lastPlaybackErrorAt = 0;
|
|
437
|
-
this.pendingTranscript = "";
|
|
438
|
-
this.pendingResponseHasToolCall = false;
|
|
439
|
-
this.pendingResponseHasAudioOutput = false;
|
|
440
|
-
this.callbacks.onStatus("Connecting realtime voice...");
|
|
441
|
-
|
|
442
|
-
try {
|
|
443
|
-
const agent = this.createSupervisorAgent();
|
|
444
|
-
const session = new RealtimeSession(agent, {
|
|
445
|
-
transport: "websocket",
|
|
446
|
-
model: DEFAULT_REALTIME_MODEL,
|
|
447
|
-
config: this.buildSessionConfig(),
|
|
448
|
-
});
|
|
449
|
-
|
|
450
|
-
this.session = session;
|
|
451
|
-
this.bindSessionEvents(session);
|
|
452
|
-
await session.connect({ apiKey });
|
|
453
|
-
this.connected = session.transport.status === "connected";
|
|
454
|
-
this.sessionReady = this.connected;
|
|
455
|
-
if (!this.connected) {
|
|
456
|
-
await this.waitForConnection(session, 5000);
|
|
457
|
-
}
|
|
458
|
-
await this.waitForSessionReady(session, 5000);
|
|
459
|
-
await this.startCapture();
|
|
460
|
-
await this.ensurePlayback();
|
|
461
|
-
this.callbacks.onStatus("Listening... (auto-send on silence, Ctrl+Y to stop)");
|
|
462
|
-
} catch (error) {
|
|
463
|
-
await this.stop();
|
|
464
|
-
throw new Error(describeError(error));
|
|
465
|
-
}
|
|
466
|
-
}
|
|
467
|
-
|
|
468
|
-
async stop(): Promise<void> {
|
|
469
|
-
if (!this.active) return;
|
|
470
|
-
this.active = false;
|
|
471
|
-
this.connected = false;
|
|
472
|
-
this.sessionReady = false;
|
|
473
|
-
await this.stopCapture();
|
|
474
|
-
await this.resetPlayback();
|
|
475
|
-
if (this.session) {
|
|
476
|
-
this.session.close();
|
|
477
|
-
this.session = undefined;
|
|
478
|
-
}
|
|
479
|
-
this.callbacks.onStatus(undefined);
|
|
480
|
-
}
|
|
481
|
-
|
|
482
|
-
notifyProgress(text: string): void {
|
|
483
|
-
this.sendSystemMessage("PROGRESS_UPDATE", text, MAX_PROGRESS_CHARS);
|
|
484
|
-
}
|
|
485
|
-
|
|
486
|
-
notifyResult(text: string): void {
|
|
487
|
-
this.sendSystemMessage("AGENT_OUTPUT", text, MAX_RESULT_CHARS);
|
|
488
|
-
}
|
|
489
|
-
|
|
490
|
-
private sendSystemMessage(prefix: string, text: string, maxChars: number): void {
|
|
491
|
-
if (!this.session || !this.active) return;
|
|
492
|
-
if (!this.connected || !this.sessionReady || this.session.transport.status !== "connected") return;
|
|
493
|
-
const trimmed = normalizeText(text);
|
|
494
|
-
if (!trimmed) return;
|
|
495
|
-
const payload = `${prefix}: ${truncateText(trimmed, maxChars)}`;
|
|
496
|
-
try {
|
|
497
|
-
this.session.transport.sendEvent({
|
|
498
|
-
type: "conversation.item.create",
|
|
499
|
-
item: {
|
|
500
|
-
type: "message",
|
|
501
|
-
role: "system",
|
|
502
|
-
content: [{ type: "input_text", text: payload }],
|
|
503
|
-
},
|
|
504
|
-
});
|
|
505
|
-
this.session.transport.sendEvent({ type: "response.create" });
|
|
506
|
-
} catch (error) {
|
|
507
|
-
const message = describeError(error);
|
|
508
|
-
if (message.includes("WebSocket is not connected")) return;
|
|
509
|
-
this.callbacks.onError(error instanceof Error ? error : new Error(message));
|
|
510
|
-
}
|
|
511
|
-
}
|
|
512
|
-
|
|
513
|
-
private createSupervisorAgent(): RealtimeAgent {
|
|
514
|
-
const sendToAgentTool = tool({
|
|
515
|
-
name: "send_to_agent",
|
|
516
|
-
description: "Send a concise instruction to the coding agent.",
|
|
517
|
-
parameters: z.object({
|
|
518
|
-
text: z.string().min(1),
|
|
519
|
-
}),
|
|
520
|
-
execute: async ({ text }) => {
|
|
521
|
-
const cleaned = normalizeText(text);
|
|
522
|
-
if (cleaned) {
|
|
523
|
-
await this.callbacks.onSendToAgent(cleaned);
|
|
524
|
-
}
|
|
525
|
-
return "sent";
|
|
526
|
-
},
|
|
527
|
-
});
|
|
528
|
-
|
|
529
|
-
const interruptAgentTool = tool({
|
|
530
|
-
name: "interrupt_agent",
|
|
531
|
-
description: "Interrupt the coding agent immediately.",
|
|
532
|
-
parameters: z.object({
|
|
533
|
-
reason: z.string().optional(),
|
|
534
|
-
}),
|
|
535
|
-
execute: async ({ reason }) => {
|
|
536
|
-
await this.callbacks.onInterruptAgent(reason);
|
|
537
|
-
return "interrupted";
|
|
538
|
-
},
|
|
539
|
-
});
|
|
540
|
-
|
|
541
|
-
return new RealtimeAgent({
|
|
542
|
-
name: "Voice Supervisor",
|
|
543
|
-
instructions: SUPERVISOR_INSTRUCTIONS,
|
|
544
|
-
tools: [sendToAgentTool, interruptAgentTool],
|
|
545
|
-
voice: DEFAULT_REALTIME_VOICE,
|
|
546
|
-
});
|
|
547
|
-
}
|
|
548
|
-
|
|
549
|
-
private buildSessionConfig(): Partial<RealtimeSessionConfig> {
|
|
550
|
-
return {
|
|
551
|
-
outputModalities: ["audio"],
|
|
552
|
-
audio: {
|
|
553
|
-
input: {
|
|
554
|
-
format: { type: "audio/pcm", rate: DEFAULT_SAMPLE_RATE },
|
|
555
|
-
noiseReduction: { type: "near_field" },
|
|
556
|
-
turnDetection: {
|
|
557
|
-
type: "semantic_vad",
|
|
558
|
-
createResponse: true,
|
|
559
|
-
interruptResponse: true,
|
|
560
|
-
},
|
|
561
|
-
},
|
|
562
|
-
output: {
|
|
563
|
-
format: { type: "audio/pcm", rate: DEFAULT_SAMPLE_RATE },
|
|
564
|
-
...(DEFAULT_REALTIME_VOICE ? { voice: DEFAULT_REALTIME_VOICE } : {}),
|
|
565
|
-
},
|
|
566
|
-
},
|
|
567
|
-
};
|
|
568
|
-
}
|
|
569
|
-
|
|
570
|
-
private bindSessionEvents(session: RealtimeSession): void {
|
|
571
|
-
session.transport.on("connection_change", (status) => {
|
|
572
|
-
this.connected = status === "connected";
|
|
573
|
-
if (this.connected) {
|
|
574
|
-
this.sessionReady = true;
|
|
575
|
-
} else {
|
|
576
|
-
this.sessionReady = false;
|
|
577
|
-
}
|
|
578
|
-
if (!this.active) return;
|
|
579
|
-
if (this.connected) {
|
|
580
|
-
this.callbacks.onStatus("Listening... (auto-send on silence, Ctrl+Y to stop)");
|
|
581
|
-
} else {
|
|
582
|
-
this.callbacks.onStatus("Reconnecting realtime voice...");
|
|
583
|
-
}
|
|
584
|
-
});
|
|
585
|
-
|
|
586
|
-
session.on("audio", (event: TransportLayerAudio) => {
|
|
587
|
-
void this.handleAudio(event);
|
|
588
|
-
});
|
|
589
|
-
|
|
590
|
-
session.on("audio_start", () => {
|
|
591
|
-
if (!this.active) return;
|
|
592
|
-
this.pendingResponseHasAudioOutput = true;
|
|
593
|
-
this.callbacks.onStatus("Speaking...");
|
|
594
|
-
});
|
|
595
|
-
|
|
596
|
-
session.on("audio_stopped", () => {
|
|
597
|
-
if (!this.active) return;
|
|
598
|
-
this.callbacks.onStatus("Listening... (auto-send on silence, Ctrl+Y to stop)");
|
|
599
|
-
});
|
|
600
|
-
|
|
601
|
-
session.on("audio_interrupted", () => {
|
|
602
|
-
void this.resetPlayback();
|
|
603
|
-
if (!this.active) return;
|
|
604
|
-
this.callbacks.onStatus("Listening... (auto-send on silence, Ctrl+Y to stop)");
|
|
605
|
-
});
|
|
606
|
-
|
|
607
|
-
session.on("transport_event", (event: TransportEvent) => {
|
|
608
|
-
this.handleTransportEvent(event);
|
|
609
|
-
});
|
|
610
|
-
|
|
611
|
-
session.on("error", (error) => {
|
|
612
|
-
const message = describeError(error);
|
|
613
|
-
logger.debug("voice-supervisor: realtime error", { error: message });
|
|
614
|
-
if (message.includes("WebSocket is not connected")) {
|
|
615
|
-
if (this.active) {
|
|
616
|
-
this.callbacks.onStatus("Reconnecting realtime voice...");
|
|
617
|
-
}
|
|
618
|
-
return;
|
|
619
|
-
}
|
|
620
|
-
this.callbacks.onError(new Error(message));
|
|
621
|
-
});
|
|
622
|
-
}
|
|
623
|
-
|
|
624
|
-
private handleTransportEvent(event: TransportEvent): void {
|
|
625
|
-
if (!this.active) return;
|
|
626
|
-
|
|
627
|
-
// Session ready
|
|
628
|
-
if (event.type === "session.created") {
|
|
629
|
-
this.sessionReady = true;
|
|
630
|
-
return;
|
|
631
|
-
}
|
|
632
|
-
|
|
633
|
-
// User speech started - interrupt agent and reset tracking
|
|
634
|
-
if (event.type === "input_audio_buffer.speech_started") {
|
|
635
|
-
const now = Date.now();
|
|
636
|
-
if (now - this.lastInterruptAt < INTERRUPT_DEBOUNCE_MS) return;
|
|
637
|
-
this.lastInterruptAt = now;
|
|
638
|
-
this.pendingTranscript = "";
|
|
639
|
-
this.pendingResponseHasToolCall = false;
|
|
640
|
-
this.pendingResponseHasAudioOutput = false;
|
|
641
|
-
void this.callbacks.onInterruptAgent();
|
|
642
|
-
return;
|
|
643
|
-
}
|
|
644
|
-
|
|
645
|
-
// User speech transcript completed - store for fallback
|
|
646
|
-
if (event.type === "conversation.item.input_audio_transcription.completed") {
|
|
647
|
-
const transcript = (event as { transcript?: string }).transcript;
|
|
648
|
-
if (transcript && typeof transcript === "string") {
|
|
649
|
-
this.pendingTranscript = normalizeText(transcript);
|
|
650
|
-
logger.debug("voice-supervisor: transcript captured", { transcript: this.pendingTranscript });
|
|
651
|
-
}
|
|
652
|
-
return;
|
|
653
|
-
}
|
|
654
|
-
|
|
655
|
-
// Response started - begin tracking
|
|
656
|
-
if (event.type === "response.created") {
|
|
657
|
-
this.pendingResponseHasToolCall = false;
|
|
658
|
-
this.pendingResponseHasAudioOutput = false;
|
|
659
|
-
return;
|
|
660
|
-
}
|
|
661
|
-
|
|
662
|
-
// Tool call detected - mark so we know not to use fallback
|
|
663
|
-
// Check multiple event types for robustness against API changes
|
|
664
|
-
if (
|
|
665
|
-
event.type === "function_call" ||
|
|
666
|
-
event.type === "response.function_call_arguments.done" ||
|
|
667
|
-
event.type === "response.function_call_arguments.delta" ||
|
|
668
|
-
event.type === "response.output_item.added"
|
|
669
|
-
) {
|
|
670
|
-
// For output_item.added, only mark if it's a function_call type
|
|
671
|
-
if (event.type === "response.output_item.added") {
|
|
672
|
-
const item = (event as { item?: { type?: string } }).item;
|
|
673
|
-
if (item?.type === "function_call") {
|
|
674
|
-
this.pendingResponseHasToolCall = true;
|
|
675
|
-
}
|
|
676
|
-
} else {
|
|
677
|
-
this.pendingResponseHasToolCall = true;
|
|
678
|
-
}
|
|
679
|
-
return;
|
|
680
|
-
}
|
|
681
|
-
|
|
682
|
-
// Audio output detected - mark so we don't fallback
|
|
683
|
-
if (
|
|
684
|
-
event.type === "response.output_audio.delta" ||
|
|
685
|
-
event.type === "response.output_audio.done" ||
|
|
686
|
-
event.type === "response.output_audio_transcript.delta" ||
|
|
687
|
-
event.type === "response.output_audio_transcript.done" ||
|
|
688
|
-
event.type === "response.content_part.added" ||
|
|
689
|
-
event.type === "response.content_part.done"
|
|
690
|
-
) {
|
|
691
|
-
this.pendingResponseHasAudioOutput = true;
|
|
692
|
-
return;
|
|
693
|
-
}
|
|
694
|
-
|
|
695
|
-
// Response completed - check if we need fallback
|
|
696
|
-
if (event.type === "response.done") {
|
|
697
|
-
// Only use fallback if we have a transcript AND there was no tool call AND no audio output
|
|
698
|
-
// This prevents duplicate responses when the realtime assistant already spoke
|
|
699
|
-
if (this.pendingTranscript && !this.pendingResponseHasToolCall && !this.pendingResponseHasAudioOutput) {
|
|
700
|
-
logger.debug("voice-supervisor: using fallback transcript path", {
|
|
701
|
-
transcript: this.pendingTranscript,
|
|
702
|
-
});
|
|
703
|
-
const transcript = this.pendingTranscript;
|
|
704
|
-
this.pendingTranscript = "";
|
|
705
|
-
// Queue the fallback asynchronously to avoid blocking
|
|
706
|
-
setImmediate(() => {
|
|
707
|
-
if (this.active) {
|
|
708
|
-
void this.callbacks.onSendToAgent(transcript);
|
|
709
|
-
}
|
|
710
|
-
});
|
|
711
|
-
}
|
|
712
|
-
return;
|
|
713
|
-
}
|
|
714
|
-
}
|
|
715
|
-
|
|
716
|
-
private async handleAudio(event: TransportLayerAudio): Promise<void> {
|
|
717
|
-
if (!this.active) return;
|
|
718
|
-
const now = Date.now();
|
|
719
|
-
try {
|
|
720
|
-
await this.ensurePlayback();
|
|
721
|
-
} catch (error) {
|
|
722
|
-
this.callbacks.onError(new Error(describeError(error)));
|
|
723
|
-
return;
|
|
724
|
-
}
|
|
725
|
-
if (!this.playbackWriter) return;
|
|
726
|
-
try {
|
|
727
|
-
await this.playbackWriter.write(new Uint8Array(event.data));
|
|
728
|
-
this.lastPlaybackAt = now;
|
|
729
|
-
this.lastPlaybackRms = rms16le(new Uint8Array(event.data));
|
|
730
|
-
} catch (error) {
|
|
731
|
-
logger.debug("voice-supervisor: playback write failed", {
|
|
732
|
-
error: describeError(error),
|
|
733
|
-
});
|
|
734
|
-
void this.resetPlayback();
|
|
735
|
-
}
|
|
736
|
-
}
|
|
737
|
-
|
|
738
|
-
private async startCapture(): Promise<void> {
|
|
739
|
-
const captureResult = buildCaptureCommand(DEFAULT_SAMPLE_RATE, DEFAULT_CHANNELS);
|
|
740
|
-
if (!captureResult) {
|
|
741
|
-
throw new Error(`No audio capture tool found.\n\n${getMissingToolsMessage()}`);
|
|
742
|
-
}
|
|
743
|
-
|
|
744
|
-
const { command, env: captureEnv } = captureResult;
|
|
745
|
-
logger.debug("voice-supervisor: starting mic capture", { command, env: captureEnv });
|
|
746
|
-
const proc = ptree.cspawn(command, {
|
|
747
|
-
env: captureEnv ? { ...process.env, ...captureEnv } : undefined,
|
|
748
|
-
});
|
|
749
|
-
this.captureProcess = proc;
|
|
750
|
-
const reader = proc.stdout.getReader();
|
|
751
|
-
this.captureReader = reader;
|
|
752
|
-
|
|
753
|
-
(async () => {
|
|
754
|
-
while (this.active) {
|
|
755
|
-
const { value, done } = await reader.read();
|
|
756
|
-
if (done || !this.active) break;
|
|
757
|
-
if (!value || !this.session) continue;
|
|
758
|
-
if (!this.connected || !this.sessionReady || this.session.transport.status !== "connected") {
|
|
759
|
-
continue;
|
|
760
|
-
}
|
|
761
|
-
|
|
762
|
-
const micRms = rms16le(value);
|
|
763
|
-
const now = Date.now();
|
|
764
|
-
const playbackActive = now - this.lastPlaybackAt < PLAYBACK_ACTIVE_WINDOW_MS;
|
|
765
|
-
|
|
766
|
-
// Echo suppression: only skip if playback is active AND mic is very quiet relative to playback
|
|
767
|
-
// This prevents feedback loops while allowing user to speak over the assistant
|
|
768
|
-
if (playbackActive && micRms < MIC_NOISE_FLOOR && micRms < this.lastPlaybackRms / ECHO_SUPPRESSION_RATIO) {
|
|
769
|
-
continue;
|
|
770
|
-
}
|
|
771
|
-
|
|
772
|
-
// Send all audio to realtime API - let semantic_vad handle turn detection
|
|
773
|
-
const buffer = toArrayBuffer(value);
|
|
774
|
-
if (buffer.byteLength === 0) continue;
|
|
775
|
-
try {
|
|
776
|
-
this.session.sendAudio(buffer);
|
|
777
|
-
} catch (error) {
|
|
778
|
-
const message = describeError(error);
|
|
779
|
-
logger.debug("voice-supervisor: sendAudio failed", { error: message });
|
|
780
|
-
if (message.includes("WebSocket is not connected")) {
|
|
781
|
-
continue;
|
|
782
|
-
}
|
|
783
|
-
this.callbacks.onError(error instanceof Error ? error : new Error(message));
|
|
784
|
-
return;
|
|
785
|
-
}
|
|
786
|
-
}
|
|
787
|
-
if (this.active) {
|
|
788
|
-
this.callbacks.onError(new Error("Voice capture stopped unexpectedly."));
|
|
789
|
-
}
|
|
790
|
-
})().catch((error) => {
|
|
791
|
-
if (!this.active) return;
|
|
792
|
-
logger.debug("voice-supervisor: capture loop error", {
|
|
793
|
-
error: describeError(error),
|
|
794
|
-
});
|
|
795
|
-
this.callbacks.onError(new Error(describeError(error)));
|
|
796
|
-
});
|
|
797
|
-
}
|
|
798
|
-
|
|
799
|
-
private async stopCapture(): Promise<void> {
|
|
800
|
-
if (this.captureReader) {
|
|
801
|
-
try {
|
|
802
|
-
await this.captureReader.cancel();
|
|
803
|
-
} catch {
|
|
804
|
-
// ignore
|
|
805
|
-
}
|
|
806
|
-
this.captureReader = undefined;
|
|
807
|
-
}
|
|
808
|
-
if (this.captureProcess) {
|
|
809
|
-
try {
|
|
810
|
-
this.captureProcess.kill("SIGINT");
|
|
811
|
-
} catch {
|
|
812
|
-
// ignore
|
|
813
|
-
}
|
|
814
|
-
await this.captureProcess.exited;
|
|
815
|
-
this.captureProcess = undefined;
|
|
816
|
-
}
|
|
817
|
-
}
|
|
818
|
-
|
|
819
|
-
private async ensurePlayback(): Promise<void> {
|
|
820
|
-
if (this.playbackProcess && this.playbackWriter) return;
|
|
821
|
-
const command = buildPlaybackCommand(DEFAULT_SAMPLE_RATE, DEFAULT_CHANNELS);
|
|
822
|
-
if (!command) {
|
|
823
|
-
throw new Error(`No audio playback tool found.\n\n${getMissingToolsMessage()}`);
|
|
824
|
-
}
|
|
825
|
-
|
|
826
|
-
logger.debug("voice-supervisor: starting audio playback", { command });
|
|
827
|
-
const proc = ptree.cspawn(command, {
|
|
828
|
-
stdin: "pipe",
|
|
829
|
-
});
|
|
830
|
-
const startedAt = Date.now();
|
|
831
|
-
|
|
832
|
-
this.playbackProcess = proc;
|
|
833
|
-
const stdin = proc.stdin;
|
|
834
|
-
if (!stdin) {
|
|
835
|
-
throw new Error("Audio playback stdin unavailable.");
|
|
836
|
-
}
|
|
837
|
-
if ("getWriter" in stdin && typeof stdin.getWriter === "function") {
|
|
838
|
-
const writer = (stdin as unknown as WritableStream<Uint8Array>).getWriter();
|
|
839
|
-
this.playbackWriter = {
|
|
840
|
-
write: async (chunk) => {
|
|
841
|
-
await writer.write(chunk);
|
|
842
|
-
},
|
|
843
|
-
close: async () => {
|
|
844
|
-
await writer.close();
|
|
845
|
-
},
|
|
846
|
-
};
|
|
847
|
-
} else if ("write" in stdin && typeof (stdin as { write?: unknown }).write === "function") {
|
|
848
|
-
const sink = stdin as unknown as {
|
|
849
|
-
write: (chunk: Uint8Array) => undefined | number | Promise<undefined | number>;
|
|
850
|
-
end?: () => undefined | number | Promise<undefined | number>;
|
|
851
|
-
close?: () => undefined | number | Promise<undefined | number>;
|
|
852
|
-
};
|
|
853
|
-
this.playbackWriter = {
|
|
854
|
-
write: async (chunk) => {
|
|
855
|
-
await sink.write(chunk);
|
|
856
|
-
},
|
|
857
|
-
close: async () => {
|
|
858
|
-
if (sink.end) {
|
|
859
|
-
await sink.end();
|
|
860
|
-
} else if (sink.close) {
|
|
861
|
-
await sink.close();
|
|
862
|
-
}
|
|
863
|
-
},
|
|
864
|
-
};
|
|
865
|
-
} else {
|
|
866
|
-
throw new Error("Audio playback stdin is not writable.");
|
|
867
|
-
}
|
|
868
|
-
|
|
869
|
-
proc.exited
|
|
870
|
-
.then(() => {
|
|
871
|
-
const code = proc.exitCode;
|
|
872
|
-
if (this.playbackProcess === proc) {
|
|
873
|
-
this.playbackProcess = undefined;
|
|
874
|
-
this.playbackWriter = undefined;
|
|
875
|
-
}
|
|
876
|
-
const trimmed = proc.peekStderr().trim();
|
|
877
|
-
if (trimmed) {
|
|
878
|
-
logger.debug("voice-supervisor: playback stderr", { stderr: trimmed });
|
|
879
|
-
}
|
|
880
|
-
const elapsed = Date.now() - startedAt;
|
|
881
|
-
if (code !== 0 && elapsed < 2000 && this.active && code !== null) {
|
|
882
|
-
this.maybeWarnPlaybackFailure(trimmed || `exit code ${code}`);
|
|
883
|
-
}
|
|
884
|
-
})
|
|
885
|
-
.catch(() => {
|
|
886
|
-
// ignore
|
|
887
|
-
});
|
|
888
|
-
}
|
|
889
|
-
|
|
890
|
-
private async resetPlayback(): Promise<void> {
|
|
891
|
-
if (this.playbackWriter) {
|
|
892
|
-
try {
|
|
893
|
-
await this.playbackWriter.close();
|
|
894
|
-
} catch {
|
|
895
|
-
// ignore
|
|
896
|
-
}
|
|
897
|
-
}
|
|
898
|
-
if (this.playbackProcess) {
|
|
899
|
-
try {
|
|
900
|
-
this.playbackProcess.kill();
|
|
901
|
-
} catch {
|
|
902
|
-
// ignore
|
|
903
|
-
}
|
|
904
|
-
await this.playbackProcess.exited;
|
|
905
|
-
}
|
|
906
|
-
this.playbackProcess = undefined;
|
|
907
|
-
this.playbackWriter = undefined;
|
|
908
|
-
}
|
|
909
|
-
|
|
910
|
-
private maybeWarnPlaybackFailure(message: string): void {
|
|
911
|
-
if (!this.callbacks.onWarning) return;
|
|
912
|
-
const now = Date.now();
|
|
913
|
-
if (now - this.lastPlaybackErrorAt < PLAYBACK_ERROR_COOLDOWN_MS) return;
|
|
914
|
-
this.lastPlaybackErrorAt = now;
|
|
915
|
-
this.callbacks.onWarning(`Audio playback failed: ${message}`);
|
|
916
|
-
}
|
|
917
|
-
|
|
918
|
-
private async waitForConnection(session: RealtimeSession, timeoutMs: number): Promise<void> {
|
|
919
|
-
if (session.transport.status === "connected") {
|
|
920
|
-
this.connected = true;
|
|
921
|
-
return;
|
|
922
|
-
}
|
|
923
|
-
await new Promise<void>((resolve, reject) => {
|
|
924
|
-
const timeout = setTimeout(() => {
|
|
925
|
-
cleanup();
|
|
926
|
-
reject(new Error("Realtime voice connection timed out."));
|
|
927
|
-
}, timeoutMs);
|
|
928
|
-
|
|
929
|
-
const cleanup = () => {
|
|
930
|
-
clearTimeout(timeout);
|
|
931
|
-
session.transport.off("connection_change", onChange);
|
|
932
|
-
};
|
|
933
|
-
|
|
934
|
-
const onChange = (status: string) => {
|
|
935
|
-
if (status === "connected") {
|
|
936
|
-
this.connected = true;
|
|
937
|
-
cleanup();
|
|
938
|
-
resolve();
|
|
939
|
-
}
|
|
940
|
-
};
|
|
941
|
-
|
|
942
|
-
session.transport.on("connection_change", onChange);
|
|
943
|
-
});
|
|
944
|
-
}
|
|
945
|
-
|
|
946
|
-
private async waitForSessionReady(session: RealtimeSession, timeoutMs: number): Promise<void> {
|
|
947
|
-
if (this.sessionReady) return;
|
|
948
|
-
await new Promise<void>((resolve, reject) => {
|
|
949
|
-
let resolved = false;
|
|
950
|
-
|
|
951
|
-
const cleanup = () => {
|
|
952
|
-
clearTimeout(timeout);
|
|
953
|
-
session.off("transport_event", onEvent);
|
|
954
|
-
};
|
|
955
|
-
|
|
956
|
-
const timeout = setTimeout(() => {
|
|
957
|
-
if (resolved) return;
|
|
958
|
-
resolved = true;
|
|
959
|
-
cleanup();
|
|
960
|
-
reject(new Error("Realtime voice session not ready."));
|
|
961
|
-
}, timeoutMs);
|
|
962
|
-
|
|
963
|
-
const onEvent = (event: TransportEvent) => {
|
|
964
|
-
if (resolved) return;
|
|
965
|
-
if (event.type === "session.created") {
|
|
966
|
-
this.sessionReady = true;
|
|
967
|
-
resolved = true;
|
|
968
|
-
cleanup();
|
|
969
|
-
resolve();
|
|
970
|
-
}
|
|
971
|
-
};
|
|
972
|
-
|
|
973
|
-
session.on("transport_event", onEvent);
|
|
974
|
-
});
|
|
975
|
-
}
|
|
976
|
-
}
|