@open-gitagent/voice 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,460 @@
1
+ import WebSocket from "ws";
2
+ import { DEFAULT_VOICE_INSTRUCTIONS, } from "@open-gitagent/gitagent";
3
+ const dim = (s) => `\x1b[2m${s}\x1b[0m`;
4
+ export class OpenAIRealtimeAdapter {
5
+ ws = null;
6
+ config;
7
+ latestVideoFrame = null;
8
+ latestScreenFrame = null;
9
+ onMessage = null;
10
+ toolHandler = null;
11
+ interrupted = false;
12
+ // Session-refresh state
13
+ refreshTimer = null;
14
+ refreshing = false;
15
+ disposed = false;
16
+ // Refresh 5 minutes before OpenAI Realtime's 60-min hard cap
17
+ static REFRESH_AFTER_MS = 55 * 60 * 1000;
18
+ constructor(config) {
19
+ this.config = config;
20
+ }
21
+ async connect(opts) {
22
+ this.onMessage = opts.onMessage;
23
+ this.toolHandler = opts.toolHandler;
24
+ const model = this.config.model || "gpt-realtime-2025-08-28";
25
+ const url = `wss://api.openai.com/v1/realtime?model=${model}`;
26
+ // Try direct WebSocket with headers first (native Node.js / real server)
27
+ try {
28
+ await this.connectWs(url, {
29
+ headers: {
30
+ Authorization: `Bearer ${this.config.apiKey}`,
31
+ },
32
+ });
33
+ return;
34
+ }
35
+ catch (err) {
36
+ const msg = err?.message || "";
37
+ // Only retry with ephemeral token if auth failed (WebContainer drops headers)
38
+ if (!msg.includes("authentication") && !msg.includes("401")) {
39
+ throw err;
40
+ }
41
+ console.log(dim("[voice] Direct auth failed, requesting ephemeral token…"));
42
+ }
43
+ // Fallback: get an ephemeral session token via REST (fetch headers work everywhere)
44
+ const keyPreview = this.config.apiKey
45
+ ? `${this.config.apiKey.slice(0, 7)}...${this.config.apiKey.slice(-4)} (${this.config.apiKey.length} chars)`
46
+ : "(empty)";
47
+ console.log(dim(`[voice] API key: ${keyPreview}`));
48
+ const sessionResp = await fetch("https://api.openai.com/v1/realtime/sessions", {
49
+ method: "POST",
50
+ headers: {
51
+ "Authorization": `Bearer ${this.config.apiKey}`,
52
+ "Content-Type": "application/json",
53
+ },
54
+ body: JSON.stringify({ model }),
55
+ });
56
+ if (!sessionResp.ok) {
57
+ const body = await sessionResp.text();
58
+ throw new Error(`Failed to create realtime session: ${sessionResp.status} ${body}`);
59
+ }
60
+ const session = await sessionResp.json();
61
+ const ephemeralKey = session.client_secret?.value;
62
+ if (!ephemeralKey) {
63
+ throw new Error("No ephemeral key returned from realtime sessions endpoint");
64
+ }
65
+ await this.connectWs(url, {
66
+ headers: {
67
+ Authorization: `Bearer ${ephemeralKey}`,
68
+ },
69
+ });
70
+ }
71
+ connectWs(url, opts) {
72
+ return new Promise((resolve, reject) => {
73
+ const ws = new WebSocket(url, opts);
74
+ let settled = false;
75
+ ws.on("open", () => {
76
+ // Don't resolve yet — wait for first message to confirm auth succeeded.
77
+ // Send session.update so the server replies with session.created or error.
78
+ this.sendSessionUpdateOn(ws);
79
+ });
80
+ ws.on("error", (err) => {
81
+ if (!settled) {
82
+ settled = true;
83
+ ws.close();
84
+ reject(err);
85
+ }
86
+ else {
87
+ console.error(dim(`[voice] WebSocket error: ${err.message}`));
88
+ this.emit({ type: "error", message: err.message });
89
+ }
90
+ });
91
+ ws.on("close", () => {
92
+ if (!settled) {
93
+ settled = true;
94
+ reject(new Error("WebSocket closed before open — authentication likely failed"));
95
+ }
96
+ console.log(dim("[voice] WebSocket closed"));
97
+ });
98
+ ws.on("message", (data) => {
99
+ const event = JSON.parse(data.toString());
100
+ // Before we've confirmed auth, check for errors
101
+ if (!settled) {
102
+ if (event.type === "error") {
103
+ settled = true;
104
+ ws.close();
105
+ const errMsg = event.error?.message || "Unknown auth error";
106
+ reject(new Error(errMsg));
107
+ return;
108
+ }
109
+ // Any non-error message means auth succeeded
110
+ settled = true;
111
+ this.ws = ws;
112
+ resolve();
113
+ }
114
+ this.handleEvent(event);
115
+ });
116
+ });
117
+ }
118
+ /** Send session.update on a specific ws instance (before this.ws is set). */
119
+ sendSessionUpdateOn(ws) {
120
+ const instructions = this.config.instructions || DEFAULT_VOICE_INSTRUCTIONS;
121
+ const payload = {
122
+ type: "session.update",
123
+ session: {
124
+ type: "realtime",
125
+ output_modalities: ["audio"],
126
+ instructions,
127
+ audio: {
128
+ input: {
129
+ format: { type: "audio/pcm", rate: 24000 },
130
+ turn_detection: {
131
+ type: "server_vad",
132
+ threshold: 0.6,
133
+ prefix_padding_ms: 400,
134
+ silence_duration_ms: 800,
135
+ create_response: true,
136
+ },
137
+ transcription: { model: "whisper-1" },
138
+ },
139
+ output: {
140
+ format: { type: "audio/pcm", rate: 24000 },
141
+ voice: this.config.voice || "ash",
142
+ },
143
+ },
144
+ tool_choice: "auto",
145
+ tools: [
146
+ {
147
+ type: "function",
148
+ name: "run_agent",
149
+ description: "Your ONLY way to take action. This agent runs on the user's Mac with full shell access. It can: run ANY shell command, open apps (open -a Spotify), play music (osascript, afplay, open URLs), browse the web, read/write files, git operations, send emails, manage calendars, install packages, control system settings, and save memories. You MUST call this tool whenever the user asks you to DO anything — play music, open something, check something, build something, send something. NEVER describe an action without calling this tool. If the user asks and you just talk without calling this — you failed.",
150
+ parameters: {
151
+ type: "object",
152
+ properties: {
153
+ query: {
154
+ type: "string",
155
+ description: "What to do. Be specific. Include file paths for uploaded files. Examples: 'Play relaxing music on YouTube using: open https://youtube.com/...', 'Open Spotify and play chill playlist using osascript', 'Save to memory: user likes rock music'",
156
+ },
157
+ },
158
+ required: ["query"],
159
+ },
160
+ },
161
+ ],
162
+ },
163
+ };
164
+ if (ws.readyState === WebSocket.OPEN) {
165
+ ws.send(JSON.stringify(payload));
166
+ }
167
+ }
168
+ send(msg) {
169
+ switch (msg.type) {
170
+ case "audio":
171
+ this.sendRaw({
172
+ type: "input_audio_buffer.append",
173
+ audio: msg.audio,
174
+ });
175
+ break;
176
+ case "video_frame": {
177
+ // OpenAI doesn't support continuous video. Store latest frame and
178
+ // inject it as an image on the next user turn via conversation item.
179
+ const source = msg.source || "camera";
180
+ if (source === "screen") {
181
+ this.latestScreenFrame = { frame: msg.frame, mimeType: msg.mimeType };
182
+ }
183
+ else {
184
+ this.latestVideoFrame = { frame: msg.frame, mimeType: msg.mimeType };
185
+ }
186
+ break;
187
+ }
188
+ case "text": {
189
+ // Send text as a user conversation item, optionally with latest video frame
190
+ const content = [];
191
+ if (this.latestVideoFrame) {
192
+ content.push({
193
+ type: "input_image",
194
+ image_url: `data:${this.latestVideoFrame.mimeType};base64,${this.latestVideoFrame.frame}`,
195
+ });
196
+ this.latestVideoFrame = null;
197
+ }
198
+ content.push({ type: "input_text", text: msg.text });
199
+ this.sendRaw({
200
+ type: "conversation.item.create",
201
+ item: {
202
+ type: "message",
203
+ role: "user",
204
+ content,
205
+ },
206
+ });
207
+ this.sendRaw({ type: "response.create" });
208
+ break;
209
+ }
210
+ case "file": {
211
+ const content = [];
212
+ if (msg.mimeType.startsWith("image/")) {
213
+ content.push({
214
+ type: "input_image",
215
+ image_url: `data:${msg.mimeType};base64,${msg.data}`,
216
+ });
217
+ content.push({ type: "input_text", text: msg.text || `[User attached image: ${msg.name}]` });
218
+ }
219
+ else {
220
+ const decoded = Buffer.from(msg.data, "base64").toString("utf-8");
221
+ const label = msg.text ? `${msg.text}\n\n` : "";
222
+ content.push({ type: "input_text", text: `${label}[File: ${msg.name}]\n\`\`\`\n${decoded}\n\`\`\`` });
223
+ }
224
+ this.sendRaw({
225
+ type: "conversation.item.create",
226
+ item: { type: "message", role: "user", content },
227
+ });
228
+ this.sendRaw({ type: "response.create" });
229
+ break;
230
+ }
231
+ }
232
+ }
233
+ async disconnect() {
234
+ this.disposed = true;
235
+ if (this.refreshTimer) {
236
+ clearTimeout(this.refreshTimer);
237
+ this.refreshTimer = null;
238
+ }
239
+ if (this.ws) {
240
+ this.ws.close();
241
+ this.ws = null;
242
+ }
243
+ }
244
+ /**
245
+ * Tear down and reopen the Realtime WS before (or right after) OpenAI's
246
+ * 60-minute hard cap expires. Re-sends the stored session.update so the
247
+ * agent picks up where it left off without the user noticing.
248
+ */
249
+ async refreshSession(reason) {
250
+ if (this.refreshing || this.disposed)
251
+ return;
252
+ this.refreshing = true;
253
+ console.log(dim(`[voice] Refreshing Realtime session (${reason})`));
254
+ try {
255
+ // Close the old WS without disposing the adapter
256
+ if (this.refreshTimer) {
257
+ clearTimeout(this.refreshTimer);
258
+ this.refreshTimer = null;
259
+ }
260
+ if (this.ws) {
261
+ try {
262
+ this.ws.close();
263
+ }
264
+ catch { }
265
+ this.ws = null;
266
+ }
267
+ const model = this.config.model || "gpt-realtime-2025-08-28";
268
+ const url = `wss://api.openai.com/v1/realtime?model=${model}`;
269
+ try {
270
+ await this.connectWs(url, {
271
+ headers: {
272
+ Authorization: `Bearer ${this.config.apiKey}`,
273
+ },
274
+ });
275
+ }
276
+ catch (err) {
277
+ const msg = err?.message || "";
278
+ if (!msg.includes("authentication") && !msg.includes("401"))
279
+ throw err;
280
+ // Ephemeral token fallback (matches connect() path)
281
+ const sessionResp = await fetch("https://api.openai.com/v1/realtime/sessions", {
282
+ method: "POST",
283
+ headers: { Authorization: `Bearer ${this.config.apiKey}`, "Content-Type": "application/json" },
284
+ body: JSON.stringify({ model }),
285
+ });
286
+ if (!sessionResp.ok)
287
+ throw new Error(`refresh ephemeral token: ${sessionResp.status}`);
288
+ const session = (await sessionResp.json());
289
+ const ephemeralKey = session.client_secret?.value;
290
+ if (!ephemeralKey)
291
+ throw new Error("No ephemeral key on refresh");
292
+ await this.connectWs(url, {});
293
+ }
294
+ console.log(dim("[voice] Session refreshed"));
295
+ }
296
+ catch (err) {
297
+ console.error(dim(`[voice] Session refresh failed: ${err.message}`));
298
+ this.emit({ type: "error", message: `Voice session refresh failed: ${err.message}` });
299
+ }
300
+ finally {
301
+ this.refreshing = false;
302
+ }
303
+ }
304
+ emit(msg) {
305
+ this.onMessage?.(msg);
306
+ }
307
+ /**
308
+ * Inject the latest video frame as a conversation item so the model
309
+ * can see it when generating the next response (e.g. after a voice turn).
310
+ */
311
+ injectVideoFrame() {
312
+ // Prefer screen frame over camera — it provides more useful context
313
+ const isScreen = !!this.latestScreenFrame;
314
+ const frame = this.latestScreenFrame || this.latestVideoFrame;
315
+ if (!frame)
316
+ return;
317
+ // Clear both so we don't inject stale frames
318
+ this.latestScreenFrame = null;
319
+ this.latestVideoFrame = null;
320
+ console.log(dim(`[voice] Injecting ${isScreen ? "screen" : "camera"} frame into conversation`));
321
+ this.sendRaw({
322
+ type: "conversation.item.create",
323
+ item: {
324
+ type: "message",
325
+ role: "user",
326
+ content: [{
327
+ type: "input_image",
328
+ image_url: `data:${frame.mimeType};base64,${frame.frame}`,
329
+ }],
330
+ },
331
+ });
332
+ }
333
+ sendSessionUpdate() {
334
+ if (this.ws)
335
+ this.sendSessionUpdateOn(this.ws);
336
+ }
337
+ handleEvent(event) {
338
+ switch (event.type) {
339
+ case "session.created":
340
+ console.log(dim("[voice] Session created"));
341
+ if (this.refreshTimer)
342
+ clearTimeout(this.refreshTimer);
343
+ this.refreshTimer = setTimeout(() => {
344
+ this.refreshSession("proactive refresh before 60-min cap").catch(() => { });
345
+ }, OpenAIRealtimeAdapter.REFRESH_AFTER_MS);
346
+ break;
347
+ case "session.updated":
348
+ console.log(dim("[voice] Session configured"));
349
+ break;
350
+ case "input_audio_buffer.speech_started":
351
+ // VAD detected start of speech — inject video frame (what user is looking at)
352
+ // and cancel any in-progress response so the user can interrupt
353
+ this.interrupted = true;
354
+ this.injectVideoFrame();
355
+ this.sendRaw({ type: "response.cancel" });
356
+ this.emit({ type: "interrupt" });
357
+ break;
358
+ case "input_audio_buffer.speech_stopped":
359
+ break;
360
+ case "conversation.item.input_audio_transcription.completed":
361
+ if (event.transcript) {
362
+ console.log(dim(`[voice] User: ${event.transcript}`));
363
+ this.emit({ type: "transcript", role: "user", text: event.transcript });
364
+ }
365
+ break;
366
+ case "response.created":
367
+ // New response starting — accept audio again
368
+ this.interrupted = false;
369
+ break;
370
+ // GA event names are response.output_audio*; keep the beta aliases too.
371
+ case "response.audio.delta":
372
+ case "response.output_audio.delta":
373
+ if (event.delta && !this.interrupted) {
374
+ this.emit({ type: "audio_delta", audio: event.delta });
375
+ }
376
+ break;
377
+ case "response.audio_transcript.delta":
378
+ case "response.output_audio_transcript.delta":
379
+ this.emit({ type: "transcript", role: "assistant", text: event.delta || "", partial: true });
380
+ break;
381
+ case "response.audio_transcript.done":
382
+ case "response.output_audio_transcript.done":
383
+ if (event.transcript) {
384
+ this.emit({ type: "transcript", role: "assistant", text: event.transcript });
385
+ }
386
+ break;
387
+ case "response.function_call_arguments.done":
388
+ this.handleFunctionCall(event);
389
+ break;
390
+ case "error": {
391
+ const errMsg = event.error?.message || "Unknown OpenAI error";
392
+ const code = event.error?.code || "";
393
+ console.error(dim(`[voice] Error: ${JSON.stringify(event.error)}`));
394
+ // Don't surface cancellation errors — they happen when user interrupts with no active response
395
+ if (errMsg.toLowerCase().includes("cancellation failed"))
396
+ break;
397
+ // Session expired (60-min cap) — silently reconnect instead of surfacing
398
+ const lower = errMsg.toLowerCase();
399
+ if (lower.includes("maximum duration") ||
400
+ lower.includes("session_expired") ||
401
+ code === "session_expired") {
402
+ this.refreshSession("session expired").catch(() => { });
403
+ break;
404
+ }
405
+ this.emit({ type: "error", message: errMsg });
406
+ break;
407
+ }
408
+ }
409
+ }
410
+ async handleFunctionCall(event) {
411
+ const callId = event.call_id;
412
+ const name = event.name;
413
+ if (name !== "run_agent" || !this.toolHandler) {
414
+ console.error(dim(`[voice] Unknown function call: ${name}`));
415
+ return;
416
+ }
417
+ let args;
418
+ try {
419
+ args = JSON.parse(event.arguments);
420
+ }
421
+ catch {
422
+ console.error(dim("[voice] Failed to parse function arguments"));
423
+ return;
424
+ }
425
+ console.log(dim(`[voice] Agent query: ${args.query}`));
426
+ this.emit({ type: "agent_working", query: args.query });
427
+ try {
428
+ const result = await this.toolHandler(args.query);
429
+ console.log(dim(`[voice] Agent response: ${result.slice(0, 200)}${result.length > 200 ? "..." : ""}`));
430
+ this.sendRaw({
431
+ type: "conversation.item.create",
432
+ item: {
433
+ type: "function_call_output",
434
+ call_id: callId,
435
+ output: result,
436
+ },
437
+ });
438
+ this.sendRaw({ type: "response.create" });
439
+ this.emit({ type: "agent_done", result: result.slice(0, 500) });
440
+ }
441
+ catch (err) {
442
+ console.error(dim(`[voice] Agent error: ${err.message}`));
443
+ this.sendRaw({
444
+ type: "conversation.item.create",
445
+ item: {
446
+ type: "function_call_output",
447
+ call_id: callId,
448
+ output: `Error: ${err.message}`,
449
+ },
450
+ });
451
+ this.sendRaw({ type: "response.create" });
452
+ this.emit({ type: "error", message: err.message });
453
+ }
454
+ }
455
+ sendRaw(event) {
456
+ if (this.ws && this.ws.readyState === WebSocket.OPEN) {
457
+ this.ws.send(JSON.stringify(event));
458
+ }
459
+ }
460
+ }
@@ -0,0 +1,18 @@
1
+ import type { VoiceServerOptions } from "@open-gitagent/gitagent";
2
+ interface LogEntry {
3
+ id: number;
4
+ ts: string;
5
+ source: string;
6
+ level: "info" | "warn" | "error";
7
+ message: string;
8
+ }
9
+ export declare function logToBuffer(source: string, level: "info" | "warn" | "error", message: string): LogEntry;
10
+ export type FileKind = "html" | "image" | "pdf" | "video" | "audio" | "markdown" | "text" | "binary";
11
+ export interface FileTypeInfo {
12
+ mime: string;
13
+ kind: FileKind;
14
+ }
15
+ export declare function fileTypeFor(pathOrName: string): FileTypeInfo;
16
+ export declare const CLOUD_MODE: boolean;
17
+ export declare function startVoiceServer(opts: VoiceServerOptions): Promise<() => Promise<void>>;
18
+ export {};