bosun 0.36.2 → 0.36.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,816 @@
1
+ /**
2
+ * voice-relay.mjs — Multi-provider voice relay for real-time voice sessions.
3
+ *
4
+ * Supports:
5
+ * - OpenAI Realtime API (WebRTC) — direct API key
6
+ * - Azure OpenAI Realtime API (WebRTC) — API key + endpoint
7
+ * - Claude/Gemini provider mode (Tier 2 speech fallback + provider vision)
8
+ * - Tier 2 fallback (browser STT → executor → browser TTS)
9
+ *
10
+ * @module voice-relay
11
+ */
12
+
13
+ import { loadConfig } from "./config.mjs";
14
+ import { execPrimaryPrompt, getPrimaryAgentName } from "./primary-agent.mjs";
15
+
16
+ // ── Module-scope state ──────────────────────────────────────────────────────
17
+ let _voiceConfig = null; // cached resolved config
18
+ let _configLoadedAt = 0; // timestamp of last config load
19
+
20
+ const CONFIG_TTL_MS = 30_000; // re-read config every 30s
21
+
22
+ const OPENAI_REALTIME_URL = "https://api.openai.com/v1/realtime";
23
+ const OPENAI_REALTIME_MODEL = "gpt-4o-realtime-preview-2024-12-17";
24
+ const OPENAI_RESPONSES_URL = "https://api.openai.com/v1/responses";
25
+ const OPENAI_DEFAULT_VISION_MODEL = "gpt-4.1-mini";
26
+
27
+ const AZURE_API_VERSION = "2025-04-01-preview";
28
+ const ANTHROPIC_MESSAGES_URL = "https://api.anthropic.com/v1/messages";
29
+ const ANTHROPIC_API_VERSION = "2023-06-01";
30
+ const CLAUDE_DEFAULT_MODEL = "claude-3-7-sonnet-latest";
31
+ const CLAUDE_DEFAULT_VISION_MODEL = "claude-3-7-sonnet-latest";
32
+ const GEMINI_GENERATE_CONTENT_URL = "https://generativelanguage.googleapis.com/v1beta/models";
33
+ const GEMINI_DEFAULT_MODEL = "gemini-2.5-pro";
34
+ const GEMINI_DEFAULT_VISION_MODEL = "gemini-2.5-flash";
35
+
36
+ const VALID_EXECUTORS = new Set([
37
+ "codex-sdk",
38
+ "copilot-sdk",
39
+ "claude-sdk",
40
+ "gemini-sdk",
41
+ "opencode-sdk",
42
+ ]);
43
+
44
+ const VALID_AGENT_MODES = new Set([
45
+ "ask",
46
+ "agent",
47
+ "plan",
48
+ "code",
49
+ "architect",
50
+ ]);
51
+
52
+ function redactSecretLikeText(value) {
53
+ let sanitized = String(value || "");
54
+ sanitized = sanitized.replace(/\b(sk|rk|pk)-[A-Za-z0-9_-]{10,}\b/g, "$1-***REDACTED***");
55
+ sanitized = sanitized.replace(/\bBearer\s+[A-Za-z0-9._~+/=-]{8,}\b/gi, "Bearer ***REDACTED***");
56
+ sanitized = sanitized.replace(
57
+ /("?(?:api[_-]?key|access[_-]?token|client[_-]?secret|authorization)"?\s*[:=]\s*"?)([^",\s}{\]]+)/gi,
58
+ "$1***REDACTED***",
59
+ );
60
+ return sanitized;
61
+ }
62
+
63
+ async function buildProviderErrorDetails(response, fallback = "unknown") {
64
+ const raw = await response.text().catch(() => fallback);
65
+ return redactSecretLikeText(raw || fallback);
66
+ }
67
+
68
+ function sanitizeVoiceCallContext(context = {}) {
69
+ const rawSessionId = String(context?.sessionId || "").trim();
70
+ const rawExecutor = String(context?.executor || "").trim().toLowerCase();
71
+ const rawMode = String(context?.mode || "").trim().toLowerCase();
72
+ const rawModel = String(context?.model || "").trim();
73
+
74
+ return {
75
+ sessionId: rawSessionId || null,
76
+ executor: VALID_EXECUTORS.has(rawExecutor) ? rawExecutor : null,
77
+ mode: VALID_AGENT_MODES.has(rawMode) ? rawMode : null,
78
+ model: rawModel || null,
79
+ };
80
+ }
81
+
82
+ function buildSessionScopedInstructions(baseInstructions, callContext = {}) {
83
+ const context = sanitizeVoiceCallContext(callContext);
84
+ if (!context.sessionId && !context.executor && !context.mode && !context.model) {
85
+ return baseInstructions;
86
+ }
87
+
88
+ const suffix = [
89
+ "",
90
+ "## Bosun Voice Call Context",
91
+ `Active chat session id: ${context.sessionId || "none"}.`,
92
+ context.executor
93
+ ? `Preferred executor for delegated work: ${context.executor}.`
94
+ : "Preferred executor for delegated work: use configured default.",
95
+ context.mode
96
+ ? `Preferred delegation mode: ${context.mode}.`
97
+ : "Preferred delegation mode: use configured default.",
98
+ context.model
99
+ ? `Preferred model override: ${context.model}.`
100
+ : "Preferred model override: none.",
101
+ "",
102
+ "## Required Behavior",
103
+ "- For every user turn in this call, invoke delegate_to_agent exactly once before any final spoken answer.",
104
+ "- For coding, repo, task, debugging, automation, or workspace requests, call delegate_to_agent before finalizing your response.",
105
+ "- Preserve user intent when delegating. Do not paraphrase away technical detail.",
106
+ "- Keep responses concise after receiving delegate_to_agent output.",
107
+ ].join("\n");
108
+
109
+ return `${baseInstructions}${suffix}`;
110
+ }
111
+
112
+ function resolveToolChoice(toolDefinitions, callContext = {}) {
113
+ const context = sanitizeVoiceCallContext(callContext);
114
+ const hasDelegateTool = Array.isArray(toolDefinitions)
115
+ && toolDefinitions.some((tool) => tool?.name === "delegate_to_agent");
116
+ if (context.sessionId && hasDelegateTool) {
117
+ return {
118
+ type: "function",
119
+ name: "delegate_to_agent",
120
+ };
121
+ }
122
+ return "auto";
123
+ }
124
+
125
+ function extractModelResponseText(payload) {
126
+ if (!payload || typeof payload !== "object") return "";
127
+ if (typeof payload.output_text === "string" && payload.output_text.trim()) {
128
+ return payload.output_text.trim();
129
+ }
130
+
131
+ const output = Array.isArray(payload.output) ? payload.output : [];
132
+ for (const item of output) {
133
+ const content = Array.isArray(item?.content) ? item.content : [];
134
+ for (const part of content) {
135
+ if (typeof part?.text === "string" && part.text.trim()) {
136
+ return part.text.trim();
137
+ }
138
+ }
139
+ }
140
+
141
+ const choices = Array.isArray(payload.choices) ? payload.choices : [];
142
+ for (const choice of choices) {
143
+ const text = String(choice?.message?.content || "").trim();
144
+ if (text) return text;
145
+ }
146
+
147
+ return "";
148
+ }
149
+
150
+ function parseImageDataUrl(dataUrl) {
151
+ const raw = String(dataUrl || "").trim();
152
+ const match = raw.match(
153
+ /^data:(image\/(?:jpeg|jpg|png|webp));base64,([A-Za-z0-9+/=]+)$/i,
154
+ );
155
+ if (!match) {
156
+ throw new Error("Invalid frame format (expected data:image/*;base64,...)");
157
+ }
158
+ return {
159
+ mimeType: String(match[1] || "").toLowerCase(),
160
+ base64Data: String(match[2] || ""),
161
+ dataUrl: raw,
162
+ };
163
+ }
164
+
165
+ function extractClaudeResponseText(payload) {
166
+ if (!payload || typeof payload !== "object") return "";
167
+ const content = Array.isArray(payload.content) ? payload.content : [];
168
+ const text = content
169
+ .filter((part) => part?.type === "text")
170
+ .map((part) => String(part?.text || "").trim())
171
+ .filter(Boolean)
172
+ .join("\n")
173
+ .trim();
174
+ if (text) return text;
175
+ return "";
176
+ }
177
+
178
+ function extractGeminiResponseText(payload) {
179
+ if (!payload || typeof payload !== "object") return "";
180
+ const candidates = Array.isArray(payload.candidates) ? payload.candidates : [];
181
+ for (const candidate of candidates) {
182
+ const parts = Array.isArray(candidate?.content?.parts)
183
+ ? candidate.content.parts
184
+ : [];
185
+ const text = parts
186
+ .map((part) => String(part?.text || "").trim())
187
+ .filter(Boolean)
188
+ .join("\n")
189
+ .trim();
190
+ if (text) return text;
191
+ }
192
+ return "";
193
+ }
194
+
195
+ async function analyzeVisionWithOpenAI(dataUrl, model, prompt, contextText, cfg) {
196
+ const response = await fetch(OPENAI_RESPONSES_URL, {
197
+ method: "POST",
198
+ headers: {
199
+ Authorization: `Bearer ${cfg.openaiKey}`,
200
+ "Content-Type": "application/json",
201
+ },
202
+ body: JSON.stringify({
203
+ model,
204
+ temperature: 0.2,
205
+ max_output_tokens: 220,
206
+ input: [
207
+ {
208
+ role: "user",
209
+ content: [
210
+ {
211
+ type: "input_text",
212
+ text: `${prompt}\n\n${contextText}`,
213
+ },
214
+ {
215
+ type: "input_image",
216
+ image_url: dataUrl,
217
+ detail: "high",
218
+ },
219
+ ],
220
+ },
221
+ ],
222
+ }),
223
+ });
224
+ if (!response.ok) {
225
+ const errText = await buildProviderErrorDetails(response, "unknown");
226
+ throw new Error(`Vision request failed (${response.status}): ${errText}`);
227
+ }
228
+ const payload = await response.json();
229
+ const summary = extractModelResponseText(payload);
230
+ if (!summary) {
231
+ throw new Error("Vision model returned an empty summary");
232
+ }
233
+ return {
234
+ summary,
235
+ provider: "openai",
236
+ model,
237
+ };
238
+ }
239
+
240
+ async function analyzeVisionWithAzure(dataUrl, model, prompt, contextText, cfg) {
241
+ const endpoint = cfg.azureEndpoint.replace(/\/+$/, "");
242
+ const url = `${endpoint}/openai/responses?api-version=${AZURE_API_VERSION}`;
243
+ const response = await fetch(url, {
244
+ method: "POST",
245
+ headers: {
246
+ "api-key": cfg.azureKey,
247
+ "Content-Type": "application/json",
248
+ },
249
+ body: JSON.stringify({
250
+ model,
251
+ temperature: 0.2,
252
+ max_output_tokens: 220,
253
+ input: [
254
+ {
255
+ role: "user",
256
+ content: [
257
+ {
258
+ type: "input_text",
259
+ text: `${prompt}\n\n${contextText}`,
260
+ },
261
+ {
262
+ type: "input_image",
263
+ image_url: dataUrl,
264
+ detail: "high",
265
+ },
266
+ ],
267
+ },
268
+ ],
269
+ }),
270
+ });
271
+ if (!response.ok) {
272
+ const errText = await buildProviderErrorDetails(response, "unknown");
273
+ throw new Error(`Azure vision request failed (${response.status}): ${errText}`);
274
+ }
275
+ const payload = await response.json();
276
+ const summary = extractModelResponseText(payload);
277
+ if (!summary) {
278
+ throw new Error("Azure vision model returned an empty summary");
279
+ }
280
+ return {
281
+ summary,
282
+ provider: "azure",
283
+ model,
284
+ };
285
+ }
286
+
287
+ async function analyzeVisionWithClaude(frame, model, prompt, contextText, cfg) {
288
+ const response = await fetch(ANTHROPIC_MESSAGES_URL, {
289
+ method: "POST",
290
+ headers: {
291
+ "x-api-key": cfg.claudeKey,
292
+ "anthropic-version": ANTHROPIC_API_VERSION,
293
+ "Content-Type": "application/json",
294
+ },
295
+ body: JSON.stringify({
296
+ model,
297
+ temperature: 0.2,
298
+ max_tokens: 260,
299
+ messages: [
300
+ {
301
+ role: "user",
302
+ content: [
303
+ { type: "text", text: `${prompt}\n\n${contextText}` },
304
+ {
305
+ type: "image",
306
+ source: {
307
+ type: "base64",
308
+ media_type: frame.mimeType,
309
+ data: frame.base64Data,
310
+ },
311
+ },
312
+ ],
313
+ },
314
+ ],
315
+ }),
316
+ });
317
+ if (!response.ok) {
318
+ const errText = await buildProviderErrorDetails(response, "unknown");
319
+ throw new Error(`Claude vision request failed (${response.status}): ${errText}`);
320
+ }
321
+ const payload = await response.json();
322
+ const summary = extractClaudeResponseText(payload);
323
+ if (!summary) {
324
+ throw new Error("Claude vision model returned an empty summary");
325
+ }
326
+ return {
327
+ summary,
328
+ provider: "claude",
329
+ model,
330
+ };
331
+ }
332
+
333
+ async function analyzeVisionWithGemini(frame, model, prompt, contextText, cfg) {
334
+ const apiKey = String(cfg.geminiKey || "").trim();
335
+ const endpoint =
336
+ `${GEMINI_GENERATE_CONTENT_URL}/${encodeURIComponent(model)}:generateContent?key=${encodeURIComponent(apiKey)}`;
337
+ const response = await fetch(endpoint, {
338
+ method: "POST",
339
+ headers: {
340
+ "Content-Type": "application/json",
341
+ },
342
+ body: JSON.stringify({
343
+ contents: [
344
+ {
345
+ role: "user",
346
+ parts: [
347
+ { text: `${prompt}\n\n${contextText}` },
348
+ {
349
+ inlineData: {
350
+ mimeType: frame.mimeType,
351
+ data: frame.base64Data,
352
+ },
353
+ },
354
+ ],
355
+ },
356
+ ],
357
+ generationConfig: {
358
+ temperature: 0.2,
359
+ maxOutputTokens: 220,
360
+ },
361
+ }),
362
+ });
363
+ if (!response.ok) {
364
+ const errText = await buildProviderErrorDetails(response, "unknown");
365
+ throw new Error(`Gemini vision request failed (${response.status}): ${errText}`);
366
+ }
367
+ const payload = await response.json();
368
+ const summary = extractGeminiResponseText(payload);
369
+ if (!summary) {
370
+ throw new Error("Gemini vision model returned an empty summary");
371
+ }
372
+ return {
373
+ summary,
374
+ provider: "gemini",
375
+ model,
376
+ };
377
+ }
378
+
379
+ // ── Voice provider detection ────────────────────────────────────────────────
380
+
381
+ /**
382
+ * Resolve voice configuration from bosun config + env.
383
+ * Returns { provider, model, openaiKey, azureKey, azureEndpoint, azureDeployment,
384
+ * claudeKey, geminiKey, voiceId, turnDetection, instructions,
385
+ * fallbackMode, delegateExecutor, enabled, visionModel }
386
+ */
387
+ export function getVoiceConfig(forceReload = false) {
388
+ if (!forceReload && _voiceConfig && (Date.now() - _configLoadedAt < CONFIG_TTL_MS)) {
389
+ return _voiceConfig;
390
+ }
391
+
392
+ const cfg = loadConfig();
393
+ const voice = cfg.voice || {};
394
+
395
+ // Provider priority: config > env > key autodetect.
396
+ // "auto" resolves to azure/openai/claude/gemini/fallback based on available credentials.
397
+ const rawProvider = String(
398
+ voice.provider || process.env.VOICE_PROVIDER || "auto",
399
+ )
400
+ .trim()
401
+ .toLowerCase();
402
+
403
+ // API keys
404
+ const openaiKey = voice.openaiApiKey
405
+ || process.env.OPENAI_REALTIME_API_KEY
406
+ || process.env.OPENAI_API_KEY
407
+ || "";
408
+
409
+ const azureKey = voice.azureApiKey
410
+ || process.env.AZURE_OPENAI_REALTIME_API_KEY
411
+ || process.env.AZURE_OPENAI_API_KEY
412
+ || "";
413
+
414
+ const azureEndpoint = voice.azureEndpoint
415
+ || process.env.AZURE_OPENAI_REALTIME_ENDPOINT
416
+ || process.env.AZURE_OPENAI_ENDPOINT
417
+ || "";
418
+
419
+ const azureDeployment = voice.azureDeployment
420
+ || process.env.AZURE_OPENAI_REALTIME_DEPLOYMENT
421
+ || "gpt-4o-realtime-preview";
422
+
423
+ const claudeKey = voice.claudeApiKey
424
+ || process.env.ANTHROPIC_API_KEY
425
+ || "";
426
+
427
+ const geminiKey = voice.geminiApiKey
428
+ || process.env.GEMINI_API_KEY
429
+ || process.env.GOOGLE_API_KEY
430
+ || "";
431
+
432
+ const provider =
433
+ rawProvider === "auto"
434
+ ? (azureKey && azureEndpoint
435
+ ? "azure"
436
+ : (openaiKey
437
+ ? "openai"
438
+ : (claudeKey
439
+ ? "claude"
440
+ : (geminiKey ? "gemini" : "fallback"))))
441
+ : rawProvider;
442
+ const defaultModel =
443
+ provider === "claude"
444
+ ? CLAUDE_DEFAULT_MODEL
445
+ : provider === "gemini"
446
+ ? GEMINI_DEFAULT_MODEL
447
+ : OPENAI_REALTIME_MODEL;
448
+ const model = voice.model || process.env.VOICE_MODEL || defaultModel;
449
+ const voiceId = voice.voiceId || process.env.VOICE_ID || "alloy";
450
+ const turnDetection =
451
+ voice.turnDetection || process.env.VOICE_TURN_DETECTION || "server_vad";
452
+ const defaultVisionModel =
453
+ provider === "claude"
454
+ ? CLAUDE_DEFAULT_VISION_MODEL
455
+ : provider === "gemini"
456
+ ? GEMINI_DEFAULT_VISION_MODEL
457
+ : OPENAI_DEFAULT_VISION_MODEL;
458
+ const visionModel =
459
+ voice.visionModel || process.env.VOICE_VISION_MODEL || defaultVisionModel;
460
+ const fallbackMode =
461
+ voice.fallbackMode || process.env.VOICE_FALLBACK_MODE || "browser";
462
+ const delegateExecutor =
463
+ voice.delegateExecutor ||
464
+ process.env.VOICE_DELEGATE_EXECUTOR ||
465
+ cfg.primaryAgent ||
466
+ "codex-sdk";
467
+ const enabled =
468
+ voice.enabled != null
469
+ ? voice.enabled !== false
470
+ : !["0", "false", "no", "off"].includes(
471
+ String(process.env.VOICE_ENABLED || "")
472
+ .trim()
473
+ .toLowerCase(),
474
+ );
475
+
476
+ const instructions = voice.instructions || `You are Bosun, a helpful voice assistant for the VirtEngine development platform.
477
+ You help developers manage tasks, steer coding agents, monitor builds, and navigate the workspace.
478
+ Be concise and conversational. When users ask about code or tasks, use the available tools.
479
+ For complex operations like writing code or creating PRs, delegate to the appropriate agent.`;
480
+
481
+ _voiceConfig = Object.freeze({
482
+ provider,
483
+ model,
484
+ openaiKey,
485
+ azureKey,
486
+ azureEndpoint,
487
+ azureDeployment,
488
+ claudeKey,
489
+ geminiKey,
490
+ voiceId,
491
+ turnDetection,
492
+ visionModel,
493
+ instructions,
494
+ fallbackMode,
495
+ delegateExecutor,
496
+ enabled,
497
+ });
498
+ _configLoadedAt = Date.now();
499
+ return _voiceConfig;
500
+ }
501
+
502
+ /**
503
+ * Check if any voice tier is available.
504
+ */
505
+ export function isVoiceAvailable() {
506
+ const cfg = getVoiceConfig();
507
+ if (!cfg.enabled) return { available: false, tier: null, reason: "Voice disabled in config" };
508
+
509
+ if (cfg.provider === "openai" && cfg.openaiKey) {
510
+ return { available: true, tier: 1, provider: "openai" };
511
+ }
512
+ if (cfg.provider === "azure" && cfg.azureKey && cfg.azureEndpoint) {
513
+ return { available: true, tier: 1, provider: "azure" };
514
+ }
515
+ if (cfg.provider === "claude" && cfg.claudeKey) {
516
+ return { available: true, tier: 2, provider: "claude" };
517
+ }
518
+ if (cfg.provider === "gemini" && cfg.geminiKey) {
519
+ return { available: true, tier: 2, provider: "gemini" };
520
+ }
521
+ if (cfg.fallbackMode === "disabled") {
522
+ return {
523
+ available: false,
524
+ tier: null,
525
+ reason: `Voice provider "${cfg.provider}" is not configured and fallback is disabled`,
526
+ };
527
+ }
528
+ // Tier 2 fallback available when enabled
529
+ return { available: true, tier: 2, provider: "fallback" };
530
+ }
531
+
532
+ /**
533
+ * Create an ephemeral token for OpenAI Realtime API (WebRTC).
534
+ * Returns { token, expiresAt, model, voiceId, provider }
535
+ */
536
+ export async function createEphemeralToken(toolDefinitions = [], callContext = {}) {
537
+ const cfg = getVoiceConfig();
538
+ if (cfg.provider === "azure") {
539
+ return createAzureEphemeralToken(toolDefinitions, callContext);
540
+ }
541
+ if (cfg.provider !== "openai") {
542
+ throw new Error(
543
+ `Realtime WebRTC token is unavailable for provider "${cfg.provider}". ` +
544
+ "Use VOICE_PROVIDER=openai|azure for Tier 1 realtime voice.",
545
+ );
546
+ }
547
+ if (!cfg.openaiKey) {
548
+ throw new Error("OPENAI_API_KEY not configured for voice");
549
+ }
550
+
551
+ const context = sanitizeVoiceCallContext(callContext);
552
+ const instructions = buildSessionScopedInstructions(cfg.instructions, context);
553
+
554
+ const sessionConfig = {
555
+ model: cfg.model,
556
+ voice: cfg.voiceId,
557
+ instructions,
558
+ tool_choice: resolveToolChoice(toolDefinitions, context),
559
+ turn_detection: {
560
+ type: cfg.turnDetection,
561
+ ...(cfg.turnDetection === "server_vad" ? {
562
+ threshold: 0.5,
563
+ prefix_padding_ms: 300,
564
+ silence_duration_ms: 500,
565
+ } : {}),
566
+ ...(cfg.turnDetection === "semantic_vad" ? {
567
+ eagerness: "medium",
568
+ } : {}),
569
+ },
570
+ input_audio_transcription: { model: "gpt-4o-mini-transcribe" },
571
+ tools: toolDefinitions,
572
+ };
573
+
574
+ const response = await fetch(`${OPENAI_REALTIME_URL}/sessions`, {
575
+ method: "POST",
576
+ headers: {
577
+ Authorization: `Bearer ${cfg.openaiKey}`,
578
+ "Content-Type": "application/json",
579
+ },
580
+ body: JSON.stringify(sessionConfig),
581
+ });
582
+
583
+ if (!response.ok) {
584
+ const errorText = await buildProviderErrorDetails(response, "unknown");
585
+ throw new Error(`OpenAI Realtime session failed (${response.status}): ${errorText}`);
586
+ }
587
+
588
+ const data = await response.json();
589
+ return {
590
+ token: data.client_secret?.value || data.token,
591
+ expiresAt: data.client_secret?.expires_at || (Date.now() / 1000 + 60),
592
+ model: cfg.model,
593
+ voiceId: cfg.voiceId,
594
+ provider: "openai",
595
+ sessionConfig,
596
+ callContext: context,
597
+ };
598
+ }
599
+
600
+ /**
601
+ * Create an ephemeral token for Azure OpenAI Realtime API.
602
+ */
603
+ async function createAzureEphemeralToken(toolDefinitions = [], callContext = {}) {
604
+ const cfg = getVoiceConfig();
605
+ if (!cfg.azureKey || !cfg.azureEndpoint) {
606
+ throw new Error("Azure OpenAI Realtime not configured (need endpoint + key)");
607
+ }
608
+
609
+ const context = sanitizeVoiceCallContext(callContext);
610
+ const instructions = buildSessionScopedInstructions(cfg.instructions, context);
611
+ const endpoint = cfg.azureEndpoint.replace(/\/+$/, "");
612
+ const url = `${endpoint}/openai/realtime/sessions?api-version=${AZURE_API_VERSION}&deployment=${cfg.azureDeployment}`;
613
+
614
+ const sessionConfig = {
615
+ model: cfg.azureDeployment,
616
+ voice: cfg.voiceId,
617
+ instructions,
618
+ tool_choice: resolveToolChoice(toolDefinitions, context),
619
+ turn_detection: {
620
+ type: cfg.turnDetection,
621
+ ...(cfg.turnDetection === "server_vad" ? {
622
+ threshold: 0.5,
623
+ prefix_padding_ms: 300,
624
+ silence_duration_ms: 500,
625
+ } : {}),
626
+ },
627
+ input_audio_transcription: { model: "whisper-1" },
628
+ tools: toolDefinitions,
629
+ };
630
+
631
+ const response = await fetch(url, {
632
+ method: "POST",
633
+ headers: {
634
+ "api-key": cfg.azureKey,
635
+ "Content-Type": "application/json",
636
+ },
637
+ body: JSON.stringify(sessionConfig),
638
+ });
639
+
640
+ if (!response.ok) {
641
+ const errorText = await buildProviderErrorDetails(response, "unknown");
642
+ throw new Error(`Azure Realtime session failed (${response.status}): ${errorText}`);
643
+ }
644
+
645
+ const data = await response.json();
646
+ return {
647
+ token: data.client_secret?.value || data.token,
648
+ expiresAt: data.client_secret?.expires_at || (Date.now() / 1000 + 60),
649
+ model: cfg.azureDeployment,
650
+ voiceId: cfg.voiceId,
651
+ provider: "azure",
652
+ sessionConfig,
653
+ azureEndpoint: endpoint,
654
+ azureDeployment: cfg.azureDeployment,
655
+ callContext: context,
656
+ };
657
+ }
658
+
659
+ /**
660
+ * Analyze a camera/screen frame and return a concise summary.
661
+ * @param {string} frameDataUrl - data URL (image/jpeg|png|webp)
662
+ * @param {object} options - { source, context, prompt }
663
+ * @returns {Promise<{ summary: string, provider: string, model: string }>}
664
+ */
665
+ export async function analyzeVisionFrame(frameDataUrl, options = {}) {
666
+ const frame = parseImageDataUrl(frameDataUrl);
667
+ const dataUrl = frame.dataUrl;
668
+
669
+ const cfg = getVoiceConfig();
670
+ const source = String(options?.source || "screen").trim().toLowerCase() || "screen";
671
+ const callContext = sanitizeVoiceCallContext(options?.context || {});
672
+ const model =
673
+ String(
674
+ options?.model
675
+ || options?.visionModel
676
+ || cfg.visionModel
677
+ || process.env.VOICE_VISION_MODEL
678
+ || OPENAI_DEFAULT_VISION_MODEL,
679
+ ).trim();
680
+ const prompt = String(options?.prompt || "").trim()
681
+ || "Summarize what is visible in this live frame for a coding assistant. Focus on code, terminal output, errors, UI labels, and actionable context.";
682
+
683
+ const contextText = [
684
+ `Frame source: ${source}.`,
685
+ `Bound chat session: ${callContext.sessionId || "none"}.`,
686
+ callContext.executor ? `Preferred executor: ${callContext.executor}.` : "",
687
+ callContext.mode ? `Preferred mode: ${callContext.mode}.` : "",
688
+ callContext.model ? `Preferred model override: ${callContext.model}.` : "",
689
+ "Respond in 1-3 concise sentences. Include likely next action if obvious.",
690
+ ]
691
+ .filter(Boolean)
692
+ .join("\n");
693
+
694
+ const preferredProviders = [];
695
+ const pushProvider = (value) => {
696
+ const provider = String(value || "").trim().toLowerCase();
697
+ if (!provider || preferredProviders.includes(provider)) return;
698
+ preferredProviders.push(provider);
699
+ };
700
+ pushProvider(cfg.provider);
701
+ if (cfg.openaiKey) pushProvider("openai");
702
+ if (cfg.azureKey && cfg.azureEndpoint) pushProvider("azure");
703
+ if (cfg.claudeKey) pushProvider("claude");
704
+ if (cfg.geminiKey) pushProvider("gemini");
705
+
706
+ let lastError = null;
707
+ for (const provider of preferredProviders) {
708
+ try {
709
+ if (provider === "openai" && cfg.openaiKey) {
710
+ return await analyzeVisionWithOpenAI(
711
+ dataUrl,
712
+ model,
713
+ prompt,
714
+ contextText,
715
+ cfg,
716
+ );
717
+ }
718
+ if (provider === "azure" && cfg.azureKey && cfg.azureEndpoint) {
719
+ return await analyzeVisionWithAzure(
720
+ dataUrl,
721
+ model,
722
+ prompt,
723
+ contextText,
724
+ cfg,
725
+ );
726
+ }
727
+ if (provider === "claude" && cfg.claudeKey) {
728
+ return await analyzeVisionWithClaude(
729
+ frame,
730
+ model,
731
+ prompt,
732
+ contextText,
733
+ cfg,
734
+ );
735
+ }
736
+ if (provider === "gemini" && cfg.geminiKey) {
737
+ return await analyzeVisionWithGemini(
738
+ frame,
739
+ model,
740
+ prompt,
741
+ contextText,
742
+ cfg,
743
+ );
744
+ }
745
+ } catch (err) {
746
+ lastError = err;
747
+ }
748
+ }
749
+
750
+ if (lastError) {
751
+ throw new Error(`Vision request failed: ${lastError.message}`);
752
+ }
753
+
754
+ throw new Error(
755
+ "Vision unavailable: configure OPENAI, Azure, Anthropic, or Gemini voice credentials",
756
+ );
757
+ }
758
+
759
+ /**
760
+ * Execute a voice tool call server-side.
761
+ * Returns { result: string, error?: string }
762
+ */
763
+ export async function executeVoiceTool(toolName, toolArgs, context = {}) {
764
+ try {
765
+ // Import voice-tools lazily to avoid circular deps
766
+ const { executeToolCall } = await import("./voice-tools.mjs");
767
+ return await executeToolCall(toolName, toolArgs, context);
768
+ } catch (err) {
769
+ console.error(`[voice-relay] tool execution error (${toolName}):`, err.message);
770
+ return { result: null, error: err.message };
771
+ }
772
+ }
773
+
774
+ /**
775
+ * Get the full tool definitions array for voice sessions.
776
+ */
777
+ export async function getVoiceToolDefinitions(options = {}) {
778
+ try {
779
+ const { getToolDefinitions } = await import("./voice-tools.mjs");
780
+ const allTools = getToolDefinitions();
781
+ const delegateOnly = options?.delegateOnly === true;
782
+ if (!delegateOnly) return allTools;
783
+ return allTools.filter((tool) => tool?.name === "delegate_to_agent");
784
+ } catch (err) {
785
+ console.error("[voice-relay] failed to load voice tool definitions:", err.message);
786
+ return [];
787
+ }
788
+ }
789
+
790
+ /**
791
+ * Get the WebRTC connection URL for the client.
792
+ */
793
+ export function getRealtimeConnectionInfo() {
794
+ const cfg = getVoiceConfig();
795
+ if (cfg.provider === "azure") {
796
+ const endpoint = cfg.azureEndpoint.replace(/\/+$/, "");
797
+ return {
798
+ provider: "azure",
799
+ url: `${endpoint}/openai/realtime?api-version=${AZURE_API_VERSION}&deployment=${cfg.azureDeployment}`,
800
+ model: cfg.azureDeployment,
801
+ };
802
+ }
803
+ if (cfg.provider !== "openai") {
804
+ return {
805
+ provider: cfg.provider,
806
+ url: null,
807
+ model: cfg.model,
808
+ tier: 2,
809
+ };
810
+ }
811
+ return {
812
+ provider: "openai",
813
+ url: `${OPENAI_REALTIME_URL}?model=${cfg.model}`,
814
+ model: cfg.model,
815
+ };
816
+ }