bosun 0.36.2 → 0.36.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/agent-prompts.mjs +95 -0
- package/analyze-agent-work-helpers.mjs +308 -0
- package/analyze-agent-work.mjs +926 -0
- package/autofix.mjs +2 -0
- package/bosun.schema.json +101 -3
- package/codex-shell.mjs +85 -10
- package/desktop/main.mjs +871 -48
- package/desktop/preload.mjs +54 -1
- package/desktop-shortcut.mjs +90 -11
- package/git-editor-fix.mjs +273 -0
- package/mcp-registry.mjs +579 -0
- package/meeting-workflow-service.mjs +631 -0
- package/monitor.mjs +18 -103
- package/package.json +21 -2
- package/primary-agent.mjs +32 -12
- package/session-tracker.mjs +68 -0
- package/setup-web-server.mjs +20 -10
- package/setup.mjs +376 -83
- package/startup-service.mjs +51 -6
- package/stream-resilience.mjs +17 -7
- package/ui/app.js +164 -4
- package/ui/components/agent-selector.js +145 -1
- package/ui/components/chat-view.js +161 -15
- package/ui/components/session-list.js +2 -2
- package/ui/components/shared.js +188 -15
- package/ui/modules/icons.js +13 -0
- package/ui/modules/utils.js +44 -0
- package/ui/modules/voice-client-sdk.js +733 -0
- package/ui/modules/voice-overlay.js +128 -15
- package/ui/modules/voice.js +15 -6
- package/ui/setup.html +281 -81
- package/ui/styles/components.css +99 -3
- package/ui/styles/sessions.css +122 -14
- package/ui/styles.css +14 -0
- package/ui/tabs/agents.js +1 -1
- package/ui/tabs/chat.js +123 -14
- package/ui/tabs/control.js +16 -22
- package/ui/tabs/dashboard.js +85 -8
- package/ui/tabs/library.js +113 -17
- package/ui/tabs/settings.js +116 -2
- package/ui/tabs/tasks.js +388 -39
- package/ui/tabs/telemetry.js +0 -1
- package/ui/tabs/workflows.js +4 -0
- package/ui-server.mjs +400 -22
- package/update-check.mjs +41 -13
- package/voice-action-dispatcher.mjs +844 -0
- package/voice-agents-sdk.mjs +664 -0
- package/voice-auth-manager.mjs +164 -0
- package/voice-relay.mjs +1194 -0
- package/voice-tools.mjs +914 -0
- package/workflow-templates/agents.mjs +6 -2
- package/workflow-templates/github.mjs +154 -12
- package/workflow-templates.mjs +3 -0
- package/github-reconciler.mjs +0 -506
- package/merge-strategy.mjs +0 -1210
- package/pr-cleanup-daemon.mjs +0 -992
- package/workspace-reaper.mjs +0 -405
package/voice-relay.mjs
ADDED
|
@@ -0,0 +1,1194 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* voice-relay.mjs — Multi-provider voice relay for real-time voice sessions.
|
|
3
|
+
*
|
|
4
|
+
* Supports:
|
|
5
|
+
* - OpenAI Realtime API (WebRTC) — direct API key
|
|
6
|
+
* - Azure OpenAI Realtime API (WebRTC) — API key + endpoint
|
|
7
|
+
* - Claude/Gemini provider mode (Tier 2 speech fallback + provider vision)
|
|
8
|
+
* - Tier 2 fallback (browser STT → executor → browser TTS)
|
|
9
|
+
* - Direct JavaScript action dispatch (voice model returns JSON, Bosun executes)
|
|
10
|
+
*
|
|
11
|
+
* @module voice-relay
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import { loadConfig } from "./config.mjs";
|
|
15
|
+
import { execPrimaryPrompt, getPrimaryAgentName } from "./primary-agent.mjs";
|
|
16
|
+
import { resolveVoiceOAuthToken } from "./voice-auth-manager.mjs";
|
|
17
|
+
|
|
18
|
+
// ── Module-scope state ──────────────────────────────────────────────────────
|
|
19
|
+
let _voiceConfig = null; // cached resolved config
|
|
20
|
+
let _configLoadedAt = 0; // timestamp of last config load
|
|
21
|
+
|
|
22
|
+
const CONFIG_TTL_MS = 30_000; // re-read config every 30s
|
|
23
|
+
|
|
24
|
+
const OPENAI_REALTIME_URL = "https://api.openai.com/v1/realtime";
|
|
25
|
+
const OPENAI_REALTIME_MODEL = "gpt-realtime-1.5"; // Released 2026-02-23; replaces gpt-4o-realtime-preview
|
|
26
|
+
const OPENAI_RESPONSES_URL = "https://api.openai.com/v1/responses";
|
|
27
|
+
const OPENAI_DEFAULT_VISION_MODEL = "gpt-4.1-mini";
|
|
28
|
+
|
|
29
|
+
const AZURE_API_VERSION = "2025-04-01-preview";
|
|
30
|
+
const ANTHROPIC_MESSAGES_URL = "https://api.anthropic.com/v1/messages";
|
|
31
|
+
const ANTHROPIC_API_VERSION = "2023-06-01";
|
|
32
|
+
const CLAUDE_DEFAULT_MODEL = "claude-3-7-sonnet-latest";
|
|
33
|
+
const CLAUDE_DEFAULT_VISION_MODEL = "claude-3-7-sonnet-latest";
|
|
34
|
+
const GEMINI_GENERATE_CONTENT_URL = "https://generativelanguage.googleapis.com/v1beta/models";
|
|
35
|
+
const GEMINI_DEFAULT_MODEL = "gemini-2.5-pro";
|
|
36
|
+
const GEMINI_DEFAULT_VISION_MODEL = "gemini-2.5-flash";
|
|
37
|
+
|
|
38
|
+
const VALID_EXECUTORS = new Set([
|
|
39
|
+
"codex-sdk",
|
|
40
|
+
"copilot-sdk",
|
|
41
|
+
"claude-sdk",
|
|
42
|
+
"gemini-sdk",
|
|
43
|
+
"opencode-sdk",
|
|
44
|
+
]);
|
|
45
|
+
|
|
46
|
+
const VALID_AGENT_MODES = new Set([
|
|
47
|
+
"ask",
|
|
48
|
+
"agent",
|
|
49
|
+
"plan",
|
|
50
|
+
"code",
|
|
51
|
+
"architect",
|
|
52
|
+
]);
|
|
53
|
+
|
|
54
|
+
const VALID_VOICE_PROVIDERS = new Set([
|
|
55
|
+
"openai",
|
|
56
|
+
"azure",
|
|
57
|
+
"claude",
|
|
58
|
+
"gemini",
|
|
59
|
+
"fallback",
|
|
60
|
+
]);
|
|
61
|
+
|
|
62
|
+
const DEFAULT_VOICE_FAILOVER = Object.freeze({
|
|
63
|
+
enabled: true,
|
|
64
|
+
maxAttempts: 2,
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
function parseFailoverInt(rawValue, fallback) {
|
|
68
|
+
const parsed = Number.parseInt(String(rawValue ?? ""), 10);
|
|
69
|
+
if (!Number.isFinite(parsed) || parsed <= 0) return fallback;
|
|
70
|
+
return parsed;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
function normalizeVoiceProviderEntry(entry) {
|
|
74
|
+
if (typeof entry === "string") {
|
|
75
|
+
const provider = String(entry || "").trim().toLowerCase();
|
|
76
|
+
if (!VALID_VOICE_PROVIDERS.has(provider)) return null;
|
|
77
|
+
return {
|
|
78
|
+
provider,
|
|
79
|
+
model: null,
|
|
80
|
+
visionModel: null,
|
|
81
|
+
voiceId: null,
|
|
82
|
+
azureDeployment: null,
|
|
83
|
+
};
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
if (!entry || typeof entry !== "object") return null;
|
|
87
|
+
const provider = String(entry.provider || "").trim().toLowerCase();
|
|
88
|
+
if (!VALID_VOICE_PROVIDERS.has(provider)) return null;
|
|
89
|
+
|
|
90
|
+
const model = String(entry.model || "").trim() || null;
|
|
91
|
+
const visionModel = String(entry.visionModel || "").trim() || null;
|
|
92
|
+
const voiceId = String(entry.voiceId || "").trim() || null;
|
|
93
|
+
const azureDeployment = String(entry.azureDeployment || "").trim() || null;
|
|
94
|
+
|
|
95
|
+
return {
|
|
96
|
+
provider,
|
|
97
|
+
model,
|
|
98
|
+
visionModel,
|
|
99
|
+
voiceId,
|
|
100
|
+
azureDeployment,
|
|
101
|
+
};
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
function normalizeVoiceProviderChain(rawProviders, primaryProvider) {
|
|
105
|
+
const dedup = new Set();
|
|
106
|
+
const chain = [];
|
|
107
|
+
const pushEntry = (entry) => {
|
|
108
|
+
const normalized = normalizeVoiceProviderEntry(entry);
|
|
109
|
+
if (!normalized) return;
|
|
110
|
+
if (dedup.has(normalized.provider)) return;
|
|
111
|
+
dedup.add(normalized.provider);
|
|
112
|
+
chain.push(normalized);
|
|
113
|
+
};
|
|
114
|
+
|
|
115
|
+
if (Array.isArray(rawProviders)) {
|
|
116
|
+
rawProviders.forEach(pushEntry);
|
|
117
|
+
} else if (typeof rawProviders === "string" && rawProviders.trim()) {
|
|
118
|
+
rawProviders
|
|
119
|
+
.split(",")
|
|
120
|
+
.map((item) => item.trim())
|
|
121
|
+
.filter(Boolean)
|
|
122
|
+
.forEach((provider) => pushEntry({ provider }));
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
if (primaryProvider && VALID_VOICE_PROVIDERS.has(primaryProvider)) {
|
|
126
|
+
if (!dedup.has(primaryProvider)) {
|
|
127
|
+
chain.unshift({
|
|
128
|
+
provider: primaryProvider,
|
|
129
|
+
model: null,
|
|
130
|
+
visionModel: null,
|
|
131
|
+
voiceId: null,
|
|
132
|
+
azureDeployment: null,
|
|
133
|
+
});
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
return chain;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
function getProviderChainWithCredentialFallbacks(chain, credentialState = {}) {
|
|
141
|
+
const dedup = new Set();
|
|
142
|
+
const providers = [];
|
|
143
|
+
const pushProvider = (provider) => {
|
|
144
|
+
if (!provider || dedup.has(provider)) return;
|
|
145
|
+
if (!VALID_VOICE_PROVIDERS.has(provider)) return;
|
|
146
|
+
dedup.add(provider);
|
|
147
|
+
providers.push(provider);
|
|
148
|
+
};
|
|
149
|
+
|
|
150
|
+
chain.forEach((entry) => pushProvider(entry.provider));
|
|
151
|
+
|
|
152
|
+
if (credentialState.azureAvailable) pushProvider("azure");
|
|
153
|
+
if (credentialState.openaiAvailable) pushProvider("openai");
|
|
154
|
+
if (credentialState.claudeAvailable) pushProvider("claude");
|
|
155
|
+
if (credentialState.geminiAvailable) pushProvider("gemini");
|
|
156
|
+
pushProvider("fallback");
|
|
157
|
+
|
|
158
|
+
return providers;
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
function shouldFailoverRealtimeError(err) {
|
|
162
|
+
const message = String(err?.message || "");
|
|
163
|
+
const statusMatch = message.match(/\((\d{3})\)/);
|
|
164
|
+
const status = statusMatch ? Number.parseInt(statusMatch[1], 10) : null;
|
|
165
|
+
if (status && (status === 401 || status === 403 || status === 408 || status === 409 || status === 429 || status >= 500)) {
|
|
166
|
+
return true;
|
|
167
|
+
}
|
|
168
|
+
if (/ECONNRESET|ETIMEDOUT|network|fetch failed|connection|connect/i.test(message)) {
|
|
169
|
+
return true;
|
|
170
|
+
}
|
|
171
|
+
return false;
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
function redactSecretLikeText(value) {
|
|
175
|
+
let sanitized = String(value || "");
|
|
176
|
+
sanitized = sanitized.replace(/\b(sk|rk|pk)-[A-Za-z0-9_-]{10,}\b/g, "$1-***REDACTED***");
|
|
177
|
+
sanitized = sanitized.replace(/\bBearer\s+[A-Za-z0-9._~+/=-]{8,}\b/gi, "Bearer ***REDACTED***");
|
|
178
|
+
sanitized = sanitized.replace(
|
|
179
|
+
/("?(?:api[_-]?key|access[_-]?token|client[_-]?secret|authorization)"?\s*[:=]\s*"?)([^",\s}{\]]+)/gi,
|
|
180
|
+
"$1***REDACTED***",
|
|
181
|
+
);
|
|
182
|
+
return sanitized;
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
async function buildProviderErrorDetails(response, fallback = "unknown") {
|
|
186
|
+
const raw = await response.text().catch(() => fallback);
|
|
187
|
+
return redactSecretLikeText(raw || fallback);
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
function sanitizeVoiceCallContext(context = {}) {
|
|
191
|
+
const rawSessionId = String(context?.sessionId || "").trim();
|
|
192
|
+
const rawExecutor = String(context?.executor || "").trim().toLowerCase();
|
|
193
|
+
const rawMode = String(context?.mode || "").trim().toLowerCase();
|
|
194
|
+
const rawModel = String(context?.model || "").trim();
|
|
195
|
+
|
|
196
|
+
return {
|
|
197
|
+
sessionId: rawSessionId || null,
|
|
198
|
+
executor: VALID_EXECUTORS.has(rawExecutor) ? rawExecutor : null,
|
|
199
|
+
mode: VALID_AGENT_MODES.has(rawMode) ? rawMode : null,
|
|
200
|
+
model: rawModel || null,
|
|
201
|
+
};
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
function buildSessionScopedInstructions(baseInstructions, callContext = {}) {
|
|
205
|
+
const context = sanitizeVoiceCallContext(callContext);
|
|
206
|
+
if (!context.sessionId && !context.executor && !context.mode && !context.model) {
|
|
207
|
+
return baseInstructions;
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
const suffix = [
|
|
211
|
+
"",
|
|
212
|
+
"## Bosun Voice Call Context",
|
|
213
|
+
`Active chat session id: ${context.sessionId || "none"}.`,
|
|
214
|
+
context.executor
|
|
215
|
+
? `Preferred executor for delegated work: ${context.executor}.`
|
|
216
|
+
: "Preferred executor for delegated work: use configured default.",
|
|
217
|
+
context.mode
|
|
218
|
+
? `Preferred delegation mode: ${context.mode}.`
|
|
219
|
+
: "Preferred delegation mode: use configured default.",
|
|
220
|
+
context.model
|
|
221
|
+
? `Preferred model override: ${context.model}.`
|
|
222
|
+
: "Preferred model override: none.",
|
|
223
|
+
"",
|
|
224
|
+
"## Required Behavior",
|
|
225
|
+
"- For every user turn in this call, invoke delegate_to_agent exactly once before any final spoken answer.",
|
|
226
|
+
"- For coding, repo, task, debugging, automation, or workspace requests, call delegate_to_agent before finalizing your response.",
|
|
227
|
+
"- Preserve user intent when delegating. Do not paraphrase away technical detail.",
|
|
228
|
+
"- Keep responses concise after receiving delegate_to_agent output.",
|
|
229
|
+
].join("\n");
|
|
230
|
+
|
|
231
|
+
return `${baseInstructions}${suffix}`;
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
function resolveToolChoice(toolDefinitions, callContext = {}) {
|
|
235
|
+
const context = sanitizeVoiceCallContext(callContext);
|
|
236
|
+
const hasDelegateTool = Array.isArray(toolDefinitions)
|
|
237
|
+
&& toolDefinitions.some((tool) => tool?.name === "delegate_to_agent");
|
|
238
|
+
if (context.sessionId && hasDelegateTool) {
|
|
239
|
+
return {
|
|
240
|
+
type: "function",
|
|
241
|
+
name: "delegate_to_agent",
|
|
242
|
+
};
|
|
243
|
+
}
|
|
244
|
+
return "auto";
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
function extractModelResponseText(payload) {
|
|
248
|
+
if (!payload || typeof payload !== "object") return "";
|
|
249
|
+
if (typeof payload.output_text === "string" && payload.output_text.trim()) {
|
|
250
|
+
return payload.output_text.trim();
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
const output = Array.isArray(payload.output) ? payload.output : [];
|
|
254
|
+
for (const item of output) {
|
|
255
|
+
const content = Array.isArray(item?.content) ? item.content : [];
|
|
256
|
+
for (const part of content) {
|
|
257
|
+
if (typeof part?.text === "string" && part.text.trim()) {
|
|
258
|
+
return part.text.trim();
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
const choices = Array.isArray(payload.choices) ? payload.choices : [];
|
|
264
|
+
for (const choice of choices) {
|
|
265
|
+
const text = String(choice?.message?.content || "").trim();
|
|
266
|
+
if (text) return text;
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
return "";
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
function parseImageDataUrl(dataUrl) {
|
|
273
|
+
const raw = String(dataUrl || "").trim();
|
|
274
|
+
const match = raw.match(
|
|
275
|
+
/^data:(image\/(?:jpeg|jpg|png|webp));base64,([A-Za-z0-9+/=]+)$/i,
|
|
276
|
+
);
|
|
277
|
+
if (!match) {
|
|
278
|
+
throw new Error("Invalid frame format (expected data:image/*;base64,...)");
|
|
279
|
+
}
|
|
280
|
+
return {
|
|
281
|
+
mimeType: String(match[1] || "").toLowerCase(),
|
|
282
|
+
base64Data: String(match[2] || ""),
|
|
283
|
+
dataUrl: raw,
|
|
284
|
+
};
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
function extractClaudeResponseText(payload) {
|
|
288
|
+
if (!payload || typeof payload !== "object") return "";
|
|
289
|
+
const content = Array.isArray(payload.content) ? payload.content : [];
|
|
290
|
+
const text = content
|
|
291
|
+
.filter((part) => part?.type === "text")
|
|
292
|
+
.map((part) => String(part?.text || "").trim())
|
|
293
|
+
.filter(Boolean)
|
|
294
|
+
.join("\n")
|
|
295
|
+
.trim();
|
|
296
|
+
if (text) return text;
|
|
297
|
+
return "";
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
function extractGeminiResponseText(payload) {
|
|
301
|
+
if (!payload || typeof payload !== "object") return "";
|
|
302
|
+
const candidates = Array.isArray(payload.candidates) ? payload.candidates : [];
|
|
303
|
+
for (const candidate of candidates) {
|
|
304
|
+
const parts = Array.isArray(candidate?.content?.parts)
|
|
305
|
+
? candidate.content.parts
|
|
306
|
+
: [];
|
|
307
|
+
const text = parts
|
|
308
|
+
.map((part) => String(part?.text || "").trim())
|
|
309
|
+
.filter(Boolean)
|
|
310
|
+
.join("\n")
|
|
311
|
+
.trim();
|
|
312
|
+
if (text) return text;
|
|
313
|
+
}
|
|
314
|
+
return "";
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
async function analyzeVisionWithOpenAI(dataUrl, model, prompt, contextText, cfg) {
|
|
318
|
+
const response = await fetch(OPENAI_RESPONSES_URL, {
|
|
319
|
+
method: "POST",
|
|
320
|
+
headers: {
|
|
321
|
+
Authorization: `Bearer ${cfg.openaiKey}`,
|
|
322
|
+
"Content-Type": "application/json",
|
|
323
|
+
},
|
|
324
|
+
body: JSON.stringify({
|
|
325
|
+
model,
|
|
326
|
+
temperature: 0.2,
|
|
327
|
+
max_output_tokens: 220,
|
|
328
|
+
input: [
|
|
329
|
+
{
|
|
330
|
+
role: "user",
|
|
331
|
+
content: [
|
|
332
|
+
{
|
|
333
|
+
type: "input_text",
|
|
334
|
+
text: `${prompt}\n\n${contextText}`,
|
|
335
|
+
},
|
|
336
|
+
{
|
|
337
|
+
type: "input_image",
|
|
338
|
+
image_url: dataUrl,
|
|
339
|
+
detail: "high",
|
|
340
|
+
},
|
|
341
|
+
],
|
|
342
|
+
},
|
|
343
|
+
],
|
|
344
|
+
}),
|
|
345
|
+
});
|
|
346
|
+
if (!response.ok) {
|
|
347
|
+
const errText = await buildProviderErrorDetails(response, "unknown");
|
|
348
|
+
throw new Error(`Vision request failed (${response.status}): ${errText}`);
|
|
349
|
+
}
|
|
350
|
+
const payload = await response.json();
|
|
351
|
+
const summary = extractModelResponseText(payload);
|
|
352
|
+
if (!summary) {
|
|
353
|
+
throw new Error("Vision model returned an empty summary");
|
|
354
|
+
}
|
|
355
|
+
return {
|
|
356
|
+
summary,
|
|
357
|
+
provider: "openai",
|
|
358
|
+
model,
|
|
359
|
+
};
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
async function analyzeVisionWithAzure(dataUrl, model, prompt, contextText, cfg) {
|
|
363
|
+
const endpoint = cfg.azureEndpoint.replace(/\/+$/, "");
|
|
364
|
+
const url = `${endpoint}/openai/responses?api-version=${AZURE_API_VERSION}`;
|
|
365
|
+
const response = await fetch(url, {
|
|
366
|
+
method: "POST",
|
|
367
|
+
headers: {
|
|
368
|
+
"api-key": cfg.azureKey,
|
|
369
|
+
"Content-Type": "application/json",
|
|
370
|
+
},
|
|
371
|
+
body: JSON.stringify({
|
|
372
|
+
model,
|
|
373
|
+
temperature: 0.2,
|
|
374
|
+
max_output_tokens: 220,
|
|
375
|
+
input: [
|
|
376
|
+
{
|
|
377
|
+
role: "user",
|
|
378
|
+
content: [
|
|
379
|
+
{
|
|
380
|
+
type: "input_text",
|
|
381
|
+
text: `${prompt}\n\n${contextText}`,
|
|
382
|
+
},
|
|
383
|
+
{
|
|
384
|
+
type: "input_image",
|
|
385
|
+
image_url: dataUrl,
|
|
386
|
+
detail: "high",
|
|
387
|
+
},
|
|
388
|
+
],
|
|
389
|
+
},
|
|
390
|
+
],
|
|
391
|
+
}),
|
|
392
|
+
});
|
|
393
|
+
if (!response.ok) {
|
|
394
|
+
const errText = await buildProviderErrorDetails(response, "unknown");
|
|
395
|
+
throw new Error(`Azure vision request failed (${response.status}): ${errText}`);
|
|
396
|
+
}
|
|
397
|
+
const payload = await response.json();
|
|
398
|
+
const summary = extractModelResponseText(payload);
|
|
399
|
+
if (!summary) {
|
|
400
|
+
throw new Error("Azure vision model returned an empty summary");
|
|
401
|
+
}
|
|
402
|
+
return {
|
|
403
|
+
summary,
|
|
404
|
+
provider: "azure",
|
|
405
|
+
model,
|
|
406
|
+
};
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
async function analyzeVisionWithClaude(frame, model, prompt, contextText, cfg) {
|
|
410
|
+
const response = await fetch(ANTHROPIC_MESSAGES_URL, {
|
|
411
|
+
method: "POST",
|
|
412
|
+
headers: {
|
|
413
|
+
"x-api-key": cfg.claudeKey,
|
|
414
|
+
"anthropic-version": ANTHROPIC_API_VERSION,
|
|
415
|
+
"Content-Type": "application/json",
|
|
416
|
+
},
|
|
417
|
+
body: JSON.stringify({
|
|
418
|
+
model,
|
|
419
|
+
temperature: 0.2,
|
|
420
|
+
max_tokens: 260,
|
|
421
|
+
messages: [
|
|
422
|
+
{
|
|
423
|
+
role: "user",
|
|
424
|
+
content: [
|
|
425
|
+
{ type: "text", text: `${prompt}\n\n${contextText}` },
|
|
426
|
+
{
|
|
427
|
+
type: "image",
|
|
428
|
+
source: {
|
|
429
|
+
type: "base64",
|
|
430
|
+
media_type: frame.mimeType,
|
|
431
|
+
data: frame.base64Data,
|
|
432
|
+
},
|
|
433
|
+
},
|
|
434
|
+
],
|
|
435
|
+
},
|
|
436
|
+
],
|
|
437
|
+
}),
|
|
438
|
+
});
|
|
439
|
+
if (!response.ok) {
|
|
440
|
+
const errText = await buildProviderErrorDetails(response, "unknown");
|
|
441
|
+
throw new Error(`Claude vision request failed (${response.status}): ${errText}`);
|
|
442
|
+
}
|
|
443
|
+
const payload = await response.json();
|
|
444
|
+
const summary = extractClaudeResponseText(payload);
|
|
445
|
+
if (!summary) {
|
|
446
|
+
throw new Error("Claude vision model returned an empty summary");
|
|
447
|
+
}
|
|
448
|
+
return {
|
|
449
|
+
summary,
|
|
450
|
+
provider: "claude",
|
|
451
|
+
model,
|
|
452
|
+
};
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
async function analyzeVisionWithGemini(frame, model, prompt, contextText, cfg) {
|
|
456
|
+
const apiKey = String(cfg.geminiKey || "").trim();
|
|
457
|
+
const endpoint =
|
|
458
|
+
`${GEMINI_GENERATE_CONTENT_URL}/${encodeURIComponent(model)}:generateContent?key=${encodeURIComponent(apiKey)}`;
|
|
459
|
+
const response = await fetch(endpoint, {
|
|
460
|
+
method: "POST",
|
|
461
|
+
headers: {
|
|
462
|
+
"Content-Type": "application/json",
|
|
463
|
+
},
|
|
464
|
+
body: JSON.stringify({
|
|
465
|
+
contents: [
|
|
466
|
+
{
|
|
467
|
+
role: "user",
|
|
468
|
+
parts: [
|
|
469
|
+
{ text: `${prompt}\n\n${contextText}` },
|
|
470
|
+
{
|
|
471
|
+
inlineData: {
|
|
472
|
+
mimeType: frame.mimeType,
|
|
473
|
+
data: frame.base64Data,
|
|
474
|
+
},
|
|
475
|
+
},
|
|
476
|
+
],
|
|
477
|
+
},
|
|
478
|
+
],
|
|
479
|
+
generationConfig: {
|
|
480
|
+
temperature: 0.2,
|
|
481
|
+
maxOutputTokens: 220,
|
|
482
|
+
},
|
|
483
|
+
}),
|
|
484
|
+
});
|
|
485
|
+
if (!response.ok) {
|
|
486
|
+
const errText = await buildProviderErrorDetails(response, "unknown");
|
|
487
|
+
throw new Error(`Gemini vision request failed (${response.status}): ${errText}`);
|
|
488
|
+
}
|
|
489
|
+
const payload = await response.json();
|
|
490
|
+
const summary = extractGeminiResponseText(payload);
|
|
491
|
+
if (!summary) {
|
|
492
|
+
throw new Error("Gemini vision model returned an empty summary");
|
|
493
|
+
}
|
|
494
|
+
return {
|
|
495
|
+
summary,
|
|
496
|
+
provider: "gemini",
|
|
497
|
+
model,
|
|
498
|
+
};
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
// ── Voice provider detection ────────────────────────────────────────────────
|
|
502
|
+
|
|
503
|
+
/**
|
|
504
|
+
* Resolve voice configuration from bosun config + env.
|
|
505
|
+
* Returns { provider, model, openaiKey, azureKey, azureEndpoint, azureDeployment,
|
|
506
|
+
* claudeKey, geminiKey, voiceId, turnDetection, instructions,
|
|
507
|
+
* fallbackMode, delegateExecutor, enabled, visionModel }
|
|
508
|
+
*/
|
|
509
|
+
export function getVoiceConfig(forceReload = false) {
|
|
510
|
+
if (!forceReload && _voiceConfig && (Date.now() - _configLoadedAt < CONFIG_TTL_MS)) {
|
|
511
|
+
return _voiceConfig;
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
const cfg = loadConfig();
|
|
515
|
+
const voice = cfg.voice || {};
|
|
516
|
+
|
|
517
|
+
// Provider priority: config > env > key autodetect.
|
|
518
|
+
// "auto" resolves to azure/openai/claude/gemini/fallback based on available credentials.
|
|
519
|
+
const rawProvider = String(
|
|
520
|
+
voice.provider || process.env.VOICE_PROVIDER || "auto",
|
|
521
|
+
)
|
|
522
|
+
.trim()
|
|
523
|
+
.toLowerCase();
|
|
524
|
+
|
|
525
|
+
// API keys
|
|
526
|
+
const openaiOAuthToken =
|
|
527
|
+
String(voice.openaiAccessToken || "").trim()
|
|
528
|
+
|| resolveVoiceOAuthToken("openai", forceReload)?.token
|
|
529
|
+
|| "";
|
|
530
|
+
const openaiKey = voice.openaiApiKey
|
|
531
|
+
|| process.env.OPENAI_REALTIME_API_KEY
|
|
532
|
+
|| process.env.OPENAI_API_KEY
|
|
533
|
+
|| "";
|
|
534
|
+
|
|
535
|
+
const azureOAuthToken =
|
|
536
|
+
String(voice.azureAccessToken || "").trim()
|
|
537
|
+
|| resolveVoiceOAuthToken("azure", forceReload)?.token
|
|
538
|
+
|| "";
|
|
539
|
+
const azureKey = voice.azureApiKey
|
|
540
|
+
|| process.env.AZURE_OPENAI_REALTIME_API_KEY
|
|
541
|
+
|| process.env.AZURE_OPENAI_API_KEY
|
|
542
|
+
|| "";
|
|
543
|
+
|
|
544
|
+
const azureEndpoint = voice.azureEndpoint
|
|
545
|
+
|| process.env.AZURE_OPENAI_REALTIME_ENDPOINT
|
|
546
|
+
|| process.env.AZURE_OPENAI_ENDPOINT
|
|
547
|
+
|| "";
|
|
548
|
+
|
|
549
|
+
const azureDeployment = voice.azureDeployment
|
|
550
|
+
|| process.env.AZURE_OPENAI_REALTIME_DEPLOYMENT
|
|
551
|
+
|| "gpt-realtime-1.5";
|
|
552
|
+
|
|
553
|
+
const claudeOAuthToken =
|
|
554
|
+
String(voice.claudeAccessToken || "").trim()
|
|
555
|
+
|| resolveVoiceOAuthToken("claude", forceReload)?.token
|
|
556
|
+
|| "";
|
|
557
|
+
const claudeKey = voice.claudeApiKey
|
|
558
|
+
|| process.env.ANTHROPIC_API_KEY
|
|
559
|
+
|| "";
|
|
560
|
+
|
|
561
|
+
const geminiOAuthToken =
|
|
562
|
+
String(voice.geminiAccessToken || "").trim()
|
|
563
|
+
|| resolveVoiceOAuthToken("gemini", forceReload)?.token
|
|
564
|
+
|| "";
|
|
565
|
+
const geminiKey = voice.geminiApiKey
|
|
566
|
+
|| process.env.GEMINI_API_KEY
|
|
567
|
+
|| process.env.GOOGLE_API_KEY
|
|
568
|
+
|| "";
|
|
569
|
+
|
|
570
|
+
const openaiAvailable = Boolean(openaiOAuthToken || openaiKey);
|
|
571
|
+
const azureAvailable = Boolean((azureOAuthToken || azureKey) && azureEndpoint);
|
|
572
|
+
const claudeAvailable = Boolean(claudeKey || claudeOAuthToken);
|
|
573
|
+
const geminiAvailable = Boolean(geminiKey || geminiOAuthToken);
|
|
574
|
+
|
|
575
|
+
const autoProvider =
|
|
576
|
+
azureAvailable
|
|
577
|
+
? "azure"
|
|
578
|
+
: (openaiAvailable
|
|
579
|
+
? "openai"
|
|
580
|
+
: (claudeAvailable
|
|
581
|
+
? "claude"
|
|
582
|
+
: (geminiAvailable ? "gemini" : "fallback")));
|
|
583
|
+
|
|
584
|
+
const provider = rawProvider === "auto" ? autoProvider : rawProvider;
|
|
585
|
+
|
|
586
|
+
const providerChain = normalizeVoiceProviderChain(
|
|
587
|
+
voice.providers || process.env.VOICE_PROVIDERS || [],
|
|
588
|
+
provider,
|
|
589
|
+
);
|
|
590
|
+
const providerChainWithFallbacks = getProviderChainWithCredentialFallbacks(providerChain, {
|
|
591
|
+
openaiAvailable,
|
|
592
|
+
azureAvailable,
|
|
593
|
+
claudeAvailable,
|
|
594
|
+
geminiAvailable,
|
|
595
|
+
});
|
|
596
|
+
|
|
597
|
+
const realtimeCandidates = providerChain
|
|
598
|
+
.filter((entry) => entry.provider === "openai" || entry.provider === "azure")
|
|
599
|
+
.map((entry) => ({ ...entry }));
|
|
600
|
+
if (!realtimeCandidates.length && (provider === "openai" || provider === "azure")) {
|
|
601
|
+
realtimeCandidates.push({
|
|
602
|
+
provider,
|
|
603
|
+
model: null,
|
|
604
|
+
visionModel: null,
|
|
605
|
+
voiceId: null,
|
|
606
|
+
azureDeployment: null,
|
|
607
|
+
});
|
|
608
|
+
}
|
|
609
|
+
|
|
610
|
+
const failoverEnabledRaw =
|
|
611
|
+
voice?.failover?.enabled ?? process.env.VOICE_FAILOVER_ENABLED;
|
|
612
|
+
const failoverEnabled =
|
|
613
|
+
failoverEnabledRaw == null
|
|
614
|
+
? DEFAULT_VOICE_FAILOVER.enabled
|
|
615
|
+
: !["0", "false", "no", "off"].includes(
|
|
616
|
+
String(failoverEnabledRaw).trim().toLowerCase(),
|
|
617
|
+
);
|
|
618
|
+
const failoverMaxAttempts = parseFailoverInt(
|
|
619
|
+
voice?.failover?.maxAttempts ?? process.env.VOICE_FAILOVER_MAX_ATTEMPTS,
|
|
620
|
+
DEFAULT_VOICE_FAILOVER.maxAttempts,
|
|
621
|
+
);
|
|
622
|
+
|
|
623
|
+
const diagnostics = [];
|
|
624
|
+
if (
|
|
625
|
+
process.env.OPENAI_REALTIME_API_KEY
|
|
626
|
+
&& process.env.OPENAI_API_KEY
|
|
627
|
+
&& process.env.OPENAI_REALTIME_API_KEY !== process.env.OPENAI_API_KEY
|
|
628
|
+
) {
|
|
629
|
+
diagnostics.push(
|
|
630
|
+
"Both OPENAI_REALTIME_API_KEY and OPENAI_API_KEY are set; realtime key takes precedence.",
|
|
631
|
+
);
|
|
632
|
+
}
|
|
633
|
+
if (/^sk-test-/i.test(String(openaiKey || ""))) {
|
|
634
|
+
diagnostics.push(
|
|
635
|
+
"OpenAI realtime key appears to be a test/placeholder value (sk-test-*).",
|
|
636
|
+
);
|
|
637
|
+
}
|
|
638
|
+
const defaultModel =
|
|
639
|
+
provider === "claude"
|
|
640
|
+
? CLAUDE_DEFAULT_MODEL
|
|
641
|
+
: provider === "gemini"
|
|
642
|
+
? GEMINI_DEFAULT_MODEL
|
|
643
|
+
: OPENAI_REALTIME_MODEL;
|
|
644
|
+
const model = voice.model || process.env.VOICE_MODEL || defaultModel;
|
|
645
|
+
const voiceId = voice.voiceId || process.env.VOICE_ID || "alloy";
|
|
646
|
+
const turnDetection =
|
|
647
|
+
voice.turnDetection || process.env.VOICE_TURN_DETECTION || "server_vad";
|
|
648
|
+
const defaultVisionModel =
|
|
649
|
+
provider === "claude"
|
|
650
|
+
? CLAUDE_DEFAULT_VISION_MODEL
|
|
651
|
+
: provider === "gemini"
|
|
652
|
+
? GEMINI_DEFAULT_VISION_MODEL
|
|
653
|
+
: OPENAI_DEFAULT_VISION_MODEL;
|
|
654
|
+
const visionModel =
|
|
655
|
+
voice.visionModel || process.env.VOICE_VISION_MODEL || defaultVisionModel;
|
|
656
|
+
const fallbackMode =
|
|
657
|
+
voice.fallbackMode || process.env.VOICE_FALLBACK_MODE || "browser";
|
|
658
|
+
const delegateExecutor =
|
|
659
|
+
voice.delegateExecutor ||
|
|
660
|
+
process.env.VOICE_DELEGATE_EXECUTOR ||
|
|
661
|
+
cfg.primaryAgent ||
|
|
662
|
+
"codex-sdk";
|
|
663
|
+
const enabled =
|
|
664
|
+
voice.enabled != null
|
|
665
|
+
? voice.enabled !== false
|
|
666
|
+
: !["0", "false", "no", "off"].includes(
|
|
667
|
+
String(process.env.VOICE_ENABLED || "")
|
|
668
|
+
.trim()
|
|
669
|
+
.toLowerCase(),
|
|
670
|
+
);
|
|
671
|
+
|
|
672
|
+
const instructions = voice.instructions || `You are Bosun, a helpful voice assistant for the VirtEngine development platform.
|
|
673
|
+
You help developers manage tasks, steer coding agents, monitor builds, and navigate the workspace.
|
|
674
|
+
Be concise and conversational. When users ask about code or tasks, use the available tools.
|
|
675
|
+
For complex operations like writing code or creating PRs, delegate to the appropriate agent.`;
|
|
676
|
+
|
|
677
|
+
_voiceConfig = Object.freeze({
|
|
678
|
+
provider,
|
|
679
|
+
providerChain,
|
|
680
|
+
providerChainWithFallbacks,
|
|
681
|
+
realtimeCandidates,
|
|
682
|
+
failover: {
|
|
683
|
+
enabled: failoverEnabled,
|
|
684
|
+
maxAttempts: failoverMaxAttempts,
|
|
685
|
+
},
|
|
686
|
+
model,
|
|
687
|
+
openaiKey,
|
|
688
|
+
openaiOAuthToken,
|
|
689
|
+
azureKey,
|
|
690
|
+
azureOAuthToken,
|
|
691
|
+
azureEndpoint,
|
|
692
|
+
azureDeployment,
|
|
693
|
+
claudeKey,
|
|
694
|
+
claudeOAuthToken,
|
|
695
|
+
geminiKey,
|
|
696
|
+
geminiOAuthToken,
|
|
697
|
+
voiceId,
|
|
698
|
+
turnDetection,
|
|
699
|
+
visionModel,
|
|
700
|
+
instructions,
|
|
701
|
+
fallbackMode,
|
|
702
|
+
delegateExecutor,
|
|
703
|
+
enabled,
|
|
704
|
+
diagnostics,
|
|
705
|
+
});
|
|
706
|
+
_configLoadedAt = Date.now();
|
|
707
|
+
return _voiceConfig;
|
|
708
|
+
}
|
|
709
|
+
|
|
710
|
+
/**
|
|
711
|
+
* Check if any voice tier is available.
|
|
712
|
+
*/
|
|
713
|
+
export function isVoiceAvailable() {
|
|
714
|
+
const cfg = getVoiceConfig();
|
|
715
|
+
if (!cfg.enabled) return { available: false, tier: null, reason: "Voice disabled in config" };
|
|
716
|
+
|
|
717
|
+
const realtimeProvider = cfg.realtimeCandidates.find((candidate) => {
|
|
718
|
+
if (candidate.provider === "openai") {
|
|
719
|
+
return Boolean(cfg.openaiOAuthToken || cfg.openaiKey);
|
|
720
|
+
}
|
|
721
|
+
if (candidate.provider === "azure") {
|
|
722
|
+
return Boolean((cfg.azureOAuthToken || cfg.azureKey) && cfg.azureEndpoint);
|
|
723
|
+
}
|
|
724
|
+
return false;
|
|
725
|
+
});
|
|
726
|
+
if (realtimeProvider) {
|
|
727
|
+
return { available: true, tier: 1, provider: realtimeProvider.provider };
|
|
728
|
+
}
|
|
729
|
+
|
|
730
|
+
if (cfg.provider === "claude" && (cfg.claudeKey || cfg.claudeOAuthToken)) {
|
|
731
|
+
return { available: true, tier: 2, provider: "claude" };
|
|
732
|
+
}
|
|
733
|
+
if (cfg.provider === "gemini" && (cfg.geminiKey || cfg.geminiOAuthToken)) {
|
|
734
|
+
return { available: true, tier: 2, provider: "gemini" };
|
|
735
|
+
}
|
|
736
|
+
if (cfg.fallbackMode === "disabled") {
|
|
737
|
+
return {
|
|
738
|
+
available: false,
|
|
739
|
+
tier: null,
|
|
740
|
+
reason: `Voice provider "${cfg.provider}" is not configured and fallback is disabled`,
|
|
741
|
+
};
|
|
742
|
+
}
|
|
743
|
+
// Tier 2 fallback available when enabled
|
|
744
|
+
return { available: true, tier: 2, provider: "fallback" };
|
|
745
|
+
}
|
|
746
|
+
|
|
747
|
+
/**
|
|
748
|
+
* Create an ephemeral token for OpenAI Realtime API (WebRTC).
|
|
749
|
+
* Returns { token, expiresAt, model, voiceId, provider }
|
|
750
|
+
*/
|
|
751
|
+
export async function createEphemeralToken(toolDefinitions = [], callContext = {}) {
|
|
752
|
+
const cfg = getVoiceConfig();
|
|
753
|
+
const candidates = cfg.realtimeCandidates.filter((entry) => {
|
|
754
|
+
if (entry.provider === "openai") return Boolean(cfg.openaiOAuthToken || cfg.openaiKey);
|
|
755
|
+
if (entry.provider === "azure") return Boolean((cfg.azureOAuthToken || cfg.azureKey) && cfg.azureEndpoint);
|
|
756
|
+
return false;
|
|
757
|
+
});
|
|
758
|
+
|
|
759
|
+
if (!candidates.length) {
|
|
760
|
+
throw new Error(
|
|
761
|
+
`Realtime WebRTC token is unavailable for provider "${cfg.provider}". ` +
|
|
762
|
+
"Use VOICE_PROVIDER=openai|azure and configure OAuth/API credentials for Tier 1 realtime voice.",
|
|
763
|
+
);
|
|
764
|
+
}
|
|
765
|
+
|
|
766
|
+
const maxAttempts = cfg.failover.enabled
|
|
767
|
+
? Math.min(Math.max(cfg.failover.maxAttempts, 1), candidates.length)
|
|
768
|
+
: 1;
|
|
769
|
+
|
|
770
|
+
let lastError = null;
|
|
771
|
+
for (let index = 0; index < maxAttempts; index++) {
|
|
772
|
+
const candidate = candidates[index];
|
|
773
|
+
try {
|
|
774
|
+
if (candidate.provider === "azure") {
|
|
775
|
+
return await createAzureEphemeralToken(cfg, toolDefinitions, callContext, candidate);
|
|
776
|
+
}
|
|
777
|
+
return await createOpenAIEphemeralToken(cfg, toolDefinitions, callContext, candidate);
|
|
778
|
+
} catch (err) {
|
|
779
|
+
lastError = err;
|
|
780
|
+
const canRetry = cfg.failover.enabled && index + 1 < maxAttempts && shouldFailoverRealtimeError(err);
|
|
781
|
+
if (!canRetry) break;
|
|
782
|
+
}
|
|
783
|
+
}
|
|
784
|
+
|
|
785
|
+
throw lastError || new Error("Failed to create realtime token");
|
|
786
|
+
}
|
|
787
|
+
|
|
788
|
+
async function createOpenAIEphemeralToken(cfg, toolDefinitions = [], callContext = {}, candidate = {}) {
|
|
789
|
+
const credential = String(cfg.openaiOAuthToken || cfg.openaiKey || "").trim();
|
|
790
|
+
if (!credential) {
|
|
791
|
+
throw new Error("OpenAI voice credential not configured (OAuth token or API key required)");
|
|
792
|
+
}
|
|
793
|
+
|
|
794
|
+
const context = sanitizeVoiceCallContext(callContext);
|
|
795
|
+
const instructions = buildSessionScopedInstructions(cfg.instructions, context);
|
|
796
|
+
const model = String(candidate?.model || cfg.model || OPENAI_REALTIME_MODEL).trim() || OPENAI_REALTIME_MODEL;
|
|
797
|
+
const voiceId = String(candidate?.voiceId || cfg.voiceId || "alloy").trim() || "alloy";
|
|
798
|
+
|
|
799
|
+
const sessionConfig = {
|
|
800
|
+
model,
|
|
801
|
+
voice: voiceId,
|
|
802
|
+
instructions,
|
|
803
|
+
tool_choice: resolveToolChoice(toolDefinitions, context),
|
|
804
|
+
turn_detection: {
|
|
805
|
+
type: cfg.turnDetection,
|
|
806
|
+
...(cfg.turnDetection === "server_vad" ? {
|
|
807
|
+
threshold: 0.5,
|
|
808
|
+
prefix_padding_ms: 300,
|
|
809
|
+
silence_duration_ms: 500,
|
|
810
|
+
} : {}),
|
|
811
|
+
...(cfg.turnDetection === "semantic_vad" ? {
|
|
812
|
+
eagerness: "medium",
|
|
813
|
+
} : {}),
|
|
814
|
+
},
|
|
815
|
+
input_audio_transcription: { model: "gpt-4o-mini-transcribe" },
|
|
816
|
+
tools: toolDefinitions,
|
|
817
|
+
};
|
|
818
|
+
|
|
819
|
+
const response = await fetch(`${OPENAI_REALTIME_URL}/sessions`, {
|
|
820
|
+
method: "POST",
|
|
821
|
+
headers: {
|
|
822
|
+
Authorization: `Bearer ${credential}`,
|
|
823
|
+
"Content-Type": "application/json",
|
|
824
|
+
},
|
|
825
|
+
body: JSON.stringify(sessionConfig),
|
|
826
|
+
});
|
|
827
|
+
|
|
828
|
+
if (!response.ok) {
|
|
829
|
+
const errorText = await buildProviderErrorDetails(response, "unknown");
|
|
830
|
+
throw new Error(`OpenAI Realtime session failed (${response.status}): ${errorText}`);
|
|
831
|
+
}
|
|
832
|
+
|
|
833
|
+
const data = await response.json();
|
|
834
|
+
return {
|
|
835
|
+
token: data.client_secret?.value || data.token,
|
|
836
|
+
expiresAt: data.client_secret?.expires_at || (Date.now() / 1000 + 60),
|
|
837
|
+
model,
|
|
838
|
+
voiceId,
|
|
839
|
+
provider: "openai",
|
|
840
|
+
sessionConfig,
|
|
841
|
+
callContext: context,
|
|
842
|
+
};
|
|
843
|
+
}
|
|
844
|
+
|
|
845
|
+
/**
|
|
846
|
+
* Create an ephemeral token for Azure OpenAI Realtime API.
|
|
847
|
+
*/
|
|
848
|
+
async function createAzureEphemeralToken(cfg, toolDefinitions = [], callContext = {}, candidate = {}) {
|
|
849
|
+
if ((!cfg.azureKey && !cfg.azureOAuthToken) || !cfg.azureEndpoint) {
|
|
850
|
+
throw new Error("Azure OpenAI Realtime not configured (need endpoint + key)");
|
|
851
|
+
}
|
|
852
|
+
|
|
853
|
+
const context = sanitizeVoiceCallContext(callContext);
|
|
854
|
+
const instructions = buildSessionScopedInstructions(cfg.instructions, context);
|
|
855
|
+
const endpoint = cfg.azureEndpoint.replace(/\/+$/, "");
|
|
856
|
+
const deployment =
|
|
857
|
+
String(candidate?.azureDeployment || cfg.azureDeployment || "").trim()
|
|
858
|
+
|| "gpt-realtime-1.5";
|
|
859
|
+
const voiceId = String(candidate?.voiceId || cfg.voiceId || "alloy").trim() || "alloy";
|
|
860
|
+
const url = `${endpoint}/openai/realtime/sessions?api-version=${AZURE_API_VERSION}&deployment=${deployment}`;
|
|
861
|
+
|
|
862
|
+
const headers = {
|
|
863
|
+
"Content-Type": "application/json",
|
|
864
|
+
};
|
|
865
|
+
if (cfg.azureOAuthToken) {
|
|
866
|
+
headers.Authorization = `Bearer ${cfg.azureOAuthToken}`;
|
|
867
|
+
} else {
|
|
868
|
+
headers["api-key"] = cfg.azureKey;
|
|
869
|
+
}
|
|
870
|
+
|
|
871
|
+
const sessionConfig = {
|
|
872
|
+
model: deployment,
|
|
873
|
+
voice: voiceId,
|
|
874
|
+
instructions,
|
|
875
|
+
tool_choice: resolveToolChoice(toolDefinitions, context),
|
|
876
|
+
turn_detection: {
|
|
877
|
+
type: cfg.turnDetection,
|
|
878
|
+
...(cfg.turnDetection === "server_vad" ? {
|
|
879
|
+
threshold: 0.5,
|
|
880
|
+
prefix_padding_ms: 300,
|
|
881
|
+
silence_duration_ms: 500,
|
|
882
|
+
} : {}),
|
|
883
|
+
},
|
|
884
|
+
input_audio_transcription: { model: "whisper-1" },
|
|
885
|
+
tools: toolDefinitions,
|
|
886
|
+
};
|
|
887
|
+
|
|
888
|
+
const response = await fetch(url, {
|
|
889
|
+
method: "POST",
|
|
890
|
+
headers,
|
|
891
|
+
body: JSON.stringify(sessionConfig),
|
|
892
|
+
});
|
|
893
|
+
|
|
894
|
+
if (!response.ok) {
|
|
895
|
+
const errorText = await buildProviderErrorDetails(response, "unknown");
|
|
896
|
+
throw new Error(`Azure Realtime session failed (${response.status}): ${errorText}`);
|
|
897
|
+
}
|
|
898
|
+
|
|
899
|
+
const data = await response.json();
|
|
900
|
+
return {
|
|
901
|
+
token: data.client_secret?.value || data.token,
|
|
902
|
+
expiresAt: data.client_secret?.expires_at || (Date.now() / 1000 + 60),
|
|
903
|
+
model: deployment,
|
|
904
|
+
voiceId,
|
|
905
|
+
provider: "azure",
|
|
906
|
+
sessionConfig,
|
|
907
|
+
azureEndpoint: endpoint,
|
|
908
|
+
azureDeployment: deployment,
|
|
909
|
+
callContext: context,
|
|
910
|
+
};
|
|
911
|
+
}
|
|
912
|
+
|
|
913
|
+
/**
|
|
914
|
+
* Analyze a camera/screen frame and return a concise summary.
|
|
915
|
+
* @param {string} frameDataUrl - data URL (image/jpeg|png|webp)
|
|
916
|
+
* @param {object} options - { source, context, prompt }
|
|
917
|
+
* @returns {Promise<{ summary: string, provider: string, model: string }>}
|
|
918
|
+
*/
|
|
919
|
+
export async function analyzeVisionFrame(frameDataUrl, options = {}) {
|
|
920
|
+
const frame = parseImageDataUrl(frameDataUrl);
|
|
921
|
+
const dataUrl = frame.dataUrl;
|
|
922
|
+
|
|
923
|
+
const cfg = getVoiceConfig();
|
|
924
|
+
const source = String(options?.source || "screen").trim().toLowerCase() || "screen";
|
|
925
|
+
const callContext = sanitizeVoiceCallContext(options?.context || {});
|
|
926
|
+
const model =
|
|
927
|
+
String(
|
|
928
|
+
options?.model
|
|
929
|
+
|| options?.visionModel
|
|
930
|
+
|| cfg.visionModel
|
|
931
|
+
|| process.env.VOICE_VISION_MODEL
|
|
932
|
+
|| OPENAI_DEFAULT_VISION_MODEL,
|
|
933
|
+
).trim();
|
|
934
|
+
const prompt = String(options?.prompt || "").trim()
|
|
935
|
+
|| "Summarize what is visible in this live frame for a coding assistant. Focus on code, terminal output, errors, UI labels, and actionable context.";
|
|
936
|
+
|
|
937
|
+
const contextText = [
|
|
938
|
+
`Frame source: ${source}.`,
|
|
939
|
+
`Bound chat session: ${callContext.sessionId || "none"}.`,
|
|
940
|
+
callContext.executor ? `Preferred executor: ${callContext.executor}.` : "",
|
|
941
|
+
callContext.mode ? `Preferred mode: ${callContext.mode}.` : "",
|
|
942
|
+
callContext.model ? `Preferred model override: ${callContext.model}.` : "",
|
|
943
|
+
"Respond in 1-3 concise sentences. Include likely next action if obvious.",
|
|
944
|
+
]
|
|
945
|
+
.filter(Boolean)
|
|
946
|
+
.join("\n");
|
|
947
|
+
|
|
948
|
+
const preferredProviders = [];
|
|
949
|
+
const pushProvider = (value) => {
|
|
950
|
+
const provider = String(value || "").trim().toLowerCase();
|
|
951
|
+
if (!provider || preferredProviders.includes(provider)) return;
|
|
952
|
+
preferredProviders.push(provider);
|
|
953
|
+
};
|
|
954
|
+
pushProvider(cfg.provider);
|
|
955
|
+
if (cfg.openaiKey) pushProvider("openai");
|
|
956
|
+
if (cfg.azureKey && cfg.azureEndpoint) pushProvider("azure");
|
|
957
|
+
if (cfg.claudeKey) pushProvider("claude");
|
|
958
|
+
if (cfg.geminiKey) pushProvider("gemini");
|
|
959
|
+
|
|
960
|
+
let lastError = null;
|
|
961
|
+
for (const provider of preferredProviders) {
|
|
962
|
+
try {
|
|
963
|
+
if (provider === "openai" && cfg.openaiKey) {
|
|
964
|
+
return await analyzeVisionWithOpenAI(
|
|
965
|
+
dataUrl,
|
|
966
|
+
model,
|
|
967
|
+
prompt,
|
|
968
|
+
contextText,
|
|
969
|
+
cfg,
|
|
970
|
+
);
|
|
971
|
+
}
|
|
972
|
+
if (provider === "azure" && cfg.azureKey && cfg.azureEndpoint) {
|
|
973
|
+
return await analyzeVisionWithAzure(
|
|
974
|
+
dataUrl,
|
|
975
|
+
model,
|
|
976
|
+
prompt,
|
|
977
|
+
contextText,
|
|
978
|
+
cfg,
|
|
979
|
+
);
|
|
980
|
+
}
|
|
981
|
+
if (provider === "claude" && cfg.claudeKey) {
|
|
982
|
+
return await analyzeVisionWithClaude(
|
|
983
|
+
frame,
|
|
984
|
+
model,
|
|
985
|
+
prompt,
|
|
986
|
+
contextText,
|
|
987
|
+
cfg,
|
|
988
|
+
);
|
|
989
|
+
}
|
|
990
|
+
if (provider === "gemini" && cfg.geminiKey) {
|
|
991
|
+
return await analyzeVisionWithGemini(
|
|
992
|
+
frame,
|
|
993
|
+
model,
|
|
994
|
+
prompt,
|
|
995
|
+
contextText,
|
|
996
|
+
cfg,
|
|
997
|
+
);
|
|
998
|
+
}
|
|
999
|
+
} catch (err) {
|
|
1000
|
+
lastError = err;
|
|
1001
|
+
}
|
|
1002
|
+
}
|
|
1003
|
+
|
|
1004
|
+
if (lastError) {
|
|
1005
|
+
throw new Error(`Vision request failed: ${lastError.message}`);
|
|
1006
|
+
}
|
|
1007
|
+
|
|
1008
|
+
throw new Error(
|
|
1009
|
+
"Vision unavailable: configure OPENAI, Azure, Anthropic, or Gemini voice credentials",
|
|
1010
|
+
);
|
|
1011
|
+
}
|
|
1012
|
+
|
|
1013
|
+
/**
|
|
1014
|
+
* Execute a voice tool call server-side.
|
|
1015
|
+
* Returns { result: string, error?: string }
|
|
1016
|
+
*/
|
|
1017
|
+
export async function executeVoiceTool(toolName, toolArgs, context = {}) {
|
|
1018
|
+
try {
|
|
1019
|
+
// Import voice-tools lazily to avoid circular deps
|
|
1020
|
+
const { executeToolCall } = await import("./voice-tools.mjs");
|
|
1021
|
+
return await executeToolCall(toolName, toolArgs, context);
|
|
1022
|
+
} catch (err) {
|
|
1023
|
+
console.error(`[voice-relay] tool execution error (${toolName}):`, err.message);
|
|
1024
|
+
return { result: null, error: err.message };
|
|
1025
|
+
}
|
|
1026
|
+
}
|
|
1027
|
+
|
|
1028
|
+
/**
|
|
1029
|
+
* Get the full tool definitions array for voice sessions.
|
|
1030
|
+
*/
|
|
1031
|
+
export async function getVoiceToolDefinitions(options = {}) {
|
|
1032
|
+
try {
|
|
1033
|
+
const { getToolDefinitions } = await import("./voice-tools.mjs");
|
|
1034
|
+
const allTools = getToolDefinitions();
|
|
1035
|
+
const delegateOnly = options?.delegateOnly === true;
|
|
1036
|
+
if (!delegateOnly) return allTools;
|
|
1037
|
+
return allTools.filter((tool) => tool?.name === "delegate_to_agent");
|
|
1038
|
+
} catch (err) {
|
|
1039
|
+
console.error("[voice-relay] failed to load voice tool definitions:", err.message);
|
|
1040
|
+
return [];
|
|
1041
|
+
}
|
|
1042
|
+
}
|
|
1043
|
+
|
|
1044
|
+
/**
|
|
1045
|
+
* Get the WebRTC connection URL for the client.
|
|
1046
|
+
*/
|
|
1047
|
+
export function getRealtimeConnectionInfo() {
|
|
1048
|
+
const cfg = getVoiceConfig();
|
|
1049
|
+
const candidate = cfg.realtimeCandidates.find((entry) => {
|
|
1050
|
+
if (entry.provider === "openai") return Boolean(cfg.openaiOAuthToken || cfg.openaiKey);
|
|
1051
|
+
if (entry.provider === "azure") return Boolean((cfg.azureOAuthToken || cfg.azureKey) && cfg.azureEndpoint);
|
|
1052
|
+
return false;
|
|
1053
|
+
});
|
|
1054
|
+
if (!candidate) {
|
|
1055
|
+
return {
|
|
1056
|
+
provider: cfg.provider,
|
|
1057
|
+
url: null,
|
|
1058
|
+
model: cfg.model,
|
|
1059
|
+
tier: 2,
|
|
1060
|
+
};
|
|
1061
|
+
}
|
|
1062
|
+
|
|
1063
|
+
if (candidate.provider === "azure") {
|
|
1064
|
+
const endpoint = cfg.azureEndpoint.replace(/\/+$/, "");
|
|
1065
|
+
const deployment =
|
|
1066
|
+
String(candidate?.azureDeployment || cfg.azureDeployment || "").trim()
|
|
1067
|
+
|| "gpt-realtime-1.5";
|
|
1068
|
+
return {
|
|
1069
|
+
provider: "azure",
|
|
1070
|
+
url: `${endpoint}/openai/realtime?api-version=${AZURE_API_VERSION}&deployment=${deployment}`,
|
|
1071
|
+
model: deployment,
|
|
1072
|
+
};
|
|
1073
|
+
}
|
|
1074
|
+
const model = String(candidate?.model || cfg.model || OPENAI_REALTIME_MODEL).trim() || OPENAI_REALTIME_MODEL;
|
|
1075
|
+
return {
|
|
1076
|
+
provider: "openai",
|
|
1077
|
+
url: `${OPENAI_REALTIME_URL}?model=${model}`,
|
|
1078
|
+
model,
|
|
1079
|
+
};
|
|
1080
|
+
}
|
|
1081
|
+
|
|
1082
|
+
// ── Voice action dispatch (direct JavaScript, no MCP bridge) ────────────────
|
|
1083
|
+
|
|
1084
|
+
/**
|
|
1085
|
+
* Dispatch a voice action intent through the action dispatcher.
|
|
1086
|
+
* The voice model returns JSON action objects; Bosun processes them
|
|
1087
|
+
* directly via JavaScript and returns structured results.
|
|
1088
|
+
*
|
|
1089
|
+
* @param {Object} intent — { action, params, id? }
|
|
1090
|
+
* @param {Object} context — { sessionId, executor, mode, model }
|
|
1091
|
+
* @returns {Promise<Object>} Structured result
|
|
1092
|
+
*/
|
|
1093
|
+
export async function dispatchVoiceActionIntent(intent, context = {}) {
|
|
1094
|
+
try {
|
|
1095
|
+
const { dispatchVoiceAction } = await import("./voice-action-dispatcher.mjs");
|
|
1096
|
+
return await dispatchVoiceAction(intent, context);
|
|
1097
|
+
} catch (err) {
|
|
1098
|
+
console.error("[voice-relay] action dispatch error:", err.message);
|
|
1099
|
+
return {
|
|
1100
|
+
ok: false,
|
|
1101
|
+
action: intent?.action || "",
|
|
1102
|
+
data: null,
|
|
1103
|
+
error: err.message,
|
|
1104
|
+
durationMs: 0,
|
|
1105
|
+
};
|
|
1106
|
+
}
|
|
1107
|
+
}
|
|
1108
|
+
|
|
1109
|
+
/**
|
|
1110
|
+
* Dispatch multiple voice action intents.
|
|
1111
|
+
* @param {Array} intents
|
|
1112
|
+
* @param {Object} context
|
|
1113
|
+
* @returns {Promise<Array>}
|
|
1114
|
+
*/
|
|
1115
|
+
export async function dispatchVoiceActionIntents(intents, context = {}) {
|
|
1116
|
+
try {
|
|
1117
|
+
const { dispatchVoiceActions } = await import("./voice-action-dispatcher.mjs");
|
|
1118
|
+
return await dispatchVoiceActions(intents, context);
|
|
1119
|
+
} catch (err) {
|
|
1120
|
+
console.error("[voice-relay] batch action dispatch error:", err.message);
|
|
1121
|
+
return [];
|
|
1122
|
+
}
|
|
1123
|
+
}
|
|
1124
|
+
|
|
1125
|
+
/**
|
|
1126
|
+
* Get the action manifest for voice prompt injection.
|
|
1127
|
+
* @returns {string}
|
|
1128
|
+
*/
|
|
1129
|
+
export async function getVoiceActionManifest() {
|
|
1130
|
+
try {
|
|
1131
|
+
const { getVoiceActionPromptSection } = await import("./voice-action-dispatcher.mjs");
|
|
1132
|
+
return getVoiceActionPromptSection();
|
|
1133
|
+
} catch (err) {
|
|
1134
|
+
console.error("[voice-relay] action manifest error:", err.message);
|
|
1135
|
+
return "";
|
|
1136
|
+
}
|
|
1137
|
+
}
|
|
1138
|
+
|
|
1139
|
+
/**
|
|
1140
|
+
* List all available voice actions.
|
|
1141
|
+
* @returns {Promise<string[]>}
|
|
1142
|
+
*/
|
|
1143
|
+
export async function listVoiceActions() {
|
|
1144
|
+
try {
|
|
1145
|
+
const { listAvailableActions } = await import("./voice-action-dispatcher.mjs");
|
|
1146
|
+
return listAvailableActions();
|
|
1147
|
+
} catch {
|
|
1148
|
+
return [];
|
|
1149
|
+
}
|
|
1150
|
+
}
|
|
1151
|
+
|
|
1152
|
+
/**
|
|
1153
|
+
* Build the full voice agent prompt by resolving the voice prompt template
|
|
1154
|
+
* and injecting the action manifest.
|
|
1155
|
+
*
|
|
1156
|
+
* @param {Object} options — { compact?, customInstructions? }
|
|
1157
|
+
* @returns {Promise<string>}
|
|
1158
|
+
*/
|
|
1159
|
+
export async function buildVoiceAgentPrompt(options = {}) {
|
|
1160
|
+
const cfg = getVoiceConfig();
|
|
1161
|
+
let baseInstructions = cfg.instructions || "";
|
|
1162
|
+
|
|
1163
|
+
// Try to load the customizable voice prompt from the prompt library
|
|
1164
|
+
try {
|
|
1165
|
+
const { resolveAgentPrompts, renderPromptTemplate, getDefaultPromptTemplate } = await import("./agent-prompts.mjs");
|
|
1166
|
+
const promptKey = options.compact ? "voiceAgentCompact" : "voiceAgent";
|
|
1167
|
+
|
|
1168
|
+
// Try workspace prompt first, fall back to default
|
|
1169
|
+
let template = "";
|
|
1170
|
+
try {
|
|
1171
|
+
const resolved = resolveAgentPrompts(null, process.cwd(), {});
|
|
1172
|
+
template = resolved.prompts?.[promptKey] || "";
|
|
1173
|
+
} catch {
|
|
1174
|
+
template = getDefaultPromptTemplate(promptKey) || "";
|
|
1175
|
+
}
|
|
1176
|
+
|
|
1177
|
+
if (template) {
|
|
1178
|
+
const manifest = await getVoiceActionManifest();
|
|
1179
|
+
baseInstructions = renderPromptTemplate(template, {
|
|
1180
|
+
VOICE_ACTION_MANIFEST: manifest,
|
|
1181
|
+
});
|
|
1182
|
+
}
|
|
1183
|
+
} catch {
|
|
1184
|
+
// Fall back to config instructions
|
|
1185
|
+
}
|
|
1186
|
+
|
|
1187
|
+
// Allow custom instructions override
|
|
1188
|
+
if (options.customInstructions) {
|
|
1189
|
+
baseInstructions = String(options.customInstructions);
|
|
1190
|
+
}
|
|
1191
|
+
|
|
1192
|
+
return baseInstructions;
|
|
1193
|
+
}
|
|
1194
|
+
|