bosun 0.36.2 → 0.36.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/analyze-agent-work-helpers.mjs +308 -0
- package/analyze-agent-work.mjs +926 -0
- package/autofix.mjs +2 -0
- package/codex-shell.mjs +85 -10
- package/git-editor-fix.mjs +273 -0
- package/mcp-registry.mjs +579 -0
- package/meeting-workflow-service.mjs +631 -0
- package/monitor.mjs +18 -103
- package/package.json +13 -2
- package/primary-agent.mjs +32 -12
- package/session-tracker.mjs +68 -0
- package/stream-resilience.mjs +17 -7
- package/ui/app.js +19 -4
- package/ui/components/chat-view.js +108 -5
- package/ui/components/session-list.js +1 -1
- package/ui/components/shared.js +188 -15
- package/ui/modules/icons.js +13 -0
- package/ui/modules/utils.js +44 -0
- package/ui/modules/voice.js +15 -6
- package/ui/styles/components.css +99 -3
- package/ui/styles/sessions.css +84 -12
- package/ui/tabs/chat.js +5 -1
- package/ui/tabs/control.js +16 -22
- package/ui/tabs/dashboard.js +85 -8
- package/ui/tabs/library.js +113 -17
- package/ui/tabs/settings.js +116 -2
- package/ui/tabs/tasks.js +388 -39
- package/ui/tabs/telemetry.js +0 -1
- package/ui/tabs/workflows.js +4 -0
- package/ui-server.mjs +193 -19
- package/update-check.mjs +41 -13
- package/voice-relay.mjs +816 -0
- package/voice-tools.mjs +679 -0
- package/workflow-templates/agents.mjs +6 -2
- package/workflow-templates/github.mjs +154 -12
- package/workflow-templates.mjs +3 -0
- package/github-reconciler.mjs +0 -506
- package/merge-strategy.mjs +0 -1210
- package/pr-cleanup-daemon.mjs +0 -992
- package/workspace-reaper.mjs +0 -405
package/voice-relay.mjs
ADDED
|
@@ -0,0 +1,816 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* voice-relay.mjs — Multi-provider voice relay for real-time voice sessions.
|
|
3
|
+
*
|
|
4
|
+
* Supports:
|
|
5
|
+
* - OpenAI Realtime API (WebRTC) — direct API key
|
|
6
|
+
* - Azure OpenAI Realtime API (WebRTC) — API key + endpoint
|
|
7
|
+
* - Claude/Gemini provider mode (Tier 2 speech fallback + provider vision)
|
|
8
|
+
* - Tier 2 fallback (browser STT → executor → browser TTS)
|
|
9
|
+
*
|
|
10
|
+
* @module voice-relay
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import { loadConfig } from "./config.mjs";
|
|
14
|
+
import { execPrimaryPrompt, getPrimaryAgentName } from "./primary-agent.mjs";
|
|
15
|
+
|
|
16
|
+
// ── Module-scope state ──────────────────────────────────────────────────────
|
|
17
|
+
let _voiceConfig = null; // cached resolved config
|
|
18
|
+
let _configLoadedAt = 0; // timestamp of last config load
|
|
19
|
+
|
|
20
|
+
const CONFIG_TTL_MS = 30_000; // re-read config every 30s
|
|
21
|
+
|
|
22
|
+
const OPENAI_REALTIME_URL = "https://api.openai.com/v1/realtime";
|
|
23
|
+
const OPENAI_REALTIME_MODEL = "gpt-4o-realtime-preview-2024-12-17";
|
|
24
|
+
const OPENAI_RESPONSES_URL = "https://api.openai.com/v1/responses";
|
|
25
|
+
const OPENAI_DEFAULT_VISION_MODEL = "gpt-4.1-mini";
|
|
26
|
+
|
|
27
|
+
const AZURE_API_VERSION = "2025-04-01-preview";
|
|
28
|
+
const ANTHROPIC_MESSAGES_URL = "https://api.anthropic.com/v1/messages";
|
|
29
|
+
const ANTHROPIC_API_VERSION = "2023-06-01";
|
|
30
|
+
const CLAUDE_DEFAULT_MODEL = "claude-3-7-sonnet-latest";
|
|
31
|
+
const CLAUDE_DEFAULT_VISION_MODEL = "claude-3-7-sonnet-latest";
|
|
32
|
+
const GEMINI_GENERATE_CONTENT_URL = "https://generativelanguage.googleapis.com/v1beta/models";
|
|
33
|
+
const GEMINI_DEFAULT_MODEL = "gemini-2.5-pro";
|
|
34
|
+
const GEMINI_DEFAULT_VISION_MODEL = "gemini-2.5-flash";
|
|
35
|
+
|
|
36
|
+
const VALID_EXECUTORS = new Set([
|
|
37
|
+
"codex-sdk",
|
|
38
|
+
"copilot-sdk",
|
|
39
|
+
"claude-sdk",
|
|
40
|
+
"gemini-sdk",
|
|
41
|
+
"opencode-sdk",
|
|
42
|
+
]);
|
|
43
|
+
|
|
44
|
+
const VALID_AGENT_MODES = new Set([
|
|
45
|
+
"ask",
|
|
46
|
+
"agent",
|
|
47
|
+
"plan",
|
|
48
|
+
"code",
|
|
49
|
+
"architect",
|
|
50
|
+
]);
|
|
51
|
+
|
|
52
|
+
function redactSecretLikeText(value) {
|
|
53
|
+
let sanitized = String(value || "");
|
|
54
|
+
sanitized = sanitized.replace(/\b(sk|rk|pk)-[A-Za-z0-9_-]{10,}\b/g, "$1-***REDACTED***");
|
|
55
|
+
sanitized = sanitized.replace(/\bBearer\s+[A-Za-z0-9._~+/=-]{8,}\b/gi, "Bearer ***REDACTED***");
|
|
56
|
+
sanitized = sanitized.replace(
|
|
57
|
+
/("?(?:api[_-]?key|access[_-]?token|client[_-]?secret|authorization)"?\s*[:=]\s*"?)([^",\s}{\]]+)/gi,
|
|
58
|
+
"$1***REDACTED***",
|
|
59
|
+
);
|
|
60
|
+
return sanitized;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
async function buildProviderErrorDetails(response, fallback = "unknown") {
|
|
64
|
+
const raw = await response.text().catch(() => fallback);
|
|
65
|
+
return redactSecretLikeText(raw || fallback);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
function sanitizeVoiceCallContext(context = {}) {
|
|
69
|
+
const rawSessionId = String(context?.sessionId || "").trim();
|
|
70
|
+
const rawExecutor = String(context?.executor || "").trim().toLowerCase();
|
|
71
|
+
const rawMode = String(context?.mode || "").trim().toLowerCase();
|
|
72
|
+
const rawModel = String(context?.model || "").trim();
|
|
73
|
+
|
|
74
|
+
return {
|
|
75
|
+
sessionId: rawSessionId || null,
|
|
76
|
+
executor: VALID_EXECUTORS.has(rawExecutor) ? rawExecutor : null,
|
|
77
|
+
mode: VALID_AGENT_MODES.has(rawMode) ? rawMode : null,
|
|
78
|
+
model: rawModel || null,
|
|
79
|
+
};
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
function buildSessionScopedInstructions(baseInstructions, callContext = {}) {
|
|
83
|
+
const context = sanitizeVoiceCallContext(callContext);
|
|
84
|
+
if (!context.sessionId && !context.executor && !context.mode && !context.model) {
|
|
85
|
+
return baseInstructions;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
const suffix = [
|
|
89
|
+
"",
|
|
90
|
+
"## Bosun Voice Call Context",
|
|
91
|
+
`Active chat session id: ${context.sessionId || "none"}.`,
|
|
92
|
+
context.executor
|
|
93
|
+
? `Preferred executor for delegated work: ${context.executor}.`
|
|
94
|
+
: "Preferred executor for delegated work: use configured default.",
|
|
95
|
+
context.mode
|
|
96
|
+
? `Preferred delegation mode: ${context.mode}.`
|
|
97
|
+
: "Preferred delegation mode: use configured default.",
|
|
98
|
+
context.model
|
|
99
|
+
? `Preferred model override: ${context.model}.`
|
|
100
|
+
: "Preferred model override: none.",
|
|
101
|
+
"",
|
|
102
|
+
"## Required Behavior",
|
|
103
|
+
"- For every user turn in this call, invoke delegate_to_agent exactly once before any final spoken answer.",
|
|
104
|
+
"- For coding, repo, task, debugging, automation, or workspace requests, call delegate_to_agent before finalizing your response.",
|
|
105
|
+
"- Preserve user intent when delegating. Do not paraphrase away technical detail.",
|
|
106
|
+
"- Keep responses concise after receiving delegate_to_agent output.",
|
|
107
|
+
].join("\n");
|
|
108
|
+
|
|
109
|
+
return `${baseInstructions}${suffix}`;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
function resolveToolChoice(toolDefinitions, callContext = {}) {
|
|
113
|
+
const context = sanitizeVoiceCallContext(callContext);
|
|
114
|
+
const hasDelegateTool = Array.isArray(toolDefinitions)
|
|
115
|
+
&& toolDefinitions.some((tool) => tool?.name === "delegate_to_agent");
|
|
116
|
+
if (context.sessionId && hasDelegateTool) {
|
|
117
|
+
return {
|
|
118
|
+
type: "function",
|
|
119
|
+
name: "delegate_to_agent",
|
|
120
|
+
};
|
|
121
|
+
}
|
|
122
|
+
return "auto";
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
function extractModelResponseText(payload) {
|
|
126
|
+
if (!payload || typeof payload !== "object") return "";
|
|
127
|
+
if (typeof payload.output_text === "string" && payload.output_text.trim()) {
|
|
128
|
+
return payload.output_text.trim();
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
const output = Array.isArray(payload.output) ? payload.output : [];
|
|
132
|
+
for (const item of output) {
|
|
133
|
+
const content = Array.isArray(item?.content) ? item.content : [];
|
|
134
|
+
for (const part of content) {
|
|
135
|
+
if (typeof part?.text === "string" && part.text.trim()) {
|
|
136
|
+
return part.text.trim();
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
const choices = Array.isArray(payload.choices) ? payload.choices : [];
|
|
142
|
+
for (const choice of choices) {
|
|
143
|
+
const text = String(choice?.message?.content || "").trim();
|
|
144
|
+
if (text) return text;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
return "";
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
function parseImageDataUrl(dataUrl) {
|
|
151
|
+
const raw = String(dataUrl || "").trim();
|
|
152
|
+
const match = raw.match(
|
|
153
|
+
/^data:(image\/(?:jpeg|jpg|png|webp));base64,([A-Za-z0-9+/=]+)$/i,
|
|
154
|
+
);
|
|
155
|
+
if (!match) {
|
|
156
|
+
throw new Error("Invalid frame format (expected data:image/*;base64,...)");
|
|
157
|
+
}
|
|
158
|
+
return {
|
|
159
|
+
mimeType: String(match[1] || "").toLowerCase(),
|
|
160
|
+
base64Data: String(match[2] || ""),
|
|
161
|
+
dataUrl: raw,
|
|
162
|
+
};
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
function extractClaudeResponseText(payload) {
|
|
166
|
+
if (!payload || typeof payload !== "object") return "";
|
|
167
|
+
const content = Array.isArray(payload.content) ? payload.content : [];
|
|
168
|
+
const text = content
|
|
169
|
+
.filter((part) => part?.type === "text")
|
|
170
|
+
.map((part) => String(part?.text || "").trim())
|
|
171
|
+
.filter(Boolean)
|
|
172
|
+
.join("\n")
|
|
173
|
+
.trim();
|
|
174
|
+
if (text) return text;
|
|
175
|
+
return "";
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
function extractGeminiResponseText(payload) {
|
|
179
|
+
if (!payload || typeof payload !== "object") return "";
|
|
180
|
+
const candidates = Array.isArray(payload.candidates) ? payload.candidates : [];
|
|
181
|
+
for (const candidate of candidates) {
|
|
182
|
+
const parts = Array.isArray(candidate?.content?.parts)
|
|
183
|
+
? candidate.content.parts
|
|
184
|
+
: [];
|
|
185
|
+
const text = parts
|
|
186
|
+
.map((part) => String(part?.text || "").trim())
|
|
187
|
+
.filter(Boolean)
|
|
188
|
+
.join("\n")
|
|
189
|
+
.trim();
|
|
190
|
+
if (text) return text;
|
|
191
|
+
}
|
|
192
|
+
return "";
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
async function analyzeVisionWithOpenAI(dataUrl, model, prompt, contextText, cfg) {
|
|
196
|
+
const response = await fetch(OPENAI_RESPONSES_URL, {
|
|
197
|
+
method: "POST",
|
|
198
|
+
headers: {
|
|
199
|
+
Authorization: `Bearer ${cfg.openaiKey}`,
|
|
200
|
+
"Content-Type": "application/json",
|
|
201
|
+
},
|
|
202
|
+
body: JSON.stringify({
|
|
203
|
+
model,
|
|
204
|
+
temperature: 0.2,
|
|
205
|
+
max_output_tokens: 220,
|
|
206
|
+
input: [
|
|
207
|
+
{
|
|
208
|
+
role: "user",
|
|
209
|
+
content: [
|
|
210
|
+
{
|
|
211
|
+
type: "input_text",
|
|
212
|
+
text: `${prompt}\n\n${contextText}`,
|
|
213
|
+
},
|
|
214
|
+
{
|
|
215
|
+
type: "input_image",
|
|
216
|
+
image_url: dataUrl,
|
|
217
|
+
detail: "high",
|
|
218
|
+
},
|
|
219
|
+
],
|
|
220
|
+
},
|
|
221
|
+
],
|
|
222
|
+
}),
|
|
223
|
+
});
|
|
224
|
+
if (!response.ok) {
|
|
225
|
+
const errText = await buildProviderErrorDetails(response, "unknown");
|
|
226
|
+
throw new Error(`Vision request failed (${response.status}): ${errText}`);
|
|
227
|
+
}
|
|
228
|
+
const payload = await response.json();
|
|
229
|
+
const summary = extractModelResponseText(payload);
|
|
230
|
+
if (!summary) {
|
|
231
|
+
throw new Error("Vision model returned an empty summary");
|
|
232
|
+
}
|
|
233
|
+
return {
|
|
234
|
+
summary,
|
|
235
|
+
provider: "openai",
|
|
236
|
+
model,
|
|
237
|
+
};
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
async function analyzeVisionWithAzure(dataUrl, model, prompt, contextText, cfg) {
|
|
241
|
+
const endpoint = cfg.azureEndpoint.replace(/\/+$/, "");
|
|
242
|
+
const url = `${endpoint}/openai/responses?api-version=${AZURE_API_VERSION}`;
|
|
243
|
+
const response = await fetch(url, {
|
|
244
|
+
method: "POST",
|
|
245
|
+
headers: {
|
|
246
|
+
"api-key": cfg.azureKey,
|
|
247
|
+
"Content-Type": "application/json",
|
|
248
|
+
},
|
|
249
|
+
body: JSON.stringify({
|
|
250
|
+
model,
|
|
251
|
+
temperature: 0.2,
|
|
252
|
+
max_output_tokens: 220,
|
|
253
|
+
input: [
|
|
254
|
+
{
|
|
255
|
+
role: "user",
|
|
256
|
+
content: [
|
|
257
|
+
{
|
|
258
|
+
type: "input_text",
|
|
259
|
+
text: `${prompt}\n\n${contextText}`,
|
|
260
|
+
},
|
|
261
|
+
{
|
|
262
|
+
type: "input_image",
|
|
263
|
+
image_url: dataUrl,
|
|
264
|
+
detail: "high",
|
|
265
|
+
},
|
|
266
|
+
],
|
|
267
|
+
},
|
|
268
|
+
],
|
|
269
|
+
}),
|
|
270
|
+
});
|
|
271
|
+
if (!response.ok) {
|
|
272
|
+
const errText = await buildProviderErrorDetails(response, "unknown");
|
|
273
|
+
throw new Error(`Azure vision request failed (${response.status}): ${errText}`);
|
|
274
|
+
}
|
|
275
|
+
const payload = await response.json();
|
|
276
|
+
const summary = extractModelResponseText(payload);
|
|
277
|
+
if (!summary) {
|
|
278
|
+
throw new Error("Azure vision model returned an empty summary");
|
|
279
|
+
}
|
|
280
|
+
return {
|
|
281
|
+
summary,
|
|
282
|
+
provider: "azure",
|
|
283
|
+
model,
|
|
284
|
+
};
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
async function analyzeVisionWithClaude(frame, model, prompt, contextText, cfg) {
|
|
288
|
+
const response = await fetch(ANTHROPIC_MESSAGES_URL, {
|
|
289
|
+
method: "POST",
|
|
290
|
+
headers: {
|
|
291
|
+
"x-api-key": cfg.claudeKey,
|
|
292
|
+
"anthropic-version": ANTHROPIC_API_VERSION,
|
|
293
|
+
"Content-Type": "application/json",
|
|
294
|
+
},
|
|
295
|
+
body: JSON.stringify({
|
|
296
|
+
model,
|
|
297
|
+
temperature: 0.2,
|
|
298
|
+
max_tokens: 260,
|
|
299
|
+
messages: [
|
|
300
|
+
{
|
|
301
|
+
role: "user",
|
|
302
|
+
content: [
|
|
303
|
+
{ type: "text", text: `${prompt}\n\n${contextText}` },
|
|
304
|
+
{
|
|
305
|
+
type: "image",
|
|
306
|
+
source: {
|
|
307
|
+
type: "base64",
|
|
308
|
+
media_type: frame.mimeType,
|
|
309
|
+
data: frame.base64Data,
|
|
310
|
+
},
|
|
311
|
+
},
|
|
312
|
+
],
|
|
313
|
+
},
|
|
314
|
+
],
|
|
315
|
+
}),
|
|
316
|
+
});
|
|
317
|
+
if (!response.ok) {
|
|
318
|
+
const errText = await buildProviderErrorDetails(response, "unknown");
|
|
319
|
+
throw new Error(`Claude vision request failed (${response.status}): ${errText}`);
|
|
320
|
+
}
|
|
321
|
+
const payload = await response.json();
|
|
322
|
+
const summary = extractClaudeResponseText(payload);
|
|
323
|
+
if (!summary) {
|
|
324
|
+
throw new Error("Claude vision model returned an empty summary");
|
|
325
|
+
}
|
|
326
|
+
return {
|
|
327
|
+
summary,
|
|
328
|
+
provider: "claude",
|
|
329
|
+
model,
|
|
330
|
+
};
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
async function analyzeVisionWithGemini(frame, model, prompt, contextText, cfg) {
|
|
334
|
+
const apiKey = String(cfg.geminiKey || "").trim();
|
|
335
|
+
const endpoint =
|
|
336
|
+
`${GEMINI_GENERATE_CONTENT_URL}/${encodeURIComponent(model)}:generateContent?key=${encodeURIComponent(apiKey)}`;
|
|
337
|
+
const response = await fetch(endpoint, {
|
|
338
|
+
method: "POST",
|
|
339
|
+
headers: {
|
|
340
|
+
"Content-Type": "application/json",
|
|
341
|
+
},
|
|
342
|
+
body: JSON.stringify({
|
|
343
|
+
contents: [
|
|
344
|
+
{
|
|
345
|
+
role: "user",
|
|
346
|
+
parts: [
|
|
347
|
+
{ text: `${prompt}\n\n${contextText}` },
|
|
348
|
+
{
|
|
349
|
+
inlineData: {
|
|
350
|
+
mimeType: frame.mimeType,
|
|
351
|
+
data: frame.base64Data,
|
|
352
|
+
},
|
|
353
|
+
},
|
|
354
|
+
],
|
|
355
|
+
},
|
|
356
|
+
],
|
|
357
|
+
generationConfig: {
|
|
358
|
+
temperature: 0.2,
|
|
359
|
+
maxOutputTokens: 220,
|
|
360
|
+
},
|
|
361
|
+
}),
|
|
362
|
+
});
|
|
363
|
+
if (!response.ok) {
|
|
364
|
+
const errText = await buildProviderErrorDetails(response, "unknown");
|
|
365
|
+
throw new Error(`Gemini vision request failed (${response.status}): ${errText}`);
|
|
366
|
+
}
|
|
367
|
+
const payload = await response.json();
|
|
368
|
+
const summary = extractGeminiResponseText(payload);
|
|
369
|
+
if (!summary) {
|
|
370
|
+
throw new Error("Gemini vision model returned an empty summary");
|
|
371
|
+
}
|
|
372
|
+
return {
|
|
373
|
+
summary,
|
|
374
|
+
provider: "gemini",
|
|
375
|
+
model,
|
|
376
|
+
};
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
// ── Voice provider detection ────────────────────────────────────────────────
|
|
380
|
+
|
|
381
|
+
/**
|
|
382
|
+
* Resolve voice configuration from bosun config + env.
|
|
383
|
+
* Returns { provider, model, openaiKey, azureKey, azureEndpoint, azureDeployment,
|
|
384
|
+
* claudeKey, geminiKey, voiceId, turnDetection, instructions,
|
|
385
|
+
* fallbackMode, delegateExecutor, enabled, visionModel }
|
|
386
|
+
*/
|
|
387
|
+
export function getVoiceConfig(forceReload = false) {
|
|
388
|
+
if (!forceReload && _voiceConfig && (Date.now() - _configLoadedAt < CONFIG_TTL_MS)) {
|
|
389
|
+
return _voiceConfig;
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
const cfg = loadConfig();
|
|
393
|
+
const voice = cfg.voice || {};
|
|
394
|
+
|
|
395
|
+
// Provider priority: config > env > key autodetect.
|
|
396
|
+
// "auto" resolves to azure/openai/claude/gemini/fallback based on available credentials.
|
|
397
|
+
const rawProvider = String(
|
|
398
|
+
voice.provider || process.env.VOICE_PROVIDER || "auto",
|
|
399
|
+
)
|
|
400
|
+
.trim()
|
|
401
|
+
.toLowerCase();
|
|
402
|
+
|
|
403
|
+
// API keys
|
|
404
|
+
const openaiKey = voice.openaiApiKey
|
|
405
|
+
|| process.env.OPENAI_REALTIME_API_KEY
|
|
406
|
+
|| process.env.OPENAI_API_KEY
|
|
407
|
+
|| "";
|
|
408
|
+
|
|
409
|
+
const azureKey = voice.azureApiKey
|
|
410
|
+
|| process.env.AZURE_OPENAI_REALTIME_API_KEY
|
|
411
|
+
|| process.env.AZURE_OPENAI_API_KEY
|
|
412
|
+
|| "";
|
|
413
|
+
|
|
414
|
+
const azureEndpoint = voice.azureEndpoint
|
|
415
|
+
|| process.env.AZURE_OPENAI_REALTIME_ENDPOINT
|
|
416
|
+
|| process.env.AZURE_OPENAI_ENDPOINT
|
|
417
|
+
|| "";
|
|
418
|
+
|
|
419
|
+
const azureDeployment = voice.azureDeployment
|
|
420
|
+
|| process.env.AZURE_OPENAI_REALTIME_DEPLOYMENT
|
|
421
|
+
|| "gpt-4o-realtime-preview";
|
|
422
|
+
|
|
423
|
+
const claudeKey = voice.claudeApiKey
|
|
424
|
+
|| process.env.ANTHROPIC_API_KEY
|
|
425
|
+
|| "";
|
|
426
|
+
|
|
427
|
+
const geminiKey = voice.geminiApiKey
|
|
428
|
+
|| process.env.GEMINI_API_KEY
|
|
429
|
+
|| process.env.GOOGLE_API_KEY
|
|
430
|
+
|| "";
|
|
431
|
+
|
|
432
|
+
const provider =
|
|
433
|
+
rawProvider === "auto"
|
|
434
|
+
? (azureKey && azureEndpoint
|
|
435
|
+
? "azure"
|
|
436
|
+
: (openaiKey
|
|
437
|
+
? "openai"
|
|
438
|
+
: (claudeKey
|
|
439
|
+
? "claude"
|
|
440
|
+
: (geminiKey ? "gemini" : "fallback"))))
|
|
441
|
+
: rawProvider;
|
|
442
|
+
const defaultModel =
|
|
443
|
+
provider === "claude"
|
|
444
|
+
? CLAUDE_DEFAULT_MODEL
|
|
445
|
+
: provider === "gemini"
|
|
446
|
+
? GEMINI_DEFAULT_MODEL
|
|
447
|
+
: OPENAI_REALTIME_MODEL;
|
|
448
|
+
const model = voice.model || process.env.VOICE_MODEL || defaultModel;
|
|
449
|
+
const voiceId = voice.voiceId || process.env.VOICE_ID || "alloy";
|
|
450
|
+
const turnDetection =
|
|
451
|
+
voice.turnDetection || process.env.VOICE_TURN_DETECTION || "server_vad";
|
|
452
|
+
const defaultVisionModel =
|
|
453
|
+
provider === "claude"
|
|
454
|
+
? CLAUDE_DEFAULT_VISION_MODEL
|
|
455
|
+
: provider === "gemini"
|
|
456
|
+
? GEMINI_DEFAULT_VISION_MODEL
|
|
457
|
+
: OPENAI_DEFAULT_VISION_MODEL;
|
|
458
|
+
const visionModel =
|
|
459
|
+
voice.visionModel || process.env.VOICE_VISION_MODEL || defaultVisionModel;
|
|
460
|
+
const fallbackMode =
|
|
461
|
+
voice.fallbackMode || process.env.VOICE_FALLBACK_MODE || "browser";
|
|
462
|
+
const delegateExecutor =
|
|
463
|
+
voice.delegateExecutor ||
|
|
464
|
+
process.env.VOICE_DELEGATE_EXECUTOR ||
|
|
465
|
+
cfg.primaryAgent ||
|
|
466
|
+
"codex-sdk";
|
|
467
|
+
const enabled =
|
|
468
|
+
voice.enabled != null
|
|
469
|
+
? voice.enabled !== false
|
|
470
|
+
: !["0", "false", "no", "off"].includes(
|
|
471
|
+
String(process.env.VOICE_ENABLED || "")
|
|
472
|
+
.trim()
|
|
473
|
+
.toLowerCase(),
|
|
474
|
+
);
|
|
475
|
+
|
|
476
|
+
const instructions = voice.instructions || `You are Bosun, a helpful voice assistant for the VirtEngine development platform.
|
|
477
|
+
You help developers manage tasks, steer coding agents, monitor builds, and navigate the workspace.
|
|
478
|
+
Be concise and conversational. When users ask about code or tasks, use the available tools.
|
|
479
|
+
For complex operations like writing code or creating PRs, delegate to the appropriate agent.`;
|
|
480
|
+
|
|
481
|
+
_voiceConfig = Object.freeze({
|
|
482
|
+
provider,
|
|
483
|
+
model,
|
|
484
|
+
openaiKey,
|
|
485
|
+
azureKey,
|
|
486
|
+
azureEndpoint,
|
|
487
|
+
azureDeployment,
|
|
488
|
+
claudeKey,
|
|
489
|
+
geminiKey,
|
|
490
|
+
voiceId,
|
|
491
|
+
turnDetection,
|
|
492
|
+
visionModel,
|
|
493
|
+
instructions,
|
|
494
|
+
fallbackMode,
|
|
495
|
+
delegateExecutor,
|
|
496
|
+
enabled,
|
|
497
|
+
});
|
|
498
|
+
_configLoadedAt = Date.now();
|
|
499
|
+
return _voiceConfig;
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
/**
|
|
503
|
+
* Check if any voice tier is available.
|
|
504
|
+
*/
|
|
505
|
+
export function isVoiceAvailable() {
|
|
506
|
+
const cfg = getVoiceConfig();
|
|
507
|
+
if (!cfg.enabled) return { available: false, tier: null, reason: "Voice disabled in config" };
|
|
508
|
+
|
|
509
|
+
if (cfg.provider === "openai" && cfg.openaiKey) {
|
|
510
|
+
return { available: true, tier: 1, provider: "openai" };
|
|
511
|
+
}
|
|
512
|
+
if (cfg.provider === "azure" && cfg.azureKey && cfg.azureEndpoint) {
|
|
513
|
+
return { available: true, tier: 1, provider: "azure" };
|
|
514
|
+
}
|
|
515
|
+
if (cfg.provider === "claude" && cfg.claudeKey) {
|
|
516
|
+
return { available: true, tier: 2, provider: "claude" };
|
|
517
|
+
}
|
|
518
|
+
if (cfg.provider === "gemini" && cfg.geminiKey) {
|
|
519
|
+
return { available: true, tier: 2, provider: "gemini" };
|
|
520
|
+
}
|
|
521
|
+
if (cfg.fallbackMode === "disabled") {
|
|
522
|
+
return {
|
|
523
|
+
available: false,
|
|
524
|
+
tier: null,
|
|
525
|
+
reason: `Voice provider "${cfg.provider}" is not configured and fallback is disabled`,
|
|
526
|
+
};
|
|
527
|
+
}
|
|
528
|
+
// Tier 2 fallback available when enabled
|
|
529
|
+
return { available: true, tier: 2, provider: "fallback" };
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
/**
|
|
533
|
+
* Create an ephemeral token for OpenAI Realtime API (WebRTC).
|
|
534
|
+
* Returns { token, expiresAt, model, voiceId, provider }
|
|
535
|
+
*/
|
|
536
|
+
export async function createEphemeralToken(toolDefinitions = [], callContext = {}) {
|
|
537
|
+
const cfg = getVoiceConfig();
|
|
538
|
+
if (cfg.provider === "azure") {
|
|
539
|
+
return createAzureEphemeralToken(toolDefinitions, callContext);
|
|
540
|
+
}
|
|
541
|
+
if (cfg.provider !== "openai") {
|
|
542
|
+
throw new Error(
|
|
543
|
+
`Realtime WebRTC token is unavailable for provider "${cfg.provider}". ` +
|
|
544
|
+
"Use VOICE_PROVIDER=openai|azure for Tier 1 realtime voice.",
|
|
545
|
+
);
|
|
546
|
+
}
|
|
547
|
+
if (!cfg.openaiKey) {
|
|
548
|
+
throw new Error("OPENAI_API_KEY not configured for voice");
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
const context = sanitizeVoiceCallContext(callContext);
|
|
552
|
+
const instructions = buildSessionScopedInstructions(cfg.instructions, context);
|
|
553
|
+
|
|
554
|
+
const sessionConfig = {
|
|
555
|
+
model: cfg.model,
|
|
556
|
+
voice: cfg.voiceId,
|
|
557
|
+
instructions,
|
|
558
|
+
tool_choice: resolveToolChoice(toolDefinitions, context),
|
|
559
|
+
turn_detection: {
|
|
560
|
+
type: cfg.turnDetection,
|
|
561
|
+
...(cfg.turnDetection === "server_vad" ? {
|
|
562
|
+
threshold: 0.5,
|
|
563
|
+
prefix_padding_ms: 300,
|
|
564
|
+
silence_duration_ms: 500,
|
|
565
|
+
} : {}),
|
|
566
|
+
...(cfg.turnDetection === "semantic_vad" ? {
|
|
567
|
+
eagerness: "medium",
|
|
568
|
+
} : {}),
|
|
569
|
+
},
|
|
570
|
+
input_audio_transcription: { model: "gpt-4o-mini-transcribe" },
|
|
571
|
+
tools: toolDefinitions,
|
|
572
|
+
};
|
|
573
|
+
|
|
574
|
+
const response = await fetch(`${OPENAI_REALTIME_URL}/sessions`, {
|
|
575
|
+
method: "POST",
|
|
576
|
+
headers: {
|
|
577
|
+
Authorization: `Bearer ${cfg.openaiKey}`,
|
|
578
|
+
"Content-Type": "application/json",
|
|
579
|
+
},
|
|
580
|
+
body: JSON.stringify(sessionConfig),
|
|
581
|
+
});
|
|
582
|
+
|
|
583
|
+
if (!response.ok) {
|
|
584
|
+
const errorText = await buildProviderErrorDetails(response, "unknown");
|
|
585
|
+
throw new Error(`OpenAI Realtime session failed (${response.status}): ${errorText}`);
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
const data = await response.json();
|
|
589
|
+
return {
|
|
590
|
+
token: data.client_secret?.value || data.token,
|
|
591
|
+
expiresAt: data.client_secret?.expires_at || (Date.now() / 1000 + 60),
|
|
592
|
+
model: cfg.model,
|
|
593
|
+
voiceId: cfg.voiceId,
|
|
594
|
+
provider: "openai",
|
|
595
|
+
sessionConfig,
|
|
596
|
+
callContext: context,
|
|
597
|
+
};
|
|
598
|
+
}
|
|
599
|
+
|
|
600
|
+
/**
|
|
601
|
+
* Create an ephemeral token for Azure OpenAI Realtime API.
|
|
602
|
+
*/
|
|
603
|
+
async function createAzureEphemeralToken(toolDefinitions = [], callContext = {}) {
|
|
604
|
+
const cfg = getVoiceConfig();
|
|
605
|
+
if (!cfg.azureKey || !cfg.azureEndpoint) {
|
|
606
|
+
throw new Error("Azure OpenAI Realtime not configured (need endpoint + key)");
|
|
607
|
+
}
|
|
608
|
+
|
|
609
|
+
const context = sanitizeVoiceCallContext(callContext);
|
|
610
|
+
const instructions = buildSessionScopedInstructions(cfg.instructions, context);
|
|
611
|
+
const endpoint = cfg.azureEndpoint.replace(/\/+$/, "");
|
|
612
|
+
const url = `${endpoint}/openai/realtime/sessions?api-version=${AZURE_API_VERSION}&deployment=${cfg.azureDeployment}`;
|
|
613
|
+
|
|
614
|
+
const sessionConfig = {
|
|
615
|
+
model: cfg.azureDeployment,
|
|
616
|
+
voice: cfg.voiceId,
|
|
617
|
+
instructions,
|
|
618
|
+
tool_choice: resolveToolChoice(toolDefinitions, context),
|
|
619
|
+
turn_detection: {
|
|
620
|
+
type: cfg.turnDetection,
|
|
621
|
+
...(cfg.turnDetection === "server_vad" ? {
|
|
622
|
+
threshold: 0.5,
|
|
623
|
+
prefix_padding_ms: 300,
|
|
624
|
+
silence_duration_ms: 500,
|
|
625
|
+
} : {}),
|
|
626
|
+
},
|
|
627
|
+
input_audio_transcription: { model: "whisper-1" },
|
|
628
|
+
tools: toolDefinitions,
|
|
629
|
+
};
|
|
630
|
+
|
|
631
|
+
const response = await fetch(url, {
|
|
632
|
+
method: "POST",
|
|
633
|
+
headers: {
|
|
634
|
+
"api-key": cfg.azureKey,
|
|
635
|
+
"Content-Type": "application/json",
|
|
636
|
+
},
|
|
637
|
+
body: JSON.stringify(sessionConfig),
|
|
638
|
+
});
|
|
639
|
+
|
|
640
|
+
if (!response.ok) {
|
|
641
|
+
const errorText = await buildProviderErrorDetails(response, "unknown");
|
|
642
|
+
throw new Error(`Azure Realtime session failed (${response.status}): ${errorText}`);
|
|
643
|
+
}
|
|
644
|
+
|
|
645
|
+
const data = await response.json();
|
|
646
|
+
return {
|
|
647
|
+
token: data.client_secret?.value || data.token,
|
|
648
|
+
expiresAt: data.client_secret?.expires_at || (Date.now() / 1000 + 60),
|
|
649
|
+
model: cfg.azureDeployment,
|
|
650
|
+
voiceId: cfg.voiceId,
|
|
651
|
+
provider: "azure",
|
|
652
|
+
sessionConfig,
|
|
653
|
+
azureEndpoint: endpoint,
|
|
654
|
+
azureDeployment: cfg.azureDeployment,
|
|
655
|
+
callContext: context,
|
|
656
|
+
};
|
|
657
|
+
}
|
|
658
|
+
|
|
659
|
+
/**
|
|
660
|
+
* Analyze a camera/screen frame and return a concise summary.
|
|
661
|
+
* @param {string} frameDataUrl - data URL (image/jpeg|png|webp)
|
|
662
|
+
* @param {object} options - { source, context, prompt }
|
|
663
|
+
* @returns {Promise<{ summary: string, provider: string, model: string }>}
|
|
664
|
+
*/
|
|
665
|
+
export async function analyzeVisionFrame(frameDataUrl, options = {}) {
|
|
666
|
+
const frame = parseImageDataUrl(frameDataUrl);
|
|
667
|
+
const dataUrl = frame.dataUrl;
|
|
668
|
+
|
|
669
|
+
const cfg = getVoiceConfig();
|
|
670
|
+
const source = String(options?.source || "screen").trim().toLowerCase() || "screen";
|
|
671
|
+
const callContext = sanitizeVoiceCallContext(options?.context || {});
|
|
672
|
+
const model =
|
|
673
|
+
String(
|
|
674
|
+
options?.model
|
|
675
|
+
|| options?.visionModel
|
|
676
|
+
|| cfg.visionModel
|
|
677
|
+
|| process.env.VOICE_VISION_MODEL
|
|
678
|
+
|| OPENAI_DEFAULT_VISION_MODEL,
|
|
679
|
+
).trim();
|
|
680
|
+
const prompt = String(options?.prompt || "").trim()
|
|
681
|
+
|| "Summarize what is visible in this live frame for a coding assistant. Focus on code, terminal output, errors, UI labels, and actionable context.";
|
|
682
|
+
|
|
683
|
+
const contextText = [
|
|
684
|
+
`Frame source: ${source}.`,
|
|
685
|
+
`Bound chat session: ${callContext.sessionId || "none"}.`,
|
|
686
|
+
callContext.executor ? `Preferred executor: ${callContext.executor}.` : "",
|
|
687
|
+
callContext.mode ? `Preferred mode: ${callContext.mode}.` : "",
|
|
688
|
+
callContext.model ? `Preferred model override: ${callContext.model}.` : "",
|
|
689
|
+
"Respond in 1-3 concise sentences. Include likely next action if obvious.",
|
|
690
|
+
]
|
|
691
|
+
.filter(Boolean)
|
|
692
|
+
.join("\n");
|
|
693
|
+
|
|
694
|
+
const preferredProviders = [];
|
|
695
|
+
const pushProvider = (value) => {
|
|
696
|
+
const provider = String(value || "").trim().toLowerCase();
|
|
697
|
+
if (!provider || preferredProviders.includes(provider)) return;
|
|
698
|
+
preferredProviders.push(provider);
|
|
699
|
+
};
|
|
700
|
+
pushProvider(cfg.provider);
|
|
701
|
+
if (cfg.openaiKey) pushProvider("openai");
|
|
702
|
+
if (cfg.azureKey && cfg.azureEndpoint) pushProvider("azure");
|
|
703
|
+
if (cfg.claudeKey) pushProvider("claude");
|
|
704
|
+
if (cfg.geminiKey) pushProvider("gemini");
|
|
705
|
+
|
|
706
|
+
let lastError = null;
|
|
707
|
+
for (const provider of preferredProviders) {
|
|
708
|
+
try {
|
|
709
|
+
if (provider === "openai" && cfg.openaiKey) {
|
|
710
|
+
return await analyzeVisionWithOpenAI(
|
|
711
|
+
dataUrl,
|
|
712
|
+
model,
|
|
713
|
+
prompt,
|
|
714
|
+
contextText,
|
|
715
|
+
cfg,
|
|
716
|
+
);
|
|
717
|
+
}
|
|
718
|
+
if (provider === "azure" && cfg.azureKey && cfg.azureEndpoint) {
|
|
719
|
+
return await analyzeVisionWithAzure(
|
|
720
|
+
dataUrl,
|
|
721
|
+
model,
|
|
722
|
+
prompt,
|
|
723
|
+
contextText,
|
|
724
|
+
cfg,
|
|
725
|
+
);
|
|
726
|
+
}
|
|
727
|
+
if (provider === "claude" && cfg.claudeKey) {
|
|
728
|
+
return await analyzeVisionWithClaude(
|
|
729
|
+
frame,
|
|
730
|
+
model,
|
|
731
|
+
prompt,
|
|
732
|
+
contextText,
|
|
733
|
+
cfg,
|
|
734
|
+
);
|
|
735
|
+
}
|
|
736
|
+
if (provider === "gemini" && cfg.geminiKey) {
|
|
737
|
+
return await analyzeVisionWithGemini(
|
|
738
|
+
frame,
|
|
739
|
+
model,
|
|
740
|
+
prompt,
|
|
741
|
+
contextText,
|
|
742
|
+
cfg,
|
|
743
|
+
);
|
|
744
|
+
}
|
|
745
|
+
} catch (err) {
|
|
746
|
+
lastError = err;
|
|
747
|
+
}
|
|
748
|
+
}
|
|
749
|
+
|
|
750
|
+
if (lastError) {
|
|
751
|
+
throw new Error(`Vision request failed: ${lastError.message}`);
|
|
752
|
+
}
|
|
753
|
+
|
|
754
|
+
throw new Error(
|
|
755
|
+
"Vision unavailable: configure OPENAI, Azure, Anthropic, or Gemini voice credentials",
|
|
756
|
+
);
|
|
757
|
+
}
|
|
758
|
+
|
|
759
|
+
/**
|
|
760
|
+
* Execute a voice tool call server-side.
|
|
761
|
+
* Returns { result: string, error?: string }
|
|
762
|
+
*/
|
|
763
|
+
export async function executeVoiceTool(toolName, toolArgs, context = {}) {
|
|
764
|
+
try {
|
|
765
|
+
// Import voice-tools lazily to avoid circular deps
|
|
766
|
+
const { executeToolCall } = await import("./voice-tools.mjs");
|
|
767
|
+
return await executeToolCall(toolName, toolArgs, context);
|
|
768
|
+
} catch (err) {
|
|
769
|
+
console.error(`[voice-relay] tool execution error (${toolName}):`, err.message);
|
|
770
|
+
return { result: null, error: err.message };
|
|
771
|
+
}
|
|
772
|
+
}
|
|
773
|
+
|
|
774
|
+
/**
|
|
775
|
+
* Get the full tool definitions array for voice sessions.
|
|
776
|
+
*/
|
|
777
|
+
export async function getVoiceToolDefinitions(options = {}) {
|
|
778
|
+
try {
|
|
779
|
+
const { getToolDefinitions } = await import("./voice-tools.mjs");
|
|
780
|
+
const allTools = getToolDefinitions();
|
|
781
|
+
const delegateOnly = options?.delegateOnly === true;
|
|
782
|
+
if (!delegateOnly) return allTools;
|
|
783
|
+
return allTools.filter((tool) => tool?.name === "delegate_to_agent");
|
|
784
|
+
} catch (err) {
|
|
785
|
+
console.error("[voice-relay] failed to load voice tool definitions:", err.message);
|
|
786
|
+
return [];
|
|
787
|
+
}
|
|
788
|
+
}
|
|
789
|
+
|
|
790
|
+
/**
|
|
791
|
+
* Get the WebRTC connection URL for the client.
|
|
792
|
+
*/
|
|
793
|
+
export function getRealtimeConnectionInfo() {
|
|
794
|
+
const cfg = getVoiceConfig();
|
|
795
|
+
if (cfg.provider === "azure") {
|
|
796
|
+
const endpoint = cfg.azureEndpoint.replace(/\/+$/, "");
|
|
797
|
+
return {
|
|
798
|
+
provider: "azure",
|
|
799
|
+
url: `${endpoint}/openai/realtime?api-version=${AZURE_API_VERSION}&deployment=${cfg.azureDeployment}`,
|
|
800
|
+
model: cfg.azureDeployment,
|
|
801
|
+
};
|
|
802
|
+
}
|
|
803
|
+
if (cfg.provider !== "openai") {
|
|
804
|
+
return {
|
|
805
|
+
provider: cfg.provider,
|
|
806
|
+
url: null,
|
|
807
|
+
model: cfg.model,
|
|
808
|
+
tier: 2,
|
|
809
|
+
};
|
|
810
|
+
}
|
|
811
|
+
return {
|
|
812
|
+
provider: "openai",
|
|
813
|
+
url: `${OPENAI_REALTIME_URL}?model=${cfg.model}`,
|
|
814
|
+
model: cfg.model,
|
|
815
|
+
};
|
|
816
|
+
}
|