bosun 0.36.2 → 0.36.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/agent-prompts.mjs +95 -0
- package/analyze-agent-work-helpers.mjs +308 -0
- package/analyze-agent-work.mjs +926 -0
- package/autofix.mjs +2 -0
- package/bosun.schema.json +101 -3
- package/codex-shell.mjs +85 -10
- package/desktop/main.mjs +871 -48
- package/desktop/preload.mjs +54 -1
- package/desktop-shortcut.mjs +90 -11
- package/git-editor-fix.mjs +273 -0
- package/mcp-registry.mjs +579 -0
- package/meeting-workflow-service.mjs +631 -0
- package/monitor.mjs +18 -103
- package/package.json +21 -2
- package/primary-agent.mjs +32 -12
- package/session-tracker.mjs +68 -0
- package/setup-web-server.mjs +20 -10
- package/setup.mjs +376 -83
- package/startup-service.mjs +51 -6
- package/stream-resilience.mjs +17 -7
- package/ui/app.js +164 -4
- package/ui/components/agent-selector.js +145 -1
- package/ui/components/chat-view.js +161 -15
- package/ui/components/session-list.js +2 -2
- package/ui/components/shared.js +188 -15
- package/ui/modules/icons.js +13 -0
- package/ui/modules/utils.js +44 -0
- package/ui/modules/voice-client-sdk.js +733 -0
- package/ui/modules/voice-overlay.js +128 -15
- package/ui/modules/voice.js +15 -6
- package/ui/setup.html +281 -81
- package/ui/styles/components.css +99 -3
- package/ui/styles/sessions.css +122 -14
- package/ui/styles.css +14 -0
- package/ui/tabs/agents.js +1 -1
- package/ui/tabs/chat.js +123 -14
- package/ui/tabs/control.js +16 -22
- package/ui/tabs/dashboard.js +85 -8
- package/ui/tabs/library.js +113 -17
- package/ui/tabs/settings.js +116 -2
- package/ui/tabs/tasks.js +388 -39
- package/ui/tabs/telemetry.js +0 -1
- package/ui/tabs/workflows.js +4 -0
- package/ui-server.mjs +400 -22
- package/update-check.mjs +41 -13
- package/voice-action-dispatcher.mjs +844 -0
- package/voice-agents-sdk.mjs +664 -0
- package/voice-auth-manager.mjs +164 -0
- package/voice-relay.mjs +1194 -0
- package/voice-tools.mjs +914 -0
- package/workflow-templates/agents.mjs +6 -2
- package/workflow-templates/github.mjs +154 -12
- package/workflow-templates.mjs +3 -0
- package/github-reconciler.mjs +0 -506
- package/merge-strategy.mjs +0 -1210
- package/pr-cleanup-daemon.mjs +0 -992
- package/workspace-reaper.mjs +0 -405
|
@@ -0,0 +1,733 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* voice-client-sdk.js — Client-side voice using @openai/agents SDK as primary,
|
|
3
|
+
* with automatic fallback to legacy voice-client.js on failure.
|
|
4
|
+
*
|
|
5
|
+
* Provider strategy:
|
|
6
|
+
* - OpenAI/Azure: @openai/agents RealtimeSession (WebRTC, auto mic/speaker)
|
|
7
|
+
* - Gemini: WebSocket streaming via server proxy (Live API)
|
|
8
|
+
* - Claude/fallback: Falls through to voice-fallback.js (Web Speech API)
|
|
9
|
+
*
|
|
10
|
+
* The module exposes the same signal-based API as voice-client.js so
|
|
11
|
+
* voice-overlay.js can switch transparently.
|
|
12
|
+
*
|
|
13
|
+
* @module voice-client-sdk
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
import { signal, computed } from "@preact/signals";
|
|
17
|
+
|
|
18
|
+
// ── State Signals (same shape as voice-client.js) ───────────────────────────
|
|
19
|
+
|
|
20
|
+
export const sdkVoiceState = signal("idle");
|
|
21
|
+
export const sdkVoiceTranscript = signal("");
|
|
22
|
+
export const sdkVoiceResponse = signal("");
|
|
23
|
+
export const sdkVoiceError = signal(null);
|
|
24
|
+
export const sdkVoiceToolCalls = signal([]);
|
|
25
|
+
export const sdkVoiceSessionId = signal(null);
|
|
26
|
+
export const sdkVoiceBoundSessionId = signal(null);
|
|
27
|
+
export const sdkVoiceDuration = signal(0);
|
|
28
|
+
export const sdkVoiceProvider = signal(null);
|
|
29
|
+
export const sdkVoiceSdkActive = signal(false);
|
|
30
|
+
|
|
31
|
+
export const isSdkVoiceActive = computed(() =>
|
|
32
|
+
sdkVoiceState.value !== "idle" && sdkVoiceState.value !== "error"
|
|
33
|
+
);
|
|
34
|
+
|
|
35
|
+
// ── Module-scope state ──────────────────────────────────────────────────────
|
|
36
|
+
|
|
37
|
+
let _session = null;
|
|
38
|
+
let _durationTimer = null;
|
|
39
|
+
let _sessionStartTime = 0;
|
|
40
|
+
let _eventHandlers = new Map();
|
|
41
|
+
let _callContext = {
|
|
42
|
+
sessionId: null,
|
|
43
|
+
executor: null,
|
|
44
|
+
mode: null,
|
|
45
|
+
model: null,
|
|
46
|
+
};
|
|
47
|
+
let _sdkConfig = null;
|
|
48
|
+
let _usingLegacyFallback = false;
|
|
49
|
+
|
|
50
|
+
// ── Event System ────────────────────────────────────────────────────────────
|
|
51
|
+
|
|
52
|
+
export function onSdkVoiceEvent(event, handler) {
|
|
53
|
+
if (!_eventHandlers.has(event)) _eventHandlers.set(event, new Set());
|
|
54
|
+
_eventHandlers.get(event).add(handler);
|
|
55
|
+
return () => _eventHandlers.get(event)?.delete(handler);
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
function emit(event, data) {
|
|
59
|
+
const handlers = _eventHandlers.get(event);
|
|
60
|
+
if (handlers) {
|
|
61
|
+
for (const handler of handlers) {
|
|
62
|
+
try {
|
|
63
|
+
handler(data);
|
|
64
|
+
} catch (err) {
|
|
65
|
+
console.error(`[voice-client-sdk] event handler error (${event}):`, err);
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
function _normalizeCallContext(options = {}) {
|
|
72
|
+
return {
|
|
73
|
+
sessionId: String(options?.sessionId || "").trim() || null,
|
|
74
|
+
executor: String(options?.executor || "").trim() || null,
|
|
75
|
+
mode: String(options?.mode || "").trim() || null,
|
|
76
|
+
model: String(options?.model || "").trim() || null,
|
|
77
|
+
};
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
// ── SDK Configuration Fetch ────────────────────────────────────────────────
|
|
81
|
+
|
|
82
|
+
/**
|
|
83
|
+
* Fetch SDK configuration from the server.
|
|
84
|
+
* Determines if we should use Agents SDK or legacy voice.
|
|
85
|
+
*/
|
|
86
|
+
async function fetchSdkConfig() {
|
|
87
|
+
try {
|
|
88
|
+
const res = await fetch("/api/voice/sdk-config", {
|
|
89
|
+
method: "GET",
|
|
90
|
+
headers: { "Content-Type": "application/json" },
|
|
91
|
+
});
|
|
92
|
+
if (!res.ok) {
|
|
93
|
+
return { useSdk: false, reason: `Server returned ${res.status}` };
|
|
94
|
+
}
|
|
95
|
+
return await res.json();
|
|
96
|
+
} catch (err) {
|
|
97
|
+
return { useSdk: false, reason: err.message };
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// ── Transcript persistence ──────────────────────────────────────────────────
|
|
102
|
+
|
|
103
|
+
async function _recordTranscript(role, content, eventType = "") {
|
|
104
|
+
const sessionId = String(_callContext?.sessionId || sdkVoiceSessionId.value || "").trim();
|
|
105
|
+
const text = String(content || "").trim();
|
|
106
|
+
if (!sessionId || !text) return;
|
|
107
|
+
try {
|
|
108
|
+
await fetch("/api/voice/transcript", {
|
|
109
|
+
method: "POST",
|
|
110
|
+
headers: { "Content-Type": "application/json" },
|
|
111
|
+
body: JSON.stringify({
|
|
112
|
+
sessionId,
|
|
113
|
+
role,
|
|
114
|
+
content: text,
|
|
115
|
+
eventType,
|
|
116
|
+
executor: _callContext?.executor || undefined,
|
|
117
|
+
mode: _callContext?.mode || undefined,
|
|
118
|
+
model: _callContext?.model || undefined,
|
|
119
|
+
provider: sdkVoiceProvider.value || undefined,
|
|
120
|
+
}),
|
|
121
|
+
});
|
|
122
|
+
} catch (err) {
|
|
123
|
+
console.warn("[voice-client-sdk] transcript persistence failed:", err?.message || err);
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
// ── OpenAI/Azure Agents SDK Session ─────────────────────────────────────────
|
|
128
|
+
|
|
129
|
+
/**
|
|
130
|
+
* Start a voice session using @openai/agents RealtimeSession.
|
|
131
|
+
* This runs entirely client-side with WebRTC auto-mic handling.
|
|
132
|
+
*/
|
|
133
|
+
async function startAgentsSdkSession(config, options = {}) {
|
|
134
|
+
// Dynamically import @openai/agents/realtime (browser bundle)
|
|
135
|
+
const agentsMod = await import("@openai/agents/realtime");
|
|
136
|
+
const { RealtimeAgent, RealtimeSession } = agentsMod;
|
|
137
|
+
|
|
138
|
+
if (!RealtimeAgent || !RealtimeSession) {
|
|
139
|
+
throw new Error("@openai/agents/realtime not available in browser");
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// Fetch token and tools from server
|
|
143
|
+
const tokenRes = await fetch("/api/voice/token", {
|
|
144
|
+
method: "POST",
|
|
145
|
+
headers: { "Content-Type": "application/json" },
|
|
146
|
+
body: JSON.stringify({
|
|
147
|
+
sessionId: _callContext.sessionId || undefined,
|
|
148
|
+
executor: _callContext.executor || undefined,
|
|
149
|
+
mode: _callContext.mode || undefined,
|
|
150
|
+
model: _callContext.model || undefined,
|
|
151
|
+
delegateOnly: Boolean(_callContext.sessionId),
|
|
152
|
+
sdkMode: true,
|
|
153
|
+
}),
|
|
154
|
+
});
|
|
155
|
+
if (!tokenRes.ok) {
|
|
156
|
+
const err = await tokenRes.json().catch(() => ({ error: "Token fetch failed" }));
|
|
157
|
+
throw new Error(err.error || `Token fetch failed (${tokenRes.status})`);
|
|
158
|
+
}
|
|
159
|
+
const tokenData = await tokenRes.json();
|
|
160
|
+
|
|
161
|
+
// Create RealtimeAgent with server-provided instructions
|
|
162
|
+
const agent = new RealtimeAgent({
|
|
163
|
+
name: "Bosun Voice Agent",
|
|
164
|
+
instructions: tokenData.instructions || "You are Bosun, a helpful voice assistant.",
|
|
165
|
+
tools: (tokenData.tools || []).map((t) => ({
|
|
166
|
+
type: "function",
|
|
167
|
+
name: t.name,
|
|
168
|
+
description: t.description || "",
|
|
169
|
+
parameters: t.parameters || { type: "object", properties: {} },
|
|
170
|
+
async execute(args) {
|
|
171
|
+
// Execute tool via server
|
|
172
|
+
const res = await fetch("/api/voice/tool", {
|
|
173
|
+
method: "POST",
|
|
174
|
+
headers: { "Content-Type": "application/json" },
|
|
175
|
+
body: JSON.stringify({
|
|
176
|
+
toolName: t.name,
|
|
177
|
+
args,
|
|
178
|
+
sessionId: sdkVoiceSessionId.value,
|
|
179
|
+
executor: _callContext.executor || undefined,
|
|
180
|
+
mode: _callContext.mode || undefined,
|
|
181
|
+
model: _callContext.model || undefined,
|
|
182
|
+
}),
|
|
183
|
+
});
|
|
184
|
+
const result = await res.json();
|
|
185
|
+
return result.result || result.error || "No output";
|
|
186
|
+
},
|
|
187
|
+
})),
|
|
188
|
+
});
|
|
189
|
+
|
|
190
|
+
// Determine model and voice
|
|
191
|
+
const model = String(tokenData.model || config.model || "gpt-realtime-1.5").trim();
|
|
192
|
+
const voiceId = String(tokenData.voiceId || config.voiceId || "alloy").trim();
|
|
193
|
+
const turnDetection = String(config.turnDetection || "server_vad").trim();
|
|
194
|
+
|
|
195
|
+
// Create session with config
|
|
196
|
+
const session = new RealtimeSession(agent, {
|
|
197
|
+
model,
|
|
198
|
+
config: {
|
|
199
|
+
outputModalities: ["text", "audio"],
|
|
200
|
+
audio: {
|
|
201
|
+
input: {
|
|
202
|
+
format: "pcm16",
|
|
203
|
+
transcription: { model: "gpt-4o-mini-transcribe" },
|
|
204
|
+
turnDetection: {
|
|
205
|
+
type: turnDetection,
|
|
206
|
+
...(turnDetection === "server_vad"
|
|
207
|
+
? { threshold: 0.5, prefix_padding_ms: 300, silence_duration_ms: 500 }
|
|
208
|
+
: {}),
|
|
209
|
+
...(turnDetection === "semantic_vad"
|
|
210
|
+
? { eagerness: "medium" }
|
|
211
|
+
: {}),
|
|
212
|
+
},
|
|
213
|
+
},
|
|
214
|
+
output: {
|
|
215
|
+
format: "pcm16",
|
|
216
|
+
voice: voiceId,
|
|
217
|
+
},
|
|
218
|
+
},
|
|
219
|
+
},
|
|
220
|
+
});
|
|
221
|
+
|
|
222
|
+
// ── Wire up SDK events to our signals ──
|
|
223
|
+
|
|
224
|
+
session.on("history_updated", (history) => {
|
|
225
|
+
const items = history || [];
|
|
226
|
+
const lastUserMsg = [...items].reverse().find(
|
|
227
|
+
(item) => item.role === "user" && item.type === "message"
|
|
228
|
+
);
|
|
229
|
+
const lastAssistantMsg = [...items].reverse().find(
|
|
230
|
+
(item) => item.role === "assistant" && item.type === "message"
|
|
231
|
+
);
|
|
232
|
+
|
|
233
|
+
if (lastUserMsg) {
|
|
234
|
+
const transcript = lastUserMsg.content?.map((c) => c.transcript || c.text || "").join("") || "";
|
|
235
|
+
if (transcript) {
|
|
236
|
+
sdkVoiceTranscript.value = transcript;
|
|
237
|
+
emit("transcript", { text: transcript, final: true });
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
if (lastAssistantMsg) {
|
|
242
|
+
const response = lastAssistantMsg.content?.map((c) => c.transcript || c.text || "").join("") || "";
|
|
243
|
+
if (response) {
|
|
244
|
+
sdkVoiceResponse.value = response;
|
|
245
|
+
emit("response-complete", { text: response });
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
emit("history-updated", { history: items });
|
|
250
|
+
});
|
|
251
|
+
|
|
252
|
+
session.on("audio_interrupted", () => {
|
|
253
|
+
emit("interrupt", {});
|
|
254
|
+
});
|
|
255
|
+
|
|
256
|
+
session.on("tool_call_start", (event) => {
|
|
257
|
+
const callId = event?.callId || event?.call_id || `tc-${Date.now()}`;
|
|
258
|
+
const name = event?.name || event?.toolName || "unknown";
|
|
259
|
+
sdkVoiceToolCalls.value = [
|
|
260
|
+
...sdkVoiceToolCalls.value,
|
|
261
|
+
{ callId, name, status: "running" },
|
|
262
|
+
];
|
|
263
|
+
sdkVoiceState.value = "thinking";
|
|
264
|
+
emit("tool-call-start", { callId, name });
|
|
265
|
+
});
|
|
266
|
+
|
|
267
|
+
session.on("tool_call_done", (event) => {
|
|
268
|
+
const callId = event?.callId || event?.call_id;
|
|
269
|
+
sdkVoiceToolCalls.value = sdkVoiceToolCalls.value.map((tc) =>
|
|
270
|
+
tc.callId === callId ? { ...tc, status: "complete" } : tc
|
|
271
|
+
);
|
|
272
|
+
emit("tool-call-complete", { callId });
|
|
273
|
+
});
|
|
274
|
+
|
|
275
|
+
session.on("error", (err) => {
|
|
276
|
+
console.error("[voice-client-sdk] session error:", err);
|
|
277
|
+
sdkVoiceError.value = err?.message || "Session error";
|
|
278
|
+
emit("error", { message: err?.message });
|
|
279
|
+
});
|
|
280
|
+
|
|
281
|
+
session.on("guardrail_tripped", (event) => {
|
|
282
|
+
emit("guardrail-tripped", event);
|
|
283
|
+
});
|
|
284
|
+
|
|
285
|
+
// Connect with the token
|
|
286
|
+
const connectOpts = { apiKey: tokenData.token };
|
|
287
|
+
|
|
288
|
+
if (tokenData.provider === "azure" && tokenData.azureEndpoint) {
|
|
289
|
+
const endpoint = String(tokenData.azureEndpoint).replace(/\/+$/, "");
|
|
290
|
+
const deployment = tokenData.azureDeployment || "gpt-realtime-1.5";
|
|
291
|
+
connectOpts.url = `${endpoint}/openai/realtime?api-version=2025-04-01-preview&deployment=${deployment}`;
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
await session.connect(connectOpts);
|
|
295
|
+
|
|
296
|
+
_session = session;
|
|
297
|
+
sdkVoiceSdkActive.value = true;
|
|
298
|
+
sdkVoiceState.value = "connected";
|
|
299
|
+
sdkVoiceProvider.value = tokenData.provider || "openai";
|
|
300
|
+
_sessionStartTime = Date.now();
|
|
301
|
+
sdkVoiceSessionId.value = _callContext.sessionId || `voice-sdk-${Date.now()}`;
|
|
302
|
+
startDurationTimer();
|
|
303
|
+
|
|
304
|
+
emit("connected", {
|
|
305
|
+
provider: tokenData.provider,
|
|
306
|
+
sessionId: sdkVoiceSessionId.value,
|
|
307
|
+
sdk: "openai-agents",
|
|
308
|
+
callContext: { ..._callContext },
|
|
309
|
+
});
|
|
310
|
+
|
|
311
|
+
return session;
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
// ── Gemini Live Session (WebSocket via server proxy) ────────────────────────
|
|
315
|
+
|
|
316
|
+
/**
|
|
317
|
+
* Start a Gemini Live voice session.
|
|
318
|
+
* Since Gemini Live uses WebSocket and we can't directly use the @google/genai
|
|
319
|
+
* SDK in the browser without exposing the API key, we use a server-proxied
|
|
320
|
+
* approach: the server manages the Gemini Live WebSocket, and the client
|
|
321
|
+
* sends/receives audio via a bosun WebSocket relay.
|
|
322
|
+
*/
|
|
323
|
+
async function startGeminiLiveSession(config, options = {}) {
|
|
324
|
+
// For Gemini, fall back to server-mediated approach
|
|
325
|
+
// The client sends mic audio via WebSocket to our server,
|
|
326
|
+
// which forwards to Gemini Live API and returns audio.
|
|
327
|
+
const wsProtocol = globalThis.location?.protocol === "https:" ? "wss:" : "ws:";
|
|
328
|
+
const wsUrl = `${wsProtocol}//${globalThis.location?.host}/api/voice/gemini-live`;
|
|
329
|
+
|
|
330
|
+
const ws = new WebSocket(wsUrl);
|
|
331
|
+
let audioElement = null;
|
|
332
|
+
|
|
333
|
+
return new Promise((resolve, reject) => {
|
|
334
|
+
const timeout = setTimeout(() => {
|
|
335
|
+
reject(new Error("Gemini Live connection timeout"));
|
|
336
|
+
}, 15000);
|
|
337
|
+
|
|
338
|
+
ws.onopen = () => {
|
|
339
|
+
clearTimeout(timeout);
|
|
340
|
+
|
|
341
|
+
// Send session config
|
|
342
|
+
ws.send(JSON.stringify({
|
|
343
|
+
type: "session.config",
|
|
344
|
+
sessionId: _callContext.sessionId,
|
|
345
|
+
executor: _callContext.executor,
|
|
346
|
+
mode: _callContext.mode,
|
|
347
|
+
model: config.model,
|
|
348
|
+
}));
|
|
349
|
+
|
|
350
|
+
_session = ws;
|
|
351
|
+
sdkVoiceSdkActive.value = true;
|
|
352
|
+
sdkVoiceState.value = "connected";
|
|
353
|
+
sdkVoiceProvider.value = "gemini";
|
|
354
|
+
_sessionStartTime = Date.now();
|
|
355
|
+
sdkVoiceSessionId.value = _callContext.sessionId || `voice-gemini-${Date.now()}`;
|
|
356
|
+
startDurationTimer();
|
|
357
|
+
|
|
358
|
+
// Start mic capture and stream to server
|
|
359
|
+
startGeminiMicCapture(ws).catch((err) => {
|
|
360
|
+
console.error("[voice-client-sdk] Gemini mic capture failed:", err);
|
|
361
|
+
sdkVoiceError.value = err.message;
|
|
362
|
+
sdkVoiceState.value = "error";
|
|
363
|
+
});
|
|
364
|
+
|
|
365
|
+
emit("connected", {
|
|
366
|
+
provider: "gemini",
|
|
367
|
+
sessionId: sdkVoiceSessionId.value,
|
|
368
|
+
sdk: "google-genai-live",
|
|
369
|
+
callContext: { ..._callContext },
|
|
370
|
+
});
|
|
371
|
+
|
|
372
|
+
resolve(ws);
|
|
373
|
+
};
|
|
374
|
+
|
|
375
|
+
ws.onmessage = (event) => {
|
|
376
|
+
try {
|
|
377
|
+
const msg = JSON.parse(event.data);
|
|
378
|
+
handleGeminiServerEvent(msg);
|
|
379
|
+
} catch {
|
|
380
|
+
// Binary audio data — play it
|
|
381
|
+
if (event.data instanceof Blob || event.data instanceof ArrayBuffer) {
|
|
382
|
+
playGeminiAudio(event.data);
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
};
|
|
386
|
+
|
|
387
|
+
ws.onerror = (err) => {
|
|
388
|
+
clearTimeout(timeout);
|
|
389
|
+
reject(new Error("Gemini Live WebSocket error"));
|
|
390
|
+
};
|
|
391
|
+
|
|
392
|
+
ws.onclose = () => {
|
|
393
|
+
if (sdkVoiceState.value !== "idle") {
|
|
394
|
+
sdkVoiceState.value = "idle";
|
|
395
|
+
emit("disconnected", { reason: "Gemini Live connection closed" });
|
|
396
|
+
}
|
|
397
|
+
};
|
|
398
|
+
});
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
let _geminiMicStream = null;
|
|
402
|
+
|
|
403
|
+
async function startGeminiMicCapture(ws) {
|
|
404
|
+
const mediaDevices = navigator?.mediaDevices;
|
|
405
|
+
if (!mediaDevices?.getUserMedia) {
|
|
406
|
+
throw new Error("Microphone API unavailable");
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
_geminiMicStream = await navigator.mediaDevices.getUserMedia({
|
|
410
|
+
audio: {
|
|
411
|
+
echoCancellation: true,
|
|
412
|
+
noiseSuppression: true,
|
|
413
|
+
autoGainControl: true,
|
|
414
|
+
sampleRate: 16000,
|
|
415
|
+
channelCount: 1,
|
|
416
|
+
},
|
|
417
|
+
});
|
|
418
|
+
|
|
419
|
+
// Use MediaRecorder to stream chunks to server
|
|
420
|
+
const recorder = new MediaRecorder(_geminiMicStream, {
|
|
421
|
+
mimeType: MediaRecorder.isTypeSupported("audio/webm;codecs=opus")
|
|
422
|
+
? "audio/webm;codecs=opus"
|
|
423
|
+
: "audio/webm",
|
|
424
|
+
});
|
|
425
|
+
|
|
426
|
+
recorder.ondataavailable = (event) => {
|
|
427
|
+
if (event.data.size > 0 && ws.readyState === WebSocket.OPEN) {
|
|
428
|
+
ws.send(event.data);
|
|
429
|
+
}
|
|
430
|
+
};
|
|
431
|
+
|
|
432
|
+
recorder.start(250); // Send chunks every 250ms
|
|
433
|
+
sdkVoiceState.value = "listening";
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
function handleGeminiServerEvent(msg) {
|
|
437
|
+
const type = msg.type;
|
|
438
|
+
|
|
439
|
+
switch (type) {
|
|
440
|
+
case "transcript.user":
|
|
441
|
+
sdkVoiceTranscript.value = msg.text || "";
|
|
442
|
+
emit("transcript", { text: msg.text, final: true });
|
|
443
|
+
_recordTranscript("user", msg.text, "gemini.user_transcript");
|
|
444
|
+
break;
|
|
445
|
+
|
|
446
|
+
case "transcript.assistant":
|
|
447
|
+
sdkVoiceResponse.value = msg.text || "";
|
|
448
|
+
emit("response-complete", { text: msg.text });
|
|
449
|
+
_recordTranscript("assistant", msg.text, "gemini.assistant_transcript");
|
|
450
|
+
break;
|
|
451
|
+
|
|
452
|
+
case "audio.delta":
|
|
453
|
+
// Binary audio handled in ws.onmessage
|
|
454
|
+
break;
|
|
455
|
+
|
|
456
|
+
case "tool_call":
|
|
457
|
+
handleGeminiToolCall(msg).catch((err) => {
|
|
458
|
+
console.error("[voice-client-sdk] Gemini tool call failed:", err);
|
|
459
|
+
});
|
|
460
|
+
break;
|
|
461
|
+
|
|
462
|
+
case "speech_started":
|
|
463
|
+
sdkVoiceState.value = "listening";
|
|
464
|
+
emit("speech-started", {});
|
|
465
|
+
break;
|
|
466
|
+
|
|
467
|
+
case "speech_stopped":
|
|
468
|
+
sdkVoiceState.value = "thinking";
|
|
469
|
+
emit("speech-stopped", {});
|
|
470
|
+
break;
|
|
471
|
+
|
|
472
|
+
case "error":
|
|
473
|
+
sdkVoiceError.value = msg.message || "Gemini error";
|
|
474
|
+
emit("error", { message: msg.message });
|
|
475
|
+
break;
|
|
476
|
+
|
|
477
|
+
default:
|
|
478
|
+
break;
|
|
479
|
+
}
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
async function handleGeminiToolCall(msg) {
|
|
483
|
+
const callId = msg.callId || `gemini-tc-${Date.now()}`;
|
|
484
|
+
const name = msg.name || "unknown";
|
|
485
|
+
const args = msg.args || {};
|
|
486
|
+
|
|
487
|
+
sdkVoiceToolCalls.value = [...sdkVoiceToolCalls.value, { callId, name, args, status: "running" }];
|
|
488
|
+
sdkVoiceState.value = "thinking";
|
|
489
|
+
emit("tool-call-start", { callId, name, args });
|
|
490
|
+
|
|
491
|
+
try {
|
|
492
|
+
const res = await fetch("/api/voice/tool", {
|
|
493
|
+
method: "POST",
|
|
494
|
+
headers: { "Content-Type": "application/json" },
|
|
495
|
+
body: JSON.stringify({
|
|
496
|
+
toolName: name,
|
|
497
|
+
args,
|
|
498
|
+
sessionId: sdkVoiceSessionId.value,
|
|
499
|
+
executor: _callContext.executor || undefined,
|
|
500
|
+
mode: _callContext.mode || undefined,
|
|
501
|
+
model: _callContext.model || undefined,
|
|
502
|
+
}),
|
|
503
|
+
});
|
|
504
|
+
const result = await res.json();
|
|
505
|
+
|
|
506
|
+
sdkVoiceToolCalls.value = sdkVoiceToolCalls.value.map((tc) =>
|
|
507
|
+
tc.callId === callId ? { ...tc, status: "complete", result: result.result } : tc
|
|
508
|
+
);
|
|
509
|
+
|
|
510
|
+
// Send tool result back to Gemini via WebSocket
|
|
511
|
+
if (_session && _session.readyState === WebSocket.OPEN) {
|
|
512
|
+
_session.send(JSON.stringify({
|
|
513
|
+
type: "tool_result",
|
|
514
|
+
callId,
|
|
515
|
+
name,
|
|
516
|
+
result: result.result || result.error || "No output",
|
|
517
|
+
}));
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
emit("tool-call-complete", { callId, name, result: result.result });
|
|
521
|
+
} catch (err) {
|
|
522
|
+
sdkVoiceToolCalls.value = sdkVoiceToolCalls.value.map((tc) =>
|
|
523
|
+
tc.callId === callId ? { ...tc, status: "error", error: err.message } : tc
|
|
524
|
+
);
|
|
525
|
+
emit("tool-call-error", { callId, name, error: err.message });
|
|
526
|
+
}
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
function playGeminiAudio(data) {
|
|
530
|
+
// Use Web Audio API to play PCM audio from Gemini
|
|
531
|
+
try {
|
|
532
|
+
if (typeof AudioContext !== "undefined" || typeof webkitAudioContext !== "undefined") {
|
|
533
|
+
const AudioCtx = globalThis.AudioContext || globalThis.webkitAudioContext;
|
|
534
|
+
if (!playGeminiAudio._ctx) {
|
|
535
|
+
playGeminiAudio._ctx = new AudioCtx({ sampleRate: 24000 });
|
|
536
|
+
}
|
|
537
|
+
const ctx = playGeminiAudio._ctx;
|
|
538
|
+
|
|
539
|
+
if (data instanceof Blob) {
|
|
540
|
+
data.arrayBuffer().then((buf) => {
|
|
541
|
+
ctx.decodeAudioData(buf, (audioBuffer) => {
|
|
542
|
+
const source = ctx.createBufferSource();
|
|
543
|
+
source.buffer = audioBuffer;
|
|
544
|
+
source.connect(ctx.destination);
|
|
545
|
+
source.start();
|
|
546
|
+
}).catch(() => { /* ignore decode errors */ });
|
|
547
|
+
});
|
|
548
|
+
}
|
|
549
|
+
}
|
|
550
|
+
} catch {
|
|
551
|
+
// Audio playback not available
|
|
552
|
+
}
|
|
553
|
+
}
|
|
554
|
+
|
|
555
|
+
// ── Public API ──────────────────────────────────────────────────────────────
|
|
556
|
+
|
|
557
|
+
/**
|
|
558
|
+
* Start a voice session using the best available SDK.
|
|
559
|
+
* Falls back to legacy voice-client.js if SDK initialization fails.
|
|
560
|
+
*
|
|
561
|
+
* @param {object} options — { sessionId, executor, mode, model }
|
|
562
|
+
* @returns {Promise<{ sdk: boolean, provider: string }>}
|
|
563
|
+
*/
|
|
564
|
+
export async function startSdkVoiceSession(options = {}) {
|
|
565
|
+
if (_session) {
|
|
566
|
+
console.warn("[voice-client-sdk] Session already active");
|
|
567
|
+
return { sdk: sdkVoiceSdkActive.value, provider: sdkVoiceProvider.value };
|
|
568
|
+
}
|
|
569
|
+
|
|
570
|
+
_callContext = _normalizeCallContext(options);
|
|
571
|
+
sdkVoiceBoundSessionId.value = _callContext.sessionId;
|
|
572
|
+
sdkVoiceState.value = "connecting";
|
|
573
|
+
sdkVoiceError.value = null;
|
|
574
|
+
sdkVoiceTranscript.value = "";
|
|
575
|
+
sdkVoiceResponse.value = "";
|
|
576
|
+
sdkVoiceToolCalls.value = [];
|
|
577
|
+
_usingLegacyFallback = false;
|
|
578
|
+
|
|
579
|
+
try {
|
|
580
|
+
// 1. Fetch SDK config from server
|
|
581
|
+
_sdkConfig = await fetchSdkConfig();
|
|
582
|
+
|
|
583
|
+
// 2. Try SDK-based session based on provider
|
|
584
|
+
if (_sdkConfig.useSdk) {
|
|
585
|
+
const provider = _sdkConfig.provider || "openai";
|
|
586
|
+
|
|
587
|
+
if (provider === "openai" || provider === "azure") {
|
|
588
|
+
await startAgentsSdkSession(_sdkConfig, options);
|
|
589
|
+
return { sdk: true, provider };
|
|
590
|
+
}
|
|
591
|
+
|
|
592
|
+
if (provider === "gemini") {
|
|
593
|
+
await startGeminiLiveSession(_sdkConfig, options);
|
|
594
|
+
return { sdk: true, provider: "gemini" };
|
|
595
|
+
}
|
|
596
|
+
}
|
|
597
|
+
|
|
598
|
+
// 3. SDK not available — signal fallback
|
|
599
|
+
_usingLegacyFallback = true;
|
|
600
|
+
sdkVoiceSdkActive.value = false;
|
|
601
|
+
emit("sdk-unavailable", {
|
|
602
|
+
reason: _sdkConfig.fallbackReason || "SDK not available for provider",
|
|
603
|
+
provider: _sdkConfig.provider,
|
|
604
|
+
});
|
|
605
|
+
|
|
606
|
+
return { sdk: false, provider: _sdkConfig.provider, reason: _sdkConfig.fallbackReason };
|
|
607
|
+
} catch (err) {
|
|
608
|
+
console.error("[voice-client-sdk] SDK session failed, signaling fallback:", err);
|
|
609
|
+
_usingLegacyFallback = true;
|
|
610
|
+
sdkVoiceSdkActive.value = false;
|
|
611
|
+
sdkVoiceState.value = "idle";
|
|
612
|
+
sdkVoiceError.value = null; // Don't show error — we'll fallback
|
|
613
|
+
emit("sdk-unavailable", {
|
|
614
|
+
reason: err.message,
|
|
615
|
+
provider: _sdkConfig?.provider || "unknown",
|
|
616
|
+
});
|
|
617
|
+
|
|
618
|
+
return { sdk: false, provider: _sdkConfig?.provider || "unknown", reason: err.message };
|
|
619
|
+
}
|
|
620
|
+
}
|
|
621
|
+
|
|
622
|
+
/**
|
|
623
|
+
* Stop the current SDK voice session.
|
|
624
|
+
*/
|
|
625
|
+
export function stopSdkVoiceSession() {
|
|
626
|
+
emit("session-ending", { sessionId: sdkVoiceSessionId.value });
|
|
627
|
+
|
|
628
|
+
if (_session) {
|
|
629
|
+
try {
|
|
630
|
+
if (typeof _session.close === "function") {
|
|
631
|
+
_session.close();
|
|
632
|
+
} else if (typeof _session.disconnect === "function") {
|
|
633
|
+
_session.disconnect();
|
|
634
|
+
}
|
|
635
|
+
} catch {
|
|
636
|
+
// best effort
|
|
637
|
+
}
|
|
638
|
+
_session = null;
|
|
639
|
+
}
|
|
640
|
+
|
|
641
|
+
// Stop Gemini mic stream if active
|
|
642
|
+
if (_geminiMicStream) {
|
|
643
|
+
for (const track of _geminiMicStream.getTracks()) {
|
|
644
|
+
try { track.stop(); } catch { /* ignore */ }
|
|
645
|
+
}
|
|
646
|
+
_geminiMicStream = null;
|
|
647
|
+
}
|
|
648
|
+
|
|
649
|
+
clearInterval(_durationTimer);
|
|
650
|
+
_durationTimer = null;
|
|
651
|
+
|
|
652
|
+
sdkVoiceState.value = "idle";
|
|
653
|
+
sdkVoiceTranscript.value = "";
|
|
654
|
+
sdkVoiceResponse.value = "";
|
|
655
|
+
sdkVoiceToolCalls.value = [];
|
|
656
|
+
sdkVoiceSessionId.value = null;
|
|
657
|
+
sdkVoiceBoundSessionId.value = null;
|
|
658
|
+
sdkVoiceDuration.value = 0;
|
|
659
|
+
sdkVoiceProvider.value = null;
|
|
660
|
+
sdkVoiceSdkActive.value = false;
|
|
661
|
+
_callContext = { sessionId: null, executor: null, mode: null, model: null };
|
|
662
|
+
_usingLegacyFallback = false;
|
|
663
|
+
|
|
664
|
+
emit("session-ended", {});
|
|
665
|
+
}
|
|
666
|
+
|
|
667
|
+
/**
|
|
668
|
+
* Interrupt the current response (barge-in).
|
|
669
|
+
*/
|
|
670
|
+
export function interruptSdkResponse() {
|
|
671
|
+
if (_session) {
|
|
672
|
+
if (typeof _session.interrupt === "function") {
|
|
673
|
+
// @openai/agents SDK
|
|
674
|
+
_session.interrupt();
|
|
675
|
+
} else if (_session.readyState === WebSocket.OPEN) {
|
|
676
|
+
// Gemini WebSocket
|
|
677
|
+
_session.send(JSON.stringify({ type: "response.cancel" }));
|
|
678
|
+
}
|
|
679
|
+
emit("interrupt", {});
|
|
680
|
+
}
|
|
681
|
+
}
|
|
682
|
+
|
|
683
|
+
/**
|
|
684
|
+
* Send a text message to the voice agent.
|
|
685
|
+
*/
|
|
686
|
+
export function sendSdkTextMessage(text) {
|
|
687
|
+
if (!_session) {
|
|
688
|
+
console.warn("[voice-client-sdk] Cannot send text — no active session");
|
|
689
|
+
return;
|
|
690
|
+
}
|
|
691
|
+
|
|
692
|
+
if (typeof _session.sendMessage === "function") {
|
|
693
|
+
// @openai/agents SDK
|
|
694
|
+
_session.sendMessage(text);
|
|
695
|
+
} else if (_session.readyState === WebSocket.OPEN) {
|
|
696
|
+
// Gemini WebSocket
|
|
697
|
+
_session.send(JSON.stringify({
|
|
698
|
+
type: "text.input",
|
|
699
|
+
text,
|
|
700
|
+
}));
|
|
701
|
+
}
|
|
702
|
+
}
|
|
703
|
+
|
|
704
|
+
/**
|
|
705
|
+
* Check if falling back to legacy voice.
|
|
706
|
+
*/
|
|
707
|
+
export function isUsingLegacyFallback() {
|
|
708
|
+
return _usingLegacyFallback;
|
|
709
|
+
}
|
|
710
|
+
|
|
711
|
+
/**
|
|
712
|
+
* Get current SDK session info.
|
|
713
|
+
*/
|
|
714
|
+
export function getSdkSessionInfo() {
|
|
715
|
+
return {
|
|
716
|
+
active: sdkVoiceSdkActive.value,
|
|
717
|
+
provider: sdkVoiceProvider.value,
|
|
718
|
+
sessionId: sdkVoiceSessionId.value,
|
|
719
|
+
state: sdkVoiceState.value,
|
|
720
|
+
duration: sdkVoiceDuration.value,
|
|
721
|
+
usingLegacy: _usingLegacyFallback,
|
|
722
|
+
sdkConfig: _sdkConfig,
|
|
723
|
+
};
|
|
724
|
+
}
|
|
725
|
+
|
|
726
|
+
// ── Duration Timer ──────────────────────────────────────────────────────────
|
|
727
|
+
|
|
728
|
+
function startDurationTimer() {
|
|
729
|
+
clearInterval(_durationTimer);
|
|
730
|
+
_durationTimer = setInterval(() => {
|
|
731
|
+
sdkVoiceDuration.value = Math.floor((Date.now() - _sessionStartTime) / 1000);
|
|
732
|
+
}, 1000);
|
|
733
|
+
}
|