bosun 0.36.2 → 0.36.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/agent-prompts.mjs +95 -0
  2. package/analyze-agent-work-helpers.mjs +308 -0
  3. package/analyze-agent-work.mjs +926 -0
  4. package/autofix.mjs +2 -0
  5. package/bosun.schema.json +101 -3
  6. package/codex-shell.mjs +85 -10
  7. package/desktop/main.mjs +871 -48
  8. package/desktop/preload.mjs +54 -1
  9. package/desktop-shortcut.mjs +90 -11
  10. package/git-editor-fix.mjs +273 -0
  11. package/mcp-registry.mjs +579 -0
  12. package/meeting-workflow-service.mjs +631 -0
  13. package/monitor.mjs +18 -103
  14. package/package.json +21 -2
  15. package/primary-agent.mjs +32 -12
  16. package/session-tracker.mjs +68 -0
  17. package/setup-web-server.mjs +20 -10
  18. package/setup.mjs +376 -83
  19. package/startup-service.mjs +51 -6
  20. package/stream-resilience.mjs +17 -7
  21. package/ui/app.js +164 -4
  22. package/ui/components/agent-selector.js +145 -1
  23. package/ui/components/chat-view.js +161 -15
  24. package/ui/components/session-list.js +2 -2
  25. package/ui/components/shared.js +188 -15
  26. package/ui/modules/icons.js +13 -0
  27. package/ui/modules/utils.js +44 -0
  28. package/ui/modules/voice-client-sdk.js +733 -0
  29. package/ui/modules/voice-overlay.js +128 -15
  30. package/ui/modules/voice.js +15 -6
  31. package/ui/setup.html +281 -81
  32. package/ui/styles/components.css +99 -3
  33. package/ui/styles/sessions.css +122 -14
  34. package/ui/styles.css +14 -0
  35. package/ui/tabs/agents.js +1 -1
  36. package/ui/tabs/chat.js +123 -14
  37. package/ui/tabs/control.js +16 -22
  38. package/ui/tabs/dashboard.js +85 -8
  39. package/ui/tabs/library.js +113 -17
  40. package/ui/tabs/settings.js +116 -2
  41. package/ui/tabs/tasks.js +388 -39
  42. package/ui/tabs/telemetry.js +0 -1
  43. package/ui/tabs/workflows.js +4 -0
  44. package/ui-server.mjs +400 -22
  45. package/update-check.mjs +41 -13
  46. package/voice-action-dispatcher.mjs +844 -0
  47. package/voice-agents-sdk.mjs +664 -0
  48. package/voice-auth-manager.mjs +164 -0
  49. package/voice-relay.mjs +1194 -0
  50. package/voice-tools.mjs +914 -0
  51. package/workflow-templates/agents.mjs +6 -2
  52. package/workflow-templates/github.mjs +154 -12
  53. package/workflow-templates.mjs +3 -0
  54. package/github-reconciler.mjs +0 -506
  55. package/merge-strategy.mjs +0 -1210
  56. package/pr-cleanup-daemon.mjs +0 -992
  57. package/workspace-reaper.mjs +0 -405
@@ -0,0 +1,733 @@
1
+ /**
2
+ * voice-client-sdk.js — Client-side voice using @openai/agents SDK as primary,
3
+ * with automatic fallback to legacy voice-client.js on failure.
4
+ *
5
+ * Provider strategy:
6
+ * - OpenAI/Azure: @openai/agents RealtimeSession (WebRTC, auto mic/speaker)
7
+ * - Gemini: WebSocket streaming via server proxy (Live API)
8
+ * - Claude/fallback: Falls through to voice-fallback.js (Web Speech API)
9
+ *
10
+ * The module exposes the same signal-based API as voice-client.js so
11
+ * voice-overlay.js can switch transparently.
12
+ *
13
+ * @module voice-client-sdk
14
+ */
15
+
16
+ import { signal, computed } from "@preact/signals";
17
+
18
+ // ── State Signals (same shape as voice-client.js) ───────────────────────────
19
+
20
+ export const sdkVoiceState = signal("idle");
21
+ export const sdkVoiceTranscript = signal("");
22
+ export const sdkVoiceResponse = signal("");
23
+ export const sdkVoiceError = signal(null);
24
+ export const sdkVoiceToolCalls = signal([]);
25
+ export const sdkVoiceSessionId = signal(null);
26
+ export const sdkVoiceBoundSessionId = signal(null);
27
+ export const sdkVoiceDuration = signal(0);
28
+ export const sdkVoiceProvider = signal(null);
29
+ export const sdkVoiceSdkActive = signal(false);
30
+
31
+ export const isSdkVoiceActive = computed(() =>
32
+ sdkVoiceState.value !== "idle" && sdkVoiceState.value !== "error"
33
+ );
34
+
35
+ // ── Module-scope state ──────────────────────────────────────────────────────
36
+
37
+ let _session = null;
38
+ let _durationTimer = null;
39
+ let _sessionStartTime = 0;
40
+ let _eventHandlers = new Map();
41
+ let _callContext = {
42
+ sessionId: null,
43
+ executor: null,
44
+ mode: null,
45
+ model: null,
46
+ };
47
+ let _sdkConfig = null;
48
+ let _usingLegacyFallback = false;
49
+
50
+ // ── Event System ────────────────────────────────────────────────────────────
51
+
52
+ export function onSdkVoiceEvent(event, handler) {
53
+ if (!_eventHandlers.has(event)) _eventHandlers.set(event, new Set());
54
+ _eventHandlers.get(event).add(handler);
55
+ return () => _eventHandlers.get(event)?.delete(handler);
56
+ }
57
+
58
+ function emit(event, data) {
59
+ const handlers = _eventHandlers.get(event);
60
+ if (handlers) {
61
+ for (const handler of handlers) {
62
+ try {
63
+ handler(data);
64
+ } catch (err) {
65
+ console.error(`[voice-client-sdk] event handler error (${event}):`, err);
66
+ }
67
+ }
68
+ }
69
+ }
70
+
71
+ function _normalizeCallContext(options = {}) {
72
+ return {
73
+ sessionId: String(options?.sessionId || "").trim() || null,
74
+ executor: String(options?.executor || "").trim() || null,
75
+ mode: String(options?.mode || "").trim() || null,
76
+ model: String(options?.model || "").trim() || null,
77
+ };
78
+ }
79
+
80
+ // ── SDK Configuration Fetch ────────────────────────────────────────────────
81
+
82
+ /**
83
+ * Fetch SDK configuration from the server.
84
+ * Determines if we should use Agents SDK or legacy voice.
85
+ */
86
+ async function fetchSdkConfig() {
87
+ try {
88
+ const res = await fetch("/api/voice/sdk-config", {
89
+ method: "GET",
90
+ headers: { "Content-Type": "application/json" },
91
+ });
92
+ if (!res.ok) {
93
+ return { useSdk: false, reason: `Server returned ${res.status}` };
94
+ }
95
+ return await res.json();
96
+ } catch (err) {
97
+ return { useSdk: false, reason: err.message };
98
+ }
99
+ }
100
+
101
+ // ── Transcript persistence ──────────────────────────────────────────────────
102
+
103
+ async function _recordTranscript(role, content, eventType = "") {
104
+ const sessionId = String(_callContext?.sessionId || sdkVoiceSessionId.value || "").trim();
105
+ const text = String(content || "").trim();
106
+ if (!sessionId || !text) return;
107
+ try {
108
+ await fetch("/api/voice/transcript", {
109
+ method: "POST",
110
+ headers: { "Content-Type": "application/json" },
111
+ body: JSON.stringify({
112
+ sessionId,
113
+ role,
114
+ content: text,
115
+ eventType,
116
+ executor: _callContext?.executor || undefined,
117
+ mode: _callContext?.mode || undefined,
118
+ model: _callContext?.model || undefined,
119
+ provider: sdkVoiceProvider.value || undefined,
120
+ }),
121
+ });
122
+ } catch (err) {
123
+ console.warn("[voice-client-sdk] transcript persistence failed:", err?.message || err);
124
+ }
125
+ }
126
+
127
+ // ── OpenAI/Azure Agents SDK Session ─────────────────────────────────────────
128
+
129
+ /**
130
+ * Start a voice session using @openai/agents RealtimeSession.
131
+ * This runs entirely client-side with WebRTC auto-mic handling.
132
+ */
133
+ async function startAgentsSdkSession(config, options = {}) {
134
+ // Dynamically import @openai/agents/realtime (browser bundle)
135
+ const agentsMod = await import("@openai/agents/realtime");
136
+ const { RealtimeAgent, RealtimeSession } = agentsMod;
137
+
138
+ if (!RealtimeAgent || !RealtimeSession) {
139
+ throw new Error("@openai/agents/realtime not available in browser");
140
+ }
141
+
142
+ // Fetch token and tools from server
143
+ const tokenRes = await fetch("/api/voice/token", {
144
+ method: "POST",
145
+ headers: { "Content-Type": "application/json" },
146
+ body: JSON.stringify({
147
+ sessionId: _callContext.sessionId || undefined,
148
+ executor: _callContext.executor || undefined,
149
+ mode: _callContext.mode || undefined,
150
+ model: _callContext.model || undefined,
151
+ delegateOnly: Boolean(_callContext.sessionId),
152
+ sdkMode: true,
153
+ }),
154
+ });
155
+ if (!tokenRes.ok) {
156
+ const err = await tokenRes.json().catch(() => ({ error: "Token fetch failed" }));
157
+ throw new Error(err.error || `Token fetch failed (${tokenRes.status})`);
158
+ }
159
+ const tokenData = await tokenRes.json();
160
+
161
+ // Create RealtimeAgent with server-provided instructions
162
+ const agent = new RealtimeAgent({
163
+ name: "Bosun Voice Agent",
164
+ instructions: tokenData.instructions || "You are Bosun, a helpful voice assistant.",
165
+ tools: (tokenData.tools || []).map((t) => ({
166
+ type: "function",
167
+ name: t.name,
168
+ description: t.description || "",
169
+ parameters: t.parameters || { type: "object", properties: {} },
170
+ async execute(args) {
171
+ // Execute tool via server
172
+ const res = await fetch("/api/voice/tool", {
173
+ method: "POST",
174
+ headers: { "Content-Type": "application/json" },
175
+ body: JSON.stringify({
176
+ toolName: t.name,
177
+ args,
178
+ sessionId: sdkVoiceSessionId.value,
179
+ executor: _callContext.executor || undefined,
180
+ mode: _callContext.mode || undefined,
181
+ model: _callContext.model || undefined,
182
+ }),
183
+ });
184
+ const result = await res.json();
185
+ return result.result || result.error || "No output";
186
+ },
187
+ })),
188
+ });
189
+
190
+ // Determine model and voice
191
+ const model = String(tokenData.model || config.model || "gpt-realtime-1.5").trim();
192
+ const voiceId = String(tokenData.voiceId || config.voiceId || "alloy").trim();
193
+ const turnDetection = String(config.turnDetection || "server_vad").trim();
194
+
195
+ // Create session with config
196
+ const session = new RealtimeSession(agent, {
197
+ model,
198
+ config: {
199
+ outputModalities: ["text", "audio"],
200
+ audio: {
201
+ input: {
202
+ format: "pcm16",
203
+ transcription: { model: "gpt-4o-mini-transcribe" },
204
+ turnDetection: {
205
+ type: turnDetection,
206
+ ...(turnDetection === "server_vad"
207
+ ? { threshold: 0.5, prefix_padding_ms: 300, silence_duration_ms: 500 }
208
+ : {}),
209
+ ...(turnDetection === "semantic_vad"
210
+ ? { eagerness: "medium" }
211
+ : {}),
212
+ },
213
+ },
214
+ output: {
215
+ format: "pcm16",
216
+ voice: voiceId,
217
+ },
218
+ },
219
+ },
220
+ });
221
+
222
+ // ── Wire up SDK events to our signals ──
223
+
224
+ session.on("history_updated", (history) => {
225
+ const items = history || [];
226
+ const lastUserMsg = [...items].reverse().find(
227
+ (item) => item.role === "user" && item.type === "message"
228
+ );
229
+ const lastAssistantMsg = [...items].reverse().find(
230
+ (item) => item.role === "assistant" && item.type === "message"
231
+ );
232
+
233
+ if (lastUserMsg) {
234
+ const transcript = lastUserMsg.content?.map((c) => c.transcript || c.text || "").join("") || "";
235
+ if (transcript) {
236
+ sdkVoiceTranscript.value = transcript;
237
+ emit("transcript", { text: transcript, final: true });
238
+ }
239
+ }
240
+
241
+ if (lastAssistantMsg) {
242
+ const response = lastAssistantMsg.content?.map((c) => c.transcript || c.text || "").join("") || "";
243
+ if (response) {
244
+ sdkVoiceResponse.value = response;
245
+ emit("response-complete", { text: response });
246
+ }
247
+ }
248
+
249
+ emit("history-updated", { history: items });
250
+ });
251
+
252
+ session.on("audio_interrupted", () => {
253
+ emit("interrupt", {});
254
+ });
255
+
256
+ session.on("tool_call_start", (event) => {
257
+ const callId = event?.callId || event?.call_id || `tc-${Date.now()}`;
258
+ const name = event?.name || event?.toolName || "unknown";
259
+ sdkVoiceToolCalls.value = [
260
+ ...sdkVoiceToolCalls.value,
261
+ { callId, name, status: "running" },
262
+ ];
263
+ sdkVoiceState.value = "thinking";
264
+ emit("tool-call-start", { callId, name });
265
+ });
266
+
267
+ session.on("tool_call_done", (event) => {
268
+ const callId = event?.callId || event?.call_id;
269
+ sdkVoiceToolCalls.value = sdkVoiceToolCalls.value.map((tc) =>
270
+ tc.callId === callId ? { ...tc, status: "complete" } : tc
271
+ );
272
+ emit("tool-call-complete", { callId });
273
+ });
274
+
275
+ session.on("error", (err) => {
276
+ console.error("[voice-client-sdk] session error:", err);
277
+ sdkVoiceError.value = err?.message || "Session error";
278
+ emit("error", { message: err?.message });
279
+ });
280
+
281
+ session.on("guardrail_tripped", (event) => {
282
+ emit("guardrail-tripped", event);
283
+ });
284
+
285
+ // Connect with the token
286
+ const connectOpts = { apiKey: tokenData.token };
287
+
288
+ if (tokenData.provider === "azure" && tokenData.azureEndpoint) {
289
+ const endpoint = String(tokenData.azureEndpoint).replace(/\/+$/, "");
290
+ const deployment = tokenData.azureDeployment || "gpt-realtime-1.5";
291
+ connectOpts.url = `${endpoint}/openai/realtime?api-version=2025-04-01-preview&deployment=${deployment}`;
292
+ }
293
+
294
+ await session.connect(connectOpts);
295
+
296
+ _session = session;
297
+ sdkVoiceSdkActive.value = true;
298
+ sdkVoiceState.value = "connected";
299
+ sdkVoiceProvider.value = tokenData.provider || "openai";
300
+ _sessionStartTime = Date.now();
301
+ sdkVoiceSessionId.value = _callContext.sessionId || `voice-sdk-${Date.now()}`;
302
+ startDurationTimer();
303
+
304
+ emit("connected", {
305
+ provider: tokenData.provider,
306
+ sessionId: sdkVoiceSessionId.value,
307
+ sdk: "openai-agents",
308
+ callContext: { ..._callContext },
309
+ });
310
+
311
+ return session;
312
+ }
313
+
314
+ // ── Gemini Live Session (WebSocket via server proxy) ────────────────────────
315
+
316
+ /**
317
+ * Start a Gemini Live voice session.
318
+ * Since Gemini Live uses WebSocket and we can't directly use the @google/genai
319
+ * SDK in the browser without exposing the API key, we use a server-proxied
320
+ * approach: the server manages the Gemini Live WebSocket, and the client
321
+ * sends/receives audio via a bosun WebSocket relay.
322
+ */
323
+ async function startGeminiLiveSession(config, options = {}) {
324
+ // For Gemini, fall back to server-mediated approach
325
+ // The client sends mic audio via WebSocket to our server,
326
+ // which forwards to Gemini Live API and returns audio.
327
+ const wsProtocol = globalThis.location?.protocol === "https:" ? "wss:" : "ws:";
328
+ const wsUrl = `${wsProtocol}//${globalThis.location?.host}/api/voice/gemini-live`;
329
+
330
+ const ws = new WebSocket(wsUrl);
331
+ let audioElement = null;
332
+
333
+ return new Promise((resolve, reject) => {
334
+ const timeout = setTimeout(() => {
335
+ reject(new Error("Gemini Live connection timeout"));
336
+ }, 15000);
337
+
338
+ ws.onopen = () => {
339
+ clearTimeout(timeout);
340
+
341
+ // Send session config
342
+ ws.send(JSON.stringify({
343
+ type: "session.config",
344
+ sessionId: _callContext.sessionId,
345
+ executor: _callContext.executor,
346
+ mode: _callContext.mode,
347
+ model: config.model,
348
+ }));
349
+
350
+ _session = ws;
351
+ sdkVoiceSdkActive.value = true;
352
+ sdkVoiceState.value = "connected";
353
+ sdkVoiceProvider.value = "gemini";
354
+ _sessionStartTime = Date.now();
355
+ sdkVoiceSessionId.value = _callContext.sessionId || `voice-gemini-${Date.now()}`;
356
+ startDurationTimer();
357
+
358
+ // Start mic capture and stream to server
359
+ startGeminiMicCapture(ws).catch((err) => {
360
+ console.error("[voice-client-sdk] Gemini mic capture failed:", err);
361
+ sdkVoiceError.value = err.message;
362
+ sdkVoiceState.value = "error";
363
+ });
364
+
365
+ emit("connected", {
366
+ provider: "gemini",
367
+ sessionId: sdkVoiceSessionId.value,
368
+ sdk: "google-genai-live",
369
+ callContext: { ..._callContext },
370
+ });
371
+
372
+ resolve(ws);
373
+ };
374
+
375
+ ws.onmessage = (event) => {
376
+ try {
377
+ const msg = JSON.parse(event.data);
378
+ handleGeminiServerEvent(msg);
379
+ } catch {
380
+ // Binary audio data — play it
381
+ if (event.data instanceof Blob || event.data instanceof ArrayBuffer) {
382
+ playGeminiAudio(event.data);
383
+ }
384
+ }
385
+ };
386
+
387
+ ws.onerror = (err) => {
388
+ clearTimeout(timeout);
389
+ reject(new Error("Gemini Live WebSocket error"));
390
+ };
391
+
392
+ ws.onclose = () => {
393
+ if (sdkVoiceState.value !== "idle") {
394
+ sdkVoiceState.value = "idle";
395
+ emit("disconnected", { reason: "Gemini Live connection closed" });
396
+ }
397
+ };
398
+ });
399
+ }
400
+
401
+ let _geminiMicStream = null;
402
+
403
+ async function startGeminiMicCapture(ws) {
404
+ const mediaDevices = navigator?.mediaDevices;
405
+ if (!mediaDevices?.getUserMedia) {
406
+ throw new Error("Microphone API unavailable");
407
+ }
408
+
409
+ _geminiMicStream = await navigator.mediaDevices.getUserMedia({
410
+ audio: {
411
+ echoCancellation: true,
412
+ noiseSuppression: true,
413
+ autoGainControl: true,
414
+ sampleRate: 16000,
415
+ channelCount: 1,
416
+ },
417
+ });
418
+
419
+ // Use MediaRecorder to stream chunks to server
420
+ const recorder = new MediaRecorder(_geminiMicStream, {
421
+ mimeType: MediaRecorder.isTypeSupported("audio/webm;codecs=opus")
422
+ ? "audio/webm;codecs=opus"
423
+ : "audio/webm",
424
+ });
425
+
426
+ recorder.ondataavailable = (event) => {
427
+ if (event.data.size > 0 && ws.readyState === WebSocket.OPEN) {
428
+ ws.send(event.data);
429
+ }
430
+ };
431
+
432
+ recorder.start(250); // Send chunks every 250ms
433
+ sdkVoiceState.value = "listening";
434
+ }
435
+
436
+ function handleGeminiServerEvent(msg) {
437
+ const type = msg.type;
438
+
439
+ switch (type) {
440
+ case "transcript.user":
441
+ sdkVoiceTranscript.value = msg.text || "";
442
+ emit("transcript", { text: msg.text, final: true });
443
+ _recordTranscript("user", msg.text, "gemini.user_transcript");
444
+ break;
445
+
446
+ case "transcript.assistant":
447
+ sdkVoiceResponse.value = msg.text || "";
448
+ emit("response-complete", { text: msg.text });
449
+ _recordTranscript("assistant", msg.text, "gemini.assistant_transcript");
450
+ break;
451
+
452
+ case "audio.delta":
453
+ // Binary audio handled in ws.onmessage
454
+ break;
455
+
456
+ case "tool_call":
457
+ handleGeminiToolCall(msg).catch((err) => {
458
+ console.error("[voice-client-sdk] Gemini tool call failed:", err);
459
+ });
460
+ break;
461
+
462
+ case "speech_started":
463
+ sdkVoiceState.value = "listening";
464
+ emit("speech-started", {});
465
+ break;
466
+
467
+ case "speech_stopped":
468
+ sdkVoiceState.value = "thinking";
469
+ emit("speech-stopped", {});
470
+ break;
471
+
472
+ case "error":
473
+ sdkVoiceError.value = msg.message || "Gemini error";
474
+ emit("error", { message: msg.message });
475
+ break;
476
+
477
+ default:
478
+ break;
479
+ }
480
+ }
481
+
482
+ async function handleGeminiToolCall(msg) {
483
+ const callId = msg.callId || `gemini-tc-${Date.now()}`;
484
+ const name = msg.name || "unknown";
485
+ const args = msg.args || {};
486
+
487
+ sdkVoiceToolCalls.value = [...sdkVoiceToolCalls.value, { callId, name, args, status: "running" }];
488
+ sdkVoiceState.value = "thinking";
489
+ emit("tool-call-start", { callId, name, args });
490
+
491
+ try {
492
+ const res = await fetch("/api/voice/tool", {
493
+ method: "POST",
494
+ headers: { "Content-Type": "application/json" },
495
+ body: JSON.stringify({
496
+ toolName: name,
497
+ args,
498
+ sessionId: sdkVoiceSessionId.value,
499
+ executor: _callContext.executor || undefined,
500
+ mode: _callContext.mode || undefined,
501
+ model: _callContext.model || undefined,
502
+ }),
503
+ });
504
+ const result = await res.json();
505
+
506
+ sdkVoiceToolCalls.value = sdkVoiceToolCalls.value.map((tc) =>
507
+ tc.callId === callId ? { ...tc, status: "complete", result: result.result } : tc
508
+ );
509
+
510
+ // Send tool result back to Gemini via WebSocket
511
+ if (_session && _session.readyState === WebSocket.OPEN) {
512
+ _session.send(JSON.stringify({
513
+ type: "tool_result",
514
+ callId,
515
+ name,
516
+ result: result.result || result.error || "No output",
517
+ }));
518
+ }
519
+
520
+ emit("tool-call-complete", { callId, name, result: result.result });
521
+ } catch (err) {
522
+ sdkVoiceToolCalls.value = sdkVoiceToolCalls.value.map((tc) =>
523
+ tc.callId === callId ? { ...tc, status: "error", error: err.message } : tc
524
+ );
525
+ emit("tool-call-error", { callId, name, error: err.message });
526
+ }
527
+ }
528
+
529
+ function playGeminiAudio(data) {
530
+ // Use Web Audio API to play PCM audio from Gemini
531
+ try {
532
+ if (typeof AudioContext !== "undefined" || typeof webkitAudioContext !== "undefined") {
533
+ const AudioCtx = globalThis.AudioContext || globalThis.webkitAudioContext;
534
+ if (!playGeminiAudio._ctx) {
535
+ playGeminiAudio._ctx = new AudioCtx({ sampleRate: 24000 });
536
+ }
537
+ const ctx = playGeminiAudio._ctx;
538
+
539
+ if (data instanceof Blob) {
540
+ data.arrayBuffer().then((buf) => {
541
+ ctx.decodeAudioData(buf, (audioBuffer) => {
542
+ const source = ctx.createBufferSource();
543
+ source.buffer = audioBuffer;
544
+ source.connect(ctx.destination);
545
+ source.start();
546
+ }).catch(() => { /* ignore decode errors */ });
547
+ });
548
+ }
549
+ }
550
+ } catch {
551
+ // Audio playback not available
552
+ }
553
+ }
554
+
555
+ // ── Public API ──────────────────────────────────────────────────────────────
556
+
557
+ /**
558
+ * Start a voice session using the best available SDK.
559
+ * Falls back to legacy voice-client.js if SDK initialization fails.
560
+ *
561
+ * @param {object} options — { sessionId, executor, mode, model }
562
+ * @returns {Promise<{ sdk: boolean, provider: string }>}
563
+ */
564
+ export async function startSdkVoiceSession(options = {}) {
565
+ if (_session) {
566
+ console.warn("[voice-client-sdk] Session already active");
567
+ return { sdk: sdkVoiceSdkActive.value, provider: sdkVoiceProvider.value };
568
+ }
569
+
570
+ _callContext = _normalizeCallContext(options);
571
+ sdkVoiceBoundSessionId.value = _callContext.sessionId;
572
+ sdkVoiceState.value = "connecting";
573
+ sdkVoiceError.value = null;
574
+ sdkVoiceTranscript.value = "";
575
+ sdkVoiceResponse.value = "";
576
+ sdkVoiceToolCalls.value = [];
577
+ _usingLegacyFallback = false;
578
+
579
+ try {
580
+ // 1. Fetch SDK config from server
581
+ _sdkConfig = await fetchSdkConfig();
582
+
583
+ // 2. Try SDK-based session based on provider
584
+ if (_sdkConfig.useSdk) {
585
+ const provider = _sdkConfig.provider || "openai";
586
+
587
+ if (provider === "openai" || provider === "azure") {
588
+ await startAgentsSdkSession(_sdkConfig, options);
589
+ return { sdk: true, provider };
590
+ }
591
+
592
+ if (provider === "gemini") {
593
+ await startGeminiLiveSession(_sdkConfig, options);
594
+ return { sdk: true, provider: "gemini" };
595
+ }
596
+ }
597
+
598
+ // 3. SDK not available — signal fallback
599
+ _usingLegacyFallback = true;
600
+ sdkVoiceSdkActive.value = false;
601
+ emit("sdk-unavailable", {
602
+ reason: _sdkConfig.fallbackReason || "SDK not available for provider",
603
+ provider: _sdkConfig.provider,
604
+ });
605
+
606
+ return { sdk: false, provider: _sdkConfig.provider, reason: _sdkConfig.fallbackReason };
607
+ } catch (err) {
608
+ console.error("[voice-client-sdk] SDK session failed, signaling fallback:", err);
609
+ _usingLegacyFallback = true;
610
+ sdkVoiceSdkActive.value = false;
611
+ sdkVoiceState.value = "idle";
612
+ sdkVoiceError.value = null; // Don't show error — we'll fallback
613
+ emit("sdk-unavailable", {
614
+ reason: err.message,
615
+ provider: _sdkConfig?.provider || "unknown",
616
+ });
617
+
618
+ return { sdk: false, provider: _sdkConfig?.provider || "unknown", reason: err.message };
619
+ }
620
+ }
621
+
622
+ /**
623
+ * Stop the current SDK voice session.
624
+ */
625
+ export function stopSdkVoiceSession() {
626
+ emit("session-ending", { sessionId: sdkVoiceSessionId.value });
627
+
628
+ if (_session) {
629
+ try {
630
+ if (typeof _session.close === "function") {
631
+ _session.close();
632
+ } else if (typeof _session.disconnect === "function") {
633
+ _session.disconnect();
634
+ }
635
+ } catch {
636
+ // best effort
637
+ }
638
+ _session = null;
639
+ }
640
+
641
+ // Stop Gemini mic stream if active
642
+ if (_geminiMicStream) {
643
+ for (const track of _geminiMicStream.getTracks()) {
644
+ try { track.stop(); } catch { /* ignore */ }
645
+ }
646
+ _geminiMicStream = null;
647
+ }
648
+
649
+ clearInterval(_durationTimer);
650
+ _durationTimer = null;
651
+
652
+ sdkVoiceState.value = "idle";
653
+ sdkVoiceTranscript.value = "";
654
+ sdkVoiceResponse.value = "";
655
+ sdkVoiceToolCalls.value = [];
656
+ sdkVoiceSessionId.value = null;
657
+ sdkVoiceBoundSessionId.value = null;
658
+ sdkVoiceDuration.value = 0;
659
+ sdkVoiceProvider.value = null;
660
+ sdkVoiceSdkActive.value = false;
661
+ _callContext = { sessionId: null, executor: null, mode: null, model: null };
662
+ _usingLegacyFallback = false;
663
+
664
+ emit("session-ended", {});
665
+ }
666
+
667
+ /**
668
+ * Interrupt the current response (barge-in).
669
+ */
670
+ export function interruptSdkResponse() {
671
+ if (_session) {
672
+ if (typeof _session.interrupt === "function") {
673
+ // @openai/agents SDK
674
+ _session.interrupt();
675
+ } else if (_session.readyState === WebSocket.OPEN) {
676
+ // Gemini WebSocket
677
+ _session.send(JSON.stringify({ type: "response.cancel" }));
678
+ }
679
+ emit("interrupt", {});
680
+ }
681
+ }
682
+
683
+ /**
684
+ * Send a text message to the voice agent.
685
+ */
686
+ export function sendSdkTextMessage(text) {
687
+ if (!_session) {
688
+ console.warn("[voice-client-sdk] Cannot send text — no active session");
689
+ return;
690
+ }
691
+
692
+ if (typeof _session.sendMessage === "function") {
693
+ // @openai/agents SDK
694
+ _session.sendMessage(text);
695
+ } else if (_session.readyState === WebSocket.OPEN) {
696
+ // Gemini WebSocket
697
+ _session.send(JSON.stringify({
698
+ type: "text.input",
699
+ text,
700
+ }));
701
+ }
702
+ }
703
+
704
+ /**
705
+ * Check if falling back to legacy voice.
706
+ */
707
+ export function isUsingLegacyFallback() {
708
+ return _usingLegacyFallback;
709
+ }
710
+
711
+ /**
712
+ * Get current SDK session info.
713
+ */
714
+ export function getSdkSessionInfo() {
715
+ return {
716
+ active: sdkVoiceSdkActive.value,
717
+ provider: sdkVoiceProvider.value,
718
+ sessionId: sdkVoiceSessionId.value,
719
+ state: sdkVoiceState.value,
720
+ duration: sdkVoiceDuration.value,
721
+ usingLegacy: _usingLegacyFallback,
722
+ sdkConfig: _sdkConfig,
723
+ };
724
+ }
725
+
726
+ // ── Duration Timer ──────────────────────────────────────────────────────────
727
+
728
+ function startDurationTimer() {
729
+ clearInterval(_durationTimer);
730
+ _durationTimer = setInterval(() => {
731
+ sdkVoiceDuration.value = Math.floor((Date.now() - _sessionStartTime) / 1000);
732
+ }, 1000);
733
+ }