bosun 0.37.0 → 0.37.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +4 -1
- package/agent-tool-config.mjs +338 -0
- package/bosun-skills.mjs +59 -4
- package/bosun.schema.json +1 -1
- package/desktop/launch.mjs +18 -0
- package/desktop/main.mjs +52 -13
- package/fleet-coordinator.mjs +34 -1
- package/kanban-adapter.mjs +30 -3
- package/library-manager.mjs +66 -0
- package/maintenance.mjs +30 -5
- package/monitor.mjs +56 -0
- package/package.json +4 -1
- package/setup-web-server.mjs +73 -12
- package/setup.mjs +3 -3
- package/ui/app.js +40 -3
- package/ui/components/session-list.js +25 -7
- package/ui/components/workspace-switcher.js +48 -1
- package/ui/demo.html +176 -0
- package/ui/modules/mic-track-registry.js +83 -0
- package/ui/modules/settings-schema.js +4 -1
- package/ui/modules/state.js +25 -0
- package/ui/modules/streaming.js +1 -1
- package/ui/modules/voice-barge-in.js +27 -0
- package/ui/modules/voice-client-sdk.js +268 -42
- package/ui/modules/voice-client.js +665 -61
- package/ui/modules/voice-overlay.js +829 -47
- package/ui/setup.html +151 -9
- package/ui/styles.css +258 -0
- package/ui/tabs/chat.js +11 -0
- package/ui/tabs/library.js +890 -15
- package/ui/tabs/settings.js +51 -11
- package/ui/tabs/telemetry.js +327 -105
- package/ui/tabs/workflows.js +86 -0
- package/ui-server.mjs +1201 -107
- package/voice-action-dispatcher.mjs +81 -0
- package/voice-agents-sdk.mjs +2 -2
- package/voice-relay.mjs +131 -14
- package/voice-tools.mjs +475 -9
- package/workflow-engine.mjs +54 -0
- package/workflow-nodes.mjs +177 -28
- package/workflow-templates/github.mjs +205 -94
- package/workflow-templates/task-batch.mjs +247 -0
- package/workflow-templates.mjs +15 -0
package/ui/modules/streaming.js
CHANGED
|
@@ -629,7 +629,7 @@ export function startAgentStatusTracking() {
|
|
|
629
629
|
const content = String(message.content || "").toLowerCase();
|
|
630
630
|
const lifecycle = String(message?.meta?.lifecycle || "").toLowerCase();
|
|
631
631
|
const adapter = payload.session?.type || "";
|
|
632
|
-
const sessionId = payload.
|
|
632
|
+
const sessionId = payload.session?.id || payload.sessionId || payload.taskId || "";
|
|
633
633
|
const sessionStatus = payload.session?.status || "active";
|
|
634
634
|
|
|
635
635
|
if (sessionStatus !== "active") {
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* voice-barge-in.js
|
|
3
|
+
*
|
|
4
|
+
* Shared policy helpers for automatic barge-in (interrupt assistant playback
|
|
5
|
+
* when the user starts speaking).
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
export function shouldAutoBargeIn({
|
|
9
|
+
muted = false,
|
|
10
|
+
audioActive = false,
|
|
11
|
+
now = Date.now(),
|
|
12
|
+
lastTriggeredAt = 0,
|
|
13
|
+
minIntervalMs = 700,
|
|
14
|
+
} = {}) {
|
|
15
|
+
if (muted) return false;
|
|
16
|
+
if (!audioActive) return false;
|
|
17
|
+
const elapsed = Number(now) - Number(lastTriggeredAt || 0);
|
|
18
|
+
return elapsed >= Number(minIntervalMs || 0);
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
export function shouldAutoBargeInFromMicLevel({
|
|
22
|
+
speaking = false,
|
|
23
|
+
level = 0,
|
|
24
|
+
threshold = 0.08,
|
|
25
|
+
} = {}) {
|
|
26
|
+
return Boolean(speaking) && Number(level) >= Number(threshold);
|
|
27
|
+
}
|
|
@@ -14,6 +14,13 @@
|
|
|
14
14
|
*/
|
|
15
15
|
|
|
16
16
|
import { signal, computed } from "@preact/signals";
|
|
17
|
+
import {
|
|
18
|
+
ensureMicTrackingPatched,
|
|
19
|
+
registerMicStream,
|
|
20
|
+
stopTrackedMicStreams,
|
|
21
|
+
} from "./mic-track-registry.js";
|
|
22
|
+
import { shouldAutoBargeIn } from "./voice-barge-in.js";
|
|
23
|
+
import { isVoiceMicMuted } from "./voice-client.js";
|
|
17
24
|
|
|
18
25
|
// ── State Signals (same shape as voice-client.js) ───────────────────────────
|
|
19
26
|
|
|
@@ -49,6 +56,7 @@ let _callContext = {
|
|
|
49
56
|
executor: null,
|
|
50
57
|
mode: null,
|
|
51
58
|
model: null,
|
|
59
|
+
voiceAgentId: null,
|
|
52
60
|
};
|
|
53
61
|
let _sdkConfig = null;
|
|
54
62
|
let _usingLegacyFallback = false;
|
|
@@ -64,6 +72,13 @@ let _pendingAssistantTranscriptText = "";
|
|
|
64
72
|
let _awaitingToolCompletionAck = false;
|
|
65
73
|
let _toolCompletionAckTimer = null;
|
|
66
74
|
let _assistantBaselineBeforeToolAck = "";
|
|
75
|
+
const _sdkCapturedMicStreams = new Set();
|
|
76
|
+
let _lastAutoBargeInAt = 0;
|
|
77
|
+
const AUTO_BARGE_IN_COOLDOWN_MS = 700;
|
|
78
|
+
// Set to true by stopSdkVoiceSession() so that any in-flight getUserMedia
|
|
79
|
+
// call in startAgentsSdkSession / startGeminiMicCapture releases the track
|
|
80
|
+
// immediately instead of leaving the browser mic indicator active.
|
|
81
|
+
let _sdkExplicitStop = false;
|
|
67
82
|
|
|
68
83
|
// ── Event System ────────────────────────────────────────────────────────────
|
|
69
84
|
|
|
@@ -86,12 +101,31 @@ function emit(event, data) {
|
|
|
86
101
|
}
|
|
87
102
|
}
|
|
88
103
|
|
|
104
|
+
function maybeAutoInterruptSdkResponse(reason = "speech-started") {
|
|
105
|
+
const now = Date.now();
|
|
106
|
+
if (!shouldAutoBargeIn({
|
|
107
|
+
muted: isVoiceMicMuted.value,
|
|
108
|
+
audioActive: Boolean(_session),
|
|
109
|
+
now,
|
|
110
|
+
lastTriggeredAt: _lastAutoBargeInAt,
|
|
111
|
+
minIntervalMs: AUTO_BARGE_IN_COOLDOWN_MS,
|
|
112
|
+
})) {
|
|
113
|
+
return false;
|
|
114
|
+
}
|
|
115
|
+
_lastAutoBargeInAt = now;
|
|
116
|
+
interruptSdkResponse();
|
|
117
|
+
sdkVoiceState.value = "listening";
|
|
118
|
+
emit("auto-barge-in", { reason });
|
|
119
|
+
return true;
|
|
120
|
+
}
|
|
121
|
+
|
|
89
122
|
function _normalizeCallContext(options = {}) {
|
|
90
123
|
return {
|
|
91
124
|
sessionId: String(options?.sessionId || "").trim() || null,
|
|
92
125
|
executor: String(options?.executor || "").trim() || null,
|
|
93
126
|
mode: String(options?.mode || "").trim() || null,
|
|
94
127
|
model: String(options?.model || "").trim() || null,
|
|
128
|
+
voiceAgentId: String(options?.voiceAgentId || "").trim() || null,
|
|
95
129
|
};
|
|
96
130
|
}
|
|
97
131
|
|
|
@@ -122,6 +156,14 @@ function isNonFatalSdkSessionError(err) {
|
|
|
122
156
|
if (/setRemoteDescription/i.test(message) && /SessionDescription/i.test(message)) {
|
|
123
157
|
return true;
|
|
124
158
|
}
|
|
159
|
+
// Runtime item-level transcription failures should not hard-fail the live call.
|
|
160
|
+
if (
|
|
161
|
+
lower.includes("input transcription failed")
|
|
162
|
+
|| lower.includes("transcription failed for item")
|
|
163
|
+
|| lower.includes("input_audio_transcription")
|
|
164
|
+
) {
|
|
165
|
+
return true;
|
|
166
|
+
}
|
|
125
167
|
return false;
|
|
126
168
|
}
|
|
127
169
|
|
|
@@ -246,7 +288,7 @@ function _flushPendingTranscriptBuffers() {
|
|
|
246
288
|
}
|
|
247
289
|
|
|
248
290
|
const finalUser = String(_pendingUserTranscriptText || "").trim();
|
|
249
|
-
if (finalUser) {
|
|
291
|
+
if (finalUser && ENABLE_USER_TRANSCRIPT) {
|
|
250
292
|
_persistTranscriptIfNew("user", finalUser, "sdk.history_updated.user.flush");
|
|
251
293
|
}
|
|
252
294
|
|
|
@@ -314,10 +356,13 @@ function _scheduleUserTranscriptFinalize(text) {
|
|
|
314
356
|
if (ENABLE_USER_TRANSCRIPT) {
|
|
315
357
|
sdkVoiceTranscript.value = finalText;
|
|
316
358
|
emit("transcript", { text: finalText, final: true });
|
|
359
|
+
_persistTranscriptIfNew("user", finalText, "sdk.history_updated.user.final");
|
|
317
360
|
} else {
|
|
318
361
|
sdkVoiceTranscript.value = "";
|
|
362
|
+
// Skip persisting user transcript — ASR often hallucinates wrong
|
|
363
|
+
// languages from short fragments; the model still receives the raw
|
|
364
|
+
// audio correctly so nothing is lost.
|
|
319
365
|
}
|
|
320
|
-
_persistTranscriptIfNew("user", finalText, "sdk.history_updated.user.final");
|
|
321
366
|
}, 350);
|
|
322
367
|
}
|
|
323
368
|
|
|
@@ -364,6 +409,7 @@ async function startAgentsSdkSession(config, options = {}) {
|
|
|
364
409
|
executor: _callContext.executor || undefined,
|
|
365
410
|
mode: _callContext.mode || undefined,
|
|
366
411
|
model: _callContext.model || undefined,
|
|
412
|
+
voiceAgentId: _callContext.voiceAgentId || undefined,
|
|
367
413
|
delegateOnly: false,
|
|
368
414
|
sdkMode: true,
|
|
369
415
|
}),
|
|
@@ -396,6 +442,7 @@ async function startAgentsSdkSession(config, options = {}) {
|
|
|
396
442
|
executor: _callContext.executor || undefined,
|
|
397
443
|
mode: _callContext.mode || undefined,
|
|
398
444
|
model: _callContext.model || undefined,
|
|
445
|
+
voiceAgentId: _callContext.voiceAgentId || undefined,
|
|
399
446
|
}),
|
|
400
447
|
});
|
|
401
448
|
} catch (fetchErr) {
|
|
@@ -455,14 +502,20 @@ async function startAgentsSdkSession(config, options = {}) {
|
|
|
455
502
|
// Determine model and voice
|
|
456
503
|
const model = String(tokenData.model || resolvedConfig.model || "gpt-realtime-1.5").trim();
|
|
457
504
|
const voiceId = String(tokenData.voiceId || resolvedConfig.voiceId || "alloy").trim();
|
|
458
|
-
const turnDetection = String(resolvedConfig.turnDetection || "
|
|
505
|
+
const turnDetection = String(resolvedConfig.turnDetection || "semantic_vad").trim();
|
|
506
|
+
// Use server-provided transcription model from sessionConfig, fall back to default
|
|
507
|
+
const serverSessionConfig = tokenData?.sessionConfig || {};
|
|
508
|
+
const transcriptionModel =
|
|
509
|
+
serverSessionConfig?.input_audio_transcription?.model || "gpt-4o-transcribe";
|
|
510
|
+
const transcriptionEnabled =
|
|
511
|
+
serverSessionConfig?.input_audio_transcription !== undefined;
|
|
459
512
|
const turnDetectionConfig = {
|
|
460
513
|
type: turnDetection,
|
|
461
514
|
...(turnDetection === "server_vad"
|
|
462
515
|
? {
|
|
463
|
-
threshold: 0.
|
|
516
|
+
threshold: 0.7,
|
|
464
517
|
prefix_padding_ms: 400,
|
|
465
|
-
silence_duration_ms:
|
|
518
|
+
silence_duration_ms: 1300,
|
|
466
519
|
create_response: true,
|
|
467
520
|
interrupt_response: true,
|
|
468
521
|
createResponse: true,
|
|
@@ -488,12 +541,13 @@ async function startAgentsSdkSession(config, options = {}) {
|
|
|
488
541
|
audio: {
|
|
489
542
|
input: {
|
|
490
543
|
format: "pcm16",
|
|
491
|
-
transcription: { model:
|
|
544
|
+
...(transcriptionEnabled ? { transcription: { model: transcriptionModel } } : {}),
|
|
492
545
|
turnDetection: turnDetectionConfig,
|
|
493
546
|
},
|
|
494
547
|
output: {
|
|
495
548
|
format: "pcm16",
|
|
496
549
|
voice: voiceId,
|
|
550
|
+
...(transcriptionEnabled ? { transcription: { model: transcriptionModel } } : {}),
|
|
497
551
|
},
|
|
498
552
|
},
|
|
499
553
|
},
|
|
@@ -536,6 +590,11 @@ async function startAgentsSdkSession(config, options = {}) {
|
|
|
536
590
|
emit("interrupt", {});
|
|
537
591
|
});
|
|
538
592
|
|
|
593
|
+
session.on("speech_started", () => {
|
|
594
|
+
maybeAutoInterruptSdkResponse("speech-started");
|
|
595
|
+
emit("speech-started", {});
|
|
596
|
+
});
|
|
597
|
+
|
|
539
598
|
session.on("tool_call_start", (event) => {
|
|
540
599
|
const callId = event?.callId || event?.call_id || `tc-${Date.now()}`;
|
|
541
600
|
const name = event?.name || event?.toolName || "unknown";
|
|
@@ -626,7 +685,35 @@ async function startAgentsSdkSession(config, options = {}) {
|
|
|
626
685
|
// ignore URL logging issues
|
|
627
686
|
}
|
|
628
687
|
|
|
629
|
-
|
|
688
|
+
// Attempt WebRTC connection first. For Azure, if it fails (404 — WebRTC not
|
|
689
|
+
// supported), retry with the WebSocket URL so the SDK uses WS transport.
|
|
690
|
+
// Wrap getUserMedia during connect so we can always stop SDK-owned mic tracks
|
|
691
|
+
// on teardown, even if the SDK keeps hidden stream references.
|
|
692
|
+
await _withGetUserMediaCapture(async () => {
|
|
693
|
+
try {
|
|
694
|
+
await session.connect(connectOpts);
|
|
695
|
+
} catch (connectErr) {
|
|
696
|
+
const errMsg = String(connectErr?.message || "");
|
|
697
|
+
const isWebRtc404 = /404|not found|SDP/i.test(errMsg);
|
|
698
|
+
const hasWsUrl = Boolean(String(tokenData?.wsUrl || "").trim());
|
|
699
|
+
if (isWebRtc404 && hasWsUrl && tokenData.provider === "azure") {
|
|
700
|
+
console.warn("[voice-client-sdk] WebRTC connect failed (404) — retrying via Azure WebSocket");
|
|
701
|
+
await session.connect({ ...connectOpts, url: tokenData.wsUrl });
|
|
702
|
+
} else {
|
|
703
|
+
throw connectErr;
|
|
704
|
+
}
|
|
705
|
+
}
|
|
706
|
+
});
|
|
707
|
+
|
|
708
|
+
// Guard: stopSdkVoiceSession() may have been called while session.connect()
|
|
709
|
+
// was awaiting. Release any mic streams captured during connect so that the
|
|
710
|
+
// browser indicator goes away, then abort this session setup.
|
|
711
|
+
if (_sdkExplicitStop) {
|
|
712
|
+
_stopCapturedSdkMicStreams();
|
|
713
|
+
stopTrackedMicStreams();
|
|
714
|
+
try { session.close?.(); } catch { /* ignore */ }
|
|
715
|
+
throw new Error("SDK session was stopped during connection");
|
|
716
|
+
}
|
|
630
717
|
|
|
631
718
|
if (_agentsRealtimeModuleSource) {
|
|
632
719
|
console.info(`[voice-client-sdk] using OpenAI Realtime SDK from ${_agentsRealtimeModuleSource}`);
|
|
@@ -685,6 +772,7 @@ async function startGeminiLiveSession(config, options = {}) {
|
|
|
685
772
|
executor: _callContext.executor,
|
|
686
773
|
mode: _callContext.mode,
|
|
687
774
|
model: resolvedConfig.model,
|
|
775
|
+
voiceAgentId: _callContext.voiceAgentId || undefined,
|
|
688
776
|
}));
|
|
689
777
|
|
|
690
778
|
_session = ws;
|
|
@@ -753,6 +841,17 @@ async function startGeminiMicCapture(ws) {
|
|
|
753
841
|
channelCount: 1,
|
|
754
842
|
},
|
|
755
843
|
});
|
|
844
|
+
registerMicStream(_geminiMicStream);
|
|
845
|
+
|
|
846
|
+
// Guard: stopSdkVoiceSession() may have raced with this getUserMedia await.
|
|
847
|
+
// Release the mic immediately instead of leaving the indicator active.
|
|
848
|
+
if (_sdkExplicitStop) {
|
|
849
|
+
for (const track of _geminiMicStream.getTracks()) {
|
|
850
|
+
try { track.stop(); } catch { /* ignore */ }
|
|
851
|
+
}
|
|
852
|
+
_geminiMicStream = null;
|
|
853
|
+
throw new Error("SDK session was stopped during microphone acquisition");
|
|
854
|
+
}
|
|
756
855
|
|
|
757
856
|
// Use MediaRecorder to stream chunks to server
|
|
758
857
|
const recorder = new MediaRecorder(_geminiMicStream, {
|
|
@@ -772,45 +871,118 @@ async function startGeminiMicCapture(ws) {
|
|
|
772
871
|
sdkVoiceState.value = "listening";
|
|
773
872
|
}
|
|
774
873
|
|
|
775
|
-
function
|
|
776
|
-
if (!source) return;
|
|
777
|
-
const
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
874
|
+
function forEachAudioTrackInSource(source, cb) {
|
|
875
|
+
if (!source || typeof cb !== "function") return;
|
|
876
|
+
const seenObjects = new Set();
|
|
877
|
+
const seenTracks = new Set();
|
|
878
|
+
const queue = [{ node: source, depth: 0 }];
|
|
879
|
+
let visited = 0;
|
|
880
|
+
|
|
881
|
+
while (queue.length) {
|
|
882
|
+
const { node, depth } = queue.shift();
|
|
883
|
+
if (!node || (typeof node !== "object" && typeof node !== "function")) continue;
|
|
884
|
+
if (seenObjects.has(node)) continue;
|
|
885
|
+
seenObjects.add(node);
|
|
886
|
+
visited += 1;
|
|
887
|
+
if (visited > 220 || depth > 4) continue;
|
|
888
|
+
|
|
889
|
+
if (typeof node?.getTracks === "function") {
|
|
890
|
+
try {
|
|
891
|
+
for (const track of node.getTracks()) {
|
|
892
|
+
if (!track || String(track?.kind || "").toLowerCase() !== "audio") continue;
|
|
893
|
+
if (seenTracks.has(track)) continue;
|
|
894
|
+
seenTracks.add(track);
|
|
895
|
+
cb(track);
|
|
896
|
+
}
|
|
897
|
+
} catch {
|
|
898
|
+
// ignore stream enumeration failures
|
|
899
|
+
}
|
|
900
|
+
}
|
|
901
|
+
|
|
902
|
+
if (typeof node?.getSenders === "function") {
|
|
903
|
+
try {
|
|
904
|
+
for (const sender of node.getSenders()) {
|
|
905
|
+
const track = sender?.track;
|
|
906
|
+
if (!track || String(track?.kind || "").toLowerCase() !== "audio") continue;
|
|
907
|
+
if (seenTracks.has(track)) continue;
|
|
908
|
+
seenTracks.add(track);
|
|
909
|
+
cb(track);
|
|
910
|
+
}
|
|
911
|
+
} catch {
|
|
912
|
+
// ignore pc sender failures
|
|
913
|
+
}
|
|
914
|
+
}
|
|
915
|
+
|
|
916
|
+
let values = null;
|
|
917
|
+
try {
|
|
918
|
+
values = Object.values(node);
|
|
919
|
+
} catch {
|
|
920
|
+
values = null;
|
|
921
|
+
}
|
|
922
|
+
if (!values) continue;
|
|
923
|
+
for (const next of values) {
|
|
924
|
+
if (!next || (typeof next !== "object" && typeof next !== "function")) continue;
|
|
925
|
+
queue.push({ node: next, depth: depth + 1 });
|
|
795
926
|
}
|
|
796
927
|
}
|
|
928
|
+
}
|
|
797
929
|
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
930
|
+
function stopMicLikeTracks(source) {
|
|
931
|
+
forEachAudioTrackInSource(source, (track) => {
|
|
932
|
+
try { track.stop(); } catch { /* ignore */ }
|
|
933
|
+
});
|
|
934
|
+
}
|
|
935
|
+
|
|
936
|
+
function _captureSdkMicStream(stream) {
|
|
937
|
+
if (!stream || typeof stream.getTracks !== "function") return;
|
|
938
|
+
const hasAudio = (stream.getAudioTracks?.() || []).length > 0;
|
|
939
|
+
if (!hasAudio) return;
|
|
940
|
+
_sdkCapturedMicStreams.add(stream);
|
|
941
|
+
}
|
|
942
|
+
|
|
943
|
+
function _stopCapturedSdkMicStreams() {
|
|
944
|
+
for (const stream of _sdkCapturedMicStreams) {
|
|
945
|
+
try {
|
|
946
|
+
for (const track of stream.getTracks()) {
|
|
947
|
+
if (String(track?.kind || "").toLowerCase() !== "audio") continue;
|
|
948
|
+
try { track.stop(); } catch { /* ignore */ }
|
|
949
|
+
}
|
|
950
|
+
} catch {
|
|
951
|
+
// best effort
|
|
812
952
|
}
|
|
813
953
|
}
|
|
954
|
+
_sdkCapturedMicStreams.clear();
|
|
955
|
+
}
|
|
956
|
+
|
|
957
|
+
async function _withGetUserMediaCapture(fn) {
|
|
958
|
+
const mediaDevices = globalThis?.navigator?.mediaDevices;
|
|
959
|
+
const original = mediaDevices?.getUserMedia;
|
|
960
|
+
if (!mediaDevices || typeof original !== "function") {
|
|
961
|
+
return await fn();
|
|
962
|
+
}
|
|
963
|
+
mediaDevices.getUserMedia = async (...args) => {
|
|
964
|
+
const stream = await original.apply(mediaDevices, args);
|
|
965
|
+
_captureSdkMicStream(stream);
|
|
966
|
+
return stream;
|
|
967
|
+
};
|
|
968
|
+
try {
|
|
969
|
+
return await fn();
|
|
970
|
+
} finally {
|
|
971
|
+
mediaDevices.getUserMedia = original;
|
|
972
|
+
}
|
|
973
|
+
}
|
|
974
|
+
|
|
975
|
+
function setMicLikeTracksEnabled(source, enabled) {
|
|
976
|
+
let updated = false;
|
|
977
|
+
forEachAudioTrackInSource(source, (track) => {
|
|
978
|
+
try {
|
|
979
|
+
track.enabled = Boolean(enabled);
|
|
980
|
+
updated = true;
|
|
981
|
+
} catch {
|
|
982
|
+
// ignore per-track failures
|
|
983
|
+
}
|
|
984
|
+
});
|
|
985
|
+
return updated;
|
|
814
986
|
}
|
|
815
987
|
|
|
816
988
|
function handleGeminiServerEvent(msg) {
|
|
@@ -845,6 +1017,7 @@ function handleGeminiServerEvent(msg) {
|
|
|
845
1017
|
break;
|
|
846
1018
|
|
|
847
1019
|
case "speech_started":
|
|
1020
|
+
maybeAutoInterruptSdkResponse("speech-started");
|
|
848
1021
|
sdkVoiceState.value = "listening";
|
|
849
1022
|
emit("speech-started", {});
|
|
850
1023
|
break;
|
|
@@ -884,6 +1057,7 @@ async function handleGeminiToolCall(msg) {
|
|
|
884
1057
|
executor: _callContext.executor || undefined,
|
|
885
1058
|
mode: _callContext.mode || undefined,
|
|
886
1059
|
model: _callContext.model || undefined,
|
|
1060
|
+
voiceAgentId: _callContext.voiceAgentId || undefined,
|
|
887
1061
|
}),
|
|
888
1062
|
});
|
|
889
1063
|
const result = await res.json();
|
|
@@ -951,11 +1125,14 @@ function playGeminiAudio(data) {
|
|
|
951
1125
|
* @returns {Promise<{ sdk: boolean, provider: string }>}
|
|
952
1126
|
*/
|
|
953
1127
|
export async function startSdkVoiceSession(options = {}) {
|
|
1128
|
+
ensureMicTrackingPatched();
|
|
1129
|
+
_sdkExplicitStop = false; // reset before each new session attempt
|
|
954
1130
|
if (_session) {
|
|
955
1131
|
console.warn("[voice-client-sdk] Session already active");
|
|
956
1132
|
return { sdk: sdkVoiceSdkActive.value, provider: sdkVoiceProvider.value };
|
|
957
1133
|
}
|
|
958
1134
|
|
|
1135
|
+
isVoiceMicMuted.value = false;
|
|
959
1136
|
_callContext = _normalizeCallContext(options);
|
|
960
1137
|
sdkVoiceBoundSessionId.value = _callContext.sessionId;
|
|
961
1138
|
sdkVoiceState.value = "connecting";
|
|
@@ -964,6 +1141,7 @@ export async function startSdkVoiceSession(options = {}) {
|
|
|
964
1141
|
sdkVoiceResponse.value = "";
|
|
965
1142
|
sdkVoiceToolCalls.value = [];
|
|
966
1143
|
_usingLegacyFallback = false;
|
|
1144
|
+
_lastAutoBargeInAt = 0;
|
|
967
1145
|
_resetTranscriptPersistenceState();
|
|
968
1146
|
|
|
969
1147
|
try {
|
|
@@ -1014,6 +1192,7 @@ export async function startSdkVoiceSession(options = {}) {
|
|
|
1014
1192
|
sdkVoiceSdkActive.value = false;
|
|
1015
1193
|
sdkVoiceState.value = "idle";
|
|
1016
1194
|
sdkVoiceError.value = null; // Don't show error — we'll fallback
|
|
1195
|
+
_stopCapturedSdkMicStreams();
|
|
1017
1196
|
emit("sdk-unavailable", {
|
|
1018
1197
|
reason: reason || "SDK unavailable",
|
|
1019
1198
|
provider: _sdkConfig?.provider || "unknown",
|
|
@@ -1031,6 +1210,9 @@ export async function startSdkVoiceSession(options = {}) {
|
|
|
1031
1210
|
* Stop the current SDK voice session.
|
|
1032
1211
|
*/
|
|
1033
1212
|
export function stopSdkVoiceSession() {
|
|
1213
|
+
// Set before any cleanup so in-flight getUserMedia / session.connect awaiters
|
|
1214
|
+
// detect the cancellation and release acquired mic tracks immediately.
|
|
1215
|
+
_sdkExplicitStop = true;
|
|
1034
1216
|
emit("session-ending", { sessionId: sdkVoiceSessionId.value });
|
|
1035
1217
|
_flushPendingTranscriptBuffers();
|
|
1036
1218
|
if (_geminiRecorder) {
|
|
@@ -1051,6 +1233,7 @@ export function stopSdkVoiceSession() {
|
|
|
1051
1233
|
}
|
|
1052
1234
|
_session = null;
|
|
1053
1235
|
}
|
|
1236
|
+
_stopCapturedSdkMicStreams();
|
|
1054
1237
|
|
|
1055
1238
|
// Stop Gemini mic stream if active
|
|
1056
1239
|
if (_geminiMicStream) {
|
|
@@ -1059,6 +1242,9 @@ export function stopSdkVoiceSession() {
|
|
|
1059
1242
|
}
|
|
1060
1243
|
_geminiMicStream = null;
|
|
1061
1244
|
}
|
|
1245
|
+
// Force-stop any tracked audio input streams to avoid stale browser mic
|
|
1246
|
+
// capture indicators after call close (covers async/race teardown paths).
|
|
1247
|
+
stopTrackedMicStreams();
|
|
1062
1248
|
|
|
1063
1249
|
clearInterval(_durationTimer);
|
|
1064
1250
|
_durationTimer = null;
|
|
@@ -1072,7 +1258,14 @@ export function stopSdkVoiceSession() {
|
|
|
1072
1258
|
sdkVoiceDuration.value = 0;
|
|
1073
1259
|
sdkVoiceProvider.value = null;
|
|
1074
1260
|
sdkVoiceSdkActive.value = false;
|
|
1075
|
-
|
|
1261
|
+
isVoiceMicMuted.value = false;
|
|
1262
|
+
_callContext = {
|
|
1263
|
+
sessionId: null,
|
|
1264
|
+
executor: null,
|
|
1265
|
+
mode: null,
|
|
1266
|
+
model: null,
|
|
1267
|
+
voiceAgentId: null,
|
|
1268
|
+
};
|
|
1076
1269
|
_usingLegacyFallback = false;
|
|
1077
1270
|
_resetTranscriptPersistenceState();
|
|
1078
1271
|
|
|
@@ -1095,6 +1288,39 @@ export function interruptSdkResponse() {
|
|
|
1095
1288
|
}
|
|
1096
1289
|
}
|
|
1097
1290
|
|
|
1291
|
+
/**
|
|
1292
|
+
* Toggle microphone mute state for SDK-driven voice sessions.
|
|
1293
|
+
* Returns the new muted state.
|
|
1294
|
+
*/
|
|
1295
|
+
export function toggleSdkMicMute() {
|
|
1296
|
+
const willBeMuted = !isVoiceMicMuted.value;
|
|
1297
|
+
const enabled = !willBeMuted;
|
|
1298
|
+
|
|
1299
|
+
if (_session) {
|
|
1300
|
+
// Try SDK-native controls first when available.
|
|
1301
|
+
try {
|
|
1302
|
+
if (enabled && typeof _session.unmute === "function") {
|
|
1303
|
+
_session.unmute();
|
|
1304
|
+
} else if (!enabled && typeof _session.mute === "function") {
|
|
1305
|
+
_session.mute();
|
|
1306
|
+
}
|
|
1307
|
+
} catch {
|
|
1308
|
+
// fall through to track-level toggles
|
|
1309
|
+
}
|
|
1310
|
+
setMicLikeTracksEnabled(_session, enabled);
|
|
1311
|
+
}
|
|
1312
|
+
|
|
1313
|
+
if (_geminiMicStream) {
|
|
1314
|
+
for (const track of _geminiMicStream.getTracks()) {
|
|
1315
|
+
if (String(track?.kind || "").toLowerCase() !== "audio") continue;
|
|
1316
|
+
try { track.enabled = enabled; } catch { /* ignore */ }
|
|
1317
|
+
}
|
|
1318
|
+
}
|
|
1319
|
+
|
|
1320
|
+
isVoiceMicMuted.value = willBeMuted;
|
|
1321
|
+
return isVoiceMicMuted.value;
|
|
1322
|
+
}
|
|
1323
|
+
|
|
1098
1324
|
/**
|
|
1099
1325
|
* Send a text message to the voice agent.
|
|
1100
1326
|
* @param {string} text
|