bosun 0.37.0 → 0.37.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/.env.example +4 -1
  2. package/agent-tool-config.mjs +338 -0
  3. package/bosun-skills.mjs +59 -4
  4. package/bosun.schema.json +1 -1
  5. package/desktop/launch.mjs +18 -0
  6. package/desktop/main.mjs +52 -13
  7. package/fleet-coordinator.mjs +34 -1
  8. package/kanban-adapter.mjs +30 -3
  9. package/library-manager.mjs +66 -0
  10. package/maintenance.mjs +30 -5
  11. package/monitor.mjs +56 -0
  12. package/package.json +4 -1
  13. package/setup-web-server.mjs +73 -12
  14. package/setup.mjs +3 -3
  15. package/ui/app.js +40 -3
  16. package/ui/components/session-list.js +25 -7
  17. package/ui/components/workspace-switcher.js +48 -1
  18. package/ui/demo.html +176 -0
  19. package/ui/modules/mic-track-registry.js +83 -0
  20. package/ui/modules/settings-schema.js +4 -1
  21. package/ui/modules/state.js +25 -0
  22. package/ui/modules/streaming.js +1 -1
  23. package/ui/modules/voice-barge-in.js +27 -0
  24. package/ui/modules/voice-client-sdk.js +268 -42
  25. package/ui/modules/voice-client.js +665 -61
  26. package/ui/modules/voice-overlay.js +829 -47
  27. package/ui/setup.html +151 -9
  28. package/ui/styles.css +258 -0
  29. package/ui/tabs/chat.js +11 -0
  30. package/ui/tabs/library.js +890 -15
  31. package/ui/tabs/settings.js +51 -11
  32. package/ui/tabs/telemetry.js +327 -105
  33. package/ui/tabs/workflows.js +86 -0
  34. package/ui-server.mjs +1201 -107
  35. package/voice-action-dispatcher.mjs +81 -0
  36. package/voice-agents-sdk.mjs +2 -2
  37. package/voice-relay.mjs +131 -14
  38. package/voice-tools.mjs +475 -9
  39. package/workflow-engine.mjs +54 -0
  40. package/workflow-nodes.mjs +177 -28
  41. package/workflow-templates/github.mjs +205 -94
  42. package/workflow-templates/task-batch.mjs +247 -0
  43. package/workflow-templates.mjs +15 -0
@@ -8,6 +8,12 @@
8
8
  */
9
9
 
10
10
  import { signal, computed } from "@preact/signals";
11
+ import {
12
+ ensureMicTrackingPatched,
13
+ registerMicStream,
14
+ stopTrackedMicStreams,
15
+ } from "./mic-track-registry.js";
16
+ import { shouldAutoBargeIn } from "./voice-barge-in.js";
11
17
 
12
18
  // ── State Signals ───────────────────────────────────────────────────────────
13
19
 
@@ -25,16 +31,181 @@ export const isVoiceActive = computed(() =>
25
31
  );
26
32
  export const isVoiceMicMuted = signal(false);
27
33
 
34
+ // ── Audio Device Selection ──────────────────────────────────────────────────
35
+
36
+ /** @type {import("@preact/signals").Signal<MediaDeviceInfo[]>} */
37
+ export const audioInputDevices = signal([]);
38
+ /** @type {import("@preact/signals").Signal<MediaDeviceInfo[]>} */
39
+ export const audioOutputDevices = signal([]);
40
+ /** @type {import("@preact/signals").Signal<string>} selected input device ID ("" = default) */
41
+ export const selectedAudioInput = signal("");
42
+ /** @type {import("@preact/signals").Signal<string>} selected output device ID ("" = default) */
43
+ export const selectedAudioOutput = signal("");
44
+ /** @type {import("@preact/signals").Signal<number>} mic input level 0-1 */
45
+ export const micInputLevel = signal(0);
46
+
47
+ /** Audio processing preferences (persisted via voice overlay settings) */
48
+ export const audioSettings = signal({
49
+ echoCancellation: true,
50
+ noiseSuppression: true,
51
+ autoGainControl: true,
52
+ sampleRate: 24000,
53
+ });
54
+
55
+ let _micLevelAnalyser = null;
56
+ let _micLevelTimer = null;
57
+
58
+ /**
59
+ * Enumerate available audio devices.
60
+ * Must be called after getUserMedia to get device labels.
61
+ */
62
+ export async function enumerateAudioDevices() {
63
+ try {
64
+ const devices = await navigator.mediaDevices.enumerateDevices();
65
+ audioInputDevices.value = devices.filter(d => d.kind === "audioinput");
66
+ audioOutputDevices.value = devices.filter(d => d.kind === "audiooutput");
67
+ } catch {
68
+ audioInputDevices.value = [];
69
+ audioOutputDevices.value = [];
70
+ }
71
+ }
72
+
73
+ /**
74
+ * Switch the microphone input device mid-session.
75
+ * @param {string} deviceId
76
+ */
77
+ export async function switchAudioInput(deviceId) {
78
+ selectedAudioInput.value = deviceId;
79
+ if (!_mediaStream) return;
80
+ try {
81
+ ensureMicTrackingPatched();
82
+ // Stop existing mic tracks
83
+ for (const track of _mediaStream.getAudioTracks()) {
84
+ track.stop();
85
+ }
86
+ const settings = audioSettings.value;
87
+ const newStream = await navigator.mediaDevices.getUserMedia({
88
+ audio: {
89
+ deviceId: deviceId ? { exact: deviceId } : undefined,
90
+ echoCancellation: settings.echoCancellation,
91
+ noiseSuppression: settings.noiseSuppression,
92
+ autoGainControl: settings.autoGainControl,
93
+ sampleRate: settings.sampleRate,
94
+ },
95
+ });
96
+ registerMicStream(newStream);
97
+ const newTrack = newStream.getAudioTracks()[0];
98
+ if (!newTrack) return;
99
+
100
+ // Replace track in the peer connection
101
+ if (_pc) {
102
+ const sender = _pc.getSenders().find(s => s.track?.kind === "audio");
103
+ if (sender) {
104
+ await sender.replaceTrack(newTrack);
105
+ }
106
+ }
107
+
108
+ // Replace in our saved reference
109
+ _mediaStream = newStream;
110
+ _startMicLevelMonitor(newStream);
111
+ await enumerateAudioDevices();
112
+ } catch (err) {
113
+ console.warn("[voice-client] switchAudioInput failed:", err);
114
+ }
115
+ }
116
+
117
+ /**
118
+ * Switch the audio output device (speaker/headphone).
119
+ * Uses HTMLMediaElement.setSinkId() — available in most modern browsers.
120
+ * @param {string} deviceId
121
+ */
122
+ export async function switchAudioOutput(deviceId) {
123
+ selectedAudioOutput.value = deviceId;
124
+ try {
125
+ if (_audioElement && typeof _audioElement.setSinkId === "function") {
126
+ await _audioElement.setSinkId(deviceId);
127
+ }
128
+ if (_responsesAudioElement && typeof _responsesAudioElement.setSinkId === "function") {
129
+ await _responsesAudioElement.setSinkId(deviceId);
130
+ }
131
+ } catch (err) {
132
+ console.warn("[voice-client] switchAudioOutput failed:", err);
133
+ }
134
+ }
135
+
136
+ /**
137
+ * Update audio processing settings and apply to active stream.
138
+ * @param {Partial<typeof audioSettings.value>} updates
139
+ */
140
+ export function updateAudioSettings(updates) {
141
+ audioSettings.value = { ...audioSettings.value, ...updates };
142
+ // Apply constraints to active tracks
143
+ if (_mediaStream) {
144
+ const settings = audioSettings.value;
145
+ for (const track of _mediaStream.getAudioTracks()) {
146
+ track.applyConstraints({
147
+ echoCancellation: settings.echoCancellation,
148
+ noiseSuppression: settings.noiseSuppression,
149
+ autoGainControl: settings.autoGainControl,
150
+ }).catch(() => {});
151
+ }
152
+ }
153
+ }
154
+
155
+ function _startMicLevelMonitor(stream) {
156
+ _stopMicLevelMonitor();
157
+ try {
158
+ const ctx = new (globalThis.AudioContext || globalThis.webkitAudioContext)();
159
+ const src = ctx.createMediaStreamSource(stream);
160
+ const analyser = ctx.createAnalyser();
161
+ analyser.fftSize = 256;
162
+ analyser.smoothingTimeConstant = 0.5;
163
+ src.connect(analyser);
164
+ _micLevelAnalyser = { ctx, analyser, buffer: new Uint8Array(analyser.frequencyBinCount) };
165
+ _micLevelTimer = setInterval(() => {
166
+ if (!_micLevelAnalyser) return;
167
+ _micLevelAnalyser.analyser.getByteFrequencyData(_micLevelAnalyser.buffer);
168
+ const sum = _micLevelAnalyser.buffer.reduce((a, v) => a + v, 0);
169
+ const avg = sum / _micLevelAnalyser.buffer.length;
170
+ const level = Math.min(1, avg / 128);
171
+ micInputLevel.value = level;
172
+ }, 100);
173
+ } catch {
174
+ // AudioContext might not be available
175
+ }
176
+ }
177
+
178
+ function _stopMicLevelMonitor() {
179
+ if (_micLevelTimer) {
180
+ clearInterval(_micLevelTimer);
181
+ _micLevelTimer = null;
182
+ }
183
+ if (_micLevelAnalyser) {
184
+ try { _micLevelAnalyser.ctx.close(); } catch { /* ignore */ }
185
+ _micLevelAnalyser = null;
186
+ }
187
+ micInputLevel.value = 0;
188
+ }
189
+
28
190
  // ── Module-scope state ──────────────────────────────────────────────────────
29
191
 
30
192
  let _pc = null; // RTCPeerConnection
31
193
  let _dc = null; // DataChannel for events
32
194
  let _mediaStream = null; // User mic MediaStream
33
195
  let _audioElement = null; // <audio> for playback
34
- let _transport = "webrtc"; // webrtc | responses-audio
196
+ let _transport = "webrtc"; // webrtc | websocket | responses-audio
35
197
  let _responsesTokenData = null;
36
198
  let _responsesRecognition = null;
37
199
  let _responsesAudioElement = null;
200
+
201
+ // ── WebSocket transport state ───────────────────────────────────────────────
202
+ let _ws = null; // WebSocket for Azure Realtime
203
+ let _wsAudioCtx = null; // AudioContext for WebSocket PCM16 I/O
204
+ let _wsMicProcessor = null; // ScriptProcessorNode for mic capture
205
+ let _wsMicSource = null; // MediaStreamAudioSourceNode
206
+ let _wsPlaybackQueue = []; // Queued PCM16 Float32 chunks for playback
207
+ let _wsPlaybackScheduled = 0; // AudioContext time of next scheduled chunk
208
+ let _wsPlaybackPlaying = false; // Whether audio playback loop is running
38
209
  let _responsesAbortController = null;
39
210
  let _responsesRecognitionRestartTimer = null;
40
211
  let _reconnectTimer = null; // 28-min reconnect timer
@@ -49,6 +220,7 @@ let _callContext = {
49
220
  executor: null,
50
221
  mode: null,
51
222
  model: null,
223
+ voiceAgentId: null,
52
224
  };
53
225
  let _lastPersistedUserTranscript = "";
54
226
  let _lastPersistedAssistantTranscript = "";
@@ -57,9 +229,13 @@ let _lastPersistedAssistantAt = 0;
57
229
  let _awaitingToolCompletionAck = false;
58
230
  let _assistantRespondedAfterTool = false;
59
231
  let _toolCompletionAckTimer = null;
232
+ let _lastAutoBargeInAt = 0;
233
+ let _autoBargeInTimer = null;
60
234
 
61
235
  const RECONNECT_AT_MS = 28 * 60 * 1000; // 28 minutes
62
236
  const MAX_RECONNECT_ATTEMPTS = 3;
237
+ const AUTO_BARGE_IN_COOLDOWN_MS = 700;
238
+ const AUTO_BARGE_IN_FADE_MS = 220;
63
239
  // Noise-control default: disable user-side live ASR transcript output/persistence.
64
240
  // Assistant response text remains enabled.
65
241
  const ENABLE_USER_TRANSCRIPT = false;
@@ -75,7 +251,8 @@ function _normalizeCallContext(options = {}) {
75
251
  const executor = String(options?.executor || "").trim() || null;
76
252
  const mode = String(options?.mode || "").trim() || null;
77
253
  const model = String(options?.model || "").trim() || null;
78
- return { sessionId, executor, mode, model };
254
+ const voiceAgentId = String(options?.voiceAgentId || "").trim() || null;
255
+ return { sessionId, executor, mode, model, voiceAgentId };
79
256
  }
80
257
 
81
258
  function _isResponsesAudioTransport(tokenData) {
@@ -385,6 +562,9 @@ function emit(event, data) {
385
562
  }
386
563
 
387
564
  function sendRealtimeEvent(payload) {
565
+ // WebSocket transport: send over WS
566
+ if (_transport === "websocket") return _sendWsEvent(payload);
567
+ // WebRTC transport: send over data channel
388
568
  if (!_dc || _dc.readyState !== "open") return false;
389
569
  try {
390
570
  _dc.send(JSON.stringify(payload));
@@ -404,9 +584,11 @@ function clearPendingResponseCreate() {
404
584
  }
405
585
 
406
586
  function scheduleManualResponseCreate(reason = "speech-stopped") {
407
- if (_transport !== "webrtc") return;
587
+ if (_transport !== "webrtc" && _transport !== "websocket") return;
408
588
  if (_awaitingAutoResponse) return;
409
- if (!_dc || _dc.readyState !== "open") return;
589
+ // Check appropriate channel is open
590
+ if (_transport === "webrtc" && (!_dc || _dc.readyState !== "open")) return;
591
+ if (_transport === "websocket" && (!_ws || _ws.readyState !== WebSocket.OPEN)) return;
410
592
  _awaitingAutoResponse = true;
411
593
  if (_pendingResponseCreateTimer) clearTimeout(_pendingResponseCreateTimer);
412
594
  _pendingResponseCreateTimer = setTimeout(() => {
@@ -434,14 +616,14 @@ function sendSessionUpdate(tokenData = {}) {
434
616
  sessionConfig?.turn_detection?.type ||
435
617
  sessionConfig?.audio?.input?.turnDetection?.type ||
436
618
  sessionConfig?.audio?.input?.turn_detection?.type ||
437
- "server_vad";
619
+ "semantic_vad";
438
620
  const turnDetectionConfig = {
439
621
  type: turnDetection,
440
622
  ...(turnDetection === "server_vad"
441
623
  ? {
442
- threshold: 0.35,
624
+ threshold: 0.7,
443
625
  prefix_padding_ms: 400,
444
- silence_duration_ms: 700,
626
+ silence_duration_ms: 1200,
445
627
  create_response: true,
446
628
  interrupt_response: true,
447
629
  }
@@ -455,6 +637,12 @@ function sendSessionUpdate(tokenData = {}) {
455
637
  : {}),
456
638
  };
457
639
 
640
+ // Use server-provided transcription model from sessionConfig, fall back to default
641
+ const transcriptionModel =
642
+ sessionConfig?.input_audio_transcription?.model || "gpt-4o-transcribe";
643
+ const transcriptionEnabled =
644
+ sessionConfig?.input_audio_transcription !== undefined;
645
+
458
646
  sendRealtimeEvent({
459
647
  type: "session.update",
460
648
  session: {
@@ -462,12 +650,258 @@ function sendSessionUpdate(tokenData = {}) {
462
650
  voice: voiceId,
463
651
  input_audio_format: "pcm16",
464
652
  output_audio_format: "pcm16",
465
- input_audio_transcription: { model: "gpt-4o-transcribe" },
653
+ ...(transcriptionEnabled
654
+ ? { input_audio_transcription: { model: transcriptionModel } }
655
+ : {}),
466
656
  turn_detection: turnDetectionConfig,
467
657
  },
468
658
  });
469
659
  }
470
660
 
661
+ // ── WebSocket Realtime Transport ─────────────────────────────────────────────
662
+ //
663
+ // Azure OpenAI Realtime API only supports WebSocket in many deployments
664
+ // (WebRTC returns 404). This transport captures mic audio as PCM16 chunks,
665
+ // sends them over WebSocket, receives response audio as PCM16 deltas, and
666
+ // plays them through AudioContext — giving the same real-time conversational
667
+ // voice experience as WebRTC.
668
+
669
+ /** Convert Float32 audio samples to Int16 PCM. */
670
+ function _float32ToInt16(float32Array) {
671
+ const int16 = new Int16Array(float32Array.length);
672
+ for (let i = 0; i < float32Array.length; i++) {
673
+ const s = Math.max(-1, Math.min(1, float32Array[i]));
674
+ int16[i] = s < 0 ? s * 0x8000 : s * 0x7FFF;
675
+ }
676
+ return int16;
677
+ }
678
+
679
+ /** Convert Int16 PCM to Float32 audio samples. */
680
+ function _int16ToFloat32(int16Array) {
681
+ const float32 = new Float32Array(int16Array.length);
682
+ for (let i = 0; i < int16Array.length; i++) {
683
+ float32[i] = int16Array[i] / (int16Array[i] < 0 ? 0x8000 : 0x7FFF);
684
+ }
685
+ return float32;
686
+ }
687
+
688
+ /** Encode Int16Array to base64 string (browser). */
689
+ function _int16ToBase64(int16Array) {
690
+ const bytes = new Uint8Array(int16Array.buffer);
691
+ let binary = "";
692
+ for (let i = 0; i < bytes.length; i++) {
693
+ binary += String.fromCharCode(bytes[i]);
694
+ }
695
+ return btoa(binary);
696
+ }
697
+
698
+ /** Decode base64 string to Int16Array. */
699
+ function _base64ToInt16(base64) {
700
+ const binary = atob(base64);
701
+ const bytes = new Uint8Array(binary.length);
702
+ for (let i = 0; i < binary.length; i++) {
703
+ bytes[i] = binary.charCodeAt(i);
704
+ }
705
+ return new Int16Array(bytes.buffer);
706
+ }
707
+
708
+ /** Send a JSON event over the WebSocket transport. */
709
+ function _sendWsEvent(payload) {
710
+ if (!_ws || _ws.readyState !== WebSocket.OPEN) return false;
711
+ try {
712
+ _ws.send(JSON.stringify(payload));
713
+ return true;
714
+ } catch (err) {
715
+ console.warn("[voice-client] WS send failed:", err?.message || err);
716
+ return false;
717
+ }
718
+ }
719
+
720
+ /** Play queued PCM16 audio chunks via AudioContext. */
721
+ function _scheduleWsPlayback() {
722
+ if (_wsPlaybackPlaying) return;
723
+ _wsPlaybackPlaying = true;
724
+
725
+ const drain = () => {
726
+ if (!_wsAudioCtx || _wsPlaybackQueue.length === 0 || _explicitStop) {
727
+ _wsPlaybackPlaying = false;
728
+ return;
729
+ }
730
+
731
+ const samples = _wsPlaybackQueue.shift();
732
+ const buffer = _wsAudioCtx.createBuffer(1, samples.length, 24000);
733
+ buffer.copyToChannel(samples, 0);
734
+ const sourceNode = _wsAudioCtx.createBufferSource();
735
+ sourceNode.buffer = buffer;
736
+
737
+ // Route through selected output device if supported
738
+ if (selectedAudioOutput.value && typeof _wsAudioCtx.setSinkId === "function") {
739
+ try { _wsAudioCtx.setSinkId(selectedAudioOutput.value); } catch { /* ignore */ }
740
+ }
741
+
742
+ sourceNode.connect(_wsAudioCtx.destination);
743
+
744
+ const now = _wsAudioCtx.currentTime;
745
+ const startTime = Math.max(now, _wsPlaybackScheduled);
746
+ sourceNode.start(startTime);
747
+ _wsPlaybackScheduled = startTime + buffer.duration;
748
+
749
+ sourceNode.onended = () => {
750
+ if (_wsPlaybackQueue.length > 0) {
751
+ drain();
752
+ } else {
753
+ _wsPlaybackPlaying = false;
754
+ if (voiceState.value === "speaking") {
755
+ voiceState.value = "connected";
756
+ }
757
+ }
758
+ };
759
+ };
760
+
761
+ drain();
762
+ }
763
+
764
+ /** Clean up WebSocket transport resources. */
765
+ function _cleanupWsTransport() {
766
+ if (_wsMicProcessor) {
767
+ try { _wsMicProcessor.disconnect(); } catch { /* ignore */ }
768
+ _wsMicProcessor = null;
769
+ }
770
+ if (_wsMicSource) {
771
+ try { _wsMicSource.disconnect(); } catch { /* ignore */ }
772
+ _wsMicSource = null;
773
+ }
774
+ if (_ws) {
775
+ try { _ws.close(); } catch { /* ignore */ }
776
+ _ws = null;
777
+ }
778
+ if (_wsAudioCtx) {
779
+ try { _wsAudioCtx.close(); } catch { /* ignore */ }
780
+ _wsAudioCtx = null;
781
+ }
782
+ _wsPlaybackQueue = [];
783
+ _wsPlaybackScheduled = 0;
784
+ _wsPlaybackPlaying = false;
785
+ }
786
+
787
+ /**
788
+ * Start a WebSocket-based Realtime session.
789
+ * Used as fallback when Azure WebRTC SDP exchange returns 404.
790
+ */
791
+ async function _startWebSocketTransport(tokenData, mediaStream) {
792
+ const wsUrl = String(tokenData?.wsUrl || "").trim();
793
+ if (!wsUrl) {
794
+ throw new Error("WebSocket URL not available for Azure Realtime fallback");
795
+ }
796
+
797
+ _transport = "websocket";
798
+
799
+ // Set up AudioContext for PCM16 I/O at 24kHz (Realtime API native rate)
800
+ _wsAudioCtx = new (globalThis.AudioContext || globalThis.webkitAudioContext)({
801
+ sampleRate: 24000,
802
+ });
803
+
804
+ return new Promise((resolve, reject) => {
805
+ _ws = new WebSocket(wsUrl);
806
+
807
+ const connectTimeout = setTimeout(() => {
808
+ reject(new Error("Azure Realtime WebSocket connection timed out"));
809
+ if (_ws) { try { _ws.close(); } catch { /* ignore */ } }
810
+ }, 15000);
811
+
812
+ _ws.onopen = () => {
813
+ clearTimeout(connectTimeout);
814
+
815
+ // Send session configuration (same as WebRTC data channel session.update)
816
+ sendSessionUpdate(tokenData);
817
+
818
+ // Start mic capture → PCM16 → WebSocket
819
+ _wsMicSource = _wsAudioCtx.createMediaStreamSource(mediaStream);
820
+ // ScriptProcessorNode deprecated but widely supported; buffer = 4096 samples
821
+ _wsMicProcessor = _wsAudioCtx.createScriptProcessor(4096, 1, 1);
822
+ _wsMicProcessor.onaudioprocess = (e) => {
823
+ if (_explicitStop || !_ws || _ws.readyState !== WebSocket.OPEN) return;
824
+ if (isVoiceMicMuted.value) return;
825
+ const float32 = e.inputBuffer.getChannelData(0);
826
+ const int16 = _float32ToInt16(float32);
827
+ const base64 = _int16ToBase64(int16);
828
+ _sendWsEvent({
829
+ type: "input_audio_buffer.append",
830
+ audio: base64,
831
+ });
832
+ };
833
+ _wsMicSource.connect(_wsMicProcessor);
834
+ _wsMicProcessor.connect(_wsAudioCtx.destination); // required for processing
835
+
836
+ voiceState.value = "connected";
837
+ voiceSessionId.value = _callContext.sessionId || `voice-ws-${Date.now()}`;
838
+ _sessionStartTime = Date.now();
839
+ startDurationTimer();
840
+
841
+ emit("connected", {
842
+ provider: tokenData.provider || "azure",
843
+ sessionId: voiceSessionId.value,
844
+ callContext: { ..._callContext },
845
+ transport: "websocket",
846
+ });
847
+
848
+ resolve();
849
+ };
850
+
851
+ _ws.onmessage = (event) => {
852
+ try {
853
+ const msg = JSON.parse(event.data);
854
+
855
+ // Handle audio deltas — play PCM16 through AudioContext
856
+ if (msg.type === "response.audio.delta" && msg.delta) {
857
+ if (voiceState.value !== "speaking") {
858
+ voiceState.value = "speaking";
859
+ }
860
+ const int16 = _base64ToInt16(msg.delta);
861
+ const float32 = _int16ToFloat32(int16);
862
+ _wsPlaybackQueue.push(float32);
863
+ _scheduleWsPlayback();
864
+ return;
865
+ }
866
+
867
+ if (msg.type === "response.audio.done") {
868
+ // Audio stream complete — playback will finish via onended callback
869
+ return;
870
+ }
871
+
872
+ // All other events go through the standard handler
873
+ handleServerEvent(msg);
874
+ } catch (err) {
875
+ console.error("[voice-client] WS message parse error:", err);
876
+ }
877
+ };
878
+
879
+ _ws.onerror = (event) => {
880
+ clearTimeout(connectTimeout);
881
+ const msg = "Azure Realtime WebSocket error";
882
+ console.error("[voice-client] WebSocket error:", event);
883
+ if (voiceState.value === "connecting") {
884
+ reject(new Error(msg));
885
+ } else {
886
+ voiceState.value = "error";
887
+ voiceError.value = msg;
888
+ emit("error", { message: msg });
889
+ }
890
+ };
891
+
892
+ _ws.onclose = (event) => {
893
+ clearTimeout(connectTimeout);
894
+ if (_explicitStop) return;
895
+ const reason = `WebSocket closed (code=${event.code})`;
896
+ if (voiceState.value === "connecting") {
897
+ reject(new Error(reason));
898
+ } else {
899
+ handleDisconnect(reason);
900
+ }
901
+ };
902
+ });
903
+ }
904
+
471
905
  // ── Core Connection ─────────────────────────────────────────────────────────
472
906
 
473
907
  /**
@@ -479,6 +913,7 @@ function sendSessionUpdate(tokenData = {}) {
479
913
  * 5. Create offer, set remote answer
480
914
  */
481
915
  export async function startVoiceSession(options = {}) {
916
+ ensureMicTrackingPatched();
482
917
  if (_pc) {
483
918
  console.warn("[voice-client] Session already active");
484
919
  return;
@@ -511,6 +946,7 @@ export async function startVoiceSession(options = {}) {
511
946
  executor: _callContext.executor || undefined,
512
947
  mode: _callContext.mode || undefined,
513
948
  model: _callContext.model || undefined,
949
+ voiceAgentId: _callContext.voiceAgentId || undefined,
514
950
  delegateOnly: false,
515
951
  }),
516
952
  });
@@ -546,12 +982,30 @@ export async function startVoiceSession(options = {}) {
546
982
 
547
983
  _mediaStream = await navigator.mediaDevices.getUserMedia({
548
984
  audio: {
549
- echoCancellation: true,
550
- noiseSuppression: true,
551
- autoGainControl: true,
552
- sampleRate: 24000,
985
+ deviceId: selectedAudioInput.value ? { exact: selectedAudioInput.value } : undefined,
986
+ echoCancellation: audioSettings.value.echoCancellation,
987
+ noiseSuppression: audioSettings.value.noiseSuppression,
988
+ autoGainControl: audioSettings.value.autoGainControl,
989
+ sampleRate: audioSettings.value.sampleRate,
553
990
  },
554
991
  });
992
+ registerMicStream(_mediaStream);
993
+
994
+ // Guard: stopVoiceSession() may have been called while getUserMedia() was
995
+ // still awaiting (e.g. the user pressed hang-up during the permission
996
+ // prompt or network delay). cleanup() already ran without this stream
997
+ // in the registry — release the mic immediately so the browser indicator
998
+ // goes away instead of staying lit indefinitely.
999
+ if (_explicitStop) {
1000
+ for (const track of _mediaStream.getTracks()) {
1001
+ try { track.stop(); } catch { /* ignore */ }
1002
+ }
1003
+ _mediaStream = null;
1004
+ throw new Error("voice session was stopped during microphone acquisition");
1005
+ }
1006
+
1007
+ await enumerateAudioDevices();
1008
+ _startMicLevelMonitor(_mediaStream);
555
1009
 
556
1010
  // 3. Create RTCPeerConnection
557
1011
  _pc = new RTCPeerConnection();
@@ -570,6 +1024,10 @@ export async function startVoiceSession(options = {}) {
570
1024
  _audioElement.autoplay = true;
571
1025
  _audioElement.playsInline = true;
572
1026
  _audioElement.muted = true;
1027
+ // Apply selected output device
1028
+ if (selectedAudioOutput.value && typeof _audioElement.setSinkId === "function") {
1029
+ try { await _audioElement.setSinkId(selectedAudioOutput.value); } catch { /* ignore */ }
1030
+ }
573
1031
  _pc.ontrack = (event) => {
574
1032
  _audioElement.srcObject = event.streams[0];
575
1033
  // Unmute now that the element is already playing (avoids autoplay block)
@@ -635,23 +1093,59 @@ export async function startVoiceSession(options = {}) {
635
1093
  ? `${tokenData.azureEndpoint}/openai/realtime?api-version=2025-04-01-preview&deployment=${tokenData.azureDeployment}`
636
1094
  : `https://api.openai.com/v1/realtime?model=${tokenData.model}`);
637
1095
 
638
- const sdpResponse = await fetch(baseUrl, {
639
- method: "POST",
640
- headers: {
641
- Authorization: `Bearer ${tokenData.token}`,
642
- "Content-Type": "application/sdp",
643
- },
644
- body: offer.sdp,
645
- });
1096
+ let webrtcFailed = false;
1097
+ let webrtcFailStatus = 0;
1098
+ try {
1099
+ const sdpResponse = await fetch(baseUrl, {
1100
+ method: "POST",
1101
+ headers: {
1102
+ Authorization: `Bearer ${tokenData.token}`,
1103
+ "Content-Type": "application/sdp",
1104
+ },
1105
+ body: offer.sdp,
1106
+ });
1107
+
1108
+ if (!sdpResponse.ok) {
1109
+ webrtcFailStatus = sdpResponse.status;
1110
+ const errBody = await sdpResponse.text().catch(() => "");
1111
+ const detail = errBody ? ` — ${errBody.slice(0, 300)}` : "";
1112
+ // For Azure, 404 means the resource doesn't support WebRTC — try WebSocket
1113
+ if (sdpResponse.status === 404 && tokenData.wsUrl) {
1114
+ console.warn("[voice-client] WebRTC SDP 404 — falling back to Azure WebSocket transport");
1115
+ webrtcFailed = true;
1116
+ } else {
1117
+ throw new Error(`WebRTC SDP exchange failed (${sdpResponse.status})${detail}`);
1118
+ }
1119
+ }
646
1120
 
647
- if (!sdpResponse.ok) {
648
- const errBody = await sdpResponse.text().catch(() => "");
649
- const detail = errBody ? ` — ${errBody.slice(0, 300)}` : "";
650
- throw new Error(`WebRTC SDP exchange failed (${sdpResponse.status})${detail}`);
1121
+ if (!webrtcFailed) {
1122
+ const answerSdp = await sdpResponse.text();
1123
+ await _pc.setRemoteDescription({ type: "answer", sdp: answerSdp });
1124
+ }
1125
+ } catch (sdpErr) {
1126
+ if (!webrtcFailed) throw sdpErr;
651
1127
  }
652
1128
 
653
- const answerSdp = await sdpResponse.text();
654
- await _pc.setRemoteDescription({ type: "answer", sdp: answerSdp });
1129
+ // ── WebSocket fallback for Azure when WebRTC returns 404 ────────────
1130
+ if (webrtcFailed) {
1131
+ // Clean up the WebRTC objects — we won't need them
1132
+ if (_dc) { try { _dc.close(); } catch { /* ignore */ } _dc = null; }
1133
+ if (_pc) { try { _pc.close(); } catch { /* ignore */ } _pc = null; }
1134
+ if (_audioElement) {
1135
+ try { _audioElement.pause(); _audioElement.srcObject = null; } catch { /* ignore */ }
1136
+ _audioElement = null;
1137
+ }
1138
+
1139
+ console.info("[voice-client] Starting Azure Realtime WebSocket transport");
1140
+ await _startWebSocketTransport(tokenData, _mediaStream);
1141
+
1142
+ emit("session-started", {
1143
+ sessionId: voiceSessionId.value,
1144
+ callContext: { ..._callContext },
1145
+ transport: "websocket",
1146
+ });
1147
+ return;
1148
+ }
655
1149
 
656
1150
  emit("session-started", {
657
1151
  sessionId: voiceSessionId.value,
@@ -672,6 +1166,7 @@ export async function startVoiceSession(options = {}) {
672
1166
  export function stopVoiceSession() {
673
1167
  _explicitStop = true;
674
1168
  emit("session-ending", { sessionId: voiceSessionId.value });
1169
+ _stopMicLevelMonitor();
675
1170
  cleanup();
676
1171
  voiceState.value = "idle";
677
1172
  voiceTranscript.value = "";
@@ -680,7 +1175,13 @@ export function stopVoiceSession() {
680
1175
  voiceSessionId.value = null;
681
1176
  voiceBoundSessionId.value = null;
682
1177
  voiceDuration.value = 0;
683
- _callContext = { sessionId: null, executor: null, mode: null, model: null };
1178
+ _callContext = {
1179
+ sessionId: null,
1180
+ executor: null,
1181
+ mode: null,
1182
+ model: null,
1183
+ voiceAgentId: null,
1184
+ };
684
1185
  emit("session-ended", {});
685
1186
  }
686
1187
 
@@ -696,6 +1197,7 @@ function handleServerEvent(event) {
696
1197
  break;
697
1198
 
698
1199
  case "input_audio_buffer.speech_started":
1200
+ triggerAutoBargeIn("speech-started");
699
1201
  voiceState.value = "listening";
700
1202
  emit("speech-started", {});
701
1203
  break;
@@ -807,7 +1309,9 @@ function handleServerEvent(event) {
807
1309
  break;
808
1310
 
809
1311
  case "response.audio.delta":
810
- // Audio is handled via WebRTC tracks, not data channel
1312
+ // WebRTC: audio is handled via media tracks, not data channel.
1313
+ // WebSocket: audio deltas are handled in the ws.onmessage handler
1314
+ // before reaching handleServerEvent, so this case is a no-op.
811
1315
  break;
812
1316
 
813
1317
  case "conversation.item.input_audio_transcription.failed":
@@ -910,6 +1414,7 @@ async function handleToolCall(event) {
910
1414
  executor: _callContext.executor || undefined,
911
1415
  mode: _callContext.mode || undefined,
912
1416
  model: _callContext.model || undefined,
1417
+ voiceAgentId: _callContext.voiceAgentId || undefined,
913
1418
  }),
914
1419
  });
915
1420
  const result = await res.json();
@@ -919,19 +1424,17 @@ async function handleToolCall(event) {
919
1424
  tc.callId === callId ? { ...tc, status: "complete", result: result.result } : tc
920
1425
  );
921
1426
 
922
- // Send result back to model via data channel
923
- if (_dc && _dc.readyState === "open") {
924
- _dc.send(JSON.stringify({
925
- type: "conversation.item.create",
926
- item: {
927
- type: "function_call_output",
928
- call_id: callId,
929
- output: result.result || result.error || "No output",
930
- },
931
- }));
932
- // Trigger response generation
933
- _dc.send(JSON.stringify({ type: "response.create" }));
934
- }
1427
+ // Send result back to model via data channel or WebSocket
1428
+ sendRealtimeEvent({
1429
+ type: "conversation.item.create",
1430
+ item: {
1431
+ type: "function_call_output",
1432
+ call_id: callId,
1433
+ output: result.result || result.error || "No output",
1434
+ },
1435
+ });
1436
+ // Trigger response generation
1437
+ sendRealtimeEvent({ type: "response.create" });
935
1438
 
936
1439
  const stillRunning = voiceToolCalls.value.some((tc) => tc.status === "running");
937
1440
  if (!stillRunning) {
@@ -945,22 +1448,87 @@ async function handleToolCall(event) {
945
1448
  emit("tool-call-error", { callId, name, error: err.message });
946
1449
 
947
1450
  // Send error result back
948
- if (_dc && _dc.readyState === "open") {
949
- _dc.send(JSON.stringify({
950
- type: "conversation.item.create",
951
- item: {
952
- type: "function_call_output",
953
- call_id: callId,
954
- output: `Error: ${err.message}`,
955
- },
956
- }));
957
- _dc.send(JSON.stringify({ type: "response.create" }));
958
- }
1451
+ sendRealtimeEvent({
1452
+ type: "conversation.item.create",
1453
+ item: {
1454
+ type: "function_call_output",
1455
+ call_id: callId,
1456
+ output: `Error: ${err.message}`,
1457
+ },
1458
+ });
1459
+ sendRealtimeEvent({ type: "response.create" });
959
1460
  }
960
1461
  }
961
1462
 
962
1463
  // ── Barge-in ────────────────────────────────────────────────────────────────
963
1464
 
1465
+ function isAssistantPlaybackActive() {
1466
+ if (_transport === "responses-audio") {
1467
+ return Boolean(_responsesAudioElement && !_responsesAudioElement.paused && !_responsesAudioElement.ended);
1468
+ }
1469
+ if (_transport === "websocket") {
1470
+ return Boolean(_wsPlaybackPlaying || _wsPlaybackQueue.length > 0);
1471
+ }
1472
+ return Boolean(_audioElement && !_audioElement.paused);
1473
+ }
1474
+
1475
+ function fadeElementVolumeTo(el, targetVolume, durationMs) {
1476
+ if (!el) return;
1477
+ const target = Math.max(0, Math.min(1, Number(targetVolume)));
1478
+ const duration = Math.max(40, Number(durationMs) || 180);
1479
+ const start = Math.max(0, Math.min(1, Number(el.volume)));
1480
+ const steps = 5;
1481
+ const stepMs = Math.max(10, Math.floor(duration / steps));
1482
+ let step = 0;
1483
+ const timer = setInterval(() => {
1484
+ step += 1;
1485
+ const t = Math.min(1, step / steps);
1486
+ const next = start + (target - start) * t;
1487
+ try { el.volume = Math.max(0, Math.min(1, next)); } catch { /* ignore */ }
1488
+ if (t >= 1) clearInterval(timer);
1489
+ }, stepMs);
1490
+ }
1491
+
1492
+ function triggerAutoBargeIn(reason = "speech-started") {
1493
+ const now = Date.now();
1494
+ const audioActive = isAssistantPlaybackActive();
1495
+ if (!shouldAutoBargeIn({
1496
+ muted: isVoiceMicMuted.value,
1497
+ audioActive,
1498
+ now,
1499
+ lastTriggeredAt: _lastAutoBargeInAt,
1500
+ minIntervalMs: AUTO_BARGE_IN_COOLDOWN_MS,
1501
+ })) {
1502
+ return false;
1503
+ }
1504
+ _lastAutoBargeInAt = now;
1505
+ if (_autoBargeInTimer) {
1506
+ clearTimeout(_autoBargeInTimer);
1507
+ _autoBargeInTimer = null;
1508
+ }
1509
+ if (_transport === "responses-audio" && _responsesAudioElement) {
1510
+ fadeElementVolumeTo(_responsesAudioElement, 0.1, AUTO_BARGE_IN_FADE_MS);
1511
+ _autoBargeInTimer = setTimeout(() => {
1512
+ _autoBargeInTimer = null;
1513
+ interruptResponse();
1514
+ emit("auto-barge-in", { reason });
1515
+ }, AUTO_BARGE_IN_FADE_MS);
1516
+ return true;
1517
+ }
1518
+ if (_transport === "webrtc" && _audioElement) {
1519
+ fadeElementVolumeTo(_audioElement, 0.12, AUTO_BARGE_IN_FADE_MS);
1520
+ _autoBargeInTimer = setTimeout(() => {
1521
+ _autoBargeInTimer = null;
1522
+ interruptResponse();
1523
+ emit("auto-barge-in", { reason });
1524
+ }, AUTO_BARGE_IN_FADE_MS);
1525
+ return true;
1526
+ }
1527
+ interruptResponse();
1528
+ emit("auto-barge-in", { reason });
1529
+ return true;
1530
+ }
1531
+
964
1532
  /**
965
1533
  * Interrupt the current response (barge-in).
966
1534
  */
@@ -974,14 +1542,28 @@ export function interruptResponse() {
974
1542
  try {
975
1543
  _responsesAudioElement.pause();
976
1544
  _responsesAudioElement.currentTime = 0;
1545
+ _responsesAudioElement.volume = 1;
977
1546
  } catch { /* ignore */ }
978
1547
  }
979
1548
  voiceState.value = "listening";
980
1549
  emit("interrupt", {});
981
1550
  return;
982
1551
  }
1552
+ // WebSocket transport: cancel response and clear playback queue
1553
+ if (_transport === "websocket") {
1554
+ _sendWsEvent({ type: "response.cancel" });
1555
+ _wsPlaybackQueue = [];
1556
+ _wsPlaybackPlaying = false;
1557
+ voiceState.value = "listening";
1558
+ emit("interrupt", {});
1559
+ return;
1560
+ }
983
1561
  if (_dc && _dc.readyState === "open") {
984
1562
  _dc.send(JSON.stringify({ type: "response.cancel" }));
1563
+ if (_audioElement) {
1564
+ try { _audioElement.volume = 1; } catch { /* ignore */ }
1565
+ }
1566
+ voiceState.value = "listening";
985
1567
  emit("interrupt", {});
986
1568
  }
987
1569
  }
@@ -1000,20 +1582,25 @@ export function sendTextMessage(text) {
1000
1582
  });
1001
1583
  return;
1002
1584
  }
1003
- if (!_dc || _dc.readyState !== "open") {
1585
+ // WebRTC or WebSocket: send via the shared sendRealtimeEvent helper
1586
+ if (_transport === "websocket" && (!_ws || _ws.readyState !== WebSocket.OPEN)) {
1587
+ console.warn("[voice-client] Cannot send text — WebSocket not open");
1588
+ return;
1589
+ }
1590
+ if (_transport === "webrtc" && (!_dc || _dc.readyState !== "open")) {
1004
1591
  console.warn("[voice-client] Cannot send text — data channel not open");
1005
1592
  return;
1006
1593
  }
1007
- _dc.send(JSON.stringify({
1594
+ sendRealtimeEvent({
1008
1595
  type: "conversation.item.create",
1009
1596
  item: {
1010
1597
  type: "message",
1011
1598
  role: "user",
1012
1599
  content: [{ type: "input_text", text: inputText }],
1013
1600
  },
1014
- }));
1601
+ });
1015
1602
  _recordVoiceTranscriptIfNew("user", inputText, "send_text_message");
1016
- _dc.send(JSON.stringify({ type: "response.create" }));
1603
+ sendRealtimeEvent({ type: "response.create" });
1017
1604
  }
1018
1605
 
1019
1606
  /**
@@ -1024,14 +1611,14 @@ export function sendImageFrame(imageDataUrl, options = {}) {
1024
1611
  if (_transport === "responses-audio") return false;
1025
1612
  const imageUrl = String(imageDataUrl || "").trim();
1026
1613
  if (!imageUrl) return false;
1027
- if (!_dc || _dc.readyState !== "open") {
1028
- return false;
1029
- }
1614
+ // WebSocket transport: use sendRealtimeEvent
1615
+ if (_transport === "websocket" && (!_ws || _ws.readyState !== WebSocket.OPEN)) return false;
1616
+ if (_transport === "webrtc" && (!_dc || _dc.readyState !== "open")) return false;
1030
1617
  const source = String(options?.source || "screen").trim() || "screen";
1031
1618
  const width = Number(options?.width) || undefined;
1032
1619
  const height = Number(options?.height) || undefined;
1033
1620
  try {
1034
- _dc.send(JSON.stringify({
1621
+ sendRealtimeEvent({
1035
1622
  type: "conversation.item.create",
1036
1623
  item: {
1037
1624
  type: "message",
@@ -1050,7 +1637,7 @@ export function sendImageFrame(imageDataUrl, options = {}) {
1050
1637
  width,
1051
1638
  height,
1052
1639
  },
1053
- }));
1640
+ });
1054
1641
  return true;
1055
1642
  } catch (err) {
1056
1643
  console.warn("[voice-client] failed to send realtime image frame:", err?.message || err);
@@ -1173,6 +1760,12 @@ export function toggleMicMute() {
1173
1760
  }
1174
1761
  return willBeMuted;
1175
1762
  }
1763
+ // websocket transport: mic muting is handled by the onaudioprocess guard
1764
+ if (_transport === "websocket") {
1765
+ const willBeMuted = !isVoiceMicMuted.value;
1766
+ isVoiceMicMuted.value = willBeMuted;
1767
+ return willBeMuted;
1768
+ }
1176
1769
  return isVoiceMicMuted.value;
1177
1770
  }
1178
1771
 
@@ -1231,10 +1824,15 @@ function cleanupConnection() {
1231
1824
  }
1232
1825
 
1233
1826
  function cleanup() {
1827
+ // Always close the mic-level AudioContext first so no AudioContext
1828
+ // holds a live MediaStreamAudioSourceNode after teardown. This path
1829
+ // is reached both by stopVoiceSession() and by handleDisconnect().
1830
+ _stopMicLevelMonitor();
1234
1831
  _reconnectInFlight = false;
1235
1832
  _audioAutoplayWarned = false;
1236
1833
  isVoiceMicMuted.value = false;
1237
1834
  cleanupConnection();
1835
+ _cleanupWsTransport();
1238
1836
 
1239
1837
  clearInterval(_durationTimer);
1240
1838
  _durationTimer = null;
@@ -1245,6 +1843,7 @@ function cleanup() {
1245
1843
  }
1246
1844
  _mediaStream = null;
1247
1845
  }
1846
+ stopTrackedMicStreams();
1248
1847
  _stopResponsesRecognition();
1249
1848
  if (_responsesAbortController) {
1250
1849
  try { _responsesAbortController.abort(); } catch { /* ignore */ }
@@ -1266,4 +1865,9 @@ function cleanup() {
1266
1865
  _awaitingToolCompletionAck = false;
1267
1866
  _assistantRespondedAfterTool = false;
1268
1867
  _clearToolCompletionAckTimer();
1868
+ if (_autoBargeInTimer) {
1869
+ clearTimeout(_autoBargeInTimer);
1870
+ _autoBargeInTimer = null;
1871
+ }
1872
+ _lastAutoBargeInAt = 0;
1269
1873
  }