bosun 0.37.0 → 0.37.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +4 -1
- package/agent-tool-config.mjs +338 -0
- package/bosun-skills.mjs +59 -4
- package/bosun.schema.json +1 -1
- package/desktop/launch.mjs +18 -0
- package/desktop/main.mjs +52 -13
- package/fleet-coordinator.mjs +34 -1
- package/kanban-adapter.mjs +30 -3
- package/library-manager.mjs +66 -0
- package/maintenance.mjs +30 -5
- package/monitor.mjs +56 -0
- package/package.json +4 -1
- package/setup-web-server.mjs +73 -12
- package/setup.mjs +3 -3
- package/ui/app.js +40 -3
- package/ui/components/session-list.js +25 -7
- package/ui/components/workspace-switcher.js +48 -1
- package/ui/demo.html +176 -0
- package/ui/modules/mic-track-registry.js +83 -0
- package/ui/modules/settings-schema.js +4 -1
- package/ui/modules/state.js +25 -0
- package/ui/modules/streaming.js +1 -1
- package/ui/modules/voice-barge-in.js +27 -0
- package/ui/modules/voice-client-sdk.js +268 -42
- package/ui/modules/voice-client.js +665 -61
- package/ui/modules/voice-overlay.js +829 -47
- package/ui/setup.html +151 -9
- package/ui/styles.css +258 -0
- package/ui/tabs/chat.js +11 -0
- package/ui/tabs/library.js +890 -15
- package/ui/tabs/settings.js +51 -11
- package/ui/tabs/telemetry.js +327 -105
- package/ui/tabs/workflows.js +86 -0
- package/ui-server.mjs +1201 -107
- package/voice-action-dispatcher.mjs +81 -0
- package/voice-agents-sdk.mjs +2 -2
- package/voice-relay.mjs +131 -14
- package/voice-tools.mjs +475 -9
- package/workflow-engine.mjs +54 -0
- package/workflow-nodes.mjs +177 -28
- package/workflow-templates/github.mjs +205 -94
- package/workflow-templates/task-batch.mjs +247 -0
- package/workflow-templates.mjs +15 -0
|
@@ -8,6 +8,12 @@
|
|
|
8
8
|
*/
|
|
9
9
|
|
|
10
10
|
import { signal, computed } from "@preact/signals";
|
|
11
|
+
import {
|
|
12
|
+
ensureMicTrackingPatched,
|
|
13
|
+
registerMicStream,
|
|
14
|
+
stopTrackedMicStreams,
|
|
15
|
+
} from "./mic-track-registry.js";
|
|
16
|
+
import { shouldAutoBargeIn } from "./voice-barge-in.js";
|
|
11
17
|
|
|
12
18
|
// ── State Signals ───────────────────────────────────────────────────────────
|
|
13
19
|
|
|
@@ -25,16 +31,181 @@ export const isVoiceActive = computed(() =>
|
|
|
25
31
|
);
|
|
26
32
|
export const isVoiceMicMuted = signal(false);
|
|
27
33
|
|
|
34
|
+
// ── Audio Device Selection ──────────────────────────────────────────────────
|
|
35
|
+
|
|
36
|
+
/** @type {import("@preact/signals").Signal<MediaDeviceInfo[]>} */
|
|
37
|
+
export const audioInputDevices = signal([]);
|
|
38
|
+
/** @type {import("@preact/signals").Signal<MediaDeviceInfo[]>} */
|
|
39
|
+
export const audioOutputDevices = signal([]);
|
|
40
|
+
/** @type {import("@preact/signals").Signal<string>} selected input device ID ("" = default) */
|
|
41
|
+
export const selectedAudioInput = signal("");
|
|
42
|
+
/** @type {import("@preact/signals").Signal<string>} selected output device ID ("" = default) */
|
|
43
|
+
export const selectedAudioOutput = signal("");
|
|
44
|
+
/** @type {import("@preact/signals").Signal<number>} mic input level 0-1 */
|
|
45
|
+
export const micInputLevel = signal(0);
|
|
46
|
+
|
|
47
|
+
/** Audio processing preferences (persisted via voice overlay settings) */
|
|
48
|
+
export const audioSettings = signal({
|
|
49
|
+
echoCancellation: true,
|
|
50
|
+
noiseSuppression: true,
|
|
51
|
+
autoGainControl: true,
|
|
52
|
+
sampleRate: 24000,
|
|
53
|
+
});
|
|
54
|
+
|
|
55
|
+
let _micLevelAnalyser = null;
|
|
56
|
+
let _micLevelTimer = null;
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* Enumerate available audio devices.
|
|
60
|
+
* Must be called after getUserMedia to get device labels.
|
|
61
|
+
*/
|
|
62
|
+
export async function enumerateAudioDevices() {
|
|
63
|
+
try {
|
|
64
|
+
const devices = await navigator.mediaDevices.enumerateDevices();
|
|
65
|
+
audioInputDevices.value = devices.filter(d => d.kind === "audioinput");
|
|
66
|
+
audioOutputDevices.value = devices.filter(d => d.kind === "audiooutput");
|
|
67
|
+
} catch {
|
|
68
|
+
audioInputDevices.value = [];
|
|
69
|
+
audioOutputDevices.value = [];
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* Switch the microphone input device mid-session.
|
|
75
|
+
* @param {string} deviceId
|
|
76
|
+
*/
|
|
77
|
+
export async function switchAudioInput(deviceId) {
|
|
78
|
+
selectedAudioInput.value = deviceId;
|
|
79
|
+
if (!_mediaStream) return;
|
|
80
|
+
try {
|
|
81
|
+
ensureMicTrackingPatched();
|
|
82
|
+
// Stop existing mic tracks
|
|
83
|
+
for (const track of _mediaStream.getAudioTracks()) {
|
|
84
|
+
track.stop();
|
|
85
|
+
}
|
|
86
|
+
const settings = audioSettings.value;
|
|
87
|
+
const newStream = await navigator.mediaDevices.getUserMedia({
|
|
88
|
+
audio: {
|
|
89
|
+
deviceId: deviceId ? { exact: deviceId } : undefined,
|
|
90
|
+
echoCancellation: settings.echoCancellation,
|
|
91
|
+
noiseSuppression: settings.noiseSuppression,
|
|
92
|
+
autoGainControl: settings.autoGainControl,
|
|
93
|
+
sampleRate: settings.sampleRate,
|
|
94
|
+
},
|
|
95
|
+
});
|
|
96
|
+
registerMicStream(newStream);
|
|
97
|
+
const newTrack = newStream.getAudioTracks()[0];
|
|
98
|
+
if (!newTrack) return;
|
|
99
|
+
|
|
100
|
+
// Replace track in the peer connection
|
|
101
|
+
if (_pc) {
|
|
102
|
+
const sender = _pc.getSenders().find(s => s.track?.kind === "audio");
|
|
103
|
+
if (sender) {
|
|
104
|
+
await sender.replaceTrack(newTrack);
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
// Replace in our saved reference
|
|
109
|
+
_mediaStream = newStream;
|
|
110
|
+
_startMicLevelMonitor(newStream);
|
|
111
|
+
await enumerateAudioDevices();
|
|
112
|
+
} catch (err) {
|
|
113
|
+
console.warn("[voice-client] switchAudioInput failed:", err);
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Switch the audio output device (speaker/headphone).
|
|
119
|
+
* Uses HTMLMediaElement.setSinkId() — available in most modern browsers.
|
|
120
|
+
* @param {string} deviceId
|
|
121
|
+
*/
|
|
122
|
+
export async function switchAudioOutput(deviceId) {
|
|
123
|
+
selectedAudioOutput.value = deviceId;
|
|
124
|
+
try {
|
|
125
|
+
if (_audioElement && typeof _audioElement.setSinkId === "function") {
|
|
126
|
+
await _audioElement.setSinkId(deviceId);
|
|
127
|
+
}
|
|
128
|
+
if (_responsesAudioElement && typeof _responsesAudioElement.setSinkId === "function") {
|
|
129
|
+
await _responsesAudioElement.setSinkId(deviceId);
|
|
130
|
+
}
|
|
131
|
+
} catch (err) {
|
|
132
|
+
console.warn("[voice-client] switchAudioOutput failed:", err);
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
/**
|
|
137
|
+
* Update audio processing settings and apply to active stream.
|
|
138
|
+
* @param {Partial<typeof audioSettings.value>} updates
|
|
139
|
+
*/
|
|
140
|
+
export function updateAudioSettings(updates) {
|
|
141
|
+
audioSettings.value = { ...audioSettings.value, ...updates };
|
|
142
|
+
// Apply constraints to active tracks
|
|
143
|
+
if (_mediaStream) {
|
|
144
|
+
const settings = audioSettings.value;
|
|
145
|
+
for (const track of _mediaStream.getAudioTracks()) {
|
|
146
|
+
track.applyConstraints({
|
|
147
|
+
echoCancellation: settings.echoCancellation,
|
|
148
|
+
noiseSuppression: settings.noiseSuppression,
|
|
149
|
+
autoGainControl: settings.autoGainControl,
|
|
150
|
+
}).catch(() => {});
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
function _startMicLevelMonitor(stream) {
|
|
156
|
+
_stopMicLevelMonitor();
|
|
157
|
+
try {
|
|
158
|
+
const ctx = new (globalThis.AudioContext || globalThis.webkitAudioContext)();
|
|
159
|
+
const src = ctx.createMediaStreamSource(stream);
|
|
160
|
+
const analyser = ctx.createAnalyser();
|
|
161
|
+
analyser.fftSize = 256;
|
|
162
|
+
analyser.smoothingTimeConstant = 0.5;
|
|
163
|
+
src.connect(analyser);
|
|
164
|
+
_micLevelAnalyser = { ctx, analyser, buffer: new Uint8Array(analyser.frequencyBinCount) };
|
|
165
|
+
_micLevelTimer = setInterval(() => {
|
|
166
|
+
if (!_micLevelAnalyser) return;
|
|
167
|
+
_micLevelAnalyser.analyser.getByteFrequencyData(_micLevelAnalyser.buffer);
|
|
168
|
+
const sum = _micLevelAnalyser.buffer.reduce((a, v) => a + v, 0);
|
|
169
|
+
const avg = sum / _micLevelAnalyser.buffer.length;
|
|
170
|
+
const level = Math.min(1, avg / 128);
|
|
171
|
+
micInputLevel.value = level;
|
|
172
|
+
}, 100);
|
|
173
|
+
} catch {
|
|
174
|
+
// AudioContext might not be available
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
function _stopMicLevelMonitor() {
|
|
179
|
+
if (_micLevelTimer) {
|
|
180
|
+
clearInterval(_micLevelTimer);
|
|
181
|
+
_micLevelTimer = null;
|
|
182
|
+
}
|
|
183
|
+
if (_micLevelAnalyser) {
|
|
184
|
+
try { _micLevelAnalyser.ctx.close(); } catch { /* ignore */ }
|
|
185
|
+
_micLevelAnalyser = null;
|
|
186
|
+
}
|
|
187
|
+
micInputLevel.value = 0;
|
|
188
|
+
}
|
|
189
|
+
|
|
28
190
|
// ── Module-scope state ──────────────────────────────────────────────────────
|
|
29
191
|
|
|
30
192
|
let _pc = null; // RTCPeerConnection
|
|
31
193
|
let _dc = null; // DataChannel for events
|
|
32
194
|
let _mediaStream = null; // User mic MediaStream
|
|
33
195
|
let _audioElement = null; // <audio> for playback
|
|
34
|
-
let _transport = "webrtc"; // webrtc | responses-audio
|
|
196
|
+
let _transport = "webrtc"; // webrtc | websocket | responses-audio
|
|
35
197
|
let _responsesTokenData = null;
|
|
36
198
|
let _responsesRecognition = null;
|
|
37
199
|
let _responsesAudioElement = null;
|
|
200
|
+
|
|
201
|
+
// ── WebSocket transport state ───────────────────────────────────────────────
|
|
202
|
+
let _ws = null; // WebSocket for Azure Realtime
|
|
203
|
+
let _wsAudioCtx = null; // AudioContext for WebSocket PCM16 I/O
|
|
204
|
+
let _wsMicProcessor = null; // ScriptProcessorNode for mic capture
|
|
205
|
+
let _wsMicSource = null; // MediaStreamAudioSourceNode
|
|
206
|
+
let _wsPlaybackQueue = []; // Queued PCM16 Float32 chunks for playback
|
|
207
|
+
let _wsPlaybackScheduled = 0; // AudioContext time of next scheduled chunk
|
|
208
|
+
let _wsPlaybackPlaying = false; // Whether audio playback loop is running
|
|
38
209
|
let _responsesAbortController = null;
|
|
39
210
|
let _responsesRecognitionRestartTimer = null;
|
|
40
211
|
let _reconnectTimer = null; // 28-min reconnect timer
|
|
@@ -49,6 +220,7 @@ let _callContext = {
|
|
|
49
220
|
executor: null,
|
|
50
221
|
mode: null,
|
|
51
222
|
model: null,
|
|
223
|
+
voiceAgentId: null,
|
|
52
224
|
};
|
|
53
225
|
let _lastPersistedUserTranscript = "";
|
|
54
226
|
let _lastPersistedAssistantTranscript = "";
|
|
@@ -57,9 +229,13 @@ let _lastPersistedAssistantAt = 0;
|
|
|
57
229
|
let _awaitingToolCompletionAck = false;
|
|
58
230
|
let _assistantRespondedAfterTool = false;
|
|
59
231
|
let _toolCompletionAckTimer = null;
|
|
232
|
+
let _lastAutoBargeInAt = 0;
|
|
233
|
+
let _autoBargeInTimer = null;
|
|
60
234
|
|
|
61
235
|
const RECONNECT_AT_MS = 28 * 60 * 1000; // 28 minutes
|
|
62
236
|
const MAX_RECONNECT_ATTEMPTS = 3;
|
|
237
|
+
const AUTO_BARGE_IN_COOLDOWN_MS = 700;
|
|
238
|
+
const AUTO_BARGE_IN_FADE_MS = 220;
|
|
63
239
|
// Noise-control default: disable user-side live ASR transcript output/persistence.
|
|
64
240
|
// Assistant response text remains enabled.
|
|
65
241
|
const ENABLE_USER_TRANSCRIPT = false;
|
|
@@ -75,7 +251,8 @@ function _normalizeCallContext(options = {}) {
|
|
|
75
251
|
const executor = String(options?.executor || "").trim() || null;
|
|
76
252
|
const mode = String(options?.mode || "").trim() || null;
|
|
77
253
|
const model = String(options?.model || "").trim() || null;
|
|
78
|
-
|
|
254
|
+
const voiceAgentId = String(options?.voiceAgentId || "").trim() || null;
|
|
255
|
+
return { sessionId, executor, mode, model, voiceAgentId };
|
|
79
256
|
}
|
|
80
257
|
|
|
81
258
|
function _isResponsesAudioTransport(tokenData) {
|
|
@@ -385,6 +562,9 @@ function emit(event, data) {
|
|
|
385
562
|
}
|
|
386
563
|
|
|
387
564
|
function sendRealtimeEvent(payload) {
|
|
565
|
+
// WebSocket transport: send over WS
|
|
566
|
+
if (_transport === "websocket") return _sendWsEvent(payload);
|
|
567
|
+
// WebRTC transport: send over data channel
|
|
388
568
|
if (!_dc || _dc.readyState !== "open") return false;
|
|
389
569
|
try {
|
|
390
570
|
_dc.send(JSON.stringify(payload));
|
|
@@ -404,9 +584,11 @@ function clearPendingResponseCreate() {
|
|
|
404
584
|
}
|
|
405
585
|
|
|
406
586
|
function scheduleManualResponseCreate(reason = "speech-stopped") {
|
|
407
|
-
if (_transport !== "webrtc") return;
|
|
587
|
+
if (_transport !== "webrtc" && _transport !== "websocket") return;
|
|
408
588
|
if (_awaitingAutoResponse) return;
|
|
409
|
-
|
|
589
|
+
// Check appropriate channel is open
|
|
590
|
+
if (_transport === "webrtc" && (!_dc || _dc.readyState !== "open")) return;
|
|
591
|
+
if (_transport === "websocket" && (!_ws || _ws.readyState !== WebSocket.OPEN)) return;
|
|
410
592
|
_awaitingAutoResponse = true;
|
|
411
593
|
if (_pendingResponseCreateTimer) clearTimeout(_pendingResponseCreateTimer);
|
|
412
594
|
_pendingResponseCreateTimer = setTimeout(() => {
|
|
@@ -434,14 +616,14 @@ function sendSessionUpdate(tokenData = {}) {
|
|
|
434
616
|
sessionConfig?.turn_detection?.type ||
|
|
435
617
|
sessionConfig?.audio?.input?.turnDetection?.type ||
|
|
436
618
|
sessionConfig?.audio?.input?.turn_detection?.type ||
|
|
437
|
-
"
|
|
619
|
+
"semantic_vad";
|
|
438
620
|
const turnDetectionConfig = {
|
|
439
621
|
type: turnDetection,
|
|
440
622
|
...(turnDetection === "server_vad"
|
|
441
623
|
? {
|
|
442
|
-
threshold: 0.
|
|
624
|
+
threshold: 0.7,
|
|
443
625
|
prefix_padding_ms: 400,
|
|
444
|
-
silence_duration_ms:
|
|
626
|
+
silence_duration_ms: 1200,
|
|
445
627
|
create_response: true,
|
|
446
628
|
interrupt_response: true,
|
|
447
629
|
}
|
|
@@ -455,6 +637,12 @@ function sendSessionUpdate(tokenData = {}) {
|
|
|
455
637
|
: {}),
|
|
456
638
|
};
|
|
457
639
|
|
|
640
|
+
// Use server-provided transcription model from sessionConfig, fall back to default
|
|
641
|
+
const transcriptionModel =
|
|
642
|
+
sessionConfig?.input_audio_transcription?.model || "gpt-4o-transcribe";
|
|
643
|
+
const transcriptionEnabled =
|
|
644
|
+
sessionConfig?.input_audio_transcription !== undefined;
|
|
645
|
+
|
|
458
646
|
sendRealtimeEvent({
|
|
459
647
|
type: "session.update",
|
|
460
648
|
session: {
|
|
@@ -462,12 +650,258 @@ function sendSessionUpdate(tokenData = {}) {
|
|
|
462
650
|
voice: voiceId,
|
|
463
651
|
input_audio_format: "pcm16",
|
|
464
652
|
output_audio_format: "pcm16",
|
|
465
|
-
|
|
653
|
+
...(transcriptionEnabled
|
|
654
|
+
? { input_audio_transcription: { model: transcriptionModel } }
|
|
655
|
+
: {}),
|
|
466
656
|
turn_detection: turnDetectionConfig,
|
|
467
657
|
},
|
|
468
658
|
});
|
|
469
659
|
}
|
|
470
660
|
|
|
661
|
+
// ── WebSocket Realtime Transport ─────────────────────────────────────────────
|
|
662
|
+
//
|
|
663
|
+
// Azure OpenAI Realtime API only supports WebSocket in many deployments
|
|
664
|
+
// (WebRTC returns 404). This transport captures mic audio as PCM16 chunks,
|
|
665
|
+
// sends them over WebSocket, receives response audio as PCM16 deltas, and
|
|
666
|
+
// plays them through AudioContext — giving the same real-time conversational
|
|
667
|
+
// voice experience as WebRTC.
|
|
668
|
+
|
|
669
|
+
/** Convert Float32 audio samples to Int16 PCM. */
|
|
670
|
+
function _float32ToInt16(float32Array) {
|
|
671
|
+
const int16 = new Int16Array(float32Array.length);
|
|
672
|
+
for (let i = 0; i < float32Array.length; i++) {
|
|
673
|
+
const s = Math.max(-1, Math.min(1, float32Array[i]));
|
|
674
|
+
int16[i] = s < 0 ? s * 0x8000 : s * 0x7FFF;
|
|
675
|
+
}
|
|
676
|
+
return int16;
|
|
677
|
+
}
|
|
678
|
+
|
|
679
|
+
/** Convert Int16 PCM to Float32 audio samples. */
|
|
680
|
+
function _int16ToFloat32(int16Array) {
|
|
681
|
+
const float32 = new Float32Array(int16Array.length);
|
|
682
|
+
for (let i = 0; i < int16Array.length; i++) {
|
|
683
|
+
float32[i] = int16Array[i] / (int16Array[i] < 0 ? 0x8000 : 0x7FFF);
|
|
684
|
+
}
|
|
685
|
+
return float32;
|
|
686
|
+
}
|
|
687
|
+
|
|
688
|
+
/** Encode Int16Array to base64 string (browser). */
|
|
689
|
+
function _int16ToBase64(int16Array) {
|
|
690
|
+
const bytes = new Uint8Array(int16Array.buffer);
|
|
691
|
+
let binary = "";
|
|
692
|
+
for (let i = 0; i < bytes.length; i++) {
|
|
693
|
+
binary += String.fromCharCode(bytes[i]);
|
|
694
|
+
}
|
|
695
|
+
return btoa(binary);
|
|
696
|
+
}
|
|
697
|
+
|
|
698
|
+
/** Decode base64 string to Int16Array. */
|
|
699
|
+
function _base64ToInt16(base64) {
|
|
700
|
+
const binary = atob(base64);
|
|
701
|
+
const bytes = new Uint8Array(binary.length);
|
|
702
|
+
for (let i = 0; i < binary.length; i++) {
|
|
703
|
+
bytes[i] = binary.charCodeAt(i);
|
|
704
|
+
}
|
|
705
|
+
return new Int16Array(bytes.buffer);
|
|
706
|
+
}
|
|
707
|
+
|
|
708
|
+
/** Send a JSON event over the WebSocket transport. */
|
|
709
|
+
function _sendWsEvent(payload) {
|
|
710
|
+
if (!_ws || _ws.readyState !== WebSocket.OPEN) return false;
|
|
711
|
+
try {
|
|
712
|
+
_ws.send(JSON.stringify(payload));
|
|
713
|
+
return true;
|
|
714
|
+
} catch (err) {
|
|
715
|
+
console.warn("[voice-client] WS send failed:", err?.message || err);
|
|
716
|
+
return false;
|
|
717
|
+
}
|
|
718
|
+
}
|
|
719
|
+
|
|
720
|
+
/** Play queued PCM16 audio chunks via AudioContext. */
|
|
721
|
+
function _scheduleWsPlayback() {
|
|
722
|
+
if (_wsPlaybackPlaying) return;
|
|
723
|
+
_wsPlaybackPlaying = true;
|
|
724
|
+
|
|
725
|
+
const drain = () => {
|
|
726
|
+
if (!_wsAudioCtx || _wsPlaybackQueue.length === 0 || _explicitStop) {
|
|
727
|
+
_wsPlaybackPlaying = false;
|
|
728
|
+
return;
|
|
729
|
+
}
|
|
730
|
+
|
|
731
|
+
const samples = _wsPlaybackQueue.shift();
|
|
732
|
+
const buffer = _wsAudioCtx.createBuffer(1, samples.length, 24000);
|
|
733
|
+
buffer.copyToChannel(samples, 0);
|
|
734
|
+
const sourceNode = _wsAudioCtx.createBufferSource();
|
|
735
|
+
sourceNode.buffer = buffer;
|
|
736
|
+
|
|
737
|
+
// Route through selected output device if supported
|
|
738
|
+
if (selectedAudioOutput.value && typeof _wsAudioCtx.setSinkId === "function") {
|
|
739
|
+
try { _wsAudioCtx.setSinkId(selectedAudioOutput.value); } catch { /* ignore */ }
|
|
740
|
+
}
|
|
741
|
+
|
|
742
|
+
sourceNode.connect(_wsAudioCtx.destination);
|
|
743
|
+
|
|
744
|
+
const now = _wsAudioCtx.currentTime;
|
|
745
|
+
const startTime = Math.max(now, _wsPlaybackScheduled);
|
|
746
|
+
sourceNode.start(startTime);
|
|
747
|
+
_wsPlaybackScheduled = startTime + buffer.duration;
|
|
748
|
+
|
|
749
|
+
sourceNode.onended = () => {
|
|
750
|
+
if (_wsPlaybackQueue.length > 0) {
|
|
751
|
+
drain();
|
|
752
|
+
} else {
|
|
753
|
+
_wsPlaybackPlaying = false;
|
|
754
|
+
if (voiceState.value === "speaking") {
|
|
755
|
+
voiceState.value = "connected";
|
|
756
|
+
}
|
|
757
|
+
}
|
|
758
|
+
};
|
|
759
|
+
};
|
|
760
|
+
|
|
761
|
+
drain();
|
|
762
|
+
}
|
|
763
|
+
|
|
764
|
+
/** Clean up WebSocket transport resources. */
|
|
765
|
+
function _cleanupWsTransport() {
|
|
766
|
+
if (_wsMicProcessor) {
|
|
767
|
+
try { _wsMicProcessor.disconnect(); } catch { /* ignore */ }
|
|
768
|
+
_wsMicProcessor = null;
|
|
769
|
+
}
|
|
770
|
+
if (_wsMicSource) {
|
|
771
|
+
try { _wsMicSource.disconnect(); } catch { /* ignore */ }
|
|
772
|
+
_wsMicSource = null;
|
|
773
|
+
}
|
|
774
|
+
if (_ws) {
|
|
775
|
+
try { _ws.close(); } catch { /* ignore */ }
|
|
776
|
+
_ws = null;
|
|
777
|
+
}
|
|
778
|
+
if (_wsAudioCtx) {
|
|
779
|
+
try { _wsAudioCtx.close(); } catch { /* ignore */ }
|
|
780
|
+
_wsAudioCtx = null;
|
|
781
|
+
}
|
|
782
|
+
_wsPlaybackQueue = [];
|
|
783
|
+
_wsPlaybackScheduled = 0;
|
|
784
|
+
_wsPlaybackPlaying = false;
|
|
785
|
+
}
|
|
786
|
+
|
|
787
|
+
/**
|
|
788
|
+
* Start a WebSocket-based Realtime session.
|
|
789
|
+
* Used as fallback when Azure WebRTC SDP exchange returns 404.
|
|
790
|
+
*/
|
|
791
|
+
async function _startWebSocketTransport(tokenData, mediaStream) {
|
|
792
|
+
const wsUrl = String(tokenData?.wsUrl || "").trim();
|
|
793
|
+
if (!wsUrl) {
|
|
794
|
+
throw new Error("WebSocket URL not available for Azure Realtime fallback");
|
|
795
|
+
}
|
|
796
|
+
|
|
797
|
+
_transport = "websocket";
|
|
798
|
+
|
|
799
|
+
// Set up AudioContext for PCM16 I/O at 24kHz (Realtime API native rate)
|
|
800
|
+
_wsAudioCtx = new (globalThis.AudioContext || globalThis.webkitAudioContext)({
|
|
801
|
+
sampleRate: 24000,
|
|
802
|
+
});
|
|
803
|
+
|
|
804
|
+
return new Promise((resolve, reject) => {
|
|
805
|
+
_ws = new WebSocket(wsUrl);
|
|
806
|
+
|
|
807
|
+
const connectTimeout = setTimeout(() => {
|
|
808
|
+
reject(new Error("Azure Realtime WebSocket connection timed out"));
|
|
809
|
+
if (_ws) { try { _ws.close(); } catch { /* ignore */ } }
|
|
810
|
+
}, 15000);
|
|
811
|
+
|
|
812
|
+
_ws.onopen = () => {
|
|
813
|
+
clearTimeout(connectTimeout);
|
|
814
|
+
|
|
815
|
+
// Send session configuration (same as WebRTC data channel session.update)
|
|
816
|
+
sendSessionUpdate(tokenData);
|
|
817
|
+
|
|
818
|
+
// Start mic capture → PCM16 → WebSocket
|
|
819
|
+
_wsMicSource = _wsAudioCtx.createMediaStreamSource(mediaStream);
|
|
820
|
+
// ScriptProcessorNode deprecated but widely supported; buffer = 4096 samples
|
|
821
|
+
_wsMicProcessor = _wsAudioCtx.createScriptProcessor(4096, 1, 1);
|
|
822
|
+
_wsMicProcessor.onaudioprocess = (e) => {
|
|
823
|
+
if (_explicitStop || !_ws || _ws.readyState !== WebSocket.OPEN) return;
|
|
824
|
+
if (isVoiceMicMuted.value) return;
|
|
825
|
+
const float32 = e.inputBuffer.getChannelData(0);
|
|
826
|
+
const int16 = _float32ToInt16(float32);
|
|
827
|
+
const base64 = _int16ToBase64(int16);
|
|
828
|
+
_sendWsEvent({
|
|
829
|
+
type: "input_audio_buffer.append",
|
|
830
|
+
audio: base64,
|
|
831
|
+
});
|
|
832
|
+
};
|
|
833
|
+
_wsMicSource.connect(_wsMicProcessor);
|
|
834
|
+
_wsMicProcessor.connect(_wsAudioCtx.destination); // required for processing
|
|
835
|
+
|
|
836
|
+
voiceState.value = "connected";
|
|
837
|
+
voiceSessionId.value = _callContext.sessionId || `voice-ws-${Date.now()}`;
|
|
838
|
+
_sessionStartTime = Date.now();
|
|
839
|
+
startDurationTimer();
|
|
840
|
+
|
|
841
|
+
emit("connected", {
|
|
842
|
+
provider: tokenData.provider || "azure",
|
|
843
|
+
sessionId: voiceSessionId.value,
|
|
844
|
+
callContext: { ..._callContext },
|
|
845
|
+
transport: "websocket",
|
|
846
|
+
});
|
|
847
|
+
|
|
848
|
+
resolve();
|
|
849
|
+
};
|
|
850
|
+
|
|
851
|
+
_ws.onmessage = (event) => {
|
|
852
|
+
try {
|
|
853
|
+
const msg = JSON.parse(event.data);
|
|
854
|
+
|
|
855
|
+
// Handle audio deltas — play PCM16 through AudioContext
|
|
856
|
+
if (msg.type === "response.audio.delta" && msg.delta) {
|
|
857
|
+
if (voiceState.value !== "speaking") {
|
|
858
|
+
voiceState.value = "speaking";
|
|
859
|
+
}
|
|
860
|
+
const int16 = _base64ToInt16(msg.delta);
|
|
861
|
+
const float32 = _int16ToFloat32(int16);
|
|
862
|
+
_wsPlaybackQueue.push(float32);
|
|
863
|
+
_scheduleWsPlayback();
|
|
864
|
+
return;
|
|
865
|
+
}
|
|
866
|
+
|
|
867
|
+
if (msg.type === "response.audio.done") {
|
|
868
|
+
// Audio stream complete — playback will finish via onended callback
|
|
869
|
+
return;
|
|
870
|
+
}
|
|
871
|
+
|
|
872
|
+
// All other events go through the standard handler
|
|
873
|
+
handleServerEvent(msg);
|
|
874
|
+
} catch (err) {
|
|
875
|
+
console.error("[voice-client] WS message parse error:", err);
|
|
876
|
+
}
|
|
877
|
+
};
|
|
878
|
+
|
|
879
|
+
_ws.onerror = (event) => {
|
|
880
|
+
clearTimeout(connectTimeout);
|
|
881
|
+
const msg = "Azure Realtime WebSocket error";
|
|
882
|
+
console.error("[voice-client] WebSocket error:", event);
|
|
883
|
+
if (voiceState.value === "connecting") {
|
|
884
|
+
reject(new Error(msg));
|
|
885
|
+
} else {
|
|
886
|
+
voiceState.value = "error";
|
|
887
|
+
voiceError.value = msg;
|
|
888
|
+
emit("error", { message: msg });
|
|
889
|
+
}
|
|
890
|
+
};
|
|
891
|
+
|
|
892
|
+
_ws.onclose = (event) => {
|
|
893
|
+
clearTimeout(connectTimeout);
|
|
894
|
+
if (_explicitStop) return;
|
|
895
|
+
const reason = `WebSocket closed (code=${event.code})`;
|
|
896
|
+
if (voiceState.value === "connecting") {
|
|
897
|
+
reject(new Error(reason));
|
|
898
|
+
} else {
|
|
899
|
+
handleDisconnect(reason);
|
|
900
|
+
}
|
|
901
|
+
};
|
|
902
|
+
});
|
|
903
|
+
}
|
|
904
|
+
|
|
471
905
|
// ── Core Connection ─────────────────────────────────────────────────────────
|
|
472
906
|
|
|
473
907
|
/**
|
|
@@ -479,6 +913,7 @@ function sendSessionUpdate(tokenData = {}) {
|
|
|
479
913
|
* 5. Create offer, set remote answer
|
|
480
914
|
*/
|
|
481
915
|
export async function startVoiceSession(options = {}) {
|
|
916
|
+
ensureMicTrackingPatched();
|
|
482
917
|
if (_pc) {
|
|
483
918
|
console.warn("[voice-client] Session already active");
|
|
484
919
|
return;
|
|
@@ -511,6 +946,7 @@ export async function startVoiceSession(options = {}) {
|
|
|
511
946
|
executor: _callContext.executor || undefined,
|
|
512
947
|
mode: _callContext.mode || undefined,
|
|
513
948
|
model: _callContext.model || undefined,
|
|
949
|
+
voiceAgentId: _callContext.voiceAgentId || undefined,
|
|
514
950
|
delegateOnly: false,
|
|
515
951
|
}),
|
|
516
952
|
});
|
|
@@ -546,12 +982,30 @@ export async function startVoiceSession(options = {}) {
|
|
|
546
982
|
|
|
547
983
|
_mediaStream = await navigator.mediaDevices.getUserMedia({
|
|
548
984
|
audio: {
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
985
|
+
deviceId: selectedAudioInput.value ? { exact: selectedAudioInput.value } : undefined,
|
|
986
|
+
echoCancellation: audioSettings.value.echoCancellation,
|
|
987
|
+
noiseSuppression: audioSettings.value.noiseSuppression,
|
|
988
|
+
autoGainControl: audioSettings.value.autoGainControl,
|
|
989
|
+
sampleRate: audioSettings.value.sampleRate,
|
|
553
990
|
},
|
|
554
991
|
});
|
|
992
|
+
registerMicStream(_mediaStream);
|
|
993
|
+
|
|
994
|
+
// Guard: stopVoiceSession() may have been called while getUserMedia() was
|
|
995
|
+
// still awaiting (e.g. the user pressed hang-up during the permission
|
|
996
|
+
// prompt or network delay). cleanup() already ran without this stream
|
|
997
|
+
// in the registry — release the mic immediately so the browser indicator
|
|
998
|
+
// goes away instead of staying lit indefinitely.
|
|
999
|
+
if (_explicitStop) {
|
|
1000
|
+
for (const track of _mediaStream.getTracks()) {
|
|
1001
|
+
try { track.stop(); } catch { /* ignore */ }
|
|
1002
|
+
}
|
|
1003
|
+
_mediaStream = null;
|
|
1004
|
+
throw new Error("voice session was stopped during microphone acquisition");
|
|
1005
|
+
}
|
|
1006
|
+
|
|
1007
|
+
await enumerateAudioDevices();
|
|
1008
|
+
_startMicLevelMonitor(_mediaStream);
|
|
555
1009
|
|
|
556
1010
|
// 3. Create RTCPeerConnection
|
|
557
1011
|
_pc = new RTCPeerConnection();
|
|
@@ -570,6 +1024,10 @@ export async function startVoiceSession(options = {}) {
|
|
|
570
1024
|
_audioElement.autoplay = true;
|
|
571
1025
|
_audioElement.playsInline = true;
|
|
572
1026
|
_audioElement.muted = true;
|
|
1027
|
+
// Apply selected output device
|
|
1028
|
+
if (selectedAudioOutput.value && typeof _audioElement.setSinkId === "function") {
|
|
1029
|
+
try { await _audioElement.setSinkId(selectedAudioOutput.value); } catch { /* ignore */ }
|
|
1030
|
+
}
|
|
573
1031
|
_pc.ontrack = (event) => {
|
|
574
1032
|
_audioElement.srcObject = event.streams[0];
|
|
575
1033
|
// Unmute now that the element is already playing (avoids autoplay block)
|
|
@@ -635,23 +1093,59 @@ export async function startVoiceSession(options = {}) {
|
|
|
635
1093
|
? `${tokenData.azureEndpoint}/openai/realtime?api-version=2025-04-01-preview&deployment=${tokenData.azureDeployment}`
|
|
636
1094
|
: `https://api.openai.com/v1/realtime?model=${tokenData.model}`);
|
|
637
1095
|
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
1096
|
+
let webrtcFailed = false;
|
|
1097
|
+
let webrtcFailStatus = 0;
|
|
1098
|
+
try {
|
|
1099
|
+
const sdpResponse = await fetch(baseUrl, {
|
|
1100
|
+
method: "POST",
|
|
1101
|
+
headers: {
|
|
1102
|
+
Authorization: `Bearer ${tokenData.token}`,
|
|
1103
|
+
"Content-Type": "application/sdp",
|
|
1104
|
+
},
|
|
1105
|
+
body: offer.sdp,
|
|
1106
|
+
});
|
|
1107
|
+
|
|
1108
|
+
if (!sdpResponse.ok) {
|
|
1109
|
+
webrtcFailStatus = sdpResponse.status;
|
|
1110
|
+
const errBody = await sdpResponse.text().catch(() => "");
|
|
1111
|
+
const detail = errBody ? ` — ${errBody.slice(0, 300)}` : "";
|
|
1112
|
+
// For Azure, 404 means the resource doesn't support WebRTC — try WebSocket
|
|
1113
|
+
if (sdpResponse.status === 404 && tokenData.wsUrl) {
|
|
1114
|
+
console.warn("[voice-client] WebRTC SDP 404 — falling back to Azure WebSocket transport");
|
|
1115
|
+
webrtcFailed = true;
|
|
1116
|
+
} else {
|
|
1117
|
+
throw new Error(`WebRTC SDP exchange failed (${sdpResponse.status})${detail}`);
|
|
1118
|
+
}
|
|
1119
|
+
}
|
|
646
1120
|
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
1121
|
+
if (!webrtcFailed) {
|
|
1122
|
+
const answerSdp = await sdpResponse.text();
|
|
1123
|
+
await _pc.setRemoteDescription({ type: "answer", sdp: answerSdp });
|
|
1124
|
+
}
|
|
1125
|
+
} catch (sdpErr) {
|
|
1126
|
+
if (!webrtcFailed) throw sdpErr;
|
|
651
1127
|
}
|
|
652
1128
|
|
|
653
|
-
|
|
654
|
-
|
|
1129
|
+
// ── WebSocket fallback for Azure when WebRTC returns 404 ────────────
|
|
1130
|
+
if (webrtcFailed) {
|
|
1131
|
+
// Clean up the WebRTC objects — we won't need them
|
|
1132
|
+
if (_dc) { try { _dc.close(); } catch { /* ignore */ } _dc = null; }
|
|
1133
|
+
if (_pc) { try { _pc.close(); } catch { /* ignore */ } _pc = null; }
|
|
1134
|
+
if (_audioElement) {
|
|
1135
|
+
try { _audioElement.pause(); _audioElement.srcObject = null; } catch { /* ignore */ }
|
|
1136
|
+
_audioElement = null;
|
|
1137
|
+
}
|
|
1138
|
+
|
|
1139
|
+
console.info("[voice-client] Starting Azure Realtime WebSocket transport");
|
|
1140
|
+
await _startWebSocketTransport(tokenData, _mediaStream);
|
|
1141
|
+
|
|
1142
|
+
emit("session-started", {
|
|
1143
|
+
sessionId: voiceSessionId.value,
|
|
1144
|
+
callContext: { ..._callContext },
|
|
1145
|
+
transport: "websocket",
|
|
1146
|
+
});
|
|
1147
|
+
return;
|
|
1148
|
+
}
|
|
655
1149
|
|
|
656
1150
|
emit("session-started", {
|
|
657
1151
|
sessionId: voiceSessionId.value,
|
|
@@ -672,6 +1166,7 @@ export async function startVoiceSession(options = {}) {
|
|
|
672
1166
|
export function stopVoiceSession() {
|
|
673
1167
|
_explicitStop = true;
|
|
674
1168
|
emit("session-ending", { sessionId: voiceSessionId.value });
|
|
1169
|
+
_stopMicLevelMonitor();
|
|
675
1170
|
cleanup();
|
|
676
1171
|
voiceState.value = "idle";
|
|
677
1172
|
voiceTranscript.value = "";
|
|
@@ -680,7 +1175,13 @@ export function stopVoiceSession() {
|
|
|
680
1175
|
voiceSessionId.value = null;
|
|
681
1176
|
voiceBoundSessionId.value = null;
|
|
682
1177
|
voiceDuration.value = 0;
|
|
683
|
-
_callContext = {
|
|
1178
|
+
_callContext = {
|
|
1179
|
+
sessionId: null,
|
|
1180
|
+
executor: null,
|
|
1181
|
+
mode: null,
|
|
1182
|
+
model: null,
|
|
1183
|
+
voiceAgentId: null,
|
|
1184
|
+
};
|
|
684
1185
|
emit("session-ended", {});
|
|
685
1186
|
}
|
|
686
1187
|
|
|
@@ -696,6 +1197,7 @@ function handleServerEvent(event) {
|
|
|
696
1197
|
break;
|
|
697
1198
|
|
|
698
1199
|
case "input_audio_buffer.speech_started":
|
|
1200
|
+
triggerAutoBargeIn("speech-started");
|
|
699
1201
|
voiceState.value = "listening";
|
|
700
1202
|
emit("speech-started", {});
|
|
701
1203
|
break;
|
|
@@ -807,7 +1309,9 @@ function handleServerEvent(event) {
|
|
|
807
1309
|
break;
|
|
808
1310
|
|
|
809
1311
|
case "response.audio.delta":
|
|
810
|
-
//
|
|
1312
|
+
// WebRTC: audio is handled via media tracks, not data channel.
|
|
1313
|
+
// WebSocket: audio deltas are handled in the ws.onmessage handler
|
|
1314
|
+
// before reaching handleServerEvent, so this case is a no-op.
|
|
811
1315
|
break;
|
|
812
1316
|
|
|
813
1317
|
case "conversation.item.input_audio_transcription.failed":
|
|
@@ -910,6 +1414,7 @@ async function handleToolCall(event) {
|
|
|
910
1414
|
executor: _callContext.executor || undefined,
|
|
911
1415
|
mode: _callContext.mode || undefined,
|
|
912
1416
|
model: _callContext.model || undefined,
|
|
1417
|
+
voiceAgentId: _callContext.voiceAgentId || undefined,
|
|
913
1418
|
}),
|
|
914
1419
|
});
|
|
915
1420
|
const result = await res.json();
|
|
@@ -919,19 +1424,17 @@ async function handleToolCall(event) {
|
|
|
919
1424
|
tc.callId === callId ? { ...tc, status: "complete", result: result.result } : tc
|
|
920
1425
|
);
|
|
921
1426
|
|
|
922
|
-
// Send result back to model via data channel
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
_dc.send(JSON.stringify({ type: "response.create" }));
|
|
934
|
-
}
|
|
1427
|
+
// Send result back to model via data channel or WebSocket
|
|
1428
|
+
sendRealtimeEvent({
|
|
1429
|
+
type: "conversation.item.create",
|
|
1430
|
+
item: {
|
|
1431
|
+
type: "function_call_output",
|
|
1432
|
+
call_id: callId,
|
|
1433
|
+
output: result.result || result.error || "No output",
|
|
1434
|
+
},
|
|
1435
|
+
});
|
|
1436
|
+
// Trigger response generation
|
|
1437
|
+
sendRealtimeEvent({ type: "response.create" });
|
|
935
1438
|
|
|
936
1439
|
const stillRunning = voiceToolCalls.value.some((tc) => tc.status === "running");
|
|
937
1440
|
if (!stillRunning) {
|
|
@@ -945,22 +1448,87 @@ async function handleToolCall(event) {
|
|
|
945
1448
|
emit("tool-call-error", { callId, name, error: err.message });
|
|
946
1449
|
|
|
947
1450
|
// Send error result back
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
_dc.send(JSON.stringify({ type: "response.create" }));
|
|
958
|
-
}
|
|
1451
|
+
sendRealtimeEvent({
|
|
1452
|
+
type: "conversation.item.create",
|
|
1453
|
+
item: {
|
|
1454
|
+
type: "function_call_output",
|
|
1455
|
+
call_id: callId,
|
|
1456
|
+
output: `Error: ${err.message}`,
|
|
1457
|
+
},
|
|
1458
|
+
});
|
|
1459
|
+
sendRealtimeEvent({ type: "response.create" });
|
|
959
1460
|
}
|
|
960
1461
|
}
|
|
961
1462
|
|
|
962
1463
|
// ── Barge-in ────────────────────────────────────────────────────────────────
|
|
963
1464
|
|
|
1465
|
+
function isAssistantPlaybackActive() {
|
|
1466
|
+
if (_transport === "responses-audio") {
|
|
1467
|
+
return Boolean(_responsesAudioElement && !_responsesAudioElement.paused && !_responsesAudioElement.ended);
|
|
1468
|
+
}
|
|
1469
|
+
if (_transport === "websocket") {
|
|
1470
|
+
return Boolean(_wsPlaybackPlaying || _wsPlaybackQueue.length > 0);
|
|
1471
|
+
}
|
|
1472
|
+
return Boolean(_audioElement && !_audioElement.paused);
|
|
1473
|
+
}
|
|
1474
|
+
|
|
1475
|
+
function fadeElementVolumeTo(el, targetVolume, durationMs) {
|
|
1476
|
+
if (!el) return;
|
|
1477
|
+
const target = Math.max(0, Math.min(1, Number(targetVolume)));
|
|
1478
|
+
const duration = Math.max(40, Number(durationMs) || 180);
|
|
1479
|
+
const start = Math.max(0, Math.min(1, Number(el.volume)));
|
|
1480
|
+
const steps = 5;
|
|
1481
|
+
const stepMs = Math.max(10, Math.floor(duration / steps));
|
|
1482
|
+
let step = 0;
|
|
1483
|
+
const timer = setInterval(() => {
|
|
1484
|
+
step += 1;
|
|
1485
|
+
const t = Math.min(1, step / steps);
|
|
1486
|
+
const next = start + (target - start) * t;
|
|
1487
|
+
try { el.volume = Math.max(0, Math.min(1, next)); } catch { /* ignore */ }
|
|
1488
|
+
if (t >= 1) clearInterval(timer);
|
|
1489
|
+
}, stepMs);
|
|
1490
|
+
}
|
|
1491
|
+
|
|
1492
|
+
function triggerAutoBargeIn(reason = "speech-started") {
|
|
1493
|
+
const now = Date.now();
|
|
1494
|
+
const audioActive = isAssistantPlaybackActive();
|
|
1495
|
+
if (!shouldAutoBargeIn({
|
|
1496
|
+
muted: isVoiceMicMuted.value,
|
|
1497
|
+
audioActive,
|
|
1498
|
+
now,
|
|
1499
|
+
lastTriggeredAt: _lastAutoBargeInAt,
|
|
1500
|
+
minIntervalMs: AUTO_BARGE_IN_COOLDOWN_MS,
|
|
1501
|
+
})) {
|
|
1502
|
+
return false;
|
|
1503
|
+
}
|
|
1504
|
+
_lastAutoBargeInAt = now;
|
|
1505
|
+
if (_autoBargeInTimer) {
|
|
1506
|
+
clearTimeout(_autoBargeInTimer);
|
|
1507
|
+
_autoBargeInTimer = null;
|
|
1508
|
+
}
|
|
1509
|
+
if (_transport === "responses-audio" && _responsesAudioElement) {
|
|
1510
|
+
fadeElementVolumeTo(_responsesAudioElement, 0.1, AUTO_BARGE_IN_FADE_MS);
|
|
1511
|
+
_autoBargeInTimer = setTimeout(() => {
|
|
1512
|
+
_autoBargeInTimer = null;
|
|
1513
|
+
interruptResponse();
|
|
1514
|
+
emit("auto-barge-in", { reason });
|
|
1515
|
+
}, AUTO_BARGE_IN_FADE_MS);
|
|
1516
|
+
return true;
|
|
1517
|
+
}
|
|
1518
|
+
if (_transport === "webrtc" && _audioElement) {
|
|
1519
|
+
fadeElementVolumeTo(_audioElement, 0.12, AUTO_BARGE_IN_FADE_MS);
|
|
1520
|
+
_autoBargeInTimer = setTimeout(() => {
|
|
1521
|
+
_autoBargeInTimer = null;
|
|
1522
|
+
interruptResponse();
|
|
1523
|
+
emit("auto-barge-in", { reason });
|
|
1524
|
+
}, AUTO_BARGE_IN_FADE_MS);
|
|
1525
|
+
return true;
|
|
1526
|
+
}
|
|
1527
|
+
interruptResponse();
|
|
1528
|
+
emit("auto-barge-in", { reason });
|
|
1529
|
+
return true;
|
|
1530
|
+
}
|
|
1531
|
+
|
|
964
1532
|
/**
|
|
965
1533
|
* Interrupt the current response (barge-in).
|
|
966
1534
|
*/
|
|
@@ -974,14 +1542,28 @@ export function interruptResponse() {
|
|
|
974
1542
|
try {
|
|
975
1543
|
_responsesAudioElement.pause();
|
|
976
1544
|
_responsesAudioElement.currentTime = 0;
|
|
1545
|
+
_responsesAudioElement.volume = 1;
|
|
977
1546
|
} catch { /* ignore */ }
|
|
978
1547
|
}
|
|
979
1548
|
voiceState.value = "listening";
|
|
980
1549
|
emit("interrupt", {});
|
|
981
1550
|
return;
|
|
982
1551
|
}
|
|
1552
|
+
// WebSocket transport: cancel response and clear playback queue
|
|
1553
|
+
if (_transport === "websocket") {
|
|
1554
|
+
_sendWsEvent({ type: "response.cancel" });
|
|
1555
|
+
_wsPlaybackQueue = [];
|
|
1556
|
+
_wsPlaybackPlaying = false;
|
|
1557
|
+
voiceState.value = "listening";
|
|
1558
|
+
emit("interrupt", {});
|
|
1559
|
+
return;
|
|
1560
|
+
}
|
|
983
1561
|
if (_dc && _dc.readyState === "open") {
|
|
984
1562
|
_dc.send(JSON.stringify({ type: "response.cancel" }));
|
|
1563
|
+
if (_audioElement) {
|
|
1564
|
+
try { _audioElement.volume = 1; } catch { /* ignore */ }
|
|
1565
|
+
}
|
|
1566
|
+
voiceState.value = "listening";
|
|
985
1567
|
emit("interrupt", {});
|
|
986
1568
|
}
|
|
987
1569
|
}
|
|
@@ -1000,20 +1582,25 @@ export function sendTextMessage(text) {
|
|
|
1000
1582
|
});
|
|
1001
1583
|
return;
|
|
1002
1584
|
}
|
|
1003
|
-
|
|
1585
|
+
// WebRTC or WebSocket: send via the shared sendRealtimeEvent helper
|
|
1586
|
+
if (_transport === "websocket" && (!_ws || _ws.readyState !== WebSocket.OPEN)) {
|
|
1587
|
+
console.warn("[voice-client] Cannot send text — WebSocket not open");
|
|
1588
|
+
return;
|
|
1589
|
+
}
|
|
1590
|
+
if (_transport === "webrtc" && (!_dc || _dc.readyState !== "open")) {
|
|
1004
1591
|
console.warn("[voice-client] Cannot send text — data channel not open");
|
|
1005
1592
|
return;
|
|
1006
1593
|
}
|
|
1007
|
-
|
|
1594
|
+
sendRealtimeEvent({
|
|
1008
1595
|
type: "conversation.item.create",
|
|
1009
1596
|
item: {
|
|
1010
1597
|
type: "message",
|
|
1011
1598
|
role: "user",
|
|
1012
1599
|
content: [{ type: "input_text", text: inputText }],
|
|
1013
1600
|
},
|
|
1014
|
-
})
|
|
1601
|
+
});
|
|
1015
1602
|
_recordVoiceTranscriptIfNew("user", inputText, "send_text_message");
|
|
1016
|
-
|
|
1603
|
+
sendRealtimeEvent({ type: "response.create" });
|
|
1017
1604
|
}
|
|
1018
1605
|
|
|
1019
1606
|
/**
|
|
@@ -1024,14 +1611,14 @@ export function sendImageFrame(imageDataUrl, options = {}) {
|
|
|
1024
1611
|
if (_transport === "responses-audio") return false;
|
|
1025
1612
|
const imageUrl = String(imageDataUrl || "").trim();
|
|
1026
1613
|
if (!imageUrl) return false;
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1614
|
+
// WebSocket transport: use sendRealtimeEvent
|
|
1615
|
+
if (_transport === "websocket" && (!_ws || _ws.readyState !== WebSocket.OPEN)) return false;
|
|
1616
|
+
if (_transport === "webrtc" && (!_dc || _dc.readyState !== "open")) return false;
|
|
1030
1617
|
const source = String(options?.source || "screen").trim() || "screen";
|
|
1031
1618
|
const width = Number(options?.width) || undefined;
|
|
1032
1619
|
const height = Number(options?.height) || undefined;
|
|
1033
1620
|
try {
|
|
1034
|
-
|
|
1621
|
+
sendRealtimeEvent({
|
|
1035
1622
|
type: "conversation.item.create",
|
|
1036
1623
|
item: {
|
|
1037
1624
|
type: "message",
|
|
@@ -1050,7 +1637,7 @@ export function sendImageFrame(imageDataUrl, options = {}) {
|
|
|
1050
1637
|
width,
|
|
1051
1638
|
height,
|
|
1052
1639
|
},
|
|
1053
|
-
})
|
|
1640
|
+
});
|
|
1054
1641
|
return true;
|
|
1055
1642
|
} catch (err) {
|
|
1056
1643
|
console.warn("[voice-client] failed to send realtime image frame:", err?.message || err);
|
|
@@ -1173,6 +1760,12 @@ export function toggleMicMute() {
|
|
|
1173
1760
|
}
|
|
1174
1761
|
return willBeMuted;
|
|
1175
1762
|
}
|
|
1763
|
+
// websocket transport: mic muting is handled by the onaudioprocess guard
|
|
1764
|
+
if (_transport === "websocket") {
|
|
1765
|
+
const willBeMuted = !isVoiceMicMuted.value;
|
|
1766
|
+
isVoiceMicMuted.value = willBeMuted;
|
|
1767
|
+
return willBeMuted;
|
|
1768
|
+
}
|
|
1176
1769
|
return isVoiceMicMuted.value;
|
|
1177
1770
|
}
|
|
1178
1771
|
|
|
@@ -1231,10 +1824,15 @@ function cleanupConnection() {
|
|
|
1231
1824
|
}
|
|
1232
1825
|
|
|
1233
1826
|
function cleanup() {
|
|
1827
|
+
// Always close the mic-level AudioContext first so no AudioContext
|
|
1828
|
+
// holds a live MediaStreamAudioSourceNode after teardown. This path
|
|
1829
|
+
// is reached both by stopVoiceSession() and by handleDisconnect().
|
|
1830
|
+
_stopMicLevelMonitor();
|
|
1234
1831
|
_reconnectInFlight = false;
|
|
1235
1832
|
_audioAutoplayWarned = false;
|
|
1236
1833
|
isVoiceMicMuted.value = false;
|
|
1237
1834
|
cleanupConnection();
|
|
1835
|
+
_cleanupWsTransport();
|
|
1238
1836
|
|
|
1239
1837
|
clearInterval(_durationTimer);
|
|
1240
1838
|
_durationTimer = null;
|
|
@@ -1245,6 +1843,7 @@ function cleanup() {
|
|
|
1245
1843
|
}
|
|
1246
1844
|
_mediaStream = null;
|
|
1247
1845
|
}
|
|
1846
|
+
stopTrackedMicStreams();
|
|
1248
1847
|
_stopResponsesRecognition();
|
|
1249
1848
|
if (_responsesAbortController) {
|
|
1250
1849
|
try { _responsesAbortController.abort(); } catch { /* ignore */ }
|
|
@@ -1266,4 +1865,9 @@ function cleanup() {
|
|
|
1266
1865
|
_awaitingToolCompletionAck = false;
|
|
1267
1866
|
_assistantRespondedAfterTool = false;
|
|
1268
1867
|
_clearToolCompletionAckTimer();
|
|
1868
|
+
if (_autoBargeInTimer) {
|
|
1869
|
+
clearTimeout(_autoBargeInTimer);
|
|
1870
|
+
_autoBargeInTimer = null;
|
|
1871
|
+
}
|
|
1872
|
+
_lastAutoBargeInAt = 0;
|
|
1269
1873
|
}
|