@craftedxp/voice-js 0.3.1 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CONSUMING.md +1 -1
- package/README.md +8 -7
- package/dist/browser.d.mts +20 -4
- package/dist/browser.d.ts +334 -250
- package/dist/browser.js +818 -541
- package/dist/browser.js.map +1 -1
- package/dist/browser.mjs +278 -9
- package/dist/browser.mjs.map +1 -1
- package/dist/embed.iife.js +1094 -4
- package/dist/node.d.mts +20 -4
- package/dist/node.d.ts +324 -247
- package/dist/node.js +480 -369
- package/dist/node.js.map +1 -1
- package/dist/node.mjs +103 -6
- package/dist/node.mjs.map +1 -1
- package/package.json +1 -1
package/dist/browser.js
CHANGED
|
@@ -1,24 +1,26 @@
|
|
|
1
|
-
|
|
2
|
-
var __defProp = Object.defineProperty
|
|
3
|
-
var __getOwnPropDesc = Object.getOwnPropertyDescriptor
|
|
4
|
-
var __getOwnPropNames = Object.getOwnPropertyNames
|
|
5
|
-
var __hasOwnProp = Object.prototype.hasOwnProperty
|
|
1
|
+
'use strict'
|
|
2
|
+
var __defProp = Object.defineProperty
|
|
3
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor
|
|
4
|
+
var __getOwnPropNames = Object.getOwnPropertyNames
|
|
5
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty
|
|
6
6
|
var __export = (target, all) => {
|
|
7
|
-
for (var name in all)
|
|
8
|
-
|
|
9
|
-
};
|
|
7
|
+
for (var name in all) __defProp(target, name, { get: all[name], enumerable: true })
|
|
8
|
+
}
|
|
10
9
|
var __copyProps = (to, from, except, desc) => {
|
|
11
|
-
if (from && typeof from ===
|
|
10
|
+
if ((from && typeof from === 'object') || typeof from === 'function') {
|
|
12
11
|
for (let key of __getOwnPropNames(from))
|
|
13
12
|
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
14
|
-
__defProp(to, key, {
|
|
13
|
+
__defProp(to, key, {
|
|
14
|
+
get: () => from[key],
|
|
15
|
+
enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable,
|
|
16
|
+
})
|
|
15
17
|
}
|
|
16
|
-
return to
|
|
17
|
-
}
|
|
18
|
-
var __toCommonJS = (mod) => __copyProps(__defProp({},
|
|
18
|
+
return to
|
|
19
|
+
}
|
|
20
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, '__esModule', { value: true }), mod)
|
|
19
21
|
|
|
20
22
|
// src/browser.ts
|
|
21
|
-
var browser_exports = {}
|
|
23
|
+
var browser_exports = {}
|
|
22
24
|
__export(browser_exports, {
|
|
23
25
|
buildWsUrl: () => buildWsUrl,
|
|
24
26
|
configureVoiceClient: () => configureVoiceClient,
|
|
@@ -26,57 +28,64 @@ __export(browser_exports, {
|
|
|
26
28
|
createAudioPlayback: () => createAudioPlayback,
|
|
27
29
|
createProtocolState: () => createProtocolState,
|
|
28
30
|
createReconnectingWebSocket: () => createReconnectingWebSocket,
|
|
29
|
-
handleServerMessage: () => handleServerMessage
|
|
30
|
-
})
|
|
31
|
-
module.exports = __toCommonJS(browser_exports)
|
|
31
|
+
handleServerMessage: () => handleServerMessage,
|
|
32
|
+
})
|
|
33
|
+
module.exports = __toCommonJS(browser_exports)
|
|
32
34
|
|
|
33
35
|
// src/config.ts
|
|
34
36
|
function normalizeConfig(config) {
|
|
35
|
-
if (!config) throw new Error(
|
|
36
|
-
if (
|
|
37
|
+
if (!config) throw new Error('configureVoiceClient: config is required')
|
|
38
|
+
if ('apiKey' in config) {
|
|
37
39
|
throw new Error(
|
|
38
|
-
|
|
39
|
-
)
|
|
40
|
+
'configureVoiceClient: `apiKey` is no longer supported. Embedding sk_ in JS code ships server-grade credentials to every client. Pass `fetchToken: async ({ agentId }) => { /* call YOUR backend mint */ }` instead \u2014 see the @craftedxp/voice-js README for the migration recipe.',
|
|
41
|
+
)
|
|
40
42
|
}
|
|
41
43
|
if (!config.apiBase) {
|
|
42
|
-
throw new Error(
|
|
44
|
+
throw new Error('configureVoiceClient: apiBase is required')
|
|
43
45
|
}
|
|
44
|
-
if (typeof config.fetchToken !==
|
|
45
|
-
throw new Error(
|
|
46
|
+
if (typeof config.fetchToken !== 'function') {
|
|
47
|
+
throw new Error('configureVoiceClient: fetchToken must be a function')
|
|
46
48
|
}
|
|
47
49
|
return {
|
|
48
50
|
...config,
|
|
49
|
-
apiBase: config.apiBase.replace(/\/+$/,
|
|
50
|
-
}
|
|
51
|
+
apiBase: config.apiBase.replace(/\/+$/, ''),
|
|
52
|
+
}
|
|
51
53
|
}
|
|
52
54
|
function mergeStartCallContext(factory, call) {
|
|
53
|
-
const context =
|
|
54
|
-
|
|
55
|
-
|
|
55
|
+
const context =
|
|
56
|
+
factory.defaultContext || call.context
|
|
57
|
+
? { ...(factory.defaultContext ?? {}), ...(call.context ?? {}) }
|
|
58
|
+
: void 0
|
|
59
|
+
const metadata =
|
|
60
|
+
factory.defaultMetadata || call.metadata
|
|
61
|
+
? { ...(factory.defaultMetadata ?? {}), ...(call.metadata ?? {}) }
|
|
62
|
+
: void 0
|
|
63
|
+
return { context, metadata }
|
|
56
64
|
}
|
|
57
65
|
|
|
58
66
|
// src/worklets/mic-downsampler.worklet.js
|
|
59
|
-
var mic_downsampler_worklet_default =
|
|
67
|
+
var mic_downsampler_worklet_default =
|
|
68
|
+
"// AudioWorklet \u2014 runs off the main thread in the audio rendering graph.\n//\n// Input: Float32 samples at the AudioContext's native sampleRate (typically\n// 48000 Hz on desktop, 44100 Hz on some iOS devices).\n// Output: 16 kHz mono Int16 PCM, shipped to the main thread via\n// `port.postMessage(ArrayBuffer, [ArrayBuffer])` (transferred, not copied).\n//\n// Why AudioWorklet instead of ScriptProcessorNode: ScriptProcessorNode is\n// deprecated + main-thread-bound, so any JS jank produces audible audio\n// glitches. AudioWorklet's `process()` runs on the audio rendering thread\n// at the graph's block cadence (128 frames by default) and backpressures\n// via returning `true` / `false`.\n//\n// This file is loaded as text (see tsup.config.ts loader) and registered\n// at runtime via `audioWorklet.addModule(blobUrl)`.\n\nclass MicDownsampler extends AudioWorkletProcessor {\n constructor() {\n super()\n // Target sample rate for STT. Matches Deepgram Nova-3 + the platform's\n // server-side SAMPLE_RATE constant in AgentCallHandler.\n this.targetRate = 16000\n // Accumulator for the downsample. We collect incoming samples and emit\n // an Int16 chunk when we've accumulated ~1024 target-rate samples\n // (~64 ms at 16 kHz) \u2014 matches the mobile SDK's chunk size so both\n // platforms have the same server-side framing.\n this.outputFrames = 1024\n this.acc = []\n // Running index used for fractional resampling.\n this.readCursor = 0\n }\n\n // `inputs[0][0]` = first channel of first input. 128 Float32 samples per\n // call at the context's sampleRate. Return true = keep processing.\n process(inputs) {\n const input = inputs[0]\n if (!input || input.length === 0) return true\n const channel = input[0]\n if (!channel || channel.length === 0) return true\n\n const ctxRate = sampleRate // global inside AudioWorkletProcessor\n const ratio = ctxRate / this.targetRate\n\n // Simple linear-interp downsample. For 48000 \u2192 16000 that's 3:1, which\n // linear handles fine for voice. Anti-alias filtering would be\n // theoretically better but inaudible for speech.\n for (let i = 0; i < channel.length; i++) {\n this.acc.push(channel[i])\n }\n\n while (this.acc.length - this.readCursor >= ratio * this.outputFrames) {\n const out = new Int16Array(this.outputFrames)\n let readIdx = this.readCursor\n for (let i = 0; i < this.outputFrames; i++) {\n // Linear interp between floor(readIdx) and ceil(readIdx)\n const low = Math.floor(readIdx)\n const high = Math.min(low + 1, this.acc.length - 1)\n const frac = readIdx - low\n const sample = this.acc[low] * (1 - frac) + this.acc[high] * frac\n // Clip + convert to int16\n const clipped = Math.max(-1, Math.min(1, sample))\n out[i] = clipped < 0 ? clipped * 0x8000 : clipped * 0x7fff\n readIdx += ratio\n }\n // Transfer the ArrayBuffer (zero-copy) to the main thread.\n this.port.postMessage(out.buffer, [out.buffer])\n this.readCursor = readIdx\n }\n\n // Garbage-collect the consumed portion of `acc` every so often so it\n // doesn't grow without bound. Leave ~one chunk of headroom.\n if (this.readCursor > ratio * this.outputFrames) {\n this.acc = this.acc.slice(Math.floor(this.readCursor))\n this.readCursor -= Math.floor(this.readCursor)\n }\n\n return true\n }\n}\n\nregisterProcessor('mic-downsampler', MicDownsampler)\n"
|
|
60
69
|
|
|
61
70
|
// src/AudioCapture.ts
|
|
62
|
-
var VOLUME_INTERVAL_MS = 100
|
|
71
|
+
var VOLUME_INTERVAL_MS = 100
|
|
63
72
|
var createAudioCapture = (options) => {
|
|
64
|
-
let audioContext = null
|
|
65
|
-
let mediaStream = null
|
|
66
|
-
let sourceNode = null
|
|
67
|
-
let workletNode = null
|
|
68
|
-
let analyser = null
|
|
69
|
-
let volumeTimer = null
|
|
70
|
-
let muted = false
|
|
71
|
-
let capturing = false
|
|
73
|
+
let audioContext = null
|
|
74
|
+
let mediaStream = null
|
|
75
|
+
let sourceNode = null
|
|
76
|
+
let workletNode = null
|
|
77
|
+
let analyser = null
|
|
78
|
+
let volumeTimer = null
|
|
79
|
+
let muted = false
|
|
80
|
+
let capturing = false
|
|
72
81
|
const computeRms = (buf) => {
|
|
73
|
-
let sum = 0
|
|
74
|
-
for (let i = 0; i < buf.length; i++) sum += buf[i] * buf[i]
|
|
75
|
-
const rms = Math.sqrt(sum / buf.length)
|
|
76
|
-
return Math.min(1, rms * 1.8)
|
|
77
|
-
}
|
|
82
|
+
let sum = 0
|
|
83
|
+
for (let i = 0; i < buf.length; i++) sum += buf[i] * buf[i]
|
|
84
|
+
const rms = Math.sqrt(sum / buf.length)
|
|
85
|
+
return Math.min(1, rms * 1.8)
|
|
86
|
+
}
|
|
78
87
|
const start = async () => {
|
|
79
|
-
if (capturing) return
|
|
88
|
+
if (capturing) return
|
|
80
89
|
try {
|
|
81
90
|
mediaStream = await navigator.mediaDevices.getUserMedia({
|
|
82
91
|
audio: {
|
|
@@ -87,658 +96,748 @@ var createAudioCapture = (options) => {
|
|
|
87
96
|
echoCancellation: true,
|
|
88
97
|
noiseSuppression: true,
|
|
89
98
|
autoGainControl: true,
|
|
90
|
-
channelCount: 1
|
|
91
|
-
}
|
|
92
|
-
})
|
|
93
|
-
audioContext = new AudioContext()
|
|
94
|
-
if (audioContext.state ===
|
|
95
|
-
const blob = new Blob([mic_downsampler_worklet_default], { type:
|
|
96
|
-
const url = URL.createObjectURL(blob)
|
|
99
|
+
channelCount: 1,
|
|
100
|
+
},
|
|
101
|
+
})
|
|
102
|
+
audioContext = new AudioContext()
|
|
103
|
+
if (audioContext.state === 'suspended') await audioContext.resume()
|
|
104
|
+
const blob = new Blob([mic_downsampler_worklet_default], { type: 'application/javascript' })
|
|
105
|
+
const url = URL.createObjectURL(blob)
|
|
97
106
|
try {
|
|
98
|
-
await audioContext.audioWorklet.addModule(url)
|
|
107
|
+
await audioContext.audioWorklet.addModule(url)
|
|
99
108
|
} finally {
|
|
100
|
-
URL.revokeObjectURL(url)
|
|
109
|
+
URL.revokeObjectURL(url)
|
|
101
110
|
}
|
|
102
|
-
sourceNode = audioContext.createMediaStreamSource(mediaStream)
|
|
103
|
-
workletNode = new AudioWorkletNode(audioContext,
|
|
111
|
+
sourceNode = audioContext.createMediaStreamSource(mediaStream)
|
|
112
|
+
workletNode = new AudioWorkletNode(audioContext, 'mic-downsampler')
|
|
104
113
|
workletNode.port.onmessage = (event) => {
|
|
105
|
-
if (muted) return
|
|
106
|
-
options.onChunk(event.data)
|
|
107
|
-
}
|
|
114
|
+
if (muted) return
|
|
115
|
+
options.onChunk(event.data)
|
|
116
|
+
}
|
|
108
117
|
if (options.onVolume) {
|
|
109
|
-
analyser = audioContext.createAnalyser()
|
|
110
|
-
analyser.fftSize = 256
|
|
111
|
-
sourceNode.connect(analyser)
|
|
112
|
-
const buf = new Float32Array(analyser.fftSize)
|
|
118
|
+
analyser = audioContext.createAnalyser()
|
|
119
|
+
analyser.fftSize = 256
|
|
120
|
+
sourceNode.connect(analyser)
|
|
121
|
+
const buf = new Float32Array(analyser.fftSize)
|
|
113
122
|
volumeTimer = setInterval(() => {
|
|
114
|
-
if (!analyser) return
|
|
115
|
-
analyser.getFloatTimeDomainData(buf)
|
|
116
|
-
options.onVolume?.(computeRms(buf))
|
|
117
|
-
}, VOLUME_INTERVAL_MS)
|
|
123
|
+
if (!analyser) return
|
|
124
|
+
analyser.getFloatTimeDomainData(buf)
|
|
125
|
+
options.onVolume?.(computeRms(buf))
|
|
126
|
+
}, VOLUME_INTERVAL_MS)
|
|
118
127
|
}
|
|
119
|
-
sourceNode.connect(workletNode)
|
|
120
|
-
const sink = audioContext.createGain()
|
|
121
|
-
sink.gain.value = 0
|
|
122
|
-
workletNode.connect(sink).connect(audioContext.destination)
|
|
123
|
-
capturing = true
|
|
128
|
+
sourceNode.connect(workletNode)
|
|
129
|
+
const sink = audioContext.createGain()
|
|
130
|
+
sink.gain.value = 0
|
|
131
|
+
workletNode.connect(sink).connect(audioContext.destination)
|
|
132
|
+
capturing = true
|
|
124
133
|
} catch (err) {
|
|
125
|
-
const wrapped =
|
|
126
|
-
|
|
127
|
-
|
|
134
|
+
const wrapped =
|
|
135
|
+
err instanceof Error ? err : new Error(typeof err === 'string' ? err : 'capture failed')
|
|
136
|
+
options.onError?.(wrapped)
|
|
137
|
+
throw wrapped
|
|
128
138
|
}
|
|
129
|
-
}
|
|
139
|
+
}
|
|
130
140
|
const stop = () => {
|
|
131
|
-
if (!capturing) return
|
|
132
|
-
capturing = false
|
|
141
|
+
if (!capturing) return
|
|
142
|
+
capturing = false
|
|
133
143
|
if (volumeTimer) {
|
|
134
|
-
clearInterval(volumeTimer)
|
|
135
|
-
volumeTimer = null
|
|
144
|
+
clearInterval(volumeTimer)
|
|
145
|
+
volumeTimer = null
|
|
136
146
|
}
|
|
137
147
|
try {
|
|
138
|
-
workletNode?.disconnect()
|
|
139
|
-
analyser?.disconnect()
|
|
140
|
-
sourceNode?.disconnect()
|
|
141
|
-
} catch {
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
sourceNode = null;
|
|
148
|
+
workletNode?.disconnect()
|
|
149
|
+
analyser?.disconnect()
|
|
150
|
+
sourceNode?.disconnect()
|
|
151
|
+
} catch {}
|
|
152
|
+
workletNode = null
|
|
153
|
+
analyser = null
|
|
154
|
+
sourceNode = null
|
|
146
155
|
if (mediaStream) {
|
|
147
|
-
for (const track of mediaStream.getTracks()) track.stop()
|
|
148
|
-
mediaStream = null
|
|
156
|
+
for (const track of mediaStream.getTracks()) track.stop()
|
|
157
|
+
mediaStream = null
|
|
149
158
|
}
|
|
150
|
-
if (audioContext && audioContext.state !==
|
|
151
|
-
void audioContext.close().catch(() => void 0)
|
|
159
|
+
if (audioContext && audioContext.state !== 'closed') {
|
|
160
|
+
void audioContext.close().catch(() => void 0)
|
|
152
161
|
}
|
|
153
|
-
audioContext = null
|
|
154
|
-
}
|
|
162
|
+
audioContext = null
|
|
163
|
+
}
|
|
155
164
|
return {
|
|
156
165
|
start,
|
|
157
166
|
stop,
|
|
158
167
|
mute: (v) => {
|
|
159
|
-
muted = v
|
|
168
|
+
muted = v
|
|
160
169
|
},
|
|
161
|
-
isCapturing: () => capturing
|
|
162
|
-
}
|
|
163
|
-
}
|
|
170
|
+
isCapturing: () => capturing,
|
|
171
|
+
}
|
|
172
|
+
}
|
|
164
173
|
|
|
165
174
|
// src/AudioPlayback.ts
|
|
166
|
-
var DEFAULT_SAMPLE_RATE = 16e3
|
|
167
|
-
var VOLUME_INTERVAL_MS2 = 100
|
|
175
|
+
var DEFAULT_SAMPLE_RATE = 16e3
|
|
176
|
+
var VOLUME_INTERVAL_MS2 = 100
|
|
168
177
|
var createAudioPlayback = (options = {}) => {
|
|
169
|
-
const sampleRate = options.sampleRate ?? DEFAULT_SAMPLE_RATE
|
|
170
|
-
let audioContext = null
|
|
171
|
-
let gainNode = null
|
|
172
|
-
let analyser = null
|
|
173
|
-
let volumeTimer = null
|
|
174
|
-
let nextStartTime = 0
|
|
175
|
-
let scheduledNodes = []
|
|
176
|
-
let speaking = false
|
|
178
|
+
const sampleRate = options.sampleRate ?? DEFAULT_SAMPLE_RATE
|
|
179
|
+
let audioContext = null
|
|
180
|
+
let gainNode = null
|
|
181
|
+
let analyser = null
|
|
182
|
+
let volumeTimer = null
|
|
183
|
+
let nextStartTime = 0
|
|
184
|
+
let scheduledNodes = []
|
|
185
|
+
let speaking = false
|
|
177
186
|
const ensureContext = async () => {
|
|
178
187
|
if (audioContext) {
|
|
179
|
-
if (audioContext.state ===
|
|
180
|
-
return
|
|
188
|
+
if (audioContext.state === 'suspended') await audioContext.resume()
|
|
189
|
+
return
|
|
181
190
|
}
|
|
182
|
-
audioContext = new AudioContext({ sampleRate })
|
|
183
|
-
gainNode = audioContext.createGain()
|
|
191
|
+
audioContext = new AudioContext({ sampleRate })
|
|
192
|
+
gainNode = audioContext.createGain()
|
|
184
193
|
if (options.onVolume) {
|
|
185
|
-
analyser = audioContext.createAnalyser()
|
|
186
|
-
analyser.fftSize = 256
|
|
187
|
-
gainNode.connect(analyser)
|
|
188
|
-
const buf = new Float32Array(analyser.fftSize)
|
|
194
|
+
analyser = audioContext.createAnalyser()
|
|
195
|
+
analyser.fftSize = 256
|
|
196
|
+
gainNode.connect(analyser)
|
|
197
|
+
const buf = new Float32Array(analyser.fftSize)
|
|
189
198
|
volumeTimer = setInterval(() => {
|
|
190
|
-
if (!analyser) return
|
|
191
|
-
analyser.getFloatTimeDomainData(buf)
|
|
192
|
-
let sum = 0
|
|
193
|
-
for (let i = 0; i < buf.length; i++) sum += buf[i] * buf[i]
|
|
194
|
-
const rms = Math.sqrt(sum / buf.length)
|
|
195
|
-
options.onVolume?.(Math.min(1, rms * 1.8))
|
|
196
|
-
}, VOLUME_INTERVAL_MS2)
|
|
197
|
-
}
|
|
198
|
-
gainNode.connect(audioContext.destination)
|
|
199
|
-
nextStartTime = audioContext.currentTime
|
|
200
|
-
}
|
|
199
|
+
if (!analyser) return
|
|
200
|
+
analyser.getFloatTimeDomainData(buf)
|
|
201
|
+
let sum = 0
|
|
202
|
+
for (let i = 0; i < buf.length; i++) sum += buf[i] * buf[i]
|
|
203
|
+
const rms = Math.sqrt(sum / buf.length)
|
|
204
|
+
options.onVolume?.(Math.min(1, rms * 1.8))
|
|
205
|
+
}, VOLUME_INTERVAL_MS2)
|
|
206
|
+
}
|
|
207
|
+
gainNode.connect(audioContext.destination)
|
|
208
|
+
nextStartTime = audioContext.currentTime
|
|
209
|
+
}
|
|
201
210
|
const setSpeaking = (v) => {
|
|
202
|
-
if (v === speaking) return
|
|
203
|
-
speaking = v
|
|
204
|
-
options.onSpeakingChange?.(v)
|
|
205
|
-
}
|
|
211
|
+
if (v === speaking) return
|
|
212
|
+
speaking = v
|
|
213
|
+
options.onSpeakingChange?.(v)
|
|
214
|
+
}
|
|
206
215
|
const pruneFinished = () => {
|
|
207
|
-
const now = audioContext?.currentTime ?? 0
|
|
216
|
+
const now = audioContext?.currentTime ?? 0
|
|
208
217
|
scheduledNodes = scheduledNodes.filter((n) => {
|
|
209
|
-
const node = n
|
|
210
|
-
return (node._endsAt ?? 0) > now
|
|
211
|
-
})
|
|
212
|
-
if (scheduledNodes.length === 0) setSpeaking(false)
|
|
213
|
-
}
|
|
218
|
+
const node = n
|
|
219
|
+
return (node._endsAt ?? 0) > now
|
|
220
|
+
})
|
|
221
|
+
if (scheduledNodes.length === 0) setSpeaking(false)
|
|
222
|
+
}
|
|
214
223
|
const enqueue = (pcm) => {
|
|
215
224
|
if (!audioContext) {
|
|
216
|
-
void ensureContext().then(() => enqueue(pcm))
|
|
217
|
-
return
|
|
218
|
-
}
|
|
219
|
-
if (!audioContext || !gainNode) return
|
|
220
|
-
const int16 = new Int16Array(pcm)
|
|
221
|
-
if (int16.length === 0) return
|
|
222
|
-
const audioBuffer = audioContext.createBuffer(1, int16.length, sampleRate)
|
|
223
|
-
const float32 = audioBuffer.getChannelData(0)
|
|
225
|
+
void ensureContext().then(() => enqueue(pcm))
|
|
226
|
+
return
|
|
227
|
+
}
|
|
228
|
+
if (!audioContext || !gainNode) return
|
|
229
|
+
const int16 = new Int16Array(pcm)
|
|
230
|
+
if (int16.length === 0) return
|
|
231
|
+
const audioBuffer = audioContext.createBuffer(1, int16.length, sampleRate)
|
|
232
|
+
const float32 = audioBuffer.getChannelData(0)
|
|
224
233
|
for (let i = 0; i < int16.length; i++) {
|
|
225
|
-
float32[i] = int16[i] / 32768
|
|
226
|
-
}
|
|
227
|
-
const node = audioContext.createBufferSource()
|
|
228
|
-
node.buffer = audioBuffer
|
|
229
|
-
node.connect(gainNode)
|
|
230
|
-
const now = audioContext.currentTime
|
|
231
|
-
const startAt = Math.max(now, nextStartTime)
|
|
232
|
-
node.start(startAt)
|
|
233
|
-
const duration = int16.length / sampleRate
|
|
234
|
-
node._endsAt = startAt + duration
|
|
235
|
-
nextStartTime = startAt + duration
|
|
236
|
-
scheduledNodes.push(node)
|
|
237
|
-
setSpeaking(true)
|
|
238
|
-
node.onended = () => pruneFinished()
|
|
239
|
-
}
|
|
234
|
+
float32[i] = int16[i] / 32768
|
|
235
|
+
}
|
|
236
|
+
const node = audioContext.createBufferSource()
|
|
237
|
+
node.buffer = audioBuffer
|
|
238
|
+
node.connect(gainNode)
|
|
239
|
+
const now = audioContext.currentTime
|
|
240
|
+
const startAt = Math.max(now, nextStartTime)
|
|
241
|
+
node.start(startAt)
|
|
242
|
+
const duration = int16.length / sampleRate
|
|
243
|
+
node._endsAt = startAt + duration
|
|
244
|
+
nextStartTime = startAt + duration
|
|
245
|
+
scheduledNodes.push(node)
|
|
246
|
+
setSpeaking(true)
|
|
247
|
+
node.onended = () => pruneFinished()
|
|
248
|
+
}
|
|
240
249
|
const flush = () => {
|
|
241
|
-
if (!audioContext || !gainNode) return
|
|
250
|
+
if (!audioContext || !gainNode) return
|
|
242
251
|
for (const node of scheduledNodes) {
|
|
243
252
|
try {
|
|
244
|
-
node.stop()
|
|
245
|
-
} catch {
|
|
246
|
-
}
|
|
253
|
+
node.stop()
|
|
254
|
+
} catch {}
|
|
247
255
|
}
|
|
248
|
-
scheduledNodes = []
|
|
249
|
-
gainNode.disconnect()
|
|
250
|
-
gainNode = audioContext.createGain()
|
|
256
|
+
scheduledNodes = []
|
|
257
|
+
gainNode.disconnect()
|
|
258
|
+
gainNode = audioContext.createGain()
|
|
251
259
|
if (analyser) {
|
|
252
|
-
analyser.disconnect()
|
|
253
|
-
gainNode.connect(analyser)
|
|
260
|
+
analyser.disconnect()
|
|
261
|
+
gainNode.connect(analyser)
|
|
254
262
|
}
|
|
255
|
-
gainNode.connect(audioContext.destination)
|
|
256
|
-
nextStartTime = audioContext.currentTime
|
|
257
|
-
setSpeaking(false)
|
|
258
|
-
}
|
|
263
|
+
gainNode.connect(audioContext.destination)
|
|
264
|
+
nextStartTime = audioContext.currentTime
|
|
265
|
+
setSpeaking(false)
|
|
266
|
+
}
|
|
259
267
|
const close = () => {
|
|
260
|
-
flush()
|
|
268
|
+
flush()
|
|
261
269
|
if (volumeTimer) {
|
|
262
|
-
clearInterval(volumeTimer)
|
|
263
|
-
volumeTimer = null
|
|
270
|
+
clearInterval(volumeTimer)
|
|
271
|
+
volumeTimer = null
|
|
264
272
|
}
|
|
265
|
-
if (audioContext && audioContext.state !==
|
|
266
|
-
void audioContext.close().catch(() => void 0)
|
|
273
|
+
if (audioContext && audioContext.state !== 'closed') {
|
|
274
|
+
void audioContext.close().catch(() => void 0)
|
|
267
275
|
}
|
|
268
|
-
audioContext = null
|
|
269
|
-
gainNode = null
|
|
270
|
-
analyser = null
|
|
271
|
-
}
|
|
276
|
+
audioContext = null
|
|
277
|
+
gainNode = null
|
|
278
|
+
analyser = null
|
|
279
|
+
}
|
|
272
280
|
const resume = async () => {
|
|
273
|
-
await ensureContext()
|
|
274
|
-
}
|
|
275
|
-
return { enqueue, flush, close, resume }
|
|
276
|
-
}
|
|
281
|
+
await ensureContext()
|
|
282
|
+
}
|
|
283
|
+
return { enqueue, flush, close, resume }
|
|
284
|
+
}
|
|
277
285
|
|
|
278
286
|
// src/ReconnectingWebSocket.ts
|
|
279
|
-
var READYSTATE_OPEN = 1
|
|
280
|
-
var READYSTATE_CLOSED = 3
|
|
287
|
+
var READYSTATE_OPEN = 1
|
|
288
|
+
var READYSTATE_CLOSED = 3
|
|
281
289
|
var createReconnectingWebSocket = (options, onEvent) => {
|
|
282
|
-
const maxRetries = options.maxRetries ?? 3
|
|
283
|
-
const initialBackoff = options.initialBackoffMs ?? 500
|
|
284
|
-
const maxBackoff = options.maxBackoffMs ?? 8e3
|
|
285
|
-
let ws = null
|
|
286
|
-
let intentionalClose = false
|
|
287
|
-
let retries = 0
|
|
288
|
-
let backoff = initialBackoff
|
|
289
|
-
let reconnectTimer = null
|
|
290
|
+
const maxRetries = options.maxRetries ?? 3
|
|
291
|
+
const initialBackoff = options.initialBackoffMs ?? 500
|
|
292
|
+
const maxBackoff = options.maxBackoffMs ?? 8e3
|
|
293
|
+
let ws = null
|
|
294
|
+
let intentionalClose = false
|
|
295
|
+
let retries = 0
|
|
296
|
+
let backoff = initialBackoff
|
|
297
|
+
let reconnectTimer = null
|
|
290
298
|
const openOnce = () => {
|
|
291
|
-
ws = options.wsFactory(options.url)
|
|
292
|
-
ws.binaryType =
|
|
299
|
+
ws = options.wsFactory(options.url)
|
|
300
|
+
ws.binaryType = 'arraybuffer'
|
|
293
301
|
ws.onopen = () => {
|
|
294
|
-
if (retries === 0) onEvent({ type:
|
|
295
|
-
else onEvent({ type:
|
|
296
|
-
retries = 0
|
|
297
|
-
backoff = initialBackoff
|
|
298
|
-
}
|
|
302
|
+
if (retries === 0) onEvent({ type: 'open' })
|
|
303
|
+
else onEvent({ type: 'reconnected' })
|
|
304
|
+
retries = 0
|
|
305
|
+
backoff = initialBackoff
|
|
306
|
+
}
|
|
299
307
|
ws.onmessage = (ev) => {
|
|
300
|
-
onEvent({ type:
|
|
301
|
-
}
|
|
308
|
+
onEvent({ type: 'message', data: ev.data })
|
|
309
|
+
}
|
|
302
310
|
ws.onerror = () => {
|
|
303
|
-
onEvent({ type:
|
|
304
|
-
}
|
|
311
|
+
onEvent({ type: 'error', error: new Error('WebSocket error') })
|
|
312
|
+
}
|
|
305
313
|
ws.onclose = (ev) => {
|
|
306
|
-
ws = null
|
|
307
|
-
const shouldRetry = !intentionalClose && retries < maxRetries
|
|
314
|
+
ws = null
|
|
315
|
+
const shouldRetry = !intentionalClose && retries < maxRetries
|
|
308
316
|
if (!shouldRetry) {
|
|
309
317
|
onEvent({
|
|
310
|
-
type:
|
|
318
|
+
type: 'close',
|
|
311
319
|
code: ev.code,
|
|
312
320
|
reason: ev.reason,
|
|
313
|
-
permanent: true
|
|
314
|
-
})
|
|
315
|
-
return
|
|
321
|
+
permanent: true,
|
|
322
|
+
})
|
|
323
|
+
return
|
|
316
324
|
}
|
|
317
325
|
onEvent({
|
|
318
|
-
type:
|
|
326
|
+
type: 'close',
|
|
319
327
|
code: ev.code,
|
|
320
328
|
reason: ev.reason,
|
|
321
|
-
permanent: false
|
|
322
|
-
})
|
|
323
|
-
retries
|
|
324
|
-
const delay = Math.min(backoff, maxBackoff)
|
|
325
|
-
backoff = Math.min(backoff * 2, maxBackoff)
|
|
326
|
-
reconnectTimer = setTimeout(openOnce, delay)
|
|
327
|
-
}
|
|
328
|
-
}
|
|
329
|
-
openOnce()
|
|
329
|
+
permanent: false,
|
|
330
|
+
})
|
|
331
|
+
retries++
|
|
332
|
+
const delay = Math.min(backoff, maxBackoff)
|
|
333
|
+
backoff = Math.min(backoff * 2, maxBackoff)
|
|
334
|
+
reconnectTimer = setTimeout(openOnce, delay)
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
openOnce()
|
|
330
338
|
return {
|
|
331
339
|
send: (data) => {
|
|
332
|
-
if (ws && ws.readyState === READYSTATE_OPEN) ws.send(data)
|
|
340
|
+
if (ws && ws.readyState === READYSTATE_OPEN) ws.send(data)
|
|
333
341
|
},
|
|
334
|
-
close: (code = 1e3, reason =
|
|
335
|
-
intentionalClose = true
|
|
342
|
+
close: (code = 1e3, reason = 'client-requested') => {
|
|
343
|
+
intentionalClose = true
|
|
336
344
|
if (reconnectTimer) {
|
|
337
|
-
clearTimeout(reconnectTimer)
|
|
338
|
-
reconnectTimer = null
|
|
345
|
+
clearTimeout(reconnectTimer)
|
|
346
|
+
reconnectTimer = null
|
|
339
347
|
}
|
|
340
348
|
try {
|
|
341
|
-
ws?.close(code, reason)
|
|
342
|
-
} catch {
|
|
343
|
-
}
|
|
349
|
+
ws?.close(code, reason)
|
|
350
|
+
} catch {}
|
|
344
351
|
},
|
|
345
|
-
readyState: () => ws?.readyState ?? READYSTATE_CLOSED
|
|
346
|
-
}
|
|
347
|
-
}
|
|
352
|
+
readyState: () => ws?.readyState ?? READYSTATE_CLOSED,
|
|
353
|
+
}
|
|
354
|
+
}
|
|
348
355
|
|
|
349
356
|
// src/protocol.ts
|
|
350
357
|
var createProtocolState = () => ({
|
|
351
|
-
state:
|
|
358
|
+
state: 'idle',
|
|
352
359
|
transcript: [],
|
|
353
360
|
agentBubbleId: null,
|
|
354
361
|
idCounter: 0,
|
|
355
|
-
endReason: null
|
|
356
|
-
})
|
|
362
|
+
endReason: null,
|
|
363
|
+
})
|
|
357
364
|
var mapEndReason = (raw) => {
|
|
358
|
-
if (raw ===
|
|
359
|
-
if (raw ===
|
|
360
|
-
if (raw ===
|
|
361
|
-
return
|
|
362
|
-
}
|
|
365
|
+
if (raw === 'agent_ended') return 'agent_ended'
|
|
366
|
+
if (raw === 'caller_hung_up') return 'user_hangup'
|
|
367
|
+
if (raw === 'silence_timeout' || raw === 'max_duration') return 'timeout'
|
|
368
|
+
return 'error'
|
|
369
|
+
}
|
|
363
370
|
function handleServerMessage(raw, state, cb) {
|
|
364
|
-
let msg
|
|
371
|
+
let msg
|
|
365
372
|
try {
|
|
366
|
-
msg = JSON.parse(raw)
|
|
373
|
+
msg = JSON.parse(raw)
|
|
367
374
|
} catch {
|
|
368
|
-
return
|
|
375
|
+
return
|
|
369
376
|
}
|
|
370
377
|
switch (msg.type) {
|
|
371
|
-
case
|
|
372
|
-
cb.onConnected()
|
|
373
|
-
setState(state,
|
|
374
|
-
return
|
|
375
|
-
case
|
|
376
|
-
const text = msg.text ??
|
|
377
|
-
if (!text) return
|
|
378
|
-
const isFinal = !!msg.isFinal
|
|
379
|
-
if (!isFinal) setState(state,
|
|
380
|
-
upsertUserPartial(state, text, isFinal)
|
|
381
|
-
cb.onTranscript(state.transcript)
|
|
382
|
-
return
|
|
383
|
-
}
|
|
384
|
-
case
|
|
385
|
-
const id = `m${state.idCounter++}
|
|
386
|
-
state.agentBubbleId = id
|
|
387
|
-
state.transcript = [...state.transcript, { id, role:
|
|
388
|
-
cb.onTranscript(state.transcript)
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
cb
|
|
409
|
-
return
|
|
410
|
-
|
|
411
|
-
|
|
378
|
+
case 'connected':
|
|
379
|
+
cb.onConnected()
|
|
380
|
+
setState(state, 'listening', cb)
|
|
381
|
+
return
|
|
382
|
+
case 'transcript': {
|
|
383
|
+
const text = msg.text ?? ''
|
|
384
|
+
if (!text) return
|
|
385
|
+
const isFinal = !!msg.isFinal
|
|
386
|
+
if (!isFinal) setState(state, 'user_speaking', cb)
|
|
387
|
+
upsertUserPartial(state, text, isFinal)
|
|
388
|
+
cb.onTranscript(state.transcript)
|
|
389
|
+
return
|
|
390
|
+
}
|
|
391
|
+
case 'agent_turn_start': {
|
|
392
|
+
const id = `m${state.idCounter++}`
|
|
393
|
+
state.agentBubbleId = id
|
|
394
|
+
state.transcript = [...state.transcript, { id, role: 'agent', text: '' }]
|
|
395
|
+
cb.onTranscript(state.transcript)
|
|
396
|
+
const seq = typeof msg.seq === 'number' ? msg.seq : void 0
|
|
397
|
+
cb.onAgentTurnStart(seq)
|
|
398
|
+
setState(state, 'agent_speaking', cb)
|
|
399
|
+
return
|
|
400
|
+
}
|
|
401
|
+
case 'agent_text': {
|
|
402
|
+
const delta = msg.text ?? ''
|
|
403
|
+
if (!delta || !state.agentBubbleId) return
|
|
404
|
+
const id = state.agentBubbleId
|
|
405
|
+
state.transcript = state.transcript.map((e) =>
|
|
406
|
+
e.id === id && e.role === 'agent' ? { ...e, text: e.text + delta } : e,
|
|
407
|
+
)
|
|
408
|
+
cb.onTranscript(state.transcript)
|
|
409
|
+
return
|
|
410
|
+
}
|
|
411
|
+
case 'agent_turn_end': {
|
|
412
|
+
state.agentBubbleId = null
|
|
413
|
+
const seq = typeof msg.seq === 'number' ? msg.seq : void 0
|
|
414
|
+
cb.onAgentTurnEnd(seq)
|
|
415
|
+
setState(state, 'listening', cb)
|
|
416
|
+
return
|
|
417
|
+
}
|
|
418
|
+
case 'interrupt':
|
|
419
|
+
cb.onInterrupt()
|
|
420
|
+
return
|
|
421
|
+
case 'agent_turn_abort': {
|
|
422
|
+
const committed = (msg.committedText ?? '').trim()
|
|
412
423
|
if (state.agentBubbleId) {
|
|
413
|
-
const id = state.agentBubbleId
|
|
424
|
+
const id = state.agentBubbleId
|
|
414
425
|
if (committed) {
|
|
415
|
-
state.transcript = state.transcript.map(
|
|
416
|
-
|
|
417
|
-
)
|
|
426
|
+
state.transcript = state.transcript.map((e) =>
|
|
427
|
+
e.id === id && e.role === 'agent' ? { ...e, text: committed, interrupted: true } : e,
|
|
428
|
+
)
|
|
418
429
|
} else {
|
|
419
|
-
state.transcript = state.transcript.filter((e) => e.id !== id)
|
|
430
|
+
state.transcript = state.transcript.filter((e) => e.id !== id)
|
|
420
431
|
}
|
|
421
|
-
cb.onTranscript(state.transcript)
|
|
432
|
+
cb.onTranscript(state.transcript)
|
|
422
433
|
}
|
|
423
|
-
state.agentBubbleId = null
|
|
424
|
-
return
|
|
434
|
+
state.agentBubbleId = null
|
|
435
|
+
return
|
|
425
436
|
}
|
|
426
|
-
case
|
|
437
|
+
case 'tool_call':
|
|
427
438
|
state.transcript = [
|
|
428
439
|
...state.transcript,
|
|
429
440
|
{
|
|
430
441
|
id: `m${state.idCounter++}`,
|
|
431
|
-
role:
|
|
432
|
-
text: `\u2192 ${String(msg.tool ??
|
|
433
|
-
}
|
|
434
|
-
]
|
|
435
|
-
cb.onTranscript(state.transcript)
|
|
436
|
-
return
|
|
437
|
-
case
|
|
442
|
+
role: 'tool',
|
|
443
|
+
text: `\u2192 ${String(msg.tool ?? '?')}(${msg.args ? JSON.stringify(msg.args) : ''})`,
|
|
444
|
+
},
|
|
445
|
+
]
|
|
446
|
+
cb.onTranscript(state.transcript)
|
|
447
|
+
return
|
|
448
|
+
case 'tool_result':
|
|
438
449
|
state.transcript = [
|
|
439
450
|
...state.transcript,
|
|
440
451
|
{
|
|
441
452
|
id: `m${state.idCounter++}`,
|
|
442
|
-
role:
|
|
443
|
-
text: `${msg.ok ?
|
|
444
|
-
}
|
|
445
|
-
]
|
|
446
|
-
cb.onTranscript(state.transcript)
|
|
447
|
-
return
|
|
448
|
-
case
|
|
449
|
-
const toolCallId = String(msg.toolCallId ??
|
|
450
|
-
const name = String(msg.name ??
|
|
451
|
-
const args = msg.args ?? {}
|
|
452
|
-
if (!toolCallId || !name) return
|
|
453
|
-
cb.onClientToolCall({ toolCallId, name, args })
|
|
454
|
-
return
|
|
455
|
-
}
|
|
456
|
-
case
|
|
457
|
-
const reasonRaw = String(msg.reason ??
|
|
458
|
-
const reason = mapEndReason(reasonRaw)
|
|
459
|
-
state.endReason = reason
|
|
453
|
+
role: 'tool',
|
|
454
|
+
text: `${msg.ok ? '\u2713' : '\u2717'} ${String(msg.tool ?? '?')}`,
|
|
455
|
+
},
|
|
456
|
+
]
|
|
457
|
+
cb.onTranscript(state.transcript)
|
|
458
|
+
return
|
|
459
|
+
case 'client_tool_call': {
|
|
460
|
+
const toolCallId = String(msg.toolCallId ?? '')
|
|
461
|
+
const name = String(msg.name ?? '')
|
|
462
|
+
const args = msg.args ?? {}
|
|
463
|
+
if (!toolCallId || !name) return
|
|
464
|
+
cb.onClientToolCall({ toolCallId, name, args })
|
|
465
|
+
return
|
|
466
|
+
}
|
|
467
|
+
case 'call_end': {
|
|
468
|
+
const reasonRaw = String(msg.reason ?? '')
|
|
469
|
+
const reason = mapEndReason(reasonRaw)
|
|
470
|
+
state.endReason = reason
|
|
460
471
|
state.transcript = [
|
|
461
472
|
...state.transcript,
|
|
462
473
|
{
|
|
463
474
|
id: `m${state.idCounter++}`,
|
|
464
|
-
role:
|
|
465
|
-
text: `call ended${reasonRaw ? ` (${reasonRaw})` :
|
|
466
|
-
}
|
|
467
|
-
]
|
|
468
|
-
cb.onTranscript(state.transcript)
|
|
469
|
-
cb.onCallEnd(reason)
|
|
470
|
-
return
|
|
475
|
+
role: 'system',
|
|
476
|
+
text: `call ended${reasonRaw ? ` (${reasonRaw})` : ''}`,
|
|
477
|
+
},
|
|
478
|
+
]
|
|
479
|
+
cb.onTranscript(state.transcript)
|
|
480
|
+
cb.onCallEnd(reason)
|
|
481
|
+
return
|
|
471
482
|
}
|
|
472
|
-
case
|
|
473
|
-
const code = msg.code ??
|
|
474
|
-
const message = msg.message ??
|
|
475
|
-
cb.onError({ code, message })
|
|
476
|
-
return
|
|
483
|
+
case 'error': {
|
|
484
|
+
const code = msg.code ?? 'server_error'
|
|
485
|
+
const message = msg.message ?? 'server error'
|
|
486
|
+
cb.onError({ code, message })
|
|
487
|
+
return
|
|
477
488
|
}
|
|
478
489
|
}
|
|
479
490
|
}
|
|
480
491
|
var setState = (state, next, cb) => {
|
|
481
|
-
if (state.state === next) return
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
};
|
|
492
|
+
if (state.state === next) return
|
|
493
|
+
cb.onState(next)
|
|
494
|
+
}
|
|
485
495
|
var upsertUserPartial = (state, text, isFinal) => {
|
|
486
|
-
let idx = -1
|
|
496
|
+
let idx = -1
|
|
487
497
|
for (let i = state.transcript.length - 1; i >= 0; i--) {
|
|
488
|
-
const e = state.transcript[i]
|
|
489
|
-
if (e.role ===
|
|
490
|
-
idx = i
|
|
491
|
-
break
|
|
498
|
+
const e = state.transcript[i]
|
|
499
|
+
if (e.role === 'user' && e.committed === false) {
|
|
500
|
+
idx = i
|
|
501
|
+
break
|
|
492
502
|
}
|
|
493
503
|
}
|
|
494
504
|
if (idx === -1) {
|
|
495
505
|
state.transcript = [
|
|
496
506
|
...state.transcript,
|
|
497
|
-
{ id: `m${state.idCounter++}`, role:
|
|
498
|
-
]
|
|
499
|
-
return
|
|
500
|
-
}
|
|
501
|
-
const target = state.transcript[idx]
|
|
502
|
-
const next = [...state.transcript]
|
|
503
|
-
next[idx] = { ...target, text, committed: isFinal }
|
|
504
|
-
state.transcript = next
|
|
505
|
-
}
|
|
507
|
+
{ id: `m${state.idCounter++}`, role: 'user', text, committed: isFinal },
|
|
508
|
+
]
|
|
509
|
+
return
|
|
510
|
+
}
|
|
511
|
+
const target = state.transcript[idx]
|
|
512
|
+
const next = [...state.transcript]
|
|
513
|
+
next[idx] = { ...target, text, committed: isFinal }
|
|
514
|
+
state.transcript = next
|
|
515
|
+
}
|
|
506
516
|
function buildWsUrl(args) {
|
|
507
|
-
const base = new URL(args.apiBase)
|
|
508
|
-
const proto = base.protocol ===
|
|
509
|
-
const bargeQS = args.bargeIn === false ?
|
|
510
|
-
return `${proto}//${base.host}/v1/agents/${encodeURIComponent(args.agentId)}/call?token=${encodeURIComponent(args.token)}${bargeQS}
|
|
517
|
+
const base = new URL(args.apiBase)
|
|
518
|
+
const proto = base.protocol === 'https:' ? 'wss:' : 'ws:'
|
|
519
|
+
const bargeQS = args.bargeIn === false ? '&barge=off' : ''
|
|
520
|
+
return `${proto}//${base.host}/v1/agents/${encodeURIComponent(args.agentId)}/call?token=${encodeURIComponent(args.token)}${bargeQS}`
|
|
511
521
|
}
|
|
512
522
|
|
|
513
523
|
// src/clientTools.ts
|
|
514
|
-
var NAME_RE = /^[a-zA-Z_][a-zA-Z0-9_]
|
|
515
|
-
var MAX_TOOLS = 64
|
|
516
|
-
var MAX_USAGE = 500
|
|
517
|
-
var MAX_TIMEOUT_MS = 3e4
|
|
524
|
+
var NAME_RE = /^[a-zA-Z_][a-zA-Z0-9_]*$/
|
|
525
|
+
var MAX_TOOLS = 64
|
|
526
|
+
var MAX_USAGE = 500
|
|
527
|
+
var MAX_TIMEOUT_MS = 3e4
|
|
518
528
|
var validateClientToolMap = (tools) => {
|
|
519
|
-
if (tools === void 0) return
|
|
520
|
-
if (typeof tools !==
|
|
521
|
-
throw new Error(
|
|
529
|
+
if (tools === void 0) return
|
|
530
|
+
if (typeof tools !== 'object' || tools === null || Array.isArray(tools)) {
|
|
531
|
+
throw new Error('clientTools must be an object keyed by tool name')
|
|
522
532
|
}
|
|
523
|
-
const entries = Object.entries(tools)
|
|
533
|
+
const entries = Object.entries(tools)
|
|
524
534
|
if (entries.length > MAX_TOOLS) {
|
|
525
|
-
throw new Error(`clientTools may declare at most 64 tools (got ${entries.length})`)
|
|
535
|
+
throw new Error(`clientTools may declare at most 64 tools (got ${entries.length})`)
|
|
526
536
|
}
|
|
527
537
|
for (const [name, def] of entries) {
|
|
528
538
|
if (!NAME_RE.test(name)) {
|
|
529
539
|
throw new Error(
|
|
530
|
-
`clientTools["${name}"]: name must be a valid identifier (^[a-zA-Z_][a-zA-Z0-9_]*$)
|
|
531
|
-
)
|
|
540
|
+
`clientTools["${name}"]: name must be a valid identifier (^[a-zA-Z_][a-zA-Z0-9_]*$)`,
|
|
541
|
+
)
|
|
532
542
|
}
|
|
533
|
-
if (!def || typeof def !==
|
|
534
|
-
throw new Error(`clientTools["${name}"]: must be an object`)
|
|
543
|
+
if (!def || typeof def !== 'object') {
|
|
544
|
+
throw new Error(`clientTools["${name}"]: must be an object`)
|
|
535
545
|
}
|
|
536
|
-
if (typeof def.description !==
|
|
537
|
-
throw new Error(`clientTools["${name}"]: must have a description`)
|
|
546
|
+
if (typeof def.description !== 'string' || def.description.length === 0) {
|
|
547
|
+
throw new Error(`clientTools["${name}"]: must have a description`)
|
|
538
548
|
}
|
|
539
|
-
if (typeof def.handler !==
|
|
540
|
-
throw new Error(`clientTools["${name}"]: must have a handler function`)
|
|
549
|
+
if (typeof def.handler !== 'function') {
|
|
550
|
+
throw new Error(`clientTools["${name}"]: must have a handler function`)
|
|
541
551
|
}
|
|
542
552
|
if (def.usage !== void 0 && def.usage.length > MAX_USAGE) {
|
|
543
|
-
throw new Error(`clientTools["${name}"]: usage must be \u2264500 chars`)
|
|
553
|
+
throw new Error(`clientTools["${name}"]: usage must be \u2264500 chars`)
|
|
544
554
|
}
|
|
545
|
-
if (
|
|
546
|
-
|
|
555
|
+
if (
|
|
556
|
+
def.timeoutMs !== void 0 &&
|
|
557
|
+
(!Number.isFinite(def.timeoutMs) || def.timeoutMs <= 0 || def.timeoutMs > MAX_TIMEOUT_MS)
|
|
558
|
+
) {
|
|
559
|
+
throw new Error(`clientTools["${name}"]: timeoutMs must be in (0, 30000]`)
|
|
547
560
|
}
|
|
548
561
|
}
|
|
549
|
-
}
|
|
562
|
+
}
|
|
550
563
|
var buildRegisterFrame = (tools) => ({
|
|
551
|
-
type:
|
|
564
|
+
type: 'client_tools_register',
|
|
552
565
|
tools: Object.entries(tools).map(([name, def]) => ({
|
|
553
566
|
name,
|
|
554
567
|
description: def.description,
|
|
555
568
|
parameters: def.parameters,
|
|
556
|
-
...def.usage !== void 0 ? { usage: def.usage } : {},
|
|
557
|
-
...def.timeoutMs !== void 0 ? { timeoutMs: def.timeoutMs } : {}
|
|
558
|
-
}))
|
|
559
|
-
})
|
|
569
|
+
...(def.usage !== void 0 ? { usage: def.usage } : {}),
|
|
570
|
+
...(def.timeoutMs !== void 0 ? { timeoutMs: def.timeoutMs } : {}),
|
|
571
|
+
})),
|
|
572
|
+
})
|
|
560
573
|
var dispatchClientToolCall = (send, tools, frame) => {
|
|
561
574
|
const safeSend = (payload) => {
|
|
562
575
|
try {
|
|
563
|
-
send(payload)
|
|
564
|
-
} catch {
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
const tool = tools[frame.name];
|
|
576
|
+
send(payload)
|
|
577
|
+
} catch {}
|
|
578
|
+
}
|
|
579
|
+
const tool = tools[frame.name]
|
|
568
580
|
if (!tool) {
|
|
569
581
|
safeSend({
|
|
570
|
-
type:
|
|
582
|
+
type: 'client_tool_result',
|
|
571
583
|
toolCallId: frame.toolCallId,
|
|
572
|
-
error: `No handler for ${frame.name}
|
|
573
|
-
})
|
|
574
|
-
return
|
|
584
|
+
error: `No handler for ${frame.name}`,
|
|
585
|
+
})
|
|
586
|
+
return
|
|
575
587
|
}
|
|
576
588
|
void (async () => {
|
|
577
589
|
try {
|
|
578
|
-
const out = await tool.handler(frame.args)
|
|
590
|
+
const out = await tool.handler(frame.args)
|
|
579
591
|
safeSend({
|
|
580
|
-
type:
|
|
592
|
+
type: 'client_tool_result',
|
|
581
593
|
toolCallId: frame.toolCallId,
|
|
582
|
-
result: typeof out ===
|
|
583
|
-
})
|
|
594
|
+
result: typeof out === 'string' ? out : JSON.stringify(out),
|
|
595
|
+
})
|
|
584
596
|
} catch (err) {
|
|
585
597
|
safeSend({
|
|
586
|
-
type:
|
|
598
|
+
type: 'client_tool_result',
|
|
587
599
|
toolCallId: frame.toolCallId,
|
|
588
|
-
error: err instanceof Error ? err.message : String(err)
|
|
589
|
-
})
|
|
600
|
+
error: err instanceof Error ? err.message : String(err),
|
|
601
|
+
})
|
|
590
602
|
}
|
|
591
|
-
})()
|
|
592
|
-
}
|
|
603
|
+
})()
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
// src/ClientMarksBuffer.ts
|
|
607
|
+
var createClientMarksBuffer = (args) => {
|
|
608
|
+
const now = args.now ?? (() => performance.now())
|
|
609
|
+
let pendingFirstOutboundAt = null
|
|
610
|
+
const inFlight = /* @__PURE__ */ new Map()
|
|
611
|
+
const tryEmit = (seq) => {
|
|
612
|
+
const slot = inFlight.get(seq)
|
|
613
|
+
if (!slot) return
|
|
614
|
+
if (!slot.ended) return
|
|
615
|
+
const marks = {}
|
|
616
|
+
if (slot.firstOutboundAt !== null && slot.firstAudibleAt !== null) {
|
|
617
|
+
marks.client_mic_to_first_audible_ms = slot.firstAudibleAt - slot.firstOutboundAt
|
|
618
|
+
}
|
|
619
|
+
args.send({
|
|
620
|
+
type: 'client_marks',
|
|
621
|
+
seq,
|
|
622
|
+
marks,
|
|
623
|
+
clientNow: Date.now(),
|
|
624
|
+
})
|
|
625
|
+
inFlight.delete(seq)
|
|
626
|
+
}
|
|
627
|
+
const markFirstOutboundAudio = () => {
|
|
628
|
+
if (pendingFirstOutboundAt !== null) return
|
|
629
|
+
pendingFirstOutboundAt = now()
|
|
630
|
+
}
|
|
631
|
+
const markFirstAudibleOutput = () => {
|
|
632
|
+
let target
|
|
633
|
+
for (const slot of inFlight.values()) {
|
|
634
|
+
if (!slot.ended) {
|
|
635
|
+
target = slot
|
|
636
|
+
}
|
|
637
|
+
}
|
|
638
|
+
if (!target) return
|
|
639
|
+
if (target.firstAudibleAt !== null) return
|
|
640
|
+
target.firstAudibleAt = now()
|
|
641
|
+
}
|
|
642
|
+
const onAgentTurnStart = (seq) => {
|
|
643
|
+
inFlight.set(seq, {
|
|
644
|
+
firstOutboundAt: pendingFirstOutboundAt,
|
|
645
|
+
firstAudibleAt: null,
|
|
646
|
+
ended: false,
|
|
647
|
+
})
|
|
648
|
+
pendingFirstOutboundAt = null
|
|
649
|
+
}
|
|
650
|
+
const onAgentTurnEnd = (seq) => {
|
|
651
|
+
const slot = inFlight.get(seq)
|
|
652
|
+
if (!slot) {
|
|
653
|
+
args.send({ type: 'client_marks', seq, marks: {}, clientNow: Date.now() })
|
|
654
|
+
return
|
|
655
|
+
}
|
|
656
|
+
slot.ended = true
|
|
657
|
+
tryEmit(seq)
|
|
658
|
+
}
|
|
659
|
+
const flush = () => {
|
|
660
|
+
for (const seq of [...inFlight.keys()]) {
|
|
661
|
+
const slot = inFlight.get(seq)
|
|
662
|
+
slot.ended = true
|
|
663
|
+
tryEmit(seq)
|
|
664
|
+
}
|
|
665
|
+
pendingFirstOutboundAt = null
|
|
666
|
+
}
|
|
667
|
+
return {
|
|
668
|
+
markFirstOutboundAudio,
|
|
669
|
+
markFirstAudibleOutput,
|
|
670
|
+
onAgentTurnStart,
|
|
671
|
+
onAgentTurnEnd,
|
|
672
|
+
flush,
|
|
673
|
+
}
|
|
674
|
+
}
|
|
593
675
|
|
|
594
676
|
// src/VoiceClient.ts
|
|
595
677
|
var BrowserVoiceClient = class {
|
|
596
678
|
constructor(args) {
|
|
597
|
-
this.rws = null
|
|
598
|
-
this.capture = null
|
|
599
|
-
this.playback = null
|
|
600
|
-
this.muted = false
|
|
601
|
-
this.inputVolume = 0
|
|
602
|
-
this.outputVolume = 0
|
|
603
|
-
this.startedAt = null
|
|
604
|
-
this.endedFired = false
|
|
605
|
-
this.lastError = null
|
|
679
|
+
this.rws = null
|
|
680
|
+
this.capture = null
|
|
681
|
+
this.playback = null
|
|
682
|
+
this.muted = false
|
|
683
|
+
this.inputVolume = 0
|
|
684
|
+
this.outputVolume = 0
|
|
685
|
+
this.startedAt = null
|
|
686
|
+
this.endedFired = false
|
|
687
|
+
this.lastError = null
|
|
606
688
|
this.end = () => {
|
|
607
|
-
this.teardown(
|
|
608
|
-
}
|
|
689
|
+
this.teardown('user_hangup')
|
|
690
|
+
}
|
|
609
691
|
this.mute = () => {
|
|
610
|
-
if (this.muted) return
|
|
611
|
-
this.muted = true
|
|
612
|
-
this.capture?.mute(true)
|
|
613
|
-
}
|
|
692
|
+
if (this.muted) return
|
|
693
|
+
this.muted = true
|
|
694
|
+
this.capture?.mute(true)
|
|
695
|
+
}
|
|
614
696
|
this.unmute = () => {
|
|
615
|
-
if (!this.muted) return
|
|
616
|
-
this.muted = false
|
|
617
|
-
this.capture?.mute(false)
|
|
618
|
-
}
|
|
697
|
+
if (!this.muted) return
|
|
698
|
+
this.muted = false
|
|
699
|
+
this.capture?.mute(false)
|
|
700
|
+
}
|
|
619
701
|
// ---------------------------------------------------------------
|
|
620
702
|
// Internal
|
|
621
703
|
// ---------------------------------------------------------------
|
|
622
704
|
this.sendClientToolsRegister = () => {
|
|
623
|
-
const frame = buildRegisterFrame(this.args.options.clientTools ?? {})
|
|
624
|
-
this.rws?.send(JSON.stringify(frame))
|
|
625
|
-
}
|
|
705
|
+
const frame = buildRegisterFrame(this.args.options.clientTools ?? {})
|
|
706
|
+
this.rws?.send(JSON.stringify(frame))
|
|
707
|
+
}
|
|
626
708
|
this.setState = (next) => {
|
|
627
|
-
if (this.proto.state === next) return
|
|
628
|
-
this.proto.state = next
|
|
629
|
-
this.args.options.onStateChange?.(next)
|
|
630
|
-
}
|
|
709
|
+
if (this.proto.state === next) return
|
|
710
|
+
this.proto.state = next
|
|
711
|
+
this.args.options.onStateChange?.(next)
|
|
712
|
+
}
|
|
631
713
|
this.emitError = (err) => {
|
|
632
|
-
this.lastError = err
|
|
633
|
-
this.args.options.onError?.(err)
|
|
634
|
-
}
|
|
714
|
+
this.lastError = err
|
|
715
|
+
this.args.options.onError?.(err)
|
|
716
|
+
}
|
|
635
717
|
this.handleSocketEvent = (ev) => {
|
|
636
718
|
switch (ev.type) {
|
|
637
|
-
case
|
|
638
|
-
void this.startCapture()
|
|
639
|
-
break
|
|
640
|
-
case
|
|
641
|
-
this.proto.transcript = []
|
|
642
|
-
this.proto.agentBubbleId = null
|
|
643
|
-
this.args.options.onTranscript?.(this.proto.transcript)
|
|
644
|
-
void this.startCapture()
|
|
645
|
-
this.setState(
|
|
646
|
-
break
|
|
647
|
-
case
|
|
648
|
-
if (typeof ev.data ===
|
|
719
|
+
case 'open':
|
|
720
|
+
void this.startCapture()
|
|
721
|
+
break
|
|
722
|
+
case 'reconnected':
|
|
723
|
+
this.proto.transcript = []
|
|
724
|
+
this.proto.agentBubbleId = null
|
|
725
|
+
this.args.options.onTranscript?.(this.proto.transcript)
|
|
726
|
+
void this.startCapture()
|
|
727
|
+
this.setState('listening')
|
|
728
|
+
break
|
|
729
|
+
case 'message':
|
|
730
|
+
if (typeof ev.data === 'string') {
|
|
649
731
|
handleServerMessage(ev.data, this.proto, {
|
|
650
732
|
onState: this.setState,
|
|
651
733
|
onTranscript: (entries) => this.args.options.onTranscript?.(entries),
|
|
652
734
|
onError: this.emitError,
|
|
653
735
|
onInterrupt: () => {
|
|
654
|
-
this.playback?.flush()
|
|
655
|
-
this.args.options.onInterrupt?.()
|
|
736
|
+
this.playback?.flush()
|
|
737
|
+
this.args.options.onInterrupt?.()
|
|
738
|
+
},
|
|
739
|
+
onAgentTurnStart: (seq) => {
|
|
740
|
+
if (typeof seq === 'number') this.marks.onAgentTurnStart(seq)
|
|
741
|
+
this.args.options.onAgentTurnStart?.()
|
|
742
|
+
},
|
|
743
|
+
onAgentTurnEnd: (seq) => {
|
|
744
|
+
if (typeof seq === 'number') this.marks.onAgentTurnEnd(seq)
|
|
656
745
|
},
|
|
657
|
-
onAgentTurnStart: () => this.args.options.onAgentTurnStart?.(),
|
|
658
746
|
onCallEnd: (reason) => this.teardown(reason),
|
|
659
747
|
onConnected: () => this.sendClientToolsRegister(),
|
|
660
|
-
onClientToolCall: (frame) =>
|
|
661
|
-
(
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
748
|
+
onClientToolCall: (frame) =>
|
|
749
|
+
dispatchClientToolCall(
|
|
750
|
+
(f) => this.rws?.send(JSON.stringify(f)),
|
|
751
|
+
this.args.options.clientTools ?? {},
|
|
752
|
+
frame,
|
|
753
|
+
),
|
|
754
|
+
})
|
|
666
755
|
} else {
|
|
667
|
-
this.
|
|
756
|
+
this.marks.markFirstAudibleOutput()
|
|
757
|
+
this.playback?.enqueue(ev.data)
|
|
668
758
|
}
|
|
669
|
-
break
|
|
670
|
-
case
|
|
759
|
+
break
|
|
760
|
+
case 'close':
|
|
671
761
|
if (ev.permanent) {
|
|
672
|
-
const reason = this.proto.endReason ?? (this.lastError ?
|
|
673
|
-
this.teardown(reason)
|
|
762
|
+
const reason = this.proto.endReason ?? (this.lastError ? 'error' : 'user_hangup')
|
|
763
|
+
this.teardown(reason)
|
|
674
764
|
}
|
|
675
|
-
break
|
|
676
|
-
case
|
|
677
|
-
this.emitError({ code:
|
|
678
|
-
break
|
|
765
|
+
break
|
|
766
|
+
case 'error':
|
|
767
|
+
this.emitError({ code: 'socket_error', message: ev.error.message })
|
|
768
|
+
break
|
|
679
769
|
}
|
|
680
|
-
}
|
|
770
|
+
}
|
|
681
771
|
this.startCapture = async () => {
|
|
682
|
-
if (this.capture?.isCapturing()) return
|
|
772
|
+
if (this.capture?.isCapturing()) return
|
|
683
773
|
this.capture = createAudioCapture({
|
|
684
774
|
onChunk: (pcm) => {
|
|
685
|
-
this.
|
|
775
|
+
this.marks.markFirstOutboundAudio()
|
|
776
|
+
this.rws?.send(pcm)
|
|
686
777
|
},
|
|
687
778
|
onVolume: (v) => {
|
|
688
|
-
this.inputVolume = v
|
|
689
|
-
this.args.options.onVolume?.({ input: v, output: this.outputVolume })
|
|
779
|
+
this.inputVolume = v
|
|
780
|
+
this.args.options.onVolume?.({ input: v, output: this.outputVolume })
|
|
690
781
|
},
|
|
691
782
|
onError: (err) => {
|
|
692
783
|
this.emitError({
|
|
693
|
-
code: err.name ===
|
|
694
|
-
message: err.message
|
|
695
|
-
})
|
|
696
|
-
}
|
|
697
|
-
})
|
|
698
|
-
if (this.muted) this.capture.mute(true)
|
|
784
|
+
code: err.name === 'NotAllowedError' ? 'mic_denied' : 'mic_start_failed',
|
|
785
|
+
message: err.message,
|
|
786
|
+
})
|
|
787
|
+
},
|
|
788
|
+
})
|
|
789
|
+
if (this.muted) this.capture.mute(true)
|
|
699
790
|
try {
|
|
700
|
-
await this.capture.start()
|
|
701
|
-
} catch {
|
|
702
|
-
|
|
703
|
-
};
|
|
791
|
+
await this.capture.start()
|
|
792
|
+
} catch {}
|
|
793
|
+
}
|
|
704
794
|
this.teardown = (reason) => {
|
|
705
|
-
this.capture?.stop();
|
|
706
|
-
this.capture = null;
|
|
707
|
-
this.playback?.close();
|
|
708
|
-
this.playback = null;
|
|
709
795
|
try {
|
|
710
|
-
this.
|
|
711
|
-
} catch {
|
|
712
|
-
|
|
713
|
-
this.
|
|
714
|
-
this.
|
|
715
|
-
this.
|
|
716
|
-
|
|
796
|
+
this.marks.flush()
|
|
797
|
+
} catch {}
|
|
798
|
+
this.capture?.stop()
|
|
799
|
+
this.capture = null
|
|
800
|
+
this.playback?.close()
|
|
801
|
+
this.playback = null
|
|
802
|
+
try {
|
|
803
|
+
this.rws?.close(1e3, reason)
|
|
804
|
+
} catch {}
|
|
805
|
+
this.rws = null
|
|
806
|
+
this.setState('ended')
|
|
807
|
+
this.fireEndOnce(reason)
|
|
808
|
+
}
|
|
717
809
|
this.fireEndOnce = (reason) => {
|
|
718
|
-
if (this.endedFired) return
|
|
719
|
-
this.endedFired = true
|
|
720
|
-
const startedAt = this.startedAt ?? Date.now()
|
|
810
|
+
if (this.endedFired) return
|
|
811
|
+
this.endedFired = true
|
|
812
|
+
const startedAt = this.startedAt ?? Date.now()
|
|
721
813
|
this.args.options.onEnd?.({
|
|
722
814
|
reason,
|
|
723
|
-
errorCode: reason ===
|
|
724
|
-
durationMs: Date.now() - startedAt
|
|
725
|
-
})
|
|
726
|
-
}
|
|
727
|
-
this.args = args
|
|
728
|
-
this.proto = createProtocolState()
|
|
729
|
-
validateClientToolMap(args.options.clientTools)
|
|
815
|
+
errorCode: reason === 'error' ? this.lastError?.code : void 0,
|
|
816
|
+
durationMs: Date.now() - startedAt,
|
|
817
|
+
})
|
|
818
|
+
}
|
|
819
|
+
this.args = args
|
|
820
|
+
this.proto = createProtocolState()
|
|
821
|
+
validateClientToolMap(args.options.clientTools)
|
|
822
|
+
this.marks = createClientMarksBuffer({
|
|
823
|
+
send: (frame) => {
|
|
824
|
+
try {
|
|
825
|
+
this.rws?.send(JSON.stringify(frame))
|
|
826
|
+
} catch {}
|
|
827
|
+
},
|
|
828
|
+
})
|
|
730
829
|
}
|
|
731
830
|
// ---------------------------------------------------------------
|
|
732
831
|
// Call interface
|
|
733
832
|
// ---------------------------------------------------------------
|
|
734
833
|
get state() {
|
|
735
|
-
return this.proto.state
|
|
834
|
+
return this.proto.state
|
|
736
835
|
}
|
|
737
836
|
get transcript() {
|
|
738
|
-
return this.proto.transcript.slice()
|
|
837
|
+
return this.proto.transcript.slice()
|
|
739
838
|
}
|
|
740
839
|
get isMuted() {
|
|
741
|
-
return this.muted
|
|
840
|
+
return this.muted
|
|
742
841
|
}
|
|
743
842
|
// ---------------------------------------------------------------
|
|
744
843
|
// Lifecycle — called by the factory immediately after construction.
|
|
@@ -746,84 +845,262 @@ var BrowserVoiceClient = class {
|
|
|
746
845
|
// failures arrive via `onError`.
|
|
747
846
|
// ---------------------------------------------------------------
|
|
748
847
|
async start() {
|
|
749
|
-
this.setState(
|
|
750
|
-
this.startedAt = Date.now()
|
|
848
|
+
this.setState('connecting')
|
|
849
|
+
this.startedAt = Date.now()
|
|
751
850
|
const url = buildWsUrl({
|
|
752
851
|
apiBase: this.args.config.apiBase,
|
|
753
852
|
agentId: this.args.options.agentId,
|
|
754
853
|
token: this.args.token,
|
|
755
|
-
bargeIn: this.args.options.bargeIn
|
|
756
|
-
})
|
|
854
|
+
bargeIn: this.args.options.bargeIn,
|
|
855
|
+
})
|
|
757
856
|
this.playback = createAudioPlayback({
|
|
758
857
|
onVolume: (v) => {
|
|
759
|
-
this.outputVolume = v
|
|
760
|
-
this.args.options.onVolume?.({ input: this.inputVolume, output: v })
|
|
761
|
-
}
|
|
762
|
-
})
|
|
858
|
+
this.outputVolume = v
|
|
859
|
+
this.args.options.onVolume?.({ input: this.inputVolume, output: v })
|
|
860
|
+
},
|
|
861
|
+
})
|
|
763
862
|
try {
|
|
764
|
-
await this.playback.resume()
|
|
765
|
-
} catch {
|
|
766
|
-
}
|
|
863
|
+
await this.playback.resume()
|
|
864
|
+
} catch {}
|
|
767
865
|
this.rws = createReconnectingWebSocket(
|
|
768
866
|
{
|
|
769
867
|
url,
|
|
770
868
|
wsFactory: this.args.wsFactory,
|
|
771
|
-
maxRetries: 3
|
|
869
|
+
maxRetries: 3,
|
|
772
870
|
},
|
|
773
|
-
(ev) => this.handleSocketEvent(ev)
|
|
774
|
-
)
|
|
871
|
+
(ev) => this.handleSocketEvent(ev),
|
|
872
|
+
)
|
|
873
|
+
}
|
|
874
|
+
}
|
|
875
|
+
|
|
876
|
+
// src/webrtc/createWebRtcCall.ts
|
|
877
|
+
async function createWebRtcCall(opts) {
|
|
878
|
+
const proto = createProtocolState()
|
|
879
|
+
let muted = false
|
|
880
|
+
let ended = false
|
|
881
|
+
const fireState = (next) => {
|
|
882
|
+
if (proto.state === next) return
|
|
883
|
+
proto.state = next
|
|
884
|
+
opts.onStateChange?.(next)
|
|
885
|
+
}
|
|
886
|
+
const dispatch = (raw) => {
|
|
887
|
+
handleServerMessage(raw, proto, {
|
|
888
|
+
onState: fireState,
|
|
889
|
+
onTranscript: (entries) => opts.onTranscript?.(entries),
|
|
890
|
+
onError: (err) => opts.onError?.(err),
|
|
891
|
+
onInterrupt: () => opts.onInterrupt?.(),
|
|
892
|
+
onAgentTurnStart: () => opts.onAgentTurnStart?.(),
|
|
893
|
+
onAgentTurnEnd: () => {},
|
|
894
|
+
onCallEnd: () => teardown(),
|
|
895
|
+
onConnected: () => {},
|
|
896
|
+
onClientToolCall: () => {},
|
|
897
|
+
})
|
|
775
898
|
}
|
|
776
|
-
|
|
899
|
+
fireState('connecting')
|
|
900
|
+
const pc = new RTCPeerConnection({
|
|
901
|
+
iceServers: [{ urls: 'stun:stun.l.google.com:19302' }],
|
|
902
|
+
})
|
|
903
|
+
const audioEl = document.createElement('audio')
|
|
904
|
+
audioEl.autoplay = true
|
|
905
|
+
audioEl.style.display = 'none'
|
|
906
|
+
document.body.appendChild(audioEl)
|
|
907
|
+
pc.ontrack = (event) => {
|
|
908
|
+
audioEl.srcObject = event.streams[0] ?? new MediaStream([event.track])
|
|
909
|
+
}
|
|
910
|
+
let mic
|
|
911
|
+
try {
|
|
912
|
+
mic = await navigator.mediaDevices.getUserMedia({ audio: true })
|
|
913
|
+
} catch (err) {
|
|
914
|
+
const code =
|
|
915
|
+
err instanceof DOMException && err.name === 'NotAllowedError'
|
|
916
|
+
? 'mic_denied'
|
|
917
|
+
: 'mic_start_failed'
|
|
918
|
+
opts.onError?.({
|
|
919
|
+
code,
|
|
920
|
+
message: err instanceof Error ? err.message : 'getUserMedia failed',
|
|
921
|
+
})
|
|
922
|
+
fireState('error')
|
|
923
|
+
pc.close()
|
|
924
|
+
audioEl.remove()
|
|
925
|
+
throw err
|
|
926
|
+
}
|
|
927
|
+
for (const track of mic.getAudioTracks()) pc.addTrack(track, mic)
|
|
928
|
+
const dc = pc.createDataChannel('control', { ordered: true })
|
|
929
|
+
dc.onmessage = (e) => {
|
|
930
|
+
if (typeof e.data === 'string') dispatch(e.data)
|
|
931
|
+
}
|
|
932
|
+
dc.onerror = () => {
|
|
933
|
+
opts.onError?.({ code: 'socket_error', message: 'control channel error' })
|
|
934
|
+
}
|
|
935
|
+
const gateway = opts.webrtcGatewayBase || ''
|
|
936
|
+
const offerUrl = gateway
|
|
937
|
+
? `${gateway}/webrtc/offer?token=${encodeURIComponent(opts.token)}`
|
|
938
|
+
: `${opts.apiBase}/v1/agents/${encodeURIComponent(opts.agentId)}/webrtc/offer?token=${encodeURIComponent(opts.token)}`
|
|
939
|
+
const iceUrl = gateway
|
|
940
|
+
? `${gateway}/webrtc/ice?token=${encodeURIComponent(opts.token)}`
|
|
941
|
+
: `${opts.apiBase}/v1/agents/${encodeURIComponent(opts.agentId)}/webrtc/ice?token=${encodeURIComponent(opts.token)}`
|
|
942
|
+
await pc.setLocalDescription(await pc.createOffer())
|
|
943
|
+
let callId
|
|
944
|
+
try {
|
|
945
|
+
const offerRes = await fetch(offerUrl, {
|
|
946
|
+
method: 'POST',
|
|
947
|
+
headers: { 'content-type': 'application/json' },
|
|
948
|
+
body: JSON.stringify({ sdp: pc.localDescription.sdp, type: 'offer', agentId: opts.agentId }),
|
|
949
|
+
})
|
|
950
|
+
if (!offerRes.ok) {
|
|
951
|
+
const code = offerRes.status === 401 ? 'unauthorized' : 'server_error'
|
|
952
|
+
opts.onError?.({ code, message: `signaling failed: HTTP ${offerRes.status}` })
|
|
953
|
+
fireState('error')
|
|
954
|
+
mic.getTracks().forEach((t) => t.stop())
|
|
955
|
+
pc.close()
|
|
956
|
+
audioEl.remove()
|
|
957
|
+
throw new Error(`webrtc offer failed: ${offerRes.status}`)
|
|
958
|
+
}
|
|
959
|
+
const body = await offerRes.json()
|
|
960
|
+
callId = body.callId
|
|
961
|
+
await pc.setRemoteDescription({ type: 'answer', sdp: body.sdp })
|
|
962
|
+
} catch (err) {
|
|
963
|
+
if (!ended) {
|
|
964
|
+
opts.onError?.({
|
|
965
|
+
code: 'network_unreachable',
|
|
966
|
+
message: err instanceof Error ? err.message : 'signaling failed',
|
|
967
|
+
})
|
|
968
|
+
fireState('error')
|
|
969
|
+
mic.getTracks().forEach((t) => t.stop())
|
|
970
|
+
pc.close()
|
|
971
|
+
audioEl.remove()
|
|
972
|
+
}
|
|
973
|
+
throw err
|
|
974
|
+
}
|
|
975
|
+
pc.onicecandidate = (e) => {
|
|
976
|
+
if (!e.candidate) return
|
|
977
|
+
void fetch(iceUrl, {
|
|
978
|
+
method: 'POST',
|
|
979
|
+
headers: { 'content-type': 'application/json' },
|
|
980
|
+
body: JSON.stringify({ callId, candidate: e.candidate }),
|
|
981
|
+
}).catch(() => {})
|
|
982
|
+
}
|
|
983
|
+
pc.onconnectionstatechange = () => {
|
|
984
|
+
const s = pc.connectionState
|
|
985
|
+
if (s === 'connected') fireState('listening')
|
|
986
|
+
if (s === 'failed' || s === 'disconnected') {
|
|
987
|
+
opts.onError?.({ code: 'socket_error', message: `webrtc connection ${s}` })
|
|
988
|
+
teardown()
|
|
989
|
+
}
|
|
990
|
+
if (s === 'closed' && !ended) teardown()
|
|
991
|
+
}
|
|
992
|
+
const teardown = () => {
|
|
993
|
+
if (ended) return
|
|
994
|
+
ended = true
|
|
995
|
+
try {
|
|
996
|
+
mic.getTracks().forEach((t) => t.stop())
|
|
997
|
+
} catch {}
|
|
998
|
+
try {
|
|
999
|
+
pc.close()
|
|
1000
|
+
} catch {}
|
|
1001
|
+
try {
|
|
1002
|
+
audioEl.remove()
|
|
1003
|
+
} catch {}
|
|
1004
|
+
fireState('ended')
|
|
1005
|
+
opts.onEnd?.()
|
|
1006
|
+
}
|
|
1007
|
+
return {
|
|
1008
|
+
get state() {
|
|
1009
|
+
return proto.state
|
|
1010
|
+
},
|
|
1011
|
+
get transcript() {
|
|
1012
|
+
return proto.transcript.slice()
|
|
1013
|
+
},
|
|
1014
|
+
get isMuted() {
|
|
1015
|
+
return muted
|
|
1016
|
+
},
|
|
1017
|
+
end: () => teardown(),
|
|
1018
|
+
mute: () => {
|
|
1019
|
+
if (muted) return
|
|
1020
|
+
muted = true
|
|
1021
|
+
mic.getAudioTracks().forEach((t) => (t.enabled = false))
|
|
1022
|
+
},
|
|
1023
|
+
unmute: () => {
|
|
1024
|
+
if (!muted) return
|
|
1025
|
+
muted = false
|
|
1026
|
+
mic.getAudioTracks().forEach((t) => (t.enabled = true))
|
|
1027
|
+
},
|
|
1028
|
+
}
|
|
1029
|
+
}
|
|
777
1030
|
|
|
778
1031
|
// src/browser.ts
|
|
779
|
-
var browserWsFactory = (url) => new globalThis.WebSocket(url)
|
|
1032
|
+
var browserWsFactory = (url) => new globalThis.WebSocket(url)
|
|
780
1033
|
var BrowserVoiceFactory = class {
|
|
781
1034
|
constructor(config) {
|
|
782
1035
|
this.startCall = async (options) => {
|
|
783
1036
|
if (!options.agentId) {
|
|
784
|
-
throw new Error(
|
|
1037
|
+
throw new Error('startCall: agentId is required')
|
|
785
1038
|
}
|
|
786
|
-
const { context, metadata } = mergeStartCallContext(this.config, options)
|
|
1039
|
+
const { context, metadata } = mergeStartCallContext(this.config, options)
|
|
787
1040
|
const fetchArgs = {
|
|
788
1041
|
agentId: options.agentId,
|
|
789
1042
|
userId: options.userId,
|
|
790
1043
|
context,
|
|
791
|
-
metadata
|
|
792
|
-
}
|
|
793
|
-
let
|
|
1044
|
+
metadata,
|
|
1045
|
+
}
|
|
1046
|
+
let resolved
|
|
794
1047
|
if (options.token) {
|
|
795
|
-
|
|
1048
|
+
resolved = { token: options.token, transport: 'ws' }
|
|
796
1049
|
} else {
|
|
797
|
-
|
|
798
|
-
if (!
|
|
799
|
-
throw new Error(
|
|
1050
|
+
const r = await this.config.fetchToken(fetchArgs)
|
|
1051
|
+
if (!r) {
|
|
1052
|
+
throw new Error('configureVoiceClient.fetchToken returned empty token')
|
|
800
1053
|
}
|
|
1054
|
+
resolved = typeof r === 'string' ? { token: r, transport: 'ws' } : r
|
|
1055
|
+
if (!resolved.token) {
|
|
1056
|
+
throw new Error('configureVoiceClient.fetchToken returned an object without `token`')
|
|
1057
|
+
}
|
|
1058
|
+
}
|
|
1059
|
+
if (resolved.transport === 'webrtc') {
|
|
1060
|
+
return createWebRtcCall({
|
|
1061
|
+
agentId: options.agentId,
|
|
1062
|
+
apiBase: this.config.apiBase,
|
|
1063
|
+
token: resolved.token,
|
|
1064
|
+
webrtcGatewayBase: resolved.webrtcGatewayBase,
|
|
1065
|
+
onStateChange: options.onStateChange,
|
|
1066
|
+
onTranscript: options.onTranscript,
|
|
1067
|
+
onError: options.onError,
|
|
1068
|
+
// Synthesise a minimal CallEndEvent. WebRTC doesn't carry an end reason
|
|
1069
|
+
// from the server yet — use 'agent_ended' as placeholder. durationMs is
|
|
1070
|
+
// tracked at 0 until the followup lands (see spec Followups section).
|
|
1071
|
+
onEnd: options.onEnd
|
|
1072
|
+
? () => options.onEnd({ reason: 'agent_ended', durationMs: 0 })
|
|
1073
|
+
: void 0,
|
|
1074
|
+
onInterrupt: options.onInterrupt,
|
|
1075
|
+
onAgentTurnStart: options.onAgentTurnStart,
|
|
1076
|
+
})
|
|
801
1077
|
}
|
|
802
1078
|
const client = new BrowserVoiceClient({
|
|
803
1079
|
config: this.config,
|
|
804
1080
|
// Carry merged context/metadata through to startCall so server can
|
|
805
1081
|
// see what the SDK saw.
|
|
806
1082
|
options: { ...options, context, metadata },
|
|
807
|
-
token,
|
|
808
|
-
wsFactory: browserWsFactory
|
|
809
|
-
})
|
|
810
|
-
await client.start()
|
|
811
|
-
return client
|
|
812
|
-
}
|
|
813
|
-
this.config = config
|
|
814
|
-
}
|
|
815
|
-
}
|
|
1083
|
+
token: resolved.token,
|
|
1084
|
+
wsFactory: browserWsFactory,
|
|
1085
|
+
})
|
|
1086
|
+
await client.start()
|
|
1087
|
+
return client
|
|
1088
|
+
}
|
|
1089
|
+
this.config = config
|
|
1090
|
+
}
|
|
1091
|
+
}
|
|
816
1092
|
function configureVoiceClient(config) {
|
|
817
|
-
return new BrowserVoiceFactory(normalizeConfig(config))
|
|
1093
|
+
return new BrowserVoiceFactory(normalizeConfig(config))
|
|
818
1094
|
}
|
|
819
1095
|
// Annotate the CommonJS export names for ESM import in node:
|
|
820
|
-
0 &&
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
1096
|
+
0 &&
|
|
1097
|
+
(module.exports = {
|
|
1098
|
+
buildWsUrl,
|
|
1099
|
+
configureVoiceClient,
|
|
1100
|
+
createAudioCapture,
|
|
1101
|
+
createAudioPlayback,
|
|
1102
|
+
createProtocolState,
|
|
1103
|
+
createReconnectingWebSocket,
|
|
1104
|
+
handleServerMessage,
|
|
1105
|
+
})
|
|
1106
|
+
//# sourceMappingURL=browser.js.map
|