@craftedxp/voice-js 0.3.2 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CONSUMING.md +1 -1
- package/README.md +8 -7
- package/dist/browser.d.mts +20 -4
- package/dist/browser.d.ts +334 -250
- package/dist/browser.js +838 -540
- package/dist/browser.js.map +1 -1
- package/dist/browser.mjs +297 -8
- package/dist/browser.mjs.map +1 -1
- package/dist/embed.iife.js +1110 -4
- package/dist/node.d.mts +20 -4
- package/dist/node.d.ts +324 -247
- package/dist/node.js +480 -368
- package/dist/node.js.map +1 -1
- package/dist/node.mjs +103 -5
- package/dist/node.mjs.map +1 -1
- package/package.json +1 -1
package/dist/browser.js
CHANGED
|
@@ -1,24 +1,26 @@
|
|
|
1
|
-
|
|
2
|
-
var __defProp = Object.defineProperty
|
|
3
|
-
var __getOwnPropDesc = Object.getOwnPropertyDescriptor
|
|
4
|
-
var __getOwnPropNames = Object.getOwnPropertyNames
|
|
5
|
-
var __hasOwnProp = Object.prototype.hasOwnProperty
|
|
1
|
+
'use strict'
|
|
2
|
+
var __defProp = Object.defineProperty
|
|
3
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor
|
|
4
|
+
var __getOwnPropNames = Object.getOwnPropertyNames
|
|
5
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty
|
|
6
6
|
var __export = (target, all) => {
|
|
7
|
-
for (var name in all)
|
|
8
|
-
|
|
9
|
-
};
|
|
7
|
+
for (var name in all) __defProp(target, name, { get: all[name], enumerable: true })
|
|
8
|
+
}
|
|
10
9
|
var __copyProps = (to, from, except, desc) => {
|
|
11
|
-
if (from && typeof from ===
|
|
10
|
+
if ((from && typeof from === 'object') || typeof from === 'function') {
|
|
12
11
|
for (let key of __getOwnPropNames(from))
|
|
13
12
|
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
14
|
-
__defProp(to, key, {
|
|
13
|
+
__defProp(to, key, {
|
|
14
|
+
get: () => from[key],
|
|
15
|
+
enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable,
|
|
16
|
+
})
|
|
15
17
|
}
|
|
16
|
-
return to
|
|
17
|
-
}
|
|
18
|
-
var __toCommonJS = (mod) => __copyProps(__defProp({},
|
|
18
|
+
return to
|
|
19
|
+
}
|
|
20
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, '__esModule', { value: true }), mod)
|
|
19
21
|
|
|
20
22
|
// src/browser.ts
|
|
21
|
-
var browser_exports = {}
|
|
23
|
+
var browser_exports = {}
|
|
22
24
|
__export(browser_exports, {
|
|
23
25
|
buildWsUrl: () => buildWsUrl,
|
|
24
26
|
configureVoiceClient: () => configureVoiceClient,
|
|
@@ -26,57 +28,64 @@ __export(browser_exports, {
|
|
|
26
28
|
createAudioPlayback: () => createAudioPlayback,
|
|
27
29
|
createProtocolState: () => createProtocolState,
|
|
28
30
|
createReconnectingWebSocket: () => createReconnectingWebSocket,
|
|
29
|
-
handleServerMessage: () => handleServerMessage
|
|
30
|
-
})
|
|
31
|
-
module.exports = __toCommonJS(browser_exports)
|
|
31
|
+
handleServerMessage: () => handleServerMessage,
|
|
32
|
+
})
|
|
33
|
+
module.exports = __toCommonJS(browser_exports)
|
|
32
34
|
|
|
33
35
|
// src/config.ts
|
|
34
36
|
function normalizeConfig(config) {
|
|
35
|
-
if (!config) throw new Error(
|
|
36
|
-
if (
|
|
37
|
+
if (!config) throw new Error('configureVoiceClient: config is required')
|
|
38
|
+
if ('apiKey' in config) {
|
|
37
39
|
throw new Error(
|
|
38
|
-
|
|
39
|
-
)
|
|
40
|
+
'configureVoiceClient: `apiKey` is no longer supported. Embedding sk_ in JS code ships server-grade credentials to every client. Pass `fetchToken: async ({ agentId }) => { /* call YOUR backend mint */ }` instead \u2014 see the @craftedxp/voice-js README for the migration recipe.',
|
|
41
|
+
)
|
|
40
42
|
}
|
|
41
43
|
if (!config.apiBase) {
|
|
42
|
-
throw new Error(
|
|
44
|
+
throw new Error('configureVoiceClient: apiBase is required')
|
|
43
45
|
}
|
|
44
|
-
if (typeof config.fetchToken !==
|
|
45
|
-
throw new Error(
|
|
46
|
+
if (typeof config.fetchToken !== 'function') {
|
|
47
|
+
throw new Error('configureVoiceClient: fetchToken must be a function')
|
|
46
48
|
}
|
|
47
49
|
return {
|
|
48
50
|
...config,
|
|
49
|
-
apiBase: config.apiBase.replace(/\/+$/,
|
|
50
|
-
}
|
|
51
|
+
apiBase: config.apiBase.replace(/\/+$/, ''),
|
|
52
|
+
}
|
|
51
53
|
}
|
|
52
54
|
function mergeStartCallContext(factory, call) {
|
|
53
|
-
const context =
|
|
54
|
-
|
|
55
|
-
|
|
55
|
+
const context =
|
|
56
|
+
factory.defaultContext || call.context
|
|
57
|
+
? { ...(factory.defaultContext ?? {}), ...(call.context ?? {}) }
|
|
58
|
+
: void 0
|
|
59
|
+
const metadata =
|
|
60
|
+
factory.defaultMetadata || call.metadata
|
|
61
|
+
? { ...(factory.defaultMetadata ?? {}), ...(call.metadata ?? {}) }
|
|
62
|
+
: void 0
|
|
63
|
+
return { context, metadata }
|
|
56
64
|
}
|
|
57
65
|
|
|
58
66
|
// src/worklets/mic-downsampler.worklet.js
|
|
59
|
-
var mic_downsampler_worklet_default =
|
|
67
|
+
var mic_downsampler_worklet_default =
|
|
68
|
+
"// AudioWorklet \u2014 runs off the main thread in the audio rendering graph.\n//\n// Input: Float32 samples at the AudioContext's native sampleRate (typically\n// 48000 Hz on desktop, 44100 Hz on some iOS devices).\n// Output: 16 kHz mono Int16 PCM, shipped to the main thread via\n// `port.postMessage(ArrayBuffer, [ArrayBuffer])` (transferred, not copied).\n//\n// Why AudioWorklet instead of ScriptProcessorNode: ScriptProcessorNode is\n// deprecated + main-thread-bound, so any JS jank produces audible audio\n// glitches. AudioWorklet's `process()` runs on the audio rendering thread\n// at the graph's block cadence (128 frames by default) and backpressures\n// via returning `true` / `false`.\n//\n// This file is loaded as text (see tsup.config.ts loader) and registered\n// at runtime via `audioWorklet.addModule(blobUrl)`.\n\nclass MicDownsampler extends AudioWorkletProcessor {\n constructor() {\n super()\n // Target sample rate for STT. Matches Deepgram Nova-3 + the platform's\n // server-side SAMPLE_RATE constant in AgentCallHandler.\n this.targetRate = 16000\n // Accumulator for the downsample. We collect incoming samples and emit\n // an Int16 chunk when we've accumulated ~1024 target-rate samples\n // (~64 ms at 16 kHz) \u2014 matches the mobile SDK's chunk size so both\n // platforms have the same server-side framing.\n this.outputFrames = 1024\n this.acc = []\n // Running index used for fractional resampling.\n this.readCursor = 0\n }\n\n // `inputs[0][0]` = first channel of first input. 128 Float32 samples per\n // call at the context's sampleRate. Return true = keep processing.\n process(inputs) {\n const input = inputs[0]\n if (!input || input.length === 0) return true\n const channel = input[0]\n if (!channel || channel.length === 0) return true\n\n const ctxRate = sampleRate // global inside AudioWorkletProcessor\n const ratio = ctxRate / this.targetRate\n\n // Simple linear-interp downsample. For 48000 \u2192 16000 that's 3:1, which\n // linear handles fine for voice. Anti-alias filtering would be\n // theoretically better but inaudible for speech.\n for (let i = 0; i < channel.length; i++) {\n this.acc.push(channel[i])\n }\n\n while (this.acc.length - this.readCursor >= ratio * this.outputFrames) {\n const out = new Int16Array(this.outputFrames)\n let readIdx = this.readCursor\n for (let i = 0; i < this.outputFrames; i++) {\n // Linear interp between floor(readIdx) and ceil(readIdx)\n const low = Math.floor(readIdx)\n const high = Math.min(low + 1, this.acc.length - 1)\n const frac = readIdx - low\n const sample = this.acc[low] * (1 - frac) + this.acc[high] * frac\n // Clip + convert to int16\n const clipped = Math.max(-1, Math.min(1, sample))\n out[i] = clipped < 0 ? clipped * 0x8000 : clipped * 0x7fff\n readIdx += ratio\n }\n // Transfer the ArrayBuffer (zero-copy) to the main thread.\n this.port.postMessage(out.buffer, [out.buffer])\n this.readCursor = readIdx\n }\n\n // Garbage-collect the consumed portion of `acc` every so often so it\n // doesn't grow without bound. Leave ~one chunk of headroom.\n if (this.readCursor > ratio * this.outputFrames) {\n this.acc = this.acc.slice(Math.floor(this.readCursor))\n this.readCursor -= Math.floor(this.readCursor)\n }\n\n return true\n }\n}\n\nregisterProcessor('mic-downsampler', MicDownsampler)\n"
|
|
60
69
|
|
|
61
70
|
// src/AudioCapture.ts
|
|
62
|
-
var VOLUME_INTERVAL_MS = 100
|
|
71
|
+
var VOLUME_INTERVAL_MS = 100
|
|
63
72
|
var createAudioCapture = (options) => {
|
|
64
|
-
let audioContext = null
|
|
65
|
-
let mediaStream = null
|
|
66
|
-
let sourceNode = null
|
|
67
|
-
let workletNode = null
|
|
68
|
-
let analyser = null
|
|
69
|
-
let volumeTimer = null
|
|
70
|
-
let muted = false
|
|
71
|
-
let capturing = false
|
|
73
|
+
let audioContext = null
|
|
74
|
+
let mediaStream = null
|
|
75
|
+
let sourceNode = null
|
|
76
|
+
let workletNode = null
|
|
77
|
+
let analyser = null
|
|
78
|
+
let volumeTimer = null
|
|
79
|
+
let muted = false
|
|
80
|
+
let capturing = false
|
|
72
81
|
const computeRms = (buf) => {
|
|
73
|
-
let sum = 0
|
|
74
|
-
for (let i = 0; i < buf.length; i++) sum += buf[i] * buf[i]
|
|
75
|
-
const rms = Math.sqrt(sum / buf.length)
|
|
76
|
-
return Math.min(1, rms * 1.8)
|
|
77
|
-
}
|
|
82
|
+
let sum = 0
|
|
83
|
+
for (let i = 0; i < buf.length; i++) sum += buf[i] * buf[i]
|
|
84
|
+
const rms = Math.sqrt(sum / buf.length)
|
|
85
|
+
return Math.min(1, rms * 1.8)
|
|
86
|
+
}
|
|
78
87
|
const start = async () => {
|
|
79
|
-
if (capturing) return
|
|
88
|
+
if (capturing) return
|
|
80
89
|
try {
|
|
81
90
|
mediaStream = await navigator.mediaDevices.getUserMedia({
|
|
82
91
|
audio: {
|
|
@@ -87,657 +96,748 @@ var createAudioCapture = (options) => {
|
|
|
87
96
|
echoCancellation: true,
|
|
88
97
|
noiseSuppression: true,
|
|
89
98
|
autoGainControl: true,
|
|
90
|
-
channelCount: 1
|
|
91
|
-
}
|
|
92
|
-
})
|
|
93
|
-
audioContext = new AudioContext()
|
|
94
|
-
if (audioContext.state ===
|
|
95
|
-
const blob = new Blob([mic_downsampler_worklet_default], { type:
|
|
96
|
-
const url = URL.createObjectURL(blob)
|
|
99
|
+
channelCount: 1,
|
|
100
|
+
},
|
|
101
|
+
})
|
|
102
|
+
audioContext = new AudioContext()
|
|
103
|
+
if (audioContext.state === 'suspended') await audioContext.resume()
|
|
104
|
+
const blob = new Blob([mic_downsampler_worklet_default], { type: 'application/javascript' })
|
|
105
|
+
const url = URL.createObjectURL(blob)
|
|
97
106
|
try {
|
|
98
|
-
await audioContext.audioWorklet.addModule(url)
|
|
107
|
+
await audioContext.audioWorklet.addModule(url)
|
|
99
108
|
} finally {
|
|
100
|
-
URL.revokeObjectURL(url)
|
|
109
|
+
URL.revokeObjectURL(url)
|
|
101
110
|
}
|
|
102
|
-
sourceNode = audioContext.createMediaStreamSource(mediaStream)
|
|
103
|
-
workletNode = new AudioWorkletNode(audioContext,
|
|
111
|
+
sourceNode = audioContext.createMediaStreamSource(mediaStream)
|
|
112
|
+
workletNode = new AudioWorkletNode(audioContext, 'mic-downsampler')
|
|
104
113
|
workletNode.port.onmessage = (event) => {
|
|
105
|
-
if (muted) return
|
|
106
|
-
options.onChunk(event.data)
|
|
107
|
-
}
|
|
114
|
+
if (muted) return
|
|
115
|
+
options.onChunk(event.data)
|
|
116
|
+
}
|
|
108
117
|
if (options.onVolume) {
|
|
109
|
-
analyser = audioContext.createAnalyser()
|
|
110
|
-
analyser.fftSize = 256
|
|
111
|
-
sourceNode.connect(analyser)
|
|
112
|
-
const buf = new Float32Array(analyser.fftSize)
|
|
118
|
+
analyser = audioContext.createAnalyser()
|
|
119
|
+
analyser.fftSize = 256
|
|
120
|
+
sourceNode.connect(analyser)
|
|
121
|
+
const buf = new Float32Array(analyser.fftSize)
|
|
113
122
|
volumeTimer = setInterval(() => {
|
|
114
|
-
if (!analyser) return
|
|
115
|
-
analyser.getFloatTimeDomainData(buf)
|
|
116
|
-
options.onVolume?.(computeRms(buf))
|
|
117
|
-
}, VOLUME_INTERVAL_MS)
|
|
123
|
+
if (!analyser) return
|
|
124
|
+
analyser.getFloatTimeDomainData(buf)
|
|
125
|
+
options.onVolume?.(computeRms(buf))
|
|
126
|
+
}, VOLUME_INTERVAL_MS)
|
|
118
127
|
}
|
|
119
|
-
sourceNode.connect(workletNode)
|
|
120
|
-
const sink = audioContext.createGain()
|
|
121
|
-
sink.gain.value = 0
|
|
122
|
-
workletNode.connect(sink).connect(audioContext.destination)
|
|
123
|
-
capturing = true
|
|
128
|
+
sourceNode.connect(workletNode)
|
|
129
|
+
const sink = audioContext.createGain()
|
|
130
|
+
sink.gain.value = 0
|
|
131
|
+
workletNode.connect(sink).connect(audioContext.destination)
|
|
132
|
+
capturing = true
|
|
124
133
|
} catch (err) {
|
|
125
|
-
const wrapped =
|
|
126
|
-
|
|
127
|
-
|
|
134
|
+
const wrapped =
|
|
135
|
+
err instanceof Error ? err : new Error(typeof err === 'string' ? err : 'capture failed')
|
|
136
|
+
options.onError?.(wrapped)
|
|
137
|
+
throw wrapped
|
|
128
138
|
}
|
|
129
|
-
}
|
|
139
|
+
}
|
|
130
140
|
const stop = () => {
|
|
131
|
-
if (!capturing) return
|
|
132
|
-
capturing = false
|
|
141
|
+
if (!capturing) return
|
|
142
|
+
capturing = false
|
|
133
143
|
if (volumeTimer) {
|
|
134
|
-
clearInterval(volumeTimer)
|
|
135
|
-
volumeTimer = null
|
|
144
|
+
clearInterval(volumeTimer)
|
|
145
|
+
volumeTimer = null
|
|
136
146
|
}
|
|
137
147
|
try {
|
|
138
|
-
workletNode?.disconnect()
|
|
139
|
-
analyser?.disconnect()
|
|
140
|
-
sourceNode?.disconnect()
|
|
141
|
-
} catch {
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
sourceNode = null;
|
|
148
|
+
workletNode?.disconnect()
|
|
149
|
+
analyser?.disconnect()
|
|
150
|
+
sourceNode?.disconnect()
|
|
151
|
+
} catch {}
|
|
152
|
+
workletNode = null
|
|
153
|
+
analyser = null
|
|
154
|
+
sourceNode = null
|
|
146
155
|
if (mediaStream) {
|
|
147
|
-
for (const track of mediaStream.getTracks()) track.stop()
|
|
148
|
-
mediaStream = null
|
|
156
|
+
for (const track of mediaStream.getTracks()) track.stop()
|
|
157
|
+
mediaStream = null
|
|
149
158
|
}
|
|
150
|
-
if (audioContext && audioContext.state !==
|
|
151
|
-
void audioContext.close().catch(() => void 0)
|
|
159
|
+
if (audioContext && audioContext.state !== 'closed') {
|
|
160
|
+
void audioContext.close().catch(() => void 0)
|
|
152
161
|
}
|
|
153
|
-
audioContext = null
|
|
154
|
-
}
|
|
162
|
+
audioContext = null
|
|
163
|
+
}
|
|
155
164
|
return {
|
|
156
165
|
start,
|
|
157
166
|
stop,
|
|
158
167
|
mute: (v) => {
|
|
159
|
-
muted = v
|
|
168
|
+
muted = v
|
|
160
169
|
},
|
|
161
|
-
isCapturing: () => capturing
|
|
162
|
-
}
|
|
163
|
-
}
|
|
170
|
+
isCapturing: () => capturing,
|
|
171
|
+
}
|
|
172
|
+
}
|
|
164
173
|
|
|
165
174
|
// src/AudioPlayback.ts
|
|
166
|
-
var DEFAULT_SAMPLE_RATE = 16e3
|
|
167
|
-
var VOLUME_INTERVAL_MS2 = 100
|
|
175
|
+
var DEFAULT_SAMPLE_RATE = 16e3
|
|
176
|
+
var VOLUME_INTERVAL_MS2 = 100
|
|
168
177
|
var createAudioPlayback = (options = {}) => {
|
|
169
|
-
const sampleRate = options.sampleRate ?? DEFAULT_SAMPLE_RATE
|
|
170
|
-
let audioContext = null
|
|
171
|
-
let gainNode = null
|
|
172
|
-
let analyser = null
|
|
173
|
-
let volumeTimer = null
|
|
174
|
-
let nextStartTime = 0
|
|
175
|
-
let scheduledNodes = []
|
|
176
|
-
let speaking = false
|
|
178
|
+
const sampleRate = options.sampleRate ?? DEFAULT_SAMPLE_RATE
|
|
179
|
+
let audioContext = null
|
|
180
|
+
let gainNode = null
|
|
181
|
+
let analyser = null
|
|
182
|
+
let volumeTimer = null
|
|
183
|
+
let nextStartTime = 0
|
|
184
|
+
let scheduledNodes = []
|
|
185
|
+
let speaking = false
|
|
177
186
|
const ensureContext = async () => {
|
|
178
187
|
if (audioContext) {
|
|
179
|
-
if (audioContext.state ===
|
|
180
|
-
return
|
|
188
|
+
if (audioContext.state === 'suspended') await audioContext.resume()
|
|
189
|
+
return
|
|
181
190
|
}
|
|
182
|
-
audioContext = new AudioContext({ sampleRate })
|
|
183
|
-
gainNode = audioContext.createGain()
|
|
191
|
+
audioContext = new AudioContext({ sampleRate })
|
|
192
|
+
gainNode = audioContext.createGain()
|
|
184
193
|
if (options.onVolume) {
|
|
185
|
-
analyser = audioContext.createAnalyser()
|
|
186
|
-
analyser.fftSize = 256
|
|
187
|
-
gainNode.connect(analyser)
|
|
188
|
-
const buf = new Float32Array(analyser.fftSize)
|
|
194
|
+
analyser = audioContext.createAnalyser()
|
|
195
|
+
analyser.fftSize = 256
|
|
196
|
+
gainNode.connect(analyser)
|
|
197
|
+
const buf = new Float32Array(analyser.fftSize)
|
|
189
198
|
volumeTimer = setInterval(() => {
|
|
190
|
-
if (!analyser) return
|
|
191
|
-
analyser.getFloatTimeDomainData(buf)
|
|
192
|
-
let sum = 0
|
|
193
|
-
for (let i = 0; i < buf.length; i++) sum += buf[i] * buf[i]
|
|
194
|
-
const rms = Math.sqrt(sum / buf.length)
|
|
195
|
-
options.onVolume?.(Math.min(1, rms * 1.8))
|
|
196
|
-
}, VOLUME_INTERVAL_MS2)
|
|
197
|
-
}
|
|
198
|
-
gainNode.connect(audioContext.destination)
|
|
199
|
-
nextStartTime = audioContext.currentTime
|
|
200
|
-
}
|
|
199
|
+
if (!analyser) return
|
|
200
|
+
analyser.getFloatTimeDomainData(buf)
|
|
201
|
+
let sum = 0
|
|
202
|
+
for (let i = 0; i < buf.length; i++) sum += buf[i] * buf[i]
|
|
203
|
+
const rms = Math.sqrt(sum / buf.length)
|
|
204
|
+
options.onVolume?.(Math.min(1, rms * 1.8))
|
|
205
|
+
}, VOLUME_INTERVAL_MS2)
|
|
206
|
+
}
|
|
207
|
+
gainNode.connect(audioContext.destination)
|
|
208
|
+
nextStartTime = audioContext.currentTime
|
|
209
|
+
}
|
|
201
210
|
const setSpeaking = (v) => {
|
|
202
|
-
if (v === speaking) return
|
|
203
|
-
speaking = v
|
|
204
|
-
options.onSpeakingChange?.(v)
|
|
205
|
-
}
|
|
211
|
+
if (v === speaking) return
|
|
212
|
+
speaking = v
|
|
213
|
+
options.onSpeakingChange?.(v)
|
|
214
|
+
}
|
|
206
215
|
const pruneFinished = () => {
|
|
207
|
-
const now = audioContext?.currentTime ?? 0
|
|
216
|
+
const now = audioContext?.currentTime ?? 0
|
|
208
217
|
scheduledNodes = scheduledNodes.filter((n) => {
|
|
209
|
-
const node = n
|
|
210
|
-
return (node._endsAt ?? 0) > now
|
|
211
|
-
})
|
|
212
|
-
if (scheduledNodes.length === 0) setSpeaking(false)
|
|
213
|
-
}
|
|
218
|
+
const node = n
|
|
219
|
+
return (node._endsAt ?? 0) > now
|
|
220
|
+
})
|
|
221
|
+
if (scheduledNodes.length === 0) setSpeaking(false)
|
|
222
|
+
}
|
|
214
223
|
const enqueue = (pcm) => {
|
|
215
224
|
if (!audioContext) {
|
|
216
|
-
void ensureContext().then(() => enqueue(pcm))
|
|
217
|
-
return
|
|
218
|
-
}
|
|
219
|
-
if (!audioContext || !gainNode) return
|
|
220
|
-
const int16 = new Int16Array(pcm)
|
|
221
|
-
if (int16.length === 0) return
|
|
222
|
-
const audioBuffer = audioContext.createBuffer(1, int16.length, sampleRate)
|
|
223
|
-
const float32 = audioBuffer.getChannelData(0)
|
|
225
|
+
void ensureContext().then(() => enqueue(pcm))
|
|
226
|
+
return
|
|
227
|
+
}
|
|
228
|
+
if (!audioContext || !gainNode) return
|
|
229
|
+
const int16 = new Int16Array(pcm)
|
|
230
|
+
if (int16.length === 0) return
|
|
231
|
+
const audioBuffer = audioContext.createBuffer(1, int16.length, sampleRate)
|
|
232
|
+
const float32 = audioBuffer.getChannelData(0)
|
|
224
233
|
for (let i = 0; i < int16.length; i++) {
|
|
225
|
-
float32[i] = int16[i] / 32768
|
|
226
|
-
}
|
|
227
|
-
const node = audioContext.createBufferSource()
|
|
228
|
-
node.buffer = audioBuffer
|
|
229
|
-
node.connect(gainNode)
|
|
230
|
-
const now = audioContext.currentTime
|
|
231
|
-
const startAt = Math.max(now, nextStartTime)
|
|
232
|
-
node.start(startAt)
|
|
233
|
-
const duration = int16.length / sampleRate
|
|
234
|
-
node._endsAt = startAt + duration
|
|
235
|
-
nextStartTime = startAt + duration
|
|
236
|
-
scheduledNodes.push(node)
|
|
237
|
-
setSpeaking(true)
|
|
238
|
-
node.onended = () => pruneFinished()
|
|
239
|
-
}
|
|
234
|
+
float32[i] = int16[i] / 32768
|
|
235
|
+
}
|
|
236
|
+
const node = audioContext.createBufferSource()
|
|
237
|
+
node.buffer = audioBuffer
|
|
238
|
+
node.connect(gainNode)
|
|
239
|
+
const now = audioContext.currentTime
|
|
240
|
+
const startAt = Math.max(now, nextStartTime)
|
|
241
|
+
node.start(startAt)
|
|
242
|
+
const duration = int16.length / sampleRate
|
|
243
|
+
node._endsAt = startAt + duration
|
|
244
|
+
nextStartTime = startAt + duration
|
|
245
|
+
scheduledNodes.push(node)
|
|
246
|
+
setSpeaking(true)
|
|
247
|
+
node.onended = () => pruneFinished()
|
|
248
|
+
}
|
|
240
249
|
const flush = () => {
|
|
241
|
-
if (!audioContext || !gainNode) return
|
|
250
|
+
if (!audioContext || !gainNode) return
|
|
242
251
|
for (const node of scheduledNodes) {
|
|
243
252
|
try {
|
|
244
|
-
node.stop()
|
|
245
|
-
} catch {
|
|
246
|
-
}
|
|
253
|
+
node.stop()
|
|
254
|
+
} catch {}
|
|
247
255
|
}
|
|
248
|
-
scheduledNodes = []
|
|
249
|
-
gainNode.disconnect()
|
|
250
|
-
gainNode = audioContext.createGain()
|
|
256
|
+
scheduledNodes = []
|
|
257
|
+
gainNode.disconnect()
|
|
258
|
+
gainNode = audioContext.createGain()
|
|
251
259
|
if (analyser) {
|
|
252
|
-
analyser.disconnect()
|
|
253
|
-
gainNode.connect(analyser)
|
|
260
|
+
analyser.disconnect()
|
|
261
|
+
gainNode.connect(analyser)
|
|
254
262
|
}
|
|
255
|
-
gainNode.connect(audioContext.destination)
|
|
256
|
-
nextStartTime = audioContext.currentTime
|
|
257
|
-
setSpeaking(false)
|
|
258
|
-
}
|
|
263
|
+
gainNode.connect(audioContext.destination)
|
|
264
|
+
nextStartTime = audioContext.currentTime
|
|
265
|
+
setSpeaking(false)
|
|
266
|
+
}
|
|
259
267
|
const close = () => {
|
|
260
|
-
flush()
|
|
268
|
+
flush()
|
|
261
269
|
if (volumeTimer) {
|
|
262
|
-
clearInterval(volumeTimer)
|
|
263
|
-
volumeTimer = null
|
|
270
|
+
clearInterval(volumeTimer)
|
|
271
|
+
volumeTimer = null
|
|
264
272
|
}
|
|
265
|
-
if (audioContext && audioContext.state !==
|
|
266
|
-
void audioContext.close().catch(() => void 0)
|
|
273
|
+
if (audioContext && audioContext.state !== 'closed') {
|
|
274
|
+
void audioContext.close().catch(() => void 0)
|
|
267
275
|
}
|
|
268
|
-
audioContext = null
|
|
269
|
-
gainNode = null
|
|
270
|
-
analyser = null
|
|
271
|
-
}
|
|
276
|
+
audioContext = null
|
|
277
|
+
gainNode = null
|
|
278
|
+
analyser = null
|
|
279
|
+
}
|
|
272
280
|
const resume = async () => {
|
|
273
|
-
await ensureContext()
|
|
274
|
-
}
|
|
275
|
-
return { enqueue, flush, close, resume }
|
|
276
|
-
}
|
|
281
|
+
await ensureContext()
|
|
282
|
+
}
|
|
283
|
+
return { enqueue, flush, close, resume }
|
|
284
|
+
}
|
|
277
285
|
|
|
278
286
|
// src/ReconnectingWebSocket.ts
|
|
279
|
-
var READYSTATE_OPEN = 1
|
|
280
|
-
var READYSTATE_CLOSED = 3
|
|
287
|
+
var READYSTATE_OPEN = 1
|
|
288
|
+
var READYSTATE_CLOSED = 3
|
|
281
289
|
var createReconnectingWebSocket = (options, onEvent) => {
|
|
282
|
-
const maxRetries = options.maxRetries ?? 3
|
|
283
|
-
const initialBackoff = options.initialBackoffMs ?? 500
|
|
284
|
-
const maxBackoff = options.maxBackoffMs ?? 8e3
|
|
285
|
-
let ws = null
|
|
286
|
-
let intentionalClose = false
|
|
287
|
-
let retries = 0
|
|
288
|
-
let backoff = initialBackoff
|
|
289
|
-
let reconnectTimer = null
|
|
290
|
+
const maxRetries = options.maxRetries ?? 3
|
|
291
|
+
const initialBackoff = options.initialBackoffMs ?? 500
|
|
292
|
+
const maxBackoff = options.maxBackoffMs ?? 8e3
|
|
293
|
+
let ws = null
|
|
294
|
+
let intentionalClose = false
|
|
295
|
+
let retries = 0
|
|
296
|
+
let backoff = initialBackoff
|
|
297
|
+
let reconnectTimer = null
|
|
290
298
|
const openOnce = () => {
|
|
291
|
-
ws = options.wsFactory(options.url)
|
|
292
|
-
ws.binaryType =
|
|
299
|
+
ws = options.wsFactory(options.url)
|
|
300
|
+
ws.binaryType = 'arraybuffer'
|
|
293
301
|
ws.onopen = () => {
|
|
294
|
-
if (retries === 0) onEvent({ type:
|
|
295
|
-
else onEvent({ type:
|
|
296
|
-
retries = 0
|
|
297
|
-
backoff = initialBackoff
|
|
298
|
-
}
|
|
302
|
+
if (retries === 0) onEvent({ type: 'open' })
|
|
303
|
+
else onEvent({ type: 'reconnected' })
|
|
304
|
+
retries = 0
|
|
305
|
+
backoff = initialBackoff
|
|
306
|
+
}
|
|
299
307
|
ws.onmessage = (ev) => {
|
|
300
|
-
onEvent({ type:
|
|
301
|
-
}
|
|
308
|
+
onEvent({ type: 'message', data: ev.data })
|
|
309
|
+
}
|
|
302
310
|
ws.onerror = () => {
|
|
303
|
-
onEvent({ type:
|
|
304
|
-
}
|
|
311
|
+
onEvent({ type: 'error', error: new Error('WebSocket error') })
|
|
312
|
+
}
|
|
305
313
|
ws.onclose = (ev) => {
|
|
306
|
-
ws = null
|
|
307
|
-
const shouldRetry = !intentionalClose && retries < maxRetries
|
|
314
|
+
ws = null
|
|
315
|
+
const shouldRetry = !intentionalClose && retries < maxRetries
|
|
308
316
|
if (!shouldRetry) {
|
|
309
317
|
onEvent({
|
|
310
|
-
type:
|
|
318
|
+
type: 'close',
|
|
311
319
|
code: ev.code,
|
|
312
320
|
reason: ev.reason,
|
|
313
|
-
permanent: true
|
|
314
|
-
})
|
|
315
|
-
return
|
|
321
|
+
permanent: true,
|
|
322
|
+
})
|
|
323
|
+
return
|
|
316
324
|
}
|
|
317
325
|
onEvent({
|
|
318
|
-
type:
|
|
326
|
+
type: 'close',
|
|
319
327
|
code: ev.code,
|
|
320
328
|
reason: ev.reason,
|
|
321
|
-
permanent: false
|
|
322
|
-
})
|
|
323
|
-
retries
|
|
324
|
-
const delay = Math.min(backoff, maxBackoff)
|
|
325
|
-
backoff = Math.min(backoff * 2, maxBackoff)
|
|
326
|
-
reconnectTimer = setTimeout(openOnce, delay)
|
|
327
|
-
}
|
|
328
|
-
}
|
|
329
|
-
openOnce()
|
|
329
|
+
permanent: false,
|
|
330
|
+
})
|
|
331
|
+
retries++
|
|
332
|
+
const delay = Math.min(backoff, maxBackoff)
|
|
333
|
+
backoff = Math.min(backoff * 2, maxBackoff)
|
|
334
|
+
reconnectTimer = setTimeout(openOnce, delay)
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
openOnce()
|
|
330
338
|
return {
|
|
331
339
|
send: (data) => {
|
|
332
|
-
if (ws && ws.readyState === READYSTATE_OPEN) ws.send(data)
|
|
340
|
+
if (ws && ws.readyState === READYSTATE_OPEN) ws.send(data)
|
|
333
341
|
},
|
|
334
|
-
close: (code = 1e3, reason =
|
|
335
|
-
intentionalClose = true
|
|
342
|
+
close: (code = 1e3, reason = 'client-requested') => {
|
|
343
|
+
intentionalClose = true
|
|
336
344
|
if (reconnectTimer) {
|
|
337
|
-
clearTimeout(reconnectTimer)
|
|
338
|
-
reconnectTimer = null
|
|
345
|
+
clearTimeout(reconnectTimer)
|
|
346
|
+
reconnectTimer = null
|
|
339
347
|
}
|
|
340
348
|
try {
|
|
341
|
-
ws?.close(code, reason)
|
|
342
|
-
} catch {
|
|
343
|
-
}
|
|
349
|
+
ws?.close(code, reason)
|
|
350
|
+
} catch {}
|
|
344
351
|
},
|
|
345
|
-
readyState: () => ws?.readyState ?? READYSTATE_CLOSED
|
|
346
|
-
}
|
|
347
|
-
}
|
|
352
|
+
readyState: () => ws?.readyState ?? READYSTATE_CLOSED,
|
|
353
|
+
}
|
|
354
|
+
}
|
|
348
355
|
|
|
349
356
|
// src/protocol.ts
|
|
350
357
|
var createProtocolState = () => ({
|
|
351
|
-
state:
|
|
358
|
+
state: 'idle',
|
|
352
359
|
transcript: [],
|
|
353
360
|
agentBubbleId: null,
|
|
354
361
|
idCounter: 0,
|
|
355
|
-
endReason: null
|
|
356
|
-
})
|
|
362
|
+
endReason: null,
|
|
363
|
+
})
|
|
357
364
|
var mapEndReason = (raw) => {
|
|
358
|
-
if (raw ===
|
|
359
|
-
if (raw ===
|
|
360
|
-
if (raw ===
|
|
361
|
-
return
|
|
362
|
-
}
|
|
365
|
+
if (raw === 'agent_ended') return 'agent_ended'
|
|
366
|
+
if (raw === 'caller_hung_up') return 'user_hangup'
|
|
367
|
+
if (raw === 'silence_timeout' || raw === 'max_duration') return 'timeout'
|
|
368
|
+
return 'error'
|
|
369
|
+
}
|
|
363
370
|
function handleServerMessage(raw, state, cb) {
|
|
364
|
-
let msg
|
|
371
|
+
let msg
|
|
365
372
|
try {
|
|
366
|
-
msg = JSON.parse(raw)
|
|
373
|
+
msg = JSON.parse(raw)
|
|
367
374
|
} catch {
|
|
368
|
-
return
|
|
375
|
+
return
|
|
369
376
|
}
|
|
370
377
|
switch (msg.type) {
|
|
371
|
-
case
|
|
372
|
-
cb.onConnected()
|
|
373
|
-
setState(state,
|
|
374
|
-
return
|
|
375
|
-
case
|
|
376
|
-
const text = msg.text ??
|
|
377
|
-
if (!text) return
|
|
378
|
-
const isFinal = !!msg.isFinal
|
|
379
|
-
if (!isFinal) setState(state,
|
|
380
|
-
upsertUserPartial(state, text, isFinal)
|
|
381
|
-
cb.onTranscript(state.transcript)
|
|
382
|
-
return
|
|
383
|
-
}
|
|
384
|
-
case
|
|
385
|
-
const id = `m${state.idCounter++}
|
|
386
|
-
state.agentBubbleId = id
|
|
387
|
-
state.transcript = [...state.transcript, { id, role:
|
|
388
|
-
cb.onTranscript(state.transcript)
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
cb
|
|
409
|
-
return
|
|
410
|
-
|
|
411
|
-
|
|
378
|
+
case 'connected':
|
|
379
|
+
cb.onConnected()
|
|
380
|
+
setState(state, 'listening', cb)
|
|
381
|
+
return
|
|
382
|
+
case 'transcript': {
|
|
383
|
+
const text = msg.text ?? ''
|
|
384
|
+
if (!text) return
|
|
385
|
+
const isFinal = !!msg.isFinal
|
|
386
|
+
if (!isFinal) setState(state, 'user_speaking', cb)
|
|
387
|
+
upsertUserPartial(state, text, isFinal)
|
|
388
|
+
cb.onTranscript(state.transcript)
|
|
389
|
+
return
|
|
390
|
+
}
|
|
391
|
+
case 'agent_turn_start': {
|
|
392
|
+
const id = `m${state.idCounter++}`
|
|
393
|
+
state.agentBubbleId = id
|
|
394
|
+
state.transcript = [...state.transcript, { id, role: 'agent', text: '' }]
|
|
395
|
+
cb.onTranscript(state.transcript)
|
|
396
|
+
const seq = typeof msg.seq === 'number' ? msg.seq : void 0
|
|
397
|
+
cb.onAgentTurnStart(seq)
|
|
398
|
+
setState(state, 'agent_speaking', cb)
|
|
399
|
+
return
|
|
400
|
+
}
|
|
401
|
+
case 'agent_text': {
|
|
402
|
+
const delta = msg.text ?? ''
|
|
403
|
+
if (!delta || !state.agentBubbleId) return
|
|
404
|
+
const id = state.agentBubbleId
|
|
405
|
+
state.transcript = state.transcript.map((e) =>
|
|
406
|
+
e.id === id && e.role === 'agent' ? { ...e, text: e.text + delta } : e,
|
|
407
|
+
)
|
|
408
|
+
cb.onTranscript(state.transcript)
|
|
409
|
+
return
|
|
410
|
+
}
|
|
411
|
+
case 'agent_turn_end': {
|
|
412
|
+
state.agentBubbleId = null
|
|
413
|
+
const seq = typeof msg.seq === 'number' ? msg.seq : void 0
|
|
414
|
+
cb.onAgentTurnEnd(seq)
|
|
415
|
+
setState(state, 'listening', cb)
|
|
416
|
+
return
|
|
417
|
+
}
|
|
418
|
+
case 'interrupt':
|
|
419
|
+
cb.onInterrupt()
|
|
420
|
+
return
|
|
421
|
+
case 'agent_turn_abort': {
|
|
422
|
+
const committed = (msg.committedText ?? '').trim()
|
|
412
423
|
if (state.agentBubbleId) {
|
|
413
|
-
const id = state.agentBubbleId
|
|
424
|
+
const id = state.agentBubbleId
|
|
414
425
|
if (committed) {
|
|
415
|
-
state.transcript = state.transcript.map(
|
|
416
|
-
|
|
417
|
-
)
|
|
426
|
+
state.transcript = state.transcript.map((e) =>
|
|
427
|
+
e.id === id && e.role === 'agent' ? { ...e, text: committed, interrupted: true } : e,
|
|
428
|
+
)
|
|
418
429
|
} else {
|
|
419
|
-
state.transcript = state.transcript.filter((e) => e.id !== id)
|
|
430
|
+
state.transcript = state.transcript.filter((e) => e.id !== id)
|
|
420
431
|
}
|
|
421
|
-
cb.onTranscript(state.transcript)
|
|
432
|
+
cb.onTranscript(state.transcript)
|
|
422
433
|
}
|
|
423
|
-
state.agentBubbleId = null
|
|
424
|
-
return
|
|
434
|
+
state.agentBubbleId = null
|
|
435
|
+
return
|
|
425
436
|
}
|
|
426
|
-
case
|
|
437
|
+
case 'tool_call':
|
|
427
438
|
state.transcript = [
|
|
428
439
|
...state.transcript,
|
|
429
440
|
{
|
|
430
441
|
id: `m${state.idCounter++}`,
|
|
431
|
-
role:
|
|
432
|
-
text: `\u2192 ${String(msg.tool ??
|
|
433
|
-
}
|
|
434
|
-
]
|
|
435
|
-
cb.onTranscript(state.transcript)
|
|
436
|
-
return
|
|
437
|
-
case
|
|
442
|
+
role: 'tool',
|
|
443
|
+
text: `\u2192 ${String(msg.tool ?? '?')}(${msg.args ? JSON.stringify(msg.args) : ''})`,
|
|
444
|
+
},
|
|
445
|
+
]
|
|
446
|
+
cb.onTranscript(state.transcript)
|
|
447
|
+
return
|
|
448
|
+
case 'tool_result':
|
|
438
449
|
state.transcript = [
|
|
439
450
|
...state.transcript,
|
|
440
451
|
{
|
|
441
452
|
id: `m${state.idCounter++}`,
|
|
442
|
-
role:
|
|
443
|
-
text: `${msg.ok ?
|
|
444
|
-
}
|
|
445
|
-
]
|
|
446
|
-
cb.onTranscript(state.transcript)
|
|
447
|
-
return
|
|
448
|
-
case
|
|
449
|
-
const toolCallId = String(msg.toolCallId ??
|
|
450
|
-
const name = String(msg.name ??
|
|
451
|
-
const args = msg.args ?? {}
|
|
452
|
-
if (!toolCallId || !name) return
|
|
453
|
-
cb.onClientToolCall({ toolCallId, name, args })
|
|
454
|
-
return
|
|
455
|
-
}
|
|
456
|
-
case
|
|
457
|
-
const reasonRaw = String(msg.reason ??
|
|
458
|
-
const reason = mapEndReason(reasonRaw)
|
|
459
|
-
state.endReason = reason
|
|
453
|
+
role: 'tool',
|
|
454
|
+
text: `${msg.ok ? '\u2713' : '\u2717'} ${String(msg.tool ?? '?')}`,
|
|
455
|
+
},
|
|
456
|
+
]
|
|
457
|
+
cb.onTranscript(state.transcript)
|
|
458
|
+
return
|
|
459
|
+
case 'client_tool_call': {
|
|
460
|
+
const toolCallId = String(msg.toolCallId ?? '')
|
|
461
|
+
const name = String(msg.name ?? '')
|
|
462
|
+
const args = msg.args ?? {}
|
|
463
|
+
if (!toolCallId || !name) return
|
|
464
|
+
cb.onClientToolCall({ toolCallId, name, args })
|
|
465
|
+
return
|
|
466
|
+
}
|
|
467
|
+
case 'call_end': {
|
|
468
|
+
const reasonRaw = String(msg.reason ?? '')
|
|
469
|
+
const reason = mapEndReason(reasonRaw)
|
|
470
|
+
state.endReason = reason
|
|
460
471
|
state.transcript = [
|
|
461
472
|
...state.transcript,
|
|
462
473
|
{
|
|
463
474
|
id: `m${state.idCounter++}`,
|
|
464
|
-
role:
|
|
465
|
-
text: `call ended${reasonRaw ? ` (${reasonRaw})` :
|
|
466
|
-
}
|
|
467
|
-
]
|
|
468
|
-
cb.onTranscript(state.transcript)
|
|
469
|
-
cb.onCallEnd(reason)
|
|
470
|
-
return
|
|
475
|
+
role: 'system',
|
|
476
|
+
text: `call ended${reasonRaw ? ` (${reasonRaw})` : ''}`,
|
|
477
|
+
},
|
|
478
|
+
]
|
|
479
|
+
cb.onTranscript(state.transcript)
|
|
480
|
+
cb.onCallEnd(reason)
|
|
481
|
+
return
|
|
471
482
|
}
|
|
472
|
-
case
|
|
473
|
-
const code = msg.code ??
|
|
474
|
-
const message = msg.message ??
|
|
475
|
-
cb.onError({ code, message })
|
|
476
|
-
return
|
|
483
|
+
case 'error': {
|
|
484
|
+
const code = msg.code ?? 'server_error'
|
|
485
|
+
const message = msg.message ?? 'server error'
|
|
486
|
+
cb.onError({ code, message })
|
|
487
|
+
return
|
|
477
488
|
}
|
|
478
489
|
}
|
|
479
490
|
}
|
|
480
491
|
var setState = (state, next, cb) => {
|
|
481
|
-
if (state.state === next) return
|
|
482
|
-
cb.onState(next)
|
|
483
|
-
}
|
|
492
|
+
if (state.state === next) return
|
|
493
|
+
cb.onState(next)
|
|
494
|
+
}
|
|
484
495
|
var upsertUserPartial = (state, text, isFinal) => {
|
|
485
|
-
let idx = -1
|
|
496
|
+
let idx = -1
|
|
486
497
|
for (let i = state.transcript.length - 1; i >= 0; i--) {
|
|
487
|
-
const e = state.transcript[i]
|
|
488
|
-
if (e.role ===
|
|
489
|
-
idx = i
|
|
490
|
-
break
|
|
498
|
+
const e = state.transcript[i]
|
|
499
|
+
if (e.role === 'user' && e.committed === false) {
|
|
500
|
+
idx = i
|
|
501
|
+
break
|
|
491
502
|
}
|
|
492
503
|
}
|
|
493
504
|
if (idx === -1) {
|
|
494
505
|
state.transcript = [
|
|
495
506
|
...state.transcript,
|
|
496
|
-
{ id: `m${state.idCounter++}`, role:
|
|
497
|
-
]
|
|
498
|
-
return
|
|
499
|
-
}
|
|
500
|
-
const target = state.transcript[idx]
|
|
501
|
-
const next = [...state.transcript]
|
|
502
|
-
next[idx] = { ...target, text, committed: isFinal }
|
|
503
|
-
state.transcript = next
|
|
504
|
-
}
|
|
507
|
+
{ id: `m${state.idCounter++}`, role: 'user', text, committed: isFinal },
|
|
508
|
+
]
|
|
509
|
+
return
|
|
510
|
+
}
|
|
511
|
+
const target = state.transcript[idx]
|
|
512
|
+
const next = [...state.transcript]
|
|
513
|
+
next[idx] = { ...target, text, committed: isFinal }
|
|
514
|
+
state.transcript = next
|
|
515
|
+
}
|
|
505
516
|
function buildWsUrl(args) {
|
|
506
|
-
const base = new URL(args.apiBase)
|
|
507
|
-
const proto = base.protocol ===
|
|
508
|
-
const bargeQS = args.bargeIn === false ?
|
|
509
|
-
return `${proto}//${base.host}/v1/agents/${encodeURIComponent(args.agentId)}/call?token=${encodeURIComponent(args.token)}${bargeQS}
|
|
517
|
+
const base = new URL(args.apiBase)
|
|
518
|
+
const proto = base.protocol === 'https:' ? 'wss:' : 'ws:'
|
|
519
|
+
const bargeQS = args.bargeIn === false ? '&barge=off' : ''
|
|
520
|
+
return `${proto}//${base.host}/v1/agents/${encodeURIComponent(args.agentId)}/call?token=${encodeURIComponent(args.token)}${bargeQS}`
|
|
510
521
|
}
|
|
511
522
|
|
|
512
523
|
// src/clientTools.ts
|
|
513
|
-
var NAME_RE = /^[a-zA-Z_][a-zA-Z0-9_]
|
|
514
|
-
var MAX_TOOLS = 64
|
|
515
|
-
var MAX_USAGE = 500
|
|
516
|
-
var MAX_TIMEOUT_MS = 3e4
|
|
524
|
+
var NAME_RE = /^[a-zA-Z_][a-zA-Z0-9_]*$/
|
|
525
|
+
var MAX_TOOLS = 64
|
|
526
|
+
var MAX_USAGE = 500
|
|
527
|
+
var MAX_TIMEOUT_MS = 3e4
|
|
517
528
|
var validateClientToolMap = (tools) => {
|
|
518
|
-
if (tools === void 0) return
|
|
519
|
-
if (typeof tools !==
|
|
520
|
-
throw new Error(
|
|
529
|
+
if (tools === void 0) return
|
|
530
|
+
if (typeof tools !== 'object' || tools === null || Array.isArray(tools)) {
|
|
531
|
+
throw new Error('clientTools must be an object keyed by tool name')
|
|
521
532
|
}
|
|
522
|
-
const entries = Object.entries(tools)
|
|
533
|
+
const entries = Object.entries(tools)
|
|
523
534
|
if (entries.length > MAX_TOOLS) {
|
|
524
|
-
throw new Error(`clientTools may declare at most 64 tools (got ${entries.length})`)
|
|
535
|
+
throw new Error(`clientTools may declare at most 64 tools (got ${entries.length})`)
|
|
525
536
|
}
|
|
526
537
|
for (const [name, def] of entries) {
|
|
527
538
|
if (!NAME_RE.test(name)) {
|
|
528
539
|
throw new Error(
|
|
529
|
-
`clientTools["${name}"]: name must be a valid identifier (^[a-zA-Z_][a-zA-Z0-9_]*$)
|
|
530
|
-
)
|
|
540
|
+
`clientTools["${name}"]: name must be a valid identifier (^[a-zA-Z_][a-zA-Z0-9_]*$)`,
|
|
541
|
+
)
|
|
531
542
|
}
|
|
532
|
-
if (!def || typeof def !==
|
|
533
|
-
throw new Error(`clientTools["${name}"]: must be an object`)
|
|
543
|
+
if (!def || typeof def !== 'object') {
|
|
544
|
+
throw new Error(`clientTools["${name}"]: must be an object`)
|
|
534
545
|
}
|
|
535
|
-
if (typeof def.description !==
|
|
536
|
-
throw new Error(`clientTools["${name}"]: must have a description`)
|
|
546
|
+
if (typeof def.description !== 'string' || def.description.length === 0) {
|
|
547
|
+
throw new Error(`clientTools["${name}"]: must have a description`)
|
|
537
548
|
}
|
|
538
|
-
if (typeof def.handler !==
|
|
539
|
-
throw new Error(`clientTools["${name}"]: must have a handler function`)
|
|
549
|
+
if (typeof def.handler !== 'function') {
|
|
550
|
+
throw new Error(`clientTools["${name}"]: must have a handler function`)
|
|
540
551
|
}
|
|
541
552
|
if (def.usage !== void 0 && def.usage.length > MAX_USAGE) {
|
|
542
|
-
throw new Error(`clientTools["${name}"]: usage must be \u2264500 chars`)
|
|
553
|
+
throw new Error(`clientTools["${name}"]: usage must be \u2264500 chars`)
|
|
543
554
|
}
|
|
544
|
-
if (
|
|
545
|
-
|
|
555
|
+
if (
|
|
556
|
+
def.timeoutMs !== void 0 &&
|
|
557
|
+
(!Number.isFinite(def.timeoutMs) || def.timeoutMs <= 0 || def.timeoutMs > MAX_TIMEOUT_MS)
|
|
558
|
+
) {
|
|
559
|
+
throw new Error(`clientTools["${name}"]: timeoutMs must be in (0, 30000]`)
|
|
546
560
|
}
|
|
547
561
|
}
|
|
548
|
-
}
|
|
562
|
+
}
|
|
549
563
|
var buildRegisterFrame = (tools) => ({
|
|
550
|
-
type:
|
|
564
|
+
type: 'client_tools_register',
|
|
551
565
|
tools: Object.entries(tools).map(([name, def]) => ({
|
|
552
566
|
name,
|
|
553
567
|
description: def.description,
|
|
554
568
|
parameters: def.parameters,
|
|
555
|
-
...def.usage !== void 0 ? { usage: def.usage } : {},
|
|
556
|
-
...def.timeoutMs !== void 0 ? { timeoutMs: def.timeoutMs } : {}
|
|
557
|
-
}))
|
|
558
|
-
})
|
|
569
|
+
...(def.usage !== void 0 ? { usage: def.usage } : {}),
|
|
570
|
+
...(def.timeoutMs !== void 0 ? { timeoutMs: def.timeoutMs } : {}),
|
|
571
|
+
})),
|
|
572
|
+
})
|
|
559
573
|
var dispatchClientToolCall = (send, tools, frame) => {
|
|
560
574
|
const safeSend = (payload) => {
|
|
561
575
|
try {
|
|
562
|
-
send(payload)
|
|
563
|
-
} catch {
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
const tool = tools[frame.name];
|
|
576
|
+
send(payload)
|
|
577
|
+
} catch {}
|
|
578
|
+
}
|
|
579
|
+
const tool = tools[frame.name]
|
|
567
580
|
if (!tool) {
|
|
568
581
|
safeSend({
|
|
569
|
-
type:
|
|
582
|
+
type: 'client_tool_result',
|
|
570
583
|
toolCallId: frame.toolCallId,
|
|
571
|
-
error: `No handler for ${frame.name}
|
|
572
|
-
})
|
|
573
|
-
return
|
|
584
|
+
error: `No handler for ${frame.name}`,
|
|
585
|
+
})
|
|
586
|
+
return
|
|
574
587
|
}
|
|
575
588
|
void (async () => {
|
|
576
589
|
try {
|
|
577
|
-
const out = await tool.handler(frame.args)
|
|
590
|
+
const out = await tool.handler(frame.args)
|
|
578
591
|
safeSend({
|
|
579
|
-
type:
|
|
592
|
+
type: 'client_tool_result',
|
|
580
593
|
toolCallId: frame.toolCallId,
|
|
581
|
-
result: typeof out ===
|
|
582
|
-
})
|
|
594
|
+
result: typeof out === 'string' ? out : JSON.stringify(out),
|
|
595
|
+
})
|
|
583
596
|
} catch (err) {
|
|
584
597
|
safeSend({
|
|
585
|
-
type:
|
|
598
|
+
type: 'client_tool_result',
|
|
586
599
|
toolCallId: frame.toolCallId,
|
|
587
|
-
error: err instanceof Error ? err.message : String(err)
|
|
588
|
-
})
|
|
600
|
+
error: err instanceof Error ? err.message : String(err),
|
|
601
|
+
})
|
|
602
|
+
}
|
|
603
|
+
})()
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
// src/ClientMarksBuffer.ts
|
|
607
|
+
var createClientMarksBuffer = (args) => {
|
|
608
|
+
const now = args.now ?? (() => performance.now())
|
|
609
|
+
let pendingFirstOutboundAt = null
|
|
610
|
+
const inFlight = /* @__PURE__ */ new Map()
|
|
611
|
+
const tryEmit = (seq) => {
|
|
612
|
+
const slot = inFlight.get(seq)
|
|
613
|
+
if (!slot) return
|
|
614
|
+
if (!slot.ended) return
|
|
615
|
+
const marks = {}
|
|
616
|
+
if (slot.firstOutboundAt !== null && slot.firstAudibleAt !== null) {
|
|
617
|
+
marks.client_mic_to_first_audible_ms = slot.firstAudibleAt - slot.firstOutboundAt
|
|
618
|
+
}
|
|
619
|
+
args.send({
|
|
620
|
+
type: 'client_marks',
|
|
621
|
+
seq,
|
|
622
|
+
marks,
|
|
623
|
+
clientNow: Date.now(),
|
|
624
|
+
})
|
|
625
|
+
inFlight.delete(seq)
|
|
626
|
+
}
|
|
627
|
+
const markFirstOutboundAudio = () => {
|
|
628
|
+
if (pendingFirstOutboundAt !== null) return
|
|
629
|
+
pendingFirstOutboundAt = now()
|
|
630
|
+
}
|
|
631
|
+
const markFirstAudibleOutput = () => {
|
|
632
|
+
let target
|
|
633
|
+
for (const slot of inFlight.values()) {
|
|
634
|
+
if (!slot.ended) {
|
|
635
|
+
target = slot
|
|
636
|
+
}
|
|
589
637
|
}
|
|
590
|
-
|
|
591
|
-
|
|
638
|
+
if (!target) return
|
|
639
|
+
if (target.firstAudibleAt !== null) return
|
|
640
|
+
target.firstAudibleAt = now()
|
|
641
|
+
}
|
|
642
|
+
const onAgentTurnStart = (seq) => {
|
|
643
|
+
inFlight.set(seq, {
|
|
644
|
+
firstOutboundAt: pendingFirstOutboundAt,
|
|
645
|
+
firstAudibleAt: null,
|
|
646
|
+
ended: false,
|
|
647
|
+
})
|
|
648
|
+
pendingFirstOutboundAt = null
|
|
649
|
+
}
|
|
650
|
+
const onAgentTurnEnd = (seq) => {
|
|
651
|
+
const slot = inFlight.get(seq)
|
|
652
|
+
if (!slot) {
|
|
653
|
+
args.send({ type: 'client_marks', seq, marks: {}, clientNow: Date.now() })
|
|
654
|
+
return
|
|
655
|
+
}
|
|
656
|
+
slot.ended = true
|
|
657
|
+
tryEmit(seq)
|
|
658
|
+
}
|
|
659
|
+
const flush = () => {
|
|
660
|
+
for (const seq of [...inFlight.keys()]) {
|
|
661
|
+
const slot = inFlight.get(seq)
|
|
662
|
+
slot.ended = true
|
|
663
|
+
tryEmit(seq)
|
|
664
|
+
}
|
|
665
|
+
pendingFirstOutboundAt = null
|
|
666
|
+
}
|
|
667
|
+
return {
|
|
668
|
+
markFirstOutboundAudio,
|
|
669
|
+
markFirstAudibleOutput,
|
|
670
|
+
onAgentTurnStart,
|
|
671
|
+
onAgentTurnEnd,
|
|
672
|
+
flush,
|
|
673
|
+
}
|
|
674
|
+
}
|
|
592
675
|
|
|
593
676
|
// src/VoiceClient.ts
|
|
594
677
|
var BrowserVoiceClient = class {
|
|
595
678
|
constructor(args) {
|
|
596
|
-
this.rws = null
|
|
597
|
-
this.capture = null
|
|
598
|
-
this.playback = null
|
|
599
|
-
this.muted = false
|
|
600
|
-
this.inputVolume = 0
|
|
601
|
-
this.outputVolume = 0
|
|
602
|
-
this.startedAt = null
|
|
603
|
-
this.endedFired = false
|
|
604
|
-
this.lastError = null
|
|
679
|
+
this.rws = null
|
|
680
|
+
this.capture = null
|
|
681
|
+
this.playback = null
|
|
682
|
+
this.muted = false
|
|
683
|
+
this.inputVolume = 0
|
|
684
|
+
this.outputVolume = 0
|
|
685
|
+
this.startedAt = null
|
|
686
|
+
this.endedFired = false
|
|
687
|
+
this.lastError = null
|
|
605
688
|
this.end = () => {
|
|
606
|
-
this.teardown(
|
|
607
|
-
}
|
|
689
|
+
this.teardown('user_hangup')
|
|
690
|
+
}
|
|
608
691
|
this.mute = () => {
|
|
609
|
-
if (this.muted) return
|
|
610
|
-
this.muted = true
|
|
611
|
-
this.capture?.mute(true)
|
|
612
|
-
}
|
|
692
|
+
if (this.muted) return
|
|
693
|
+
this.muted = true
|
|
694
|
+
this.capture?.mute(true)
|
|
695
|
+
}
|
|
613
696
|
this.unmute = () => {
|
|
614
|
-
if (!this.muted) return
|
|
615
|
-
this.muted = false
|
|
616
|
-
this.capture?.mute(false)
|
|
617
|
-
}
|
|
697
|
+
if (!this.muted) return
|
|
698
|
+
this.muted = false
|
|
699
|
+
this.capture?.mute(false)
|
|
700
|
+
}
|
|
618
701
|
// ---------------------------------------------------------------
|
|
619
702
|
// Internal
|
|
620
703
|
// ---------------------------------------------------------------
|
|
621
704
|
this.sendClientToolsRegister = () => {
|
|
622
|
-
const frame = buildRegisterFrame(this.args.options.clientTools ?? {})
|
|
623
|
-
this.rws?.send(JSON.stringify(frame))
|
|
624
|
-
}
|
|
705
|
+
const frame = buildRegisterFrame(this.args.options.clientTools ?? {})
|
|
706
|
+
this.rws?.send(JSON.stringify(frame))
|
|
707
|
+
}
|
|
625
708
|
this.setState = (next) => {
|
|
626
|
-
if (this.proto.state === next) return
|
|
627
|
-
this.proto.state = next
|
|
628
|
-
this.args.options.onStateChange?.(next)
|
|
629
|
-
}
|
|
709
|
+
if (this.proto.state === next) return
|
|
710
|
+
this.proto.state = next
|
|
711
|
+
this.args.options.onStateChange?.(next)
|
|
712
|
+
}
|
|
630
713
|
this.emitError = (err) => {
|
|
631
|
-
this.lastError = err
|
|
632
|
-
this.args.options.onError?.(err)
|
|
633
|
-
}
|
|
714
|
+
this.lastError = err
|
|
715
|
+
this.args.options.onError?.(err)
|
|
716
|
+
}
|
|
634
717
|
this.handleSocketEvent = (ev) => {
|
|
635
718
|
switch (ev.type) {
|
|
636
|
-
case
|
|
637
|
-
void this.startCapture()
|
|
638
|
-
break
|
|
639
|
-
case
|
|
640
|
-
this.proto.transcript = []
|
|
641
|
-
this.proto.agentBubbleId = null
|
|
642
|
-
this.args.options.onTranscript?.(this.proto.transcript)
|
|
643
|
-
void this.startCapture()
|
|
644
|
-
this.setState(
|
|
645
|
-
break
|
|
646
|
-
case
|
|
647
|
-
if (typeof ev.data ===
|
|
719
|
+
case 'open':
|
|
720
|
+
void this.startCapture()
|
|
721
|
+
break
|
|
722
|
+
case 'reconnected':
|
|
723
|
+
this.proto.transcript = []
|
|
724
|
+
this.proto.agentBubbleId = null
|
|
725
|
+
this.args.options.onTranscript?.(this.proto.transcript)
|
|
726
|
+
void this.startCapture()
|
|
727
|
+
this.setState('listening')
|
|
728
|
+
break
|
|
729
|
+
case 'message':
|
|
730
|
+
if (typeof ev.data === 'string') {
|
|
648
731
|
handleServerMessage(ev.data, this.proto, {
|
|
649
732
|
onState: this.setState,
|
|
650
733
|
onTranscript: (entries) => this.args.options.onTranscript?.(entries),
|
|
651
734
|
onError: this.emitError,
|
|
652
735
|
onInterrupt: () => {
|
|
653
|
-
this.playback?.flush()
|
|
654
|
-
this.args.options.onInterrupt?.()
|
|
736
|
+
this.playback?.flush()
|
|
737
|
+
this.args.options.onInterrupt?.()
|
|
738
|
+
},
|
|
739
|
+
onAgentTurnStart: (seq) => {
|
|
740
|
+
if (typeof seq === 'number') this.marks.onAgentTurnStart(seq)
|
|
741
|
+
this.args.options.onAgentTurnStart?.()
|
|
742
|
+
},
|
|
743
|
+
onAgentTurnEnd: (seq) => {
|
|
744
|
+
if (typeof seq === 'number') this.marks.onAgentTurnEnd(seq)
|
|
655
745
|
},
|
|
656
|
-
onAgentTurnStart: () => this.args.options.onAgentTurnStart?.(),
|
|
657
746
|
onCallEnd: (reason) => this.teardown(reason),
|
|
658
747
|
onConnected: () => this.sendClientToolsRegister(),
|
|
659
|
-
onClientToolCall: (frame) =>
|
|
660
|
-
(
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
748
|
+
onClientToolCall: (frame) =>
|
|
749
|
+
dispatchClientToolCall(
|
|
750
|
+
(f) => this.rws?.send(JSON.stringify(f)),
|
|
751
|
+
this.args.options.clientTools ?? {},
|
|
752
|
+
frame,
|
|
753
|
+
),
|
|
754
|
+
})
|
|
665
755
|
} else {
|
|
666
|
-
this.
|
|
756
|
+
this.marks.markFirstAudibleOutput()
|
|
757
|
+
this.playback?.enqueue(ev.data)
|
|
667
758
|
}
|
|
668
|
-
break
|
|
669
|
-
case
|
|
759
|
+
break
|
|
760
|
+
case 'close':
|
|
670
761
|
if (ev.permanent) {
|
|
671
|
-
const reason = this.proto.endReason ?? (this.lastError ?
|
|
672
|
-
this.teardown(reason)
|
|
762
|
+
const reason = this.proto.endReason ?? (this.lastError ? 'error' : 'user_hangup')
|
|
763
|
+
this.teardown(reason)
|
|
673
764
|
}
|
|
674
|
-
break
|
|
675
|
-
case
|
|
676
|
-
this.emitError({ code:
|
|
677
|
-
break
|
|
765
|
+
break
|
|
766
|
+
case 'error':
|
|
767
|
+
this.emitError({ code: 'socket_error', message: ev.error.message })
|
|
768
|
+
break
|
|
678
769
|
}
|
|
679
|
-
}
|
|
770
|
+
}
|
|
680
771
|
this.startCapture = async () => {
|
|
681
|
-
if (this.capture?.isCapturing()) return
|
|
772
|
+
if (this.capture?.isCapturing()) return
|
|
682
773
|
this.capture = createAudioCapture({
|
|
683
774
|
onChunk: (pcm) => {
|
|
684
|
-
this.
|
|
775
|
+
this.marks.markFirstOutboundAudio()
|
|
776
|
+
this.rws?.send(pcm)
|
|
685
777
|
},
|
|
686
778
|
onVolume: (v) => {
|
|
687
|
-
this.inputVolume = v
|
|
688
|
-
this.args.options.onVolume?.({ input: v, output: this.outputVolume })
|
|
779
|
+
this.inputVolume = v
|
|
780
|
+
this.args.options.onVolume?.({ input: v, output: this.outputVolume })
|
|
689
781
|
},
|
|
690
782
|
onError: (err) => {
|
|
691
783
|
this.emitError({
|
|
692
|
-
code: err.name ===
|
|
693
|
-
message: err.message
|
|
694
|
-
})
|
|
695
|
-
}
|
|
696
|
-
})
|
|
697
|
-
if (this.muted) this.capture.mute(true)
|
|
784
|
+
code: err.name === 'NotAllowedError' ? 'mic_denied' : 'mic_start_failed',
|
|
785
|
+
message: err.message,
|
|
786
|
+
})
|
|
787
|
+
},
|
|
788
|
+
})
|
|
789
|
+
if (this.muted) this.capture.mute(true)
|
|
698
790
|
try {
|
|
699
|
-
await this.capture.start()
|
|
700
|
-
} catch {
|
|
701
|
-
|
|
702
|
-
};
|
|
791
|
+
await this.capture.start()
|
|
792
|
+
} catch {}
|
|
793
|
+
}
|
|
703
794
|
this.teardown = (reason) => {
|
|
704
|
-
this.capture?.stop();
|
|
705
|
-
this.capture = null;
|
|
706
|
-
this.playback?.close();
|
|
707
|
-
this.playback = null;
|
|
708
795
|
try {
|
|
709
|
-
this.
|
|
710
|
-
} catch {
|
|
711
|
-
|
|
712
|
-
this.
|
|
713
|
-
this.
|
|
714
|
-
this.
|
|
715
|
-
|
|
796
|
+
this.marks.flush()
|
|
797
|
+
} catch {}
|
|
798
|
+
this.capture?.stop()
|
|
799
|
+
this.capture = null
|
|
800
|
+
this.playback?.close()
|
|
801
|
+
this.playback = null
|
|
802
|
+
try {
|
|
803
|
+
this.rws?.close(1e3, reason)
|
|
804
|
+
} catch {}
|
|
805
|
+
this.rws = null
|
|
806
|
+
this.setState('ended')
|
|
807
|
+
this.fireEndOnce(reason)
|
|
808
|
+
}
|
|
716
809
|
this.fireEndOnce = (reason) => {
|
|
717
|
-
if (this.endedFired) return
|
|
718
|
-
this.endedFired = true
|
|
719
|
-
const startedAt = this.startedAt ?? Date.now()
|
|
810
|
+
if (this.endedFired) return
|
|
811
|
+
this.endedFired = true
|
|
812
|
+
const startedAt = this.startedAt ?? Date.now()
|
|
720
813
|
this.args.options.onEnd?.({
|
|
721
814
|
reason,
|
|
722
|
-
errorCode: reason ===
|
|
723
|
-
durationMs: Date.now() - startedAt
|
|
724
|
-
})
|
|
725
|
-
}
|
|
726
|
-
this.args = args
|
|
727
|
-
this.proto = createProtocolState()
|
|
728
|
-
validateClientToolMap(args.options.clientTools)
|
|
815
|
+
errorCode: reason === 'error' ? this.lastError?.code : void 0,
|
|
816
|
+
durationMs: Date.now() - startedAt,
|
|
817
|
+
})
|
|
818
|
+
}
|
|
819
|
+
this.args = args
|
|
820
|
+
this.proto = createProtocolState()
|
|
821
|
+
validateClientToolMap(args.options.clientTools)
|
|
822
|
+
this.marks = createClientMarksBuffer({
|
|
823
|
+
send: (frame) => {
|
|
824
|
+
try {
|
|
825
|
+
this.rws?.send(JSON.stringify(frame))
|
|
826
|
+
} catch {}
|
|
827
|
+
},
|
|
828
|
+
})
|
|
729
829
|
}
|
|
730
830
|
// ---------------------------------------------------------------
|
|
731
831
|
// Call interface
|
|
732
832
|
// ---------------------------------------------------------------
|
|
733
833
|
get state() {
|
|
734
|
-
return this.proto.state
|
|
834
|
+
return this.proto.state
|
|
735
835
|
}
|
|
736
836
|
get transcript() {
|
|
737
|
-
return this.proto.transcript.slice()
|
|
837
|
+
return this.proto.transcript.slice()
|
|
738
838
|
}
|
|
739
839
|
get isMuted() {
|
|
740
|
-
return this.muted
|
|
840
|
+
return this.muted
|
|
741
841
|
}
|
|
742
842
|
// ---------------------------------------------------------------
|
|
743
843
|
// Lifecycle — called by the factory immediately after construction.
|
|
@@ -745,84 +845,282 @@ var BrowserVoiceClient = class {
|
|
|
745
845
|
// failures arrive via `onError`.
|
|
746
846
|
// ---------------------------------------------------------------
|
|
747
847
|
async start() {
|
|
748
|
-
this.setState(
|
|
749
|
-
this.startedAt = Date.now()
|
|
848
|
+
this.setState('connecting')
|
|
849
|
+
this.startedAt = Date.now()
|
|
750
850
|
const url = buildWsUrl({
|
|
751
851
|
apiBase: this.args.config.apiBase,
|
|
752
852
|
agentId: this.args.options.agentId,
|
|
753
853
|
token: this.args.token,
|
|
754
|
-
bargeIn: this.args.options.bargeIn
|
|
755
|
-
})
|
|
854
|
+
bargeIn: this.args.options.bargeIn,
|
|
855
|
+
})
|
|
756
856
|
this.playback = createAudioPlayback({
|
|
757
857
|
onVolume: (v) => {
|
|
758
|
-
this.outputVolume = v
|
|
759
|
-
this.args.options.onVolume?.({ input: this.inputVolume, output: v })
|
|
760
|
-
}
|
|
761
|
-
})
|
|
858
|
+
this.outputVolume = v
|
|
859
|
+
this.args.options.onVolume?.({ input: this.inputVolume, output: v })
|
|
860
|
+
},
|
|
861
|
+
})
|
|
762
862
|
try {
|
|
763
|
-
await this.playback.resume()
|
|
764
|
-
} catch {
|
|
765
|
-
}
|
|
863
|
+
await this.playback.resume()
|
|
864
|
+
} catch {}
|
|
766
865
|
this.rws = createReconnectingWebSocket(
|
|
767
866
|
{
|
|
768
867
|
url,
|
|
769
868
|
wsFactory: this.args.wsFactory,
|
|
770
|
-
maxRetries: 3
|
|
869
|
+
maxRetries: 3,
|
|
870
|
+
},
|
|
871
|
+
(ev) => this.handleSocketEvent(ev),
|
|
872
|
+
)
|
|
873
|
+
}
|
|
874
|
+
}
|
|
875
|
+
|
|
876
|
+
// src/webrtc/createWebRtcCall.ts
|
|
877
|
+
async function createWebRtcCall(opts) {
|
|
878
|
+
validateClientToolMap(opts.clientTools)
|
|
879
|
+
const proto = createProtocolState()
|
|
880
|
+
let muted = false
|
|
881
|
+
let ended = false
|
|
882
|
+
const tools = opts.clientTools ?? {}
|
|
883
|
+
const sendControl = (frame) => {
|
|
884
|
+
if (dc?.readyState !== 'open') return
|
|
885
|
+
try {
|
|
886
|
+
dc.send(JSON.stringify(frame))
|
|
887
|
+
} catch {}
|
|
888
|
+
}
|
|
889
|
+
const fireState = (next) => {
|
|
890
|
+
if (proto.state === next) return
|
|
891
|
+
proto.state = next
|
|
892
|
+
opts.onStateChange?.(next)
|
|
893
|
+
}
|
|
894
|
+
const dispatch = (raw) => {
|
|
895
|
+
handleServerMessage(raw, proto, {
|
|
896
|
+
onState: fireState,
|
|
897
|
+
onTranscript: (entries) => opts.onTranscript?.(entries),
|
|
898
|
+
onError: (err) => opts.onError?.(err),
|
|
899
|
+
onInterrupt: () => opts.onInterrupt?.(),
|
|
900
|
+
onAgentTurnStart: () => opts.onAgentTurnStart?.(),
|
|
901
|
+
onAgentTurnEnd: () => {},
|
|
902
|
+
onCallEnd: () => teardown(),
|
|
903
|
+
onConnected: () => {
|
|
904
|
+
if (Object.keys(tools).length > 0) {
|
|
905
|
+
sendControl(buildRegisterFrame(tools))
|
|
906
|
+
}
|
|
907
|
+
},
|
|
908
|
+
onClientToolCall: (frame) => {
|
|
909
|
+
dispatchClientToolCall(sendControl, tools, frame)
|
|
771
910
|
},
|
|
772
|
-
|
|
773
|
-
);
|
|
911
|
+
})
|
|
774
912
|
}
|
|
775
|
-
|
|
913
|
+
fireState('connecting')
|
|
914
|
+
const pc = new RTCPeerConnection({
|
|
915
|
+
iceServers: [{ urls: 'stun:stun.l.google.com:19302' }],
|
|
916
|
+
})
|
|
917
|
+
const audioEl = document.createElement('audio')
|
|
918
|
+
audioEl.autoplay = true
|
|
919
|
+
audioEl.style.display = 'none'
|
|
920
|
+
document.body.appendChild(audioEl)
|
|
921
|
+
pc.ontrack = (event) => {
|
|
922
|
+
audioEl.srcObject = event.streams[0] ?? new MediaStream([event.track])
|
|
923
|
+
}
|
|
924
|
+
let mic
|
|
925
|
+
try {
|
|
926
|
+
mic = await navigator.mediaDevices.getUserMedia({ audio: true })
|
|
927
|
+
} catch (err) {
|
|
928
|
+
const code =
|
|
929
|
+
err instanceof DOMException && err.name === 'NotAllowedError'
|
|
930
|
+
? 'mic_denied'
|
|
931
|
+
: 'mic_start_failed'
|
|
932
|
+
opts.onError?.({
|
|
933
|
+
code,
|
|
934
|
+
message: err instanceof Error ? err.message : 'getUserMedia failed',
|
|
935
|
+
})
|
|
936
|
+
fireState('error')
|
|
937
|
+
pc.close()
|
|
938
|
+
audioEl.remove()
|
|
939
|
+
throw err
|
|
940
|
+
}
|
|
941
|
+
for (const track of mic.getAudioTracks()) pc.addTrack(track, mic)
|
|
942
|
+
const dc = pc.createDataChannel('control', { ordered: true })
|
|
943
|
+
dc.onmessage = (e) => {
|
|
944
|
+
if (typeof e.data === 'string') dispatch(e.data)
|
|
945
|
+
}
|
|
946
|
+
dc.onerror = () => {
|
|
947
|
+
opts.onError?.({ code: 'socket_error', message: 'control channel error' })
|
|
948
|
+
}
|
|
949
|
+
dc.onopen = () => {
|
|
950
|
+
if (Object.keys(tools).length > 0) {
|
|
951
|
+
sendControl(buildRegisterFrame(tools))
|
|
952
|
+
}
|
|
953
|
+
}
|
|
954
|
+
const gateway = opts.webrtcGatewayBase || ''
|
|
955
|
+
const offerUrl = gateway
|
|
956
|
+
? `${gateway}/webrtc/offer?token=${encodeURIComponent(opts.token)}`
|
|
957
|
+
: `${opts.apiBase}/v1/agents/${encodeURIComponent(opts.agentId)}/webrtc/offer?token=${encodeURIComponent(opts.token)}`
|
|
958
|
+
const iceUrl = gateway
|
|
959
|
+
? `${gateway}/webrtc/ice?token=${encodeURIComponent(opts.token)}`
|
|
960
|
+
: `${opts.apiBase}/v1/agents/${encodeURIComponent(opts.agentId)}/webrtc/ice?token=${encodeURIComponent(opts.token)}`
|
|
961
|
+
await pc.setLocalDescription(await pc.createOffer())
|
|
962
|
+
let callId
|
|
963
|
+
try {
|
|
964
|
+
const offerRes = await fetch(offerUrl, {
|
|
965
|
+
method: 'POST',
|
|
966
|
+
headers: { 'content-type': 'application/json' },
|
|
967
|
+
body: JSON.stringify({ sdp: pc.localDescription.sdp, type: 'offer', agentId: opts.agentId }),
|
|
968
|
+
})
|
|
969
|
+
if (!offerRes.ok) {
|
|
970
|
+
const code = offerRes.status === 401 ? 'unauthorized' : 'server_error'
|
|
971
|
+
opts.onError?.({ code, message: `signaling failed: HTTP ${offerRes.status}` })
|
|
972
|
+
fireState('error')
|
|
973
|
+
mic.getTracks().forEach((t) => t.stop())
|
|
974
|
+
pc.close()
|
|
975
|
+
audioEl.remove()
|
|
976
|
+
throw new Error(`webrtc offer failed: ${offerRes.status}`)
|
|
977
|
+
}
|
|
978
|
+
const body = await offerRes.json()
|
|
979
|
+
callId = body.callId
|
|
980
|
+
await pc.setRemoteDescription({ type: 'answer', sdp: body.sdp })
|
|
981
|
+
} catch (err) {
|
|
982
|
+
if (!ended) {
|
|
983
|
+
opts.onError?.({
|
|
984
|
+
code: 'network_unreachable',
|
|
985
|
+
message: err instanceof Error ? err.message : 'signaling failed',
|
|
986
|
+
})
|
|
987
|
+
fireState('error')
|
|
988
|
+
mic.getTracks().forEach((t) => t.stop())
|
|
989
|
+
pc.close()
|
|
990
|
+
audioEl.remove()
|
|
991
|
+
}
|
|
992
|
+
throw err
|
|
993
|
+
}
|
|
994
|
+
pc.onicecandidate = (e) => {
|
|
995
|
+
if (!e.candidate) return
|
|
996
|
+
void fetch(iceUrl, {
|
|
997
|
+
method: 'POST',
|
|
998
|
+
headers: { 'content-type': 'application/json' },
|
|
999
|
+
body: JSON.stringify({ callId, candidate: e.candidate }),
|
|
1000
|
+
}).catch(() => {})
|
|
1001
|
+
}
|
|
1002
|
+
pc.onconnectionstatechange = () => {
|
|
1003
|
+
const s = pc.connectionState
|
|
1004
|
+
if (s === 'connected') fireState('listening')
|
|
1005
|
+
if (s === 'failed' || s === 'disconnected') {
|
|
1006
|
+
opts.onError?.({ code: 'socket_error', message: `webrtc connection ${s}` })
|
|
1007
|
+
teardown()
|
|
1008
|
+
}
|
|
1009
|
+
if (s === 'closed' && !ended) teardown()
|
|
1010
|
+
}
|
|
1011
|
+
const teardown = () => {
|
|
1012
|
+
if (ended) return
|
|
1013
|
+
ended = true
|
|
1014
|
+
try {
|
|
1015
|
+
mic.getTracks().forEach((t) => t.stop())
|
|
1016
|
+
} catch {}
|
|
1017
|
+
try {
|
|
1018
|
+
pc.close()
|
|
1019
|
+
} catch {}
|
|
1020
|
+
try {
|
|
1021
|
+
audioEl.remove()
|
|
1022
|
+
} catch {}
|
|
1023
|
+
fireState('ended')
|
|
1024
|
+
opts.onEnd?.()
|
|
1025
|
+
}
|
|
1026
|
+
return {
|
|
1027
|
+
get state() {
|
|
1028
|
+
return proto.state
|
|
1029
|
+
},
|
|
1030
|
+
get transcript() {
|
|
1031
|
+
return proto.transcript.slice()
|
|
1032
|
+
},
|
|
1033
|
+
get isMuted() {
|
|
1034
|
+
return muted
|
|
1035
|
+
},
|
|
1036
|
+
end: () => teardown(),
|
|
1037
|
+
mute: () => {
|
|
1038
|
+
if (muted) return
|
|
1039
|
+
muted = true
|
|
1040
|
+
mic.getAudioTracks().forEach((t) => (t.enabled = false))
|
|
1041
|
+
},
|
|
1042
|
+
unmute: () => {
|
|
1043
|
+
if (!muted) return
|
|
1044
|
+
muted = false
|
|
1045
|
+
mic.getAudioTracks().forEach((t) => (t.enabled = true))
|
|
1046
|
+
},
|
|
1047
|
+
}
|
|
1048
|
+
}
|
|
776
1049
|
|
|
777
1050
|
// src/browser.ts
|
|
778
|
-
var browserWsFactory = (url) => new globalThis.WebSocket(url)
|
|
1051
|
+
var browserWsFactory = (url) => new globalThis.WebSocket(url)
|
|
779
1052
|
var BrowserVoiceFactory = class {
|
|
780
1053
|
constructor(config) {
|
|
781
1054
|
this.startCall = async (options) => {
|
|
782
1055
|
if (!options.agentId) {
|
|
783
|
-
throw new Error(
|
|
1056
|
+
throw new Error('startCall: agentId is required')
|
|
784
1057
|
}
|
|
785
|
-
const { context, metadata } = mergeStartCallContext(this.config, options)
|
|
1058
|
+
const { context, metadata } = mergeStartCallContext(this.config, options)
|
|
786
1059
|
const fetchArgs = {
|
|
787
1060
|
agentId: options.agentId,
|
|
788
1061
|
userId: options.userId,
|
|
789
1062
|
context,
|
|
790
|
-
metadata
|
|
791
|
-
}
|
|
792
|
-
let
|
|
1063
|
+
metadata,
|
|
1064
|
+
}
|
|
1065
|
+
let resolved
|
|
793
1066
|
if (options.token) {
|
|
794
|
-
|
|
1067
|
+
resolved = { token: options.token, transport: 'ws' }
|
|
795
1068
|
} else {
|
|
796
|
-
|
|
797
|
-
if (!
|
|
798
|
-
throw new Error(
|
|
1069
|
+
const r = await this.config.fetchToken(fetchArgs)
|
|
1070
|
+
if (!r) {
|
|
1071
|
+
throw new Error('configureVoiceClient.fetchToken returned empty token')
|
|
1072
|
+
}
|
|
1073
|
+
resolved = typeof r === 'string' ? { token: r, transport: 'ws' } : r
|
|
1074
|
+
if (!resolved.token) {
|
|
1075
|
+
throw new Error('configureVoiceClient.fetchToken returned an object without `token`')
|
|
799
1076
|
}
|
|
800
1077
|
}
|
|
1078
|
+
if (resolved.transport === 'webrtc') {
|
|
1079
|
+
return createWebRtcCall({
|
|
1080
|
+
agentId: options.agentId,
|
|
1081
|
+
apiBase: this.config.apiBase,
|
|
1082
|
+
token: resolved.token,
|
|
1083
|
+
webrtcGatewayBase: resolved.webrtcGatewayBase,
|
|
1084
|
+
onStateChange: options.onStateChange,
|
|
1085
|
+
onTranscript: options.onTranscript,
|
|
1086
|
+
onError: options.onError,
|
|
1087
|
+
// Synthesise a minimal CallEndEvent. WebRTC doesn't carry an end reason
|
|
1088
|
+
// from the server yet — use 'agent_ended' as placeholder. durationMs is
|
|
1089
|
+
// tracked at 0 until the followup lands (see spec Followups section).
|
|
1090
|
+
onEnd: options.onEnd
|
|
1091
|
+
? () => options.onEnd({ reason: 'agent_ended', durationMs: 0 })
|
|
1092
|
+
: void 0,
|
|
1093
|
+
onInterrupt: options.onInterrupt,
|
|
1094
|
+
onAgentTurnStart: options.onAgentTurnStart,
|
|
1095
|
+
clientTools: options.clientTools,
|
|
1096
|
+
})
|
|
1097
|
+
}
|
|
801
1098
|
const client = new BrowserVoiceClient({
|
|
802
1099
|
config: this.config,
|
|
803
1100
|
// Carry merged context/metadata through to startCall so server can
|
|
804
1101
|
// see what the SDK saw.
|
|
805
1102
|
options: { ...options, context, metadata },
|
|
806
|
-
token,
|
|
807
|
-
wsFactory: browserWsFactory
|
|
808
|
-
})
|
|
809
|
-
await client.start()
|
|
810
|
-
return client
|
|
811
|
-
}
|
|
812
|
-
this.config = config
|
|
813
|
-
}
|
|
814
|
-
}
|
|
1103
|
+
token: resolved.token,
|
|
1104
|
+
wsFactory: browserWsFactory,
|
|
1105
|
+
})
|
|
1106
|
+
await client.start()
|
|
1107
|
+
return client
|
|
1108
|
+
}
|
|
1109
|
+
this.config = config
|
|
1110
|
+
}
|
|
1111
|
+
}
|
|
815
1112
|
function configureVoiceClient(config) {
|
|
816
|
-
return new BrowserVoiceFactory(normalizeConfig(config))
|
|
1113
|
+
return new BrowserVoiceFactory(normalizeConfig(config))
|
|
817
1114
|
}
|
|
818
1115
|
// Annotate the CommonJS export names for ESM import in node:
|
|
819
|
-
0 &&
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
1116
|
+
0 &&
|
|
1117
|
+
(module.exports = {
|
|
1118
|
+
buildWsUrl,
|
|
1119
|
+
configureVoiceClient,
|
|
1120
|
+
createAudioCapture,
|
|
1121
|
+
createAudioPlayback,
|
|
1122
|
+
createProtocolState,
|
|
1123
|
+
createReconnectingWebSocket,
|
|
1124
|
+
handleServerMessage,
|
|
1125
|
+
})
|
|
1126
|
+
//# sourceMappingURL=browser.js.map
|