@craftedxp/voice-js 0.3.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/browser.js CHANGED
@@ -1,24 +1,26 @@
1
- "use strict";
2
- var __defProp = Object.defineProperty;
3
- var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
- var __getOwnPropNames = Object.getOwnPropertyNames;
5
- var __hasOwnProp = Object.prototype.hasOwnProperty;
1
+ 'use strict'
2
+ var __defProp = Object.defineProperty
3
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor
4
+ var __getOwnPropNames = Object.getOwnPropertyNames
5
+ var __hasOwnProp = Object.prototype.hasOwnProperty
6
6
  var __export = (target, all) => {
7
- for (var name in all)
8
- __defProp(target, name, { get: all[name], enumerable: true });
9
- };
7
+ for (var name in all) __defProp(target, name, { get: all[name], enumerable: true })
8
+ }
10
9
  var __copyProps = (to, from, except, desc) => {
11
- if (from && typeof from === "object" || typeof from === "function") {
10
+ if ((from && typeof from === 'object') || typeof from === 'function') {
12
11
  for (let key of __getOwnPropNames(from))
13
12
  if (!__hasOwnProp.call(to, key) && key !== except)
14
- __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
13
+ __defProp(to, key, {
14
+ get: () => from[key],
15
+ enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable,
16
+ })
15
17
  }
16
- return to;
17
- };
18
- var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
18
+ return to
19
+ }
20
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, '__esModule', { value: true }), mod)
19
21
 
20
22
  // src/browser.ts
21
- var browser_exports = {};
23
+ var browser_exports = {}
22
24
  __export(browser_exports, {
23
25
  buildWsUrl: () => buildWsUrl,
24
26
  configureVoiceClient: () => configureVoiceClient,
@@ -26,57 +28,64 @@ __export(browser_exports, {
26
28
  createAudioPlayback: () => createAudioPlayback,
27
29
  createProtocolState: () => createProtocolState,
28
30
  createReconnectingWebSocket: () => createReconnectingWebSocket,
29
- handleServerMessage: () => handleServerMessage
30
- });
31
- module.exports = __toCommonJS(browser_exports);
31
+ handleServerMessage: () => handleServerMessage,
32
+ })
33
+ module.exports = __toCommonJS(browser_exports)
32
34
 
33
35
  // src/config.ts
34
36
  function normalizeConfig(config) {
35
- if (!config) throw new Error("configureVoiceClient: config is required");
36
- if ("apiKey" in config) {
37
+ if (!config) throw new Error('configureVoiceClient: config is required')
38
+ if ('apiKey' in config) {
37
39
  throw new Error(
38
- "configureVoiceClient: `apiKey` is no longer supported. Embedding sk_ in JS code ships server-grade credentials to every client. Pass `fetchToken: async ({ agentId }) => { /* call YOUR backend mint */ }` instead \u2014 see the @craftedxp/voice-js README for the migration recipe."
39
- );
40
+ 'configureVoiceClient: `apiKey` is no longer supported. Embedding sk_ in JS code ships server-grade credentials to every client. Pass `fetchToken: async ({ agentId }) => { /* call YOUR backend mint */ }` instead \u2014 see the @craftedxp/voice-js README for the migration recipe.',
41
+ )
40
42
  }
41
43
  if (!config.apiBase) {
42
- throw new Error("configureVoiceClient: apiBase is required");
44
+ throw new Error('configureVoiceClient: apiBase is required')
43
45
  }
44
- if (typeof config.fetchToken !== "function") {
45
- throw new Error("configureVoiceClient: fetchToken must be a function");
46
+ if (typeof config.fetchToken !== 'function') {
47
+ throw new Error('configureVoiceClient: fetchToken must be a function')
46
48
  }
47
49
  return {
48
50
  ...config,
49
- apiBase: config.apiBase.replace(/\/+$/, "")
50
- };
51
+ apiBase: config.apiBase.replace(/\/+$/, ''),
52
+ }
51
53
  }
52
54
  function mergeStartCallContext(factory, call) {
53
- const context = factory.defaultContext || call.context ? { ...factory.defaultContext ?? {}, ...call.context ?? {} } : void 0;
54
- const metadata = factory.defaultMetadata || call.metadata ? { ...factory.defaultMetadata ?? {}, ...call.metadata ?? {} } : void 0;
55
- return { context, metadata };
55
+ const context =
56
+ factory.defaultContext || call.context
57
+ ? { ...(factory.defaultContext ?? {}), ...(call.context ?? {}) }
58
+ : void 0
59
+ const metadata =
60
+ factory.defaultMetadata || call.metadata
61
+ ? { ...(factory.defaultMetadata ?? {}), ...(call.metadata ?? {}) }
62
+ : void 0
63
+ return { context, metadata }
56
64
  }
57
65
 
58
66
  // src/worklets/mic-downsampler.worklet.js
59
- var mic_downsampler_worklet_default = "// AudioWorklet \u2014 runs off the main thread in the audio rendering graph.\n//\n// Input: Float32 samples at the AudioContext's native sampleRate (typically\n// 48000 Hz on desktop, 44100 Hz on some iOS devices).\n// Output: 16 kHz mono Int16 PCM, shipped to the main thread via\n// `port.postMessage(ArrayBuffer, [ArrayBuffer])` (transferred, not copied).\n//\n// Why AudioWorklet instead of ScriptProcessorNode: ScriptProcessorNode is\n// deprecated + main-thread-bound, so any JS jank produces audible audio\n// glitches. AudioWorklet's `process()` runs on the audio rendering thread\n// at the graph's block cadence (128 frames by default) and backpressures\n// via returning `true` / `false`.\n//\n// This file is loaded as text (see tsup.config.ts loader) and registered\n// at runtime via `audioWorklet.addModule(blobUrl)`.\n\nclass MicDownsampler extends AudioWorkletProcessor {\n constructor() {\n super()\n // Target sample rate for STT. Matches Deepgram Nova-3 + the platform's\n // server-side SAMPLE_RATE constant in AgentCallHandler.\n this.targetRate = 16000\n // Accumulator for the downsample. We collect incoming samples and emit\n // an Int16 chunk when we've accumulated ~1024 target-rate samples\n // (~64 ms at 16 kHz) \u2014 matches the mobile SDK's chunk size so both\n // platforms have the same server-side framing.\n this.outputFrames = 1024\n this.acc = []\n // Running index used for fractional resampling.\n this.readCursor = 0\n }\n\n // `inputs[0][0]` = first channel of first input. 128 Float32 samples per\n // call at the context's sampleRate. Return true = keep processing.\n process(inputs) {\n const input = inputs[0]\n if (!input || input.length === 0) return true\n const channel = input[0]\n if (!channel || channel.length === 0) return true\n\n const ctxRate = sampleRate // global inside AudioWorkletProcessor\n const ratio = ctxRate / this.targetRate\n\n // Simple linear-interp downsample. For 48000 \u2192 16000 that's 3:1, which\n // linear handles fine for voice. Anti-alias filtering would be\n // theoretically better but inaudible for speech.\n for (let i = 0; i < channel.length; i++) {\n this.acc.push(channel[i])\n }\n\n while (this.acc.length - this.readCursor >= ratio * this.outputFrames) {\n const out = new Int16Array(this.outputFrames)\n let readIdx = this.readCursor\n for (let i = 0; i < this.outputFrames; i++) {\n // Linear interp between floor(readIdx) and ceil(readIdx)\n const low = Math.floor(readIdx)\n const high = Math.min(low + 1, this.acc.length - 1)\n const frac = readIdx - low\n const sample = this.acc[low] * (1 - frac) + this.acc[high] * frac\n // Clip + convert to int16\n const clipped = Math.max(-1, Math.min(1, sample))\n out[i] = clipped < 0 ? clipped * 0x8000 : clipped * 0x7fff\n readIdx += ratio\n }\n // Transfer the ArrayBuffer (zero-copy) to the main thread.\n this.port.postMessage(out.buffer, [out.buffer])\n this.readCursor = readIdx\n }\n\n // Garbage-collect the consumed portion of `acc` every so often so it\n // doesn't grow without bound. Leave ~one chunk of headroom.\n if (this.readCursor > ratio * this.outputFrames) {\n this.acc = this.acc.slice(Math.floor(this.readCursor))\n this.readCursor -= Math.floor(this.readCursor)\n }\n\n return true\n }\n}\n\nregisterProcessor('mic-downsampler', MicDownsampler)\n";
67
+ var mic_downsampler_worklet_default =
68
+ "// AudioWorklet \u2014 runs off the main thread in the audio rendering graph.\n//\n// Input: Float32 samples at the AudioContext's native sampleRate (typically\n// 48000 Hz on desktop, 44100 Hz on some iOS devices).\n// Output: 16 kHz mono Int16 PCM, shipped to the main thread via\n// `port.postMessage(ArrayBuffer, [ArrayBuffer])` (transferred, not copied).\n//\n// Why AudioWorklet instead of ScriptProcessorNode: ScriptProcessorNode is\n// deprecated + main-thread-bound, so any JS jank produces audible audio\n// glitches. AudioWorklet's `process()` runs on the audio rendering thread\n// at the graph's block cadence (128 frames by default) and backpressures\n// via returning `true` / `false`.\n//\n// This file is loaded as text (see tsup.config.ts loader) and registered\n// at runtime via `audioWorklet.addModule(blobUrl)`.\n\nclass MicDownsampler extends AudioWorkletProcessor {\n constructor() {\n super()\n // Target sample rate for STT. Matches Deepgram Nova-3 + the platform's\n // server-side SAMPLE_RATE constant in AgentCallHandler.\n this.targetRate = 16000\n // Accumulator for the downsample. We collect incoming samples and emit\n // an Int16 chunk when we've accumulated ~1024 target-rate samples\n // (~64 ms at 16 kHz) \u2014 matches the mobile SDK's chunk size so both\n // platforms have the same server-side framing.\n this.outputFrames = 1024\n this.acc = []\n // Running index used for fractional resampling.\n this.readCursor = 0\n }\n\n // `inputs[0][0]` = first channel of first input. 128 Float32 samples per\n // call at the context's sampleRate. Return true = keep processing.\n process(inputs) {\n const input = inputs[0]\n if (!input || input.length === 0) return true\n const channel = input[0]\n if (!channel || channel.length === 0) return true\n\n const ctxRate = sampleRate // global inside AudioWorkletProcessor\n const ratio = ctxRate / this.targetRate\n\n // Simple linear-interp downsample. For 48000 \u2192 16000 that's 3:1, which\n // linear handles fine for voice. Anti-alias filtering would be\n // theoretically better but inaudible for speech.\n for (let i = 0; i < channel.length; i++) {\n this.acc.push(channel[i])\n }\n\n while (this.acc.length - this.readCursor >= ratio * this.outputFrames) {\n const out = new Int16Array(this.outputFrames)\n let readIdx = this.readCursor\n for (let i = 0; i < this.outputFrames; i++) {\n // Linear interp between floor(readIdx) and ceil(readIdx)\n const low = Math.floor(readIdx)\n const high = Math.min(low + 1, this.acc.length - 1)\n const frac = readIdx - low\n const sample = this.acc[low] * (1 - frac) + this.acc[high] * frac\n // Clip + convert to int16\n const clipped = Math.max(-1, Math.min(1, sample))\n out[i] = clipped < 0 ? clipped * 0x8000 : clipped * 0x7fff\n readIdx += ratio\n }\n // Transfer the ArrayBuffer (zero-copy) to the main thread.\n this.port.postMessage(out.buffer, [out.buffer])\n this.readCursor = readIdx\n }\n\n // Garbage-collect the consumed portion of `acc` every so often so it\n // doesn't grow without bound. Leave ~one chunk of headroom.\n if (this.readCursor > ratio * this.outputFrames) {\n this.acc = this.acc.slice(Math.floor(this.readCursor))\n this.readCursor -= Math.floor(this.readCursor)\n }\n\n return true\n }\n}\n\nregisterProcessor('mic-downsampler', MicDownsampler)\n"
60
69
 
61
70
  // src/AudioCapture.ts
62
- var VOLUME_INTERVAL_MS = 100;
71
+ var VOLUME_INTERVAL_MS = 100
63
72
  var createAudioCapture = (options) => {
64
- let audioContext = null;
65
- let mediaStream = null;
66
- let sourceNode = null;
67
- let workletNode = null;
68
- let analyser = null;
69
- let volumeTimer = null;
70
- let muted = false;
71
- let capturing = false;
73
+ let audioContext = null
74
+ let mediaStream = null
75
+ let sourceNode = null
76
+ let workletNode = null
77
+ let analyser = null
78
+ let volumeTimer = null
79
+ let muted = false
80
+ let capturing = false
72
81
  const computeRms = (buf) => {
73
- let sum = 0;
74
- for (let i = 0; i < buf.length; i++) sum += buf[i] * buf[i];
75
- const rms = Math.sqrt(sum / buf.length);
76
- return Math.min(1, rms * 1.8);
77
- };
82
+ let sum = 0
83
+ for (let i = 0; i < buf.length; i++) sum += buf[i] * buf[i]
84
+ const rms = Math.sqrt(sum / buf.length)
85
+ return Math.min(1, rms * 1.8)
86
+ }
78
87
  const start = async () => {
79
- if (capturing) return;
88
+ if (capturing) return
80
89
  try {
81
90
  mediaStream = await navigator.mediaDevices.getUserMedia({
82
91
  audio: {
@@ -87,657 +96,748 @@ var createAudioCapture = (options) => {
87
96
  echoCancellation: true,
88
97
  noiseSuppression: true,
89
98
  autoGainControl: true,
90
- channelCount: 1
91
- }
92
- });
93
- audioContext = new AudioContext();
94
- if (audioContext.state === "suspended") await audioContext.resume();
95
- const blob = new Blob([mic_downsampler_worklet_default], { type: "application/javascript" });
96
- const url = URL.createObjectURL(blob);
99
+ channelCount: 1,
100
+ },
101
+ })
102
+ audioContext = new AudioContext()
103
+ if (audioContext.state === 'suspended') await audioContext.resume()
104
+ const blob = new Blob([mic_downsampler_worklet_default], { type: 'application/javascript' })
105
+ const url = URL.createObjectURL(blob)
97
106
  try {
98
- await audioContext.audioWorklet.addModule(url);
107
+ await audioContext.audioWorklet.addModule(url)
99
108
  } finally {
100
- URL.revokeObjectURL(url);
109
+ URL.revokeObjectURL(url)
101
110
  }
102
- sourceNode = audioContext.createMediaStreamSource(mediaStream);
103
- workletNode = new AudioWorkletNode(audioContext, "mic-downsampler");
111
+ sourceNode = audioContext.createMediaStreamSource(mediaStream)
112
+ workletNode = new AudioWorkletNode(audioContext, 'mic-downsampler')
104
113
  workletNode.port.onmessage = (event) => {
105
- if (muted) return;
106
- options.onChunk(event.data);
107
- };
114
+ if (muted) return
115
+ options.onChunk(event.data)
116
+ }
108
117
  if (options.onVolume) {
109
- analyser = audioContext.createAnalyser();
110
- analyser.fftSize = 256;
111
- sourceNode.connect(analyser);
112
- const buf = new Float32Array(analyser.fftSize);
118
+ analyser = audioContext.createAnalyser()
119
+ analyser.fftSize = 256
120
+ sourceNode.connect(analyser)
121
+ const buf = new Float32Array(analyser.fftSize)
113
122
  volumeTimer = setInterval(() => {
114
- if (!analyser) return;
115
- analyser.getFloatTimeDomainData(buf);
116
- options.onVolume?.(computeRms(buf));
117
- }, VOLUME_INTERVAL_MS);
123
+ if (!analyser) return
124
+ analyser.getFloatTimeDomainData(buf)
125
+ options.onVolume?.(computeRms(buf))
126
+ }, VOLUME_INTERVAL_MS)
118
127
  }
119
- sourceNode.connect(workletNode);
120
- const sink = audioContext.createGain();
121
- sink.gain.value = 0;
122
- workletNode.connect(sink).connect(audioContext.destination);
123
- capturing = true;
128
+ sourceNode.connect(workletNode)
129
+ const sink = audioContext.createGain()
130
+ sink.gain.value = 0
131
+ workletNode.connect(sink).connect(audioContext.destination)
132
+ capturing = true
124
133
  } catch (err) {
125
- const wrapped = err instanceof Error ? err : new Error(typeof err === "string" ? err : "capture failed");
126
- options.onError?.(wrapped);
127
- throw wrapped;
134
+ const wrapped =
135
+ err instanceof Error ? err : new Error(typeof err === 'string' ? err : 'capture failed')
136
+ options.onError?.(wrapped)
137
+ throw wrapped
128
138
  }
129
- };
139
+ }
130
140
  const stop = () => {
131
- if (!capturing) return;
132
- capturing = false;
141
+ if (!capturing) return
142
+ capturing = false
133
143
  if (volumeTimer) {
134
- clearInterval(volumeTimer);
135
- volumeTimer = null;
144
+ clearInterval(volumeTimer)
145
+ volumeTimer = null
136
146
  }
137
147
  try {
138
- workletNode?.disconnect();
139
- analyser?.disconnect();
140
- sourceNode?.disconnect();
141
- } catch {
142
- }
143
- workletNode = null;
144
- analyser = null;
145
- sourceNode = null;
148
+ workletNode?.disconnect()
149
+ analyser?.disconnect()
150
+ sourceNode?.disconnect()
151
+ } catch {}
152
+ workletNode = null
153
+ analyser = null
154
+ sourceNode = null
146
155
  if (mediaStream) {
147
- for (const track of mediaStream.getTracks()) track.stop();
148
- mediaStream = null;
156
+ for (const track of mediaStream.getTracks()) track.stop()
157
+ mediaStream = null
149
158
  }
150
- if (audioContext && audioContext.state !== "closed") {
151
- void audioContext.close().catch(() => void 0);
159
+ if (audioContext && audioContext.state !== 'closed') {
160
+ void audioContext.close().catch(() => void 0)
152
161
  }
153
- audioContext = null;
154
- };
162
+ audioContext = null
163
+ }
155
164
  return {
156
165
  start,
157
166
  stop,
158
167
  mute: (v) => {
159
- muted = v;
168
+ muted = v
160
169
  },
161
- isCapturing: () => capturing
162
- };
163
- };
170
+ isCapturing: () => capturing,
171
+ }
172
+ }
164
173
 
165
174
  // src/AudioPlayback.ts
166
- var DEFAULT_SAMPLE_RATE = 16e3;
167
- var VOLUME_INTERVAL_MS2 = 100;
175
+ var DEFAULT_SAMPLE_RATE = 16e3
176
+ var VOLUME_INTERVAL_MS2 = 100
168
177
  var createAudioPlayback = (options = {}) => {
169
- const sampleRate = options.sampleRate ?? DEFAULT_SAMPLE_RATE;
170
- let audioContext = null;
171
- let gainNode = null;
172
- let analyser = null;
173
- let volumeTimer = null;
174
- let nextStartTime = 0;
175
- let scheduledNodes = [];
176
- let speaking = false;
178
+ const sampleRate = options.sampleRate ?? DEFAULT_SAMPLE_RATE
179
+ let audioContext = null
180
+ let gainNode = null
181
+ let analyser = null
182
+ let volumeTimer = null
183
+ let nextStartTime = 0
184
+ let scheduledNodes = []
185
+ let speaking = false
177
186
  const ensureContext = async () => {
178
187
  if (audioContext) {
179
- if (audioContext.state === "suspended") await audioContext.resume();
180
- return;
188
+ if (audioContext.state === 'suspended') await audioContext.resume()
189
+ return
181
190
  }
182
- audioContext = new AudioContext({ sampleRate });
183
- gainNode = audioContext.createGain();
191
+ audioContext = new AudioContext({ sampleRate })
192
+ gainNode = audioContext.createGain()
184
193
  if (options.onVolume) {
185
- analyser = audioContext.createAnalyser();
186
- analyser.fftSize = 256;
187
- gainNode.connect(analyser);
188
- const buf = new Float32Array(analyser.fftSize);
194
+ analyser = audioContext.createAnalyser()
195
+ analyser.fftSize = 256
196
+ gainNode.connect(analyser)
197
+ const buf = new Float32Array(analyser.fftSize)
189
198
  volumeTimer = setInterval(() => {
190
- if (!analyser) return;
191
- analyser.getFloatTimeDomainData(buf);
192
- let sum = 0;
193
- for (let i = 0; i < buf.length; i++) sum += buf[i] * buf[i];
194
- const rms = Math.sqrt(sum / buf.length);
195
- options.onVolume?.(Math.min(1, rms * 1.8));
196
- }, VOLUME_INTERVAL_MS2);
197
- }
198
- gainNode.connect(audioContext.destination);
199
- nextStartTime = audioContext.currentTime;
200
- };
199
+ if (!analyser) return
200
+ analyser.getFloatTimeDomainData(buf)
201
+ let sum = 0
202
+ for (let i = 0; i < buf.length; i++) sum += buf[i] * buf[i]
203
+ const rms = Math.sqrt(sum / buf.length)
204
+ options.onVolume?.(Math.min(1, rms * 1.8))
205
+ }, VOLUME_INTERVAL_MS2)
206
+ }
207
+ gainNode.connect(audioContext.destination)
208
+ nextStartTime = audioContext.currentTime
209
+ }
201
210
  const setSpeaking = (v) => {
202
- if (v === speaking) return;
203
- speaking = v;
204
- options.onSpeakingChange?.(v);
205
- };
211
+ if (v === speaking) return
212
+ speaking = v
213
+ options.onSpeakingChange?.(v)
214
+ }
206
215
  const pruneFinished = () => {
207
- const now = audioContext?.currentTime ?? 0;
216
+ const now = audioContext?.currentTime ?? 0
208
217
  scheduledNodes = scheduledNodes.filter((n) => {
209
- const node = n;
210
- return (node._endsAt ?? 0) > now;
211
- });
212
- if (scheduledNodes.length === 0) setSpeaking(false);
213
- };
218
+ const node = n
219
+ return (node._endsAt ?? 0) > now
220
+ })
221
+ if (scheduledNodes.length === 0) setSpeaking(false)
222
+ }
214
223
  const enqueue = (pcm) => {
215
224
  if (!audioContext) {
216
- void ensureContext().then(() => enqueue(pcm));
217
- return;
218
- }
219
- if (!audioContext || !gainNode) return;
220
- const int16 = new Int16Array(pcm);
221
- if (int16.length === 0) return;
222
- const audioBuffer = audioContext.createBuffer(1, int16.length, sampleRate);
223
- const float32 = audioBuffer.getChannelData(0);
225
+ void ensureContext().then(() => enqueue(pcm))
226
+ return
227
+ }
228
+ if (!audioContext || !gainNode) return
229
+ const int16 = new Int16Array(pcm)
230
+ if (int16.length === 0) return
231
+ const audioBuffer = audioContext.createBuffer(1, int16.length, sampleRate)
232
+ const float32 = audioBuffer.getChannelData(0)
224
233
  for (let i = 0; i < int16.length; i++) {
225
- float32[i] = int16[i] / 32768;
226
- }
227
- const node = audioContext.createBufferSource();
228
- node.buffer = audioBuffer;
229
- node.connect(gainNode);
230
- const now = audioContext.currentTime;
231
- const startAt = Math.max(now, nextStartTime);
232
- node.start(startAt);
233
- const duration = int16.length / sampleRate;
234
- node._endsAt = startAt + duration;
235
- nextStartTime = startAt + duration;
236
- scheduledNodes.push(node);
237
- setSpeaking(true);
238
- node.onended = () => pruneFinished();
239
- };
234
+ float32[i] = int16[i] / 32768
235
+ }
236
+ const node = audioContext.createBufferSource()
237
+ node.buffer = audioBuffer
238
+ node.connect(gainNode)
239
+ const now = audioContext.currentTime
240
+ const startAt = Math.max(now, nextStartTime)
241
+ node.start(startAt)
242
+ const duration = int16.length / sampleRate
243
+ node._endsAt = startAt + duration
244
+ nextStartTime = startAt + duration
245
+ scheduledNodes.push(node)
246
+ setSpeaking(true)
247
+ node.onended = () => pruneFinished()
248
+ }
240
249
  const flush = () => {
241
- if (!audioContext || !gainNode) return;
250
+ if (!audioContext || !gainNode) return
242
251
  for (const node of scheduledNodes) {
243
252
  try {
244
- node.stop();
245
- } catch {
246
- }
253
+ node.stop()
254
+ } catch {}
247
255
  }
248
- scheduledNodes = [];
249
- gainNode.disconnect();
250
- gainNode = audioContext.createGain();
256
+ scheduledNodes = []
257
+ gainNode.disconnect()
258
+ gainNode = audioContext.createGain()
251
259
  if (analyser) {
252
- analyser.disconnect();
253
- gainNode.connect(analyser);
260
+ analyser.disconnect()
261
+ gainNode.connect(analyser)
254
262
  }
255
- gainNode.connect(audioContext.destination);
256
- nextStartTime = audioContext.currentTime;
257
- setSpeaking(false);
258
- };
263
+ gainNode.connect(audioContext.destination)
264
+ nextStartTime = audioContext.currentTime
265
+ setSpeaking(false)
266
+ }
259
267
  const close = () => {
260
- flush();
268
+ flush()
261
269
  if (volumeTimer) {
262
- clearInterval(volumeTimer);
263
- volumeTimer = null;
270
+ clearInterval(volumeTimer)
271
+ volumeTimer = null
264
272
  }
265
- if (audioContext && audioContext.state !== "closed") {
266
- void audioContext.close().catch(() => void 0);
273
+ if (audioContext && audioContext.state !== 'closed') {
274
+ void audioContext.close().catch(() => void 0)
267
275
  }
268
- audioContext = null;
269
- gainNode = null;
270
- analyser = null;
271
- };
276
+ audioContext = null
277
+ gainNode = null
278
+ analyser = null
279
+ }
272
280
  const resume = async () => {
273
- await ensureContext();
274
- };
275
- return { enqueue, flush, close, resume };
276
- };
281
+ await ensureContext()
282
+ }
283
+ return { enqueue, flush, close, resume }
284
+ }
277
285
 
278
286
  // src/ReconnectingWebSocket.ts
279
- var READYSTATE_OPEN = 1;
280
- var READYSTATE_CLOSED = 3;
287
+ var READYSTATE_OPEN = 1
288
+ var READYSTATE_CLOSED = 3
281
289
  var createReconnectingWebSocket = (options, onEvent) => {
282
- const maxRetries = options.maxRetries ?? 3;
283
- const initialBackoff = options.initialBackoffMs ?? 500;
284
- const maxBackoff = options.maxBackoffMs ?? 8e3;
285
- let ws = null;
286
- let intentionalClose = false;
287
- let retries = 0;
288
- let backoff = initialBackoff;
289
- let reconnectTimer = null;
290
+ const maxRetries = options.maxRetries ?? 3
291
+ const initialBackoff = options.initialBackoffMs ?? 500
292
+ const maxBackoff = options.maxBackoffMs ?? 8e3
293
+ let ws = null
294
+ let intentionalClose = false
295
+ let retries = 0
296
+ let backoff = initialBackoff
297
+ let reconnectTimer = null
290
298
  const openOnce = () => {
291
- ws = options.wsFactory(options.url);
292
- ws.binaryType = "arraybuffer";
299
+ ws = options.wsFactory(options.url)
300
+ ws.binaryType = 'arraybuffer'
293
301
  ws.onopen = () => {
294
- if (retries === 0) onEvent({ type: "open" });
295
- else onEvent({ type: "reconnected" });
296
- retries = 0;
297
- backoff = initialBackoff;
298
- };
302
+ if (retries === 0) onEvent({ type: 'open' })
303
+ else onEvent({ type: 'reconnected' })
304
+ retries = 0
305
+ backoff = initialBackoff
306
+ }
299
307
  ws.onmessage = (ev) => {
300
- onEvent({ type: "message", data: ev.data });
301
- };
308
+ onEvent({ type: 'message', data: ev.data })
309
+ }
302
310
  ws.onerror = () => {
303
- onEvent({ type: "error", error: new Error("WebSocket error") });
304
- };
311
+ onEvent({ type: 'error', error: new Error('WebSocket error') })
312
+ }
305
313
  ws.onclose = (ev) => {
306
- ws = null;
307
- const shouldRetry = !intentionalClose && retries < maxRetries;
314
+ ws = null
315
+ const shouldRetry = !intentionalClose && retries < maxRetries
308
316
  if (!shouldRetry) {
309
317
  onEvent({
310
- type: "close",
318
+ type: 'close',
311
319
  code: ev.code,
312
320
  reason: ev.reason,
313
- permanent: true
314
- });
315
- return;
321
+ permanent: true,
322
+ })
323
+ return
316
324
  }
317
325
  onEvent({
318
- type: "close",
326
+ type: 'close',
319
327
  code: ev.code,
320
328
  reason: ev.reason,
321
- permanent: false
322
- });
323
- retries++;
324
- const delay = Math.min(backoff, maxBackoff);
325
- backoff = Math.min(backoff * 2, maxBackoff);
326
- reconnectTimer = setTimeout(openOnce, delay);
327
- };
328
- };
329
- openOnce();
329
+ permanent: false,
330
+ })
331
+ retries++
332
+ const delay = Math.min(backoff, maxBackoff)
333
+ backoff = Math.min(backoff * 2, maxBackoff)
334
+ reconnectTimer = setTimeout(openOnce, delay)
335
+ }
336
+ }
337
+ openOnce()
330
338
  return {
331
339
  send: (data) => {
332
- if (ws && ws.readyState === READYSTATE_OPEN) ws.send(data);
340
+ if (ws && ws.readyState === READYSTATE_OPEN) ws.send(data)
333
341
  },
334
- close: (code = 1e3, reason = "client-requested") => {
335
- intentionalClose = true;
342
+ close: (code = 1e3, reason = 'client-requested') => {
343
+ intentionalClose = true
336
344
  if (reconnectTimer) {
337
- clearTimeout(reconnectTimer);
338
- reconnectTimer = null;
345
+ clearTimeout(reconnectTimer)
346
+ reconnectTimer = null
339
347
  }
340
348
  try {
341
- ws?.close(code, reason);
342
- } catch {
343
- }
349
+ ws?.close(code, reason)
350
+ } catch {}
344
351
  },
345
- readyState: () => ws?.readyState ?? READYSTATE_CLOSED
346
- };
347
- };
352
+ readyState: () => ws?.readyState ?? READYSTATE_CLOSED,
353
+ }
354
+ }
348
355
 
349
356
  // src/protocol.ts
350
357
  var createProtocolState = () => ({
351
- state: "idle",
358
+ state: 'idle',
352
359
  transcript: [],
353
360
  agentBubbleId: null,
354
361
  idCounter: 0,
355
- endReason: null
356
- });
362
+ endReason: null,
363
+ })
357
364
  var mapEndReason = (raw) => {
358
- if (raw === "agent_ended") return "agent_ended";
359
- if (raw === "caller_hung_up") return "user_hangup";
360
- if (raw === "silence_timeout" || raw === "max_duration") return "timeout";
361
- return "error";
362
- };
365
+ if (raw === 'agent_ended') return 'agent_ended'
366
+ if (raw === 'caller_hung_up') return 'user_hangup'
367
+ if (raw === 'silence_timeout' || raw === 'max_duration') return 'timeout'
368
+ return 'error'
369
+ }
363
370
  function handleServerMessage(raw, state, cb) {
364
- let msg;
371
+ let msg
365
372
  try {
366
- msg = JSON.parse(raw);
373
+ msg = JSON.parse(raw)
367
374
  } catch {
368
- return;
375
+ return
369
376
  }
370
377
  switch (msg.type) {
371
- case "connected":
372
- cb.onConnected();
373
- setState(state, "listening", cb);
374
- return;
375
- case "transcript": {
376
- const text = msg.text ?? "";
377
- if (!text) return;
378
- const isFinal = !!msg.isFinal;
379
- if (!isFinal) setState(state, "user_speaking", cb);
380
- upsertUserPartial(state, text, isFinal);
381
- cb.onTranscript(state.transcript);
382
- return;
383
- }
384
- case "agent_turn_start": {
385
- const id = `m${state.idCounter++}`;
386
- state.agentBubbleId = id;
387
- state.transcript = [...state.transcript, { id, role: "agent", text: "" }];
388
- cb.onTranscript(state.transcript);
389
- cb.onAgentTurnStart();
390
- setState(state, "agent_speaking", cb);
391
- return;
392
- }
393
- case "agent_text": {
394
- const delta = msg.text ?? "";
395
- if (!delta || !state.agentBubbleId) return;
396
- const id = state.agentBubbleId;
397
- state.transcript = state.transcript.map(
398
- (e) => e.id === id && e.role === "agent" ? { ...e, text: e.text + delta } : e
399
- );
400
- cb.onTranscript(state.transcript);
401
- return;
402
- }
403
- case "agent_turn_end":
404
- state.agentBubbleId = null;
405
- setState(state, "listening", cb);
406
- return;
407
- case "interrupt":
408
- cb.onInterrupt();
409
- return;
410
- case "agent_turn_abort": {
411
- const committed = (msg.committedText ?? "").trim();
378
+ case 'connected':
379
+ cb.onConnected()
380
+ setState(state, 'listening', cb)
381
+ return
382
+ case 'transcript': {
383
+ const text = msg.text ?? ''
384
+ if (!text) return
385
+ const isFinal = !!msg.isFinal
386
+ if (!isFinal) setState(state, 'user_speaking', cb)
387
+ upsertUserPartial(state, text, isFinal)
388
+ cb.onTranscript(state.transcript)
389
+ return
390
+ }
391
+ case 'agent_turn_start': {
392
+ const id = `m${state.idCounter++}`
393
+ state.agentBubbleId = id
394
+ state.transcript = [...state.transcript, { id, role: 'agent', text: '' }]
395
+ cb.onTranscript(state.transcript)
396
+ const seq = typeof msg.seq === 'number' ? msg.seq : void 0
397
+ cb.onAgentTurnStart(seq)
398
+ setState(state, 'agent_speaking', cb)
399
+ return
400
+ }
401
+ case 'agent_text': {
402
+ const delta = msg.text ?? ''
403
+ if (!delta || !state.agentBubbleId) return
404
+ const id = state.agentBubbleId
405
+ state.transcript = state.transcript.map((e) =>
406
+ e.id === id && e.role === 'agent' ? { ...e, text: e.text + delta } : e,
407
+ )
408
+ cb.onTranscript(state.transcript)
409
+ return
410
+ }
411
+ case 'agent_turn_end': {
412
+ state.agentBubbleId = null
413
+ const seq = typeof msg.seq === 'number' ? msg.seq : void 0
414
+ cb.onAgentTurnEnd(seq)
415
+ setState(state, 'listening', cb)
416
+ return
417
+ }
418
+ case 'interrupt':
419
+ cb.onInterrupt()
420
+ return
421
+ case 'agent_turn_abort': {
422
+ const committed = (msg.committedText ?? '').trim()
412
423
  if (state.agentBubbleId) {
413
- const id = state.agentBubbleId;
424
+ const id = state.agentBubbleId
414
425
  if (committed) {
415
- state.transcript = state.transcript.map(
416
- (e) => e.id === id && e.role === "agent" ? { ...e, text: committed, interrupted: true } : e
417
- );
426
+ state.transcript = state.transcript.map((e) =>
427
+ e.id === id && e.role === 'agent' ? { ...e, text: committed, interrupted: true } : e,
428
+ )
418
429
  } else {
419
- state.transcript = state.transcript.filter((e) => e.id !== id);
430
+ state.transcript = state.transcript.filter((e) => e.id !== id)
420
431
  }
421
- cb.onTranscript(state.transcript);
432
+ cb.onTranscript(state.transcript)
422
433
  }
423
- state.agentBubbleId = null;
424
- return;
434
+ state.agentBubbleId = null
435
+ return
425
436
  }
426
- case "tool_call":
437
+ case 'tool_call':
427
438
  state.transcript = [
428
439
  ...state.transcript,
429
440
  {
430
441
  id: `m${state.idCounter++}`,
431
- role: "tool",
432
- text: `\u2192 ${String(msg.tool ?? "?")}(${msg.args ? JSON.stringify(msg.args) : ""})`
433
- }
434
- ];
435
- cb.onTranscript(state.transcript);
436
- return;
437
- case "tool_result":
442
+ role: 'tool',
443
+ text: `\u2192 ${String(msg.tool ?? '?')}(${msg.args ? JSON.stringify(msg.args) : ''})`,
444
+ },
445
+ ]
446
+ cb.onTranscript(state.transcript)
447
+ return
448
+ case 'tool_result':
438
449
  state.transcript = [
439
450
  ...state.transcript,
440
451
  {
441
452
  id: `m${state.idCounter++}`,
442
- role: "tool",
443
- text: `${msg.ok ? "\u2713" : "\u2717"} ${String(msg.tool ?? "?")}`
444
- }
445
- ];
446
- cb.onTranscript(state.transcript);
447
- return;
448
- case "client_tool_call": {
449
- const toolCallId = String(msg.toolCallId ?? "");
450
- const name = String(msg.name ?? "");
451
- const args = msg.args ?? {};
452
- if (!toolCallId || !name) return;
453
- cb.onClientToolCall({ toolCallId, name, args });
454
- return;
455
- }
456
- case "call_end": {
457
- const reasonRaw = String(msg.reason ?? "");
458
- const reason = mapEndReason(reasonRaw);
459
- state.endReason = reason;
453
+ role: 'tool',
454
+ text: `${msg.ok ? '\u2713' : '\u2717'} ${String(msg.tool ?? '?')}`,
455
+ },
456
+ ]
457
+ cb.onTranscript(state.transcript)
458
+ return
459
+ case 'client_tool_call': {
460
+ const toolCallId = String(msg.toolCallId ?? '')
461
+ const name = String(msg.name ?? '')
462
+ const args = msg.args ?? {}
463
+ if (!toolCallId || !name) return
464
+ cb.onClientToolCall({ toolCallId, name, args })
465
+ return
466
+ }
467
+ case 'call_end': {
468
+ const reasonRaw = String(msg.reason ?? '')
469
+ const reason = mapEndReason(reasonRaw)
470
+ state.endReason = reason
460
471
  state.transcript = [
461
472
  ...state.transcript,
462
473
  {
463
474
  id: `m${state.idCounter++}`,
464
- role: "system",
465
- text: `call ended${reasonRaw ? ` (${reasonRaw})` : ""}`
466
- }
467
- ];
468
- cb.onTranscript(state.transcript);
469
- cb.onCallEnd(reason);
470
- return;
475
+ role: 'system',
476
+ text: `call ended${reasonRaw ? ` (${reasonRaw})` : ''}`,
477
+ },
478
+ ]
479
+ cb.onTranscript(state.transcript)
480
+ cb.onCallEnd(reason)
481
+ return
471
482
  }
472
- case "error": {
473
- const code = msg.code ?? "server_error";
474
- const message = msg.message ?? "server error";
475
- cb.onError({ code, message });
476
- return;
483
+ case 'error': {
484
+ const code = msg.code ?? 'server_error'
485
+ const message = msg.message ?? 'server error'
486
+ cb.onError({ code, message })
487
+ return
477
488
  }
478
489
  }
479
490
  }
480
491
  var setState = (state, next, cb) => {
481
- if (state.state === next) return;
482
- cb.onState(next);
483
- };
492
+ if (state.state === next) return
493
+ cb.onState(next)
494
+ }
484
495
  var upsertUserPartial = (state, text, isFinal) => {
485
- let idx = -1;
496
+ let idx = -1
486
497
  for (let i = state.transcript.length - 1; i >= 0; i--) {
487
- const e = state.transcript[i];
488
- if (e.role === "user" && e.committed === false) {
489
- idx = i;
490
- break;
498
+ const e = state.transcript[i]
499
+ if (e.role === 'user' && e.committed === false) {
500
+ idx = i
501
+ break
491
502
  }
492
503
  }
493
504
  if (idx === -1) {
494
505
  state.transcript = [
495
506
  ...state.transcript,
496
- { id: `m${state.idCounter++}`, role: "user", text, committed: isFinal }
497
- ];
498
- return;
499
- }
500
- const target = state.transcript[idx];
501
- const next = [...state.transcript];
502
- next[idx] = { ...target, text, committed: isFinal };
503
- state.transcript = next;
504
- };
507
+ { id: `m${state.idCounter++}`, role: 'user', text, committed: isFinal },
508
+ ]
509
+ return
510
+ }
511
+ const target = state.transcript[idx]
512
+ const next = [...state.transcript]
513
+ next[idx] = { ...target, text, committed: isFinal }
514
+ state.transcript = next
515
+ }
505
516
  function buildWsUrl(args) {
506
- const base = new URL(args.apiBase);
507
- const proto = base.protocol === "https:" ? "wss:" : "ws:";
508
- const bargeQS = args.bargeIn === false ? "&barge=off" : "";
509
- return `${proto}//${base.host}/v1/agents/${encodeURIComponent(args.agentId)}/call?token=${encodeURIComponent(args.token)}${bargeQS}`;
517
+ const base = new URL(args.apiBase)
518
+ const proto = base.protocol === 'https:' ? 'wss:' : 'ws:'
519
+ const bargeQS = args.bargeIn === false ? '&barge=off' : ''
520
+ return `${proto}//${base.host}/v1/agents/${encodeURIComponent(args.agentId)}/call?token=${encodeURIComponent(args.token)}${bargeQS}`
510
521
  }
511
522
 
512
523
  // src/clientTools.ts
513
- var NAME_RE = /^[a-zA-Z_][a-zA-Z0-9_]*$/;
514
- var MAX_TOOLS = 64;
515
- var MAX_USAGE = 500;
516
- var MAX_TIMEOUT_MS = 3e4;
524
+ var NAME_RE = /^[a-zA-Z_][a-zA-Z0-9_]*$/
525
+ var MAX_TOOLS = 64
526
+ var MAX_USAGE = 500
527
+ var MAX_TIMEOUT_MS = 3e4
517
528
  var validateClientToolMap = (tools) => {
518
- if (tools === void 0) return;
519
- if (typeof tools !== "object" || tools === null || Array.isArray(tools)) {
520
- throw new Error("clientTools must be an object keyed by tool name");
529
+ if (tools === void 0) return
530
+ if (typeof tools !== 'object' || tools === null || Array.isArray(tools)) {
531
+ throw new Error('clientTools must be an object keyed by tool name')
521
532
  }
522
- const entries = Object.entries(tools);
533
+ const entries = Object.entries(tools)
523
534
  if (entries.length > MAX_TOOLS) {
524
- throw new Error(`clientTools may declare at most 64 tools (got ${entries.length})`);
535
+ throw new Error(`clientTools may declare at most 64 tools (got ${entries.length})`)
525
536
  }
526
537
  for (const [name, def] of entries) {
527
538
  if (!NAME_RE.test(name)) {
528
539
  throw new Error(
529
- `clientTools["${name}"]: name must be a valid identifier (^[a-zA-Z_][a-zA-Z0-9_]*$)`
530
- );
540
+ `clientTools["${name}"]: name must be a valid identifier (^[a-zA-Z_][a-zA-Z0-9_]*$)`,
541
+ )
531
542
  }
532
- if (!def || typeof def !== "object") {
533
- throw new Error(`clientTools["${name}"]: must be an object`);
543
+ if (!def || typeof def !== 'object') {
544
+ throw new Error(`clientTools["${name}"]: must be an object`)
534
545
  }
535
- if (typeof def.description !== "string" || def.description.length === 0) {
536
- throw new Error(`clientTools["${name}"]: must have a description`);
546
+ if (typeof def.description !== 'string' || def.description.length === 0) {
547
+ throw new Error(`clientTools["${name}"]: must have a description`)
537
548
  }
538
- if (typeof def.handler !== "function") {
539
- throw new Error(`clientTools["${name}"]: must have a handler function`);
549
+ if (typeof def.handler !== 'function') {
550
+ throw new Error(`clientTools["${name}"]: must have a handler function`)
540
551
  }
541
552
  if (def.usage !== void 0 && def.usage.length > MAX_USAGE) {
542
- throw new Error(`clientTools["${name}"]: usage must be \u2264500 chars`);
553
+ throw new Error(`clientTools["${name}"]: usage must be \u2264500 chars`)
543
554
  }
544
- if (def.timeoutMs !== void 0 && (!Number.isFinite(def.timeoutMs) || def.timeoutMs <= 0 || def.timeoutMs > MAX_TIMEOUT_MS)) {
545
- throw new Error(`clientTools["${name}"]: timeoutMs must be in (0, 30000]`);
555
+ if (
556
+ def.timeoutMs !== void 0 &&
557
+ (!Number.isFinite(def.timeoutMs) || def.timeoutMs <= 0 || def.timeoutMs > MAX_TIMEOUT_MS)
558
+ ) {
559
+ throw new Error(`clientTools["${name}"]: timeoutMs must be in (0, 30000]`)
546
560
  }
547
561
  }
548
- };
562
+ }
549
563
  var buildRegisterFrame = (tools) => ({
550
- type: "client_tools_register",
564
+ type: 'client_tools_register',
551
565
  tools: Object.entries(tools).map(([name, def]) => ({
552
566
  name,
553
567
  description: def.description,
554
568
  parameters: def.parameters,
555
- ...def.usage !== void 0 ? { usage: def.usage } : {},
556
- ...def.timeoutMs !== void 0 ? { timeoutMs: def.timeoutMs } : {}
557
- }))
558
- });
569
+ ...(def.usage !== void 0 ? { usage: def.usage } : {}),
570
+ ...(def.timeoutMs !== void 0 ? { timeoutMs: def.timeoutMs } : {}),
571
+ })),
572
+ })
559
573
  var dispatchClientToolCall = (send, tools, frame) => {
560
574
  const safeSend = (payload) => {
561
575
  try {
562
- send(payload);
563
- } catch {
564
- }
565
- };
566
- const tool = tools[frame.name];
576
+ send(payload)
577
+ } catch {}
578
+ }
579
+ const tool = tools[frame.name]
567
580
  if (!tool) {
568
581
  safeSend({
569
- type: "client_tool_result",
582
+ type: 'client_tool_result',
570
583
  toolCallId: frame.toolCallId,
571
- error: `No handler for ${frame.name}`
572
- });
573
- return;
584
+ error: `No handler for ${frame.name}`,
585
+ })
586
+ return
574
587
  }
575
588
  void (async () => {
576
589
  try {
577
- const out = await tool.handler(frame.args);
590
+ const out = await tool.handler(frame.args)
578
591
  safeSend({
579
- type: "client_tool_result",
592
+ type: 'client_tool_result',
580
593
  toolCallId: frame.toolCallId,
581
- result: typeof out === "string" ? out : JSON.stringify(out)
582
- });
594
+ result: typeof out === 'string' ? out : JSON.stringify(out),
595
+ })
583
596
  } catch (err) {
584
597
  safeSend({
585
- type: "client_tool_result",
598
+ type: 'client_tool_result',
586
599
  toolCallId: frame.toolCallId,
587
- error: err instanceof Error ? err.message : String(err)
588
- });
600
+ error: err instanceof Error ? err.message : String(err),
601
+ })
589
602
  }
590
- })();
591
- };
603
+ })()
604
+ }
605
+
606
+ // src/ClientMarksBuffer.ts
607
+ var createClientMarksBuffer = (args) => {
608
+ const now = args.now ?? (() => performance.now())
609
+ let pendingFirstOutboundAt = null
610
+ const inFlight = /* @__PURE__ */ new Map()
611
+ const tryEmit = (seq) => {
612
+ const slot = inFlight.get(seq)
613
+ if (!slot) return
614
+ if (!slot.ended) return
615
+ const marks = {}
616
+ if (slot.firstOutboundAt !== null && slot.firstAudibleAt !== null) {
617
+ marks.client_mic_to_first_audible_ms = slot.firstAudibleAt - slot.firstOutboundAt
618
+ }
619
+ args.send({
620
+ type: 'client_marks',
621
+ seq,
622
+ marks,
623
+ clientNow: Date.now(),
624
+ })
625
+ inFlight.delete(seq)
626
+ }
627
+ const markFirstOutboundAudio = () => {
628
+ if (pendingFirstOutboundAt !== null) return
629
+ pendingFirstOutboundAt = now()
630
+ }
631
+ const markFirstAudibleOutput = () => {
632
+ let target
633
+ for (const slot of inFlight.values()) {
634
+ if (!slot.ended) {
635
+ target = slot
636
+ }
637
+ }
638
+ if (!target) return
639
+ if (target.firstAudibleAt !== null) return
640
+ target.firstAudibleAt = now()
641
+ }
642
+ const onAgentTurnStart = (seq) => {
643
+ inFlight.set(seq, {
644
+ firstOutboundAt: pendingFirstOutboundAt,
645
+ firstAudibleAt: null,
646
+ ended: false,
647
+ })
648
+ pendingFirstOutboundAt = null
649
+ }
650
+ const onAgentTurnEnd = (seq) => {
651
+ const slot = inFlight.get(seq)
652
+ if (!slot) {
653
+ args.send({ type: 'client_marks', seq, marks: {}, clientNow: Date.now() })
654
+ return
655
+ }
656
+ slot.ended = true
657
+ tryEmit(seq)
658
+ }
659
+ const flush = () => {
660
+ for (const seq of [...inFlight.keys()]) {
661
+ const slot = inFlight.get(seq)
662
+ slot.ended = true
663
+ tryEmit(seq)
664
+ }
665
+ pendingFirstOutboundAt = null
666
+ }
667
+ return {
668
+ markFirstOutboundAudio,
669
+ markFirstAudibleOutput,
670
+ onAgentTurnStart,
671
+ onAgentTurnEnd,
672
+ flush,
673
+ }
674
+ }
592
675
 
593
676
  // src/VoiceClient.ts
594
677
  var BrowserVoiceClient = class {
595
678
  constructor(args) {
596
- this.rws = null;
597
- this.capture = null;
598
- this.playback = null;
599
- this.muted = false;
600
- this.inputVolume = 0;
601
- this.outputVolume = 0;
602
- this.startedAt = null;
603
- this.endedFired = false;
604
- this.lastError = null;
679
+ this.rws = null
680
+ this.capture = null
681
+ this.playback = null
682
+ this.muted = false
683
+ this.inputVolume = 0
684
+ this.outputVolume = 0
685
+ this.startedAt = null
686
+ this.endedFired = false
687
+ this.lastError = null
605
688
  this.end = () => {
606
- this.teardown("user_hangup");
607
- };
689
+ this.teardown('user_hangup')
690
+ }
608
691
  this.mute = () => {
609
- if (this.muted) return;
610
- this.muted = true;
611
- this.capture?.mute(true);
612
- };
692
+ if (this.muted) return
693
+ this.muted = true
694
+ this.capture?.mute(true)
695
+ }
613
696
  this.unmute = () => {
614
- if (!this.muted) return;
615
- this.muted = false;
616
- this.capture?.mute(false);
617
- };
697
+ if (!this.muted) return
698
+ this.muted = false
699
+ this.capture?.mute(false)
700
+ }
618
701
  // ---------------------------------------------------------------
619
702
  // Internal
620
703
  // ---------------------------------------------------------------
621
704
  this.sendClientToolsRegister = () => {
622
- const frame = buildRegisterFrame(this.args.options.clientTools ?? {});
623
- this.rws?.send(JSON.stringify(frame));
624
- };
705
+ const frame = buildRegisterFrame(this.args.options.clientTools ?? {})
706
+ this.rws?.send(JSON.stringify(frame))
707
+ }
625
708
  this.setState = (next) => {
626
- if (this.proto.state === next) return;
627
- this.proto.state = next;
628
- this.args.options.onStateChange?.(next);
629
- };
709
+ if (this.proto.state === next) return
710
+ this.proto.state = next
711
+ this.args.options.onStateChange?.(next)
712
+ }
630
713
  this.emitError = (err) => {
631
- this.lastError = err;
632
- this.args.options.onError?.(err);
633
- };
714
+ this.lastError = err
715
+ this.args.options.onError?.(err)
716
+ }
634
717
  this.handleSocketEvent = (ev) => {
635
718
  switch (ev.type) {
636
- case "open":
637
- void this.startCapture();
638
- break;
639
- case "reconnected":
640
- this.proto.transcript = [];
641
- this.proto.agentBubbleId = null;
642
- this.args.options.onTranscript?.(this.proto.transcript);
643
- void this.startCapture();
644
- this.setState("listening");
645
- break;
646
- case "message":
647
- if (typeof ev.data === "string") {
719
+ case 'open':
720
+ void this.startCapture()
721
+ break
722
+ case 'reconnected':
723
+ this.proto.transcript = []
724
+ this.proto.agentBubbleId = null
725
+ this.args.options.onTranscript?.(this.proto.transcript)
726
+ void this.startCapture()
727
+ this.setState('listening')
728
+ break
729
+ case 'message':
730
+ if (typeof ev.data === 'string') {
648
731
  handleServerMessage(ev.data, this.proto, {
649
732
  onState: this.setState,
650
733
  onTranscript: (entries) => this.args.options.onTranscript?.(entries),
651
734
  onError: this.emitError,
652
735
  onInterrupt: () => {
653
- this.playback?.flush();
654
- this.args.options.onInterrupt?.();
736
+ this.playback?.flush()
737
+ this.args.options.onInterrupt?.()
738
+ },
739
+ onAgentTurnStart: (seq) => {
740
+ if (typeof seq === 'number') this.marks.onAgentTurnStart(seq)
741
+ this.args.options.onAgentTurnStart?.()
742
+ },
743
+ onAgentTurnEnd: (seq) => {
744
+ if (typeof seq === 'number') this.marks.onAgentTurnEnd(seq)
655
745
  },
656
- onAgentTurnStart: () => this.args.options.onAgentTurnStart?.(),
657
746
  onCallEnd: (reason) => this.teardown(reason),
658
747
  onConnected: () => this.sendClientToolsRegister(),
659
- onClientToolCall: (frame) => dispatchClientToolCall(
660
- (f) => this.rws?.send(JSON.stringify(f)),
661
- this.args.options.clientTools ?? {},
662
- frame
663
- )
664
- });
748
+ onClientToolCall: (frame) =>
749
+ dispatchClientToolCall(
750
+ (f) => this.rws?.send(JSON.stringify(f)),
751
+ this.args.options.clientTools ?? {},
752
+ frame,
753
+ ),
754
+ })
665
755
  } else {
666
- this.playback?.enqueue(ev.data);
756
+ this.marks.markFirstAudibleOutput()
757
+ this.playback?.enqueue(ev.data)
667
758
  }
668
- break;
669
- case "close":
759
+ break
760
+ case 'close':
670
761
  if (ev.permanent) {
671
- const reason = this.proto.endReason ?? (this.lastError ? "error" : "user_hangup");
672
- this.teardown(reason);
762
+ const reason = this.proto.endReason ?? (this.lastError ? 'error' : 'user_hangup')
763
+ this.teardown(reason)
673
764
  }
674
- break;
675
- case "error":
676
- this.emitError({ code: "socket_error", message: ev.error.message });
677
- break;
765
+ break
766
+ case 'error':
767
+ this.emitError({ code: 'socket_error', message: ev.error.message })
768
+ break
678
769
  }
679
- };
770
+ }
680
771
  this.startCapture = async () => {
681
- if (this.capture?.isCapturing()) return;
772
+ if (this.capture?.isCapturing()) return
682
773
  this.capture = createAudioCapture({
683
774
  onChunk: (pcm) => {
684
- this.rws?.send(pcm);
775
+ this.marks.markFirstOutboundAudio()
776
+ this.rws?.send(pcm)
685
777
  },
686
778
  onVolume: (v) => {
687
- this.inputVolume = v;
688
- this.args.options.onVolume?.({ input: v, output: this.outputVolume });
779
+ this.inputVolume = v
780
+ this.args.options.onVolume?.({ input: v, output: this.outputVolume })
689
781
  },
690
782
  onError: (err) => {
691
783
  this.emitError({
692
- code: err.name === "NotAllowedError" ? "mic_denied" : "mic_start_failed",
693
- message: err.message
694
- });
695
- }
696
- });
697
- if (this.muted) this.capture.mute(true);
784
+ code: err.name === 'NotAllowedError' ? 'mic_denied' : 'mic_start_failed',
785
+ message: err.message,
786
+ })
787
+ },
788
+ })
789
+ if (this.muted) this.capture.mute(true)
698
790
  try {
699
- await this.capture.start();
700
- } catch {
701
- }
702
- };
791
+ await this.capture.start()
792
+ } catch {}
793
+ }
703
794
  this.teardown = (reason) => {
704
- this.capture?.stop();
705
- this.capture = null;
706
- this.playback?.close();
707
- this.playback = null;
708
795
  try {
709
- this.rws?.close(1e3, reason);
710
- } catch {
711
- }
712
- this.rws = null;
713
- this.setState("ended");
714
- this.fireEndOnce(reason);
715
- };
796
+ this.marks.flush()
797
+ } catch {}
798
+ this.capture?.stop()
799
+ this.capture = null
800
+ this.playback?.close()
801
+ this.playback = null
802
+ try {
803
+ this.rws?.close(1e3, reason)
804
+ } catch {}
805
+ this.rws = null
806
+ this.setState('ended')
807
+ this.fireEndOnce(reason)
808
+ }
716
809
  this.fireEndOnce = (reason) => {
717
- if (this.endedFired) return;
718
- this.endedFired = true;
719
- const startedAt = this.startedAt ?? Date.now();
810
+ if (this.endedFired) return
811
+ this.endedFired = true
812
+ const startedAt = this.startedAt ?? Date.now()
720
813
  this.args.options.onEnd?.({
721
814
  reason,
722
- errorCode: reason === "error" ? this.lastError?.code : void 0,
723
- durationMs: Date.now() - startedAt
724
- });
725
- };
726
- this.args = args;
727
- this.proto = createProtocolState();
728
- validateClientToolMap(args.options.clientTools);
815
+ errorCode: reason === 'error' ? this.lastError?.code : void 0,
816
+ durationMs: Date.now() - startedAt,
817
+ })
818
+ }
819
+ this.args = args
820
+ this.proto = createProtocolState()
821
+ validateClientToolMap(args.options.clientTools)
822
+ this.marks = createClientMarksBuffer({
823
+ send: (frame) => {
824
+ try {
825
+ this.rws?.send(JSON.stringify(frame))
826
+ } catch {}
827
+ },
828
+ })
729
829
  }
730
830
  // ---------------------------------------------------------------
731
831
  // Call interface
732
832
  // ---------------------------------------------------------------
733
833
  get state() {
734
- return this.proto.state;
834
+ return this.proto.state
735
835
  }
736
836
  get transcript() {
737
- return this.proto.transcript.slice();
837
+ return this.proto.transcript.slice()
738
838
  }
739
839
  get isMuted() {
740
- return this.muted;
840
+ return this.muted
741
841
  }
742
842
  // ---------------------------------------------------------------
743
843
  // Lifecycle — called by the factory immediately after construction.
@@ -745,84 +845,262 @@ var BrowserVoiceClient = class {
745
845
  // failures arrive via `onError`.
746
846
  // ---------------------------------------------------------------
747
847
  async start() {
748
- this.setState("connecting");
749
- this.startedAt = Date.now();
848
+ this.setState('connecting')
849
+ this.startedAt = Date.now()
750
850
  const url = buildWsUrl({
751
851
  apiBase: this.args.config.apiBase,
752
852
  agentId: this.args.options.agentId,
753
853
  token: this.args.token,
754
- bargeIn: this.args.options.bargeIn
755
- });
854
+ bargeIn: this.args.options.bargeIn,
855
+ })
756
856
  this.playback = createAudioPlayback({
757
857
  onVolume: (v) => {
758
- this.outputVolume = v;
759
- this.args.options.onVolume?.({ input: this.inputVolume, output: v });
760
- }
761
- });
858
+ this.outputVolume = v
859
+ this.args.options.onVolume?.({ input: this.inputVolume, output: v })
860
+ },
861
+ })
762
862
  try {
763
- await this.playback.resume();
764
- } catch {
765
- }
863
+ await this.playback.resume()
864
+ } catch {}
766
865
  this.rws = createReconnectingWebSocket(
767
866
  {
768
867
  url,
769
868
  wsFactory: this.args.wsFactory,
770
- maxRetries: 3
869
+ maxRetries: 3,
771
870
  },
772
- (ev) => this.handleSocketEvent(ev)
773
- );
871
+ (ev) => this.handleSocketEvent(ev),
872
+ )
873
+ }
874
+ }
875
+
876
+ // src/webrtc/createWebRtcCall.ts
877
+ async function createWebRtcCall(opts) {
878
+ const proto = createProtocolState()
879
+ let muted = false
880
+ let ended = false
881
+ const fireState = (next) => {
882
+ if (proto.state === next) return
883
+ proto.state = next
884
+ opts.onStateChange?.(next)
885
+ }
886
+ const dispatch = (raw) => {
887
+ handleServerMessage(raw, proto, {
888
+ onState: fireState,
889
+ onTranscript: (entries) => opts.onTranscript?.(entries),
890
+ onError: (err) => opts.onError?.(err),
891
+ onInterrupt: () => opts.onInterrupt?.(),
892
+ onAgentTurnStart: () => opts.onAgentTurnStart?.(),
893
+ onAgentTurnEnd: () => {},
894
+ onCallEnd: () => teardown(),
895
+ onConnected: () => {},
896
+ onClientToolCall: () => {},
897
+ })
774
898
  }
775
- };
899
+ fireState('connecting')
900
+ const pc = new RTCPeerConnection({
901
+ iceServers: [{ urls: 'stun:stun.l.google.com:19302' }],
902
+ })
903
+ const audioEl = document.createElement('audio')
904
+ audioEl.autoplay = true
905
+ audioEl.style.display = 'none'
906
+ document.body.appendChild(audioEl)
907
+ pc.ontrack = (event) => {
908
+ audioEl.srcObject = event.streams[0] ?? new MediaStream([event.track])
909
+ }
910
+ let mic
911
+ try {
912
+ mic = await navigator.mediaDevices.getUserMedia({ audio: true })
913
+ } catch (err) {
914
+ const code =
915
+ err instanceof DOMException && err.name === 'NotAllowedError'
916
+ ? 'mic_denied'
917
+ : 'mic_start_failed'
918
+ opts.onError?.({
919
+ code,
920
+ message: err instanceof Error ? err.message : 'getUserMedia failed',
921
+ })
922
+ fireState('error')
923
+ pc.close()
924
+ audioEl.remove()
925
+ throw err
926
+ }
927
+ for (const track of mic.getAudioTracks()) pc.addTrack(track, mic)
928
+ const dc = pc.createDataChannel('control', { ordered: true })
929
+ dc.onmessage = (e) => {
930
+ if (typeof e.data === 'string') dispatch(e.data)
931
+ }
932
+ dc.onerror = () => {
933
+ opts.onError?.({ code: 'socket_error', message: 'control channel error' })
934
+ }
935
+ const gateway = opts.webrtcGatewayBase || ''
936
+ const offerUrl = gateway
937
+ ? `${gateway}/webrtc/offer?token=${encodeURIComponent(opts.token)}`
938
+ : `${opts.apiBase}/v1/agents/${encodeURIComponent(opts.agentId)}/webrtc/offer?token=${encodeURIComponent(opts.token)}`
939
+ const iceUrl = gateway
940
+ ? `${gateway}/webrtc/ice?token=${encodeURIComponent(opts.token)}`
941
+ : `${opts.apiBase}/v1/agents/${encodeURIComponent(opts.agentId)}/webrtc/ice?token=${encodeURIComponent(opts.token)}`
942
+ await pc.setLocalDescription(await pc.createOffer())
943
+ let callId
944
+ try {
945
+ const offerRes = await fetch(offerUrl, {
946
+ method: 'POST',
947
+ headers: { 'content-type': 'application/json' },
948
+ body: JSON.stringify({ sdp: pc.localDescription.sdp, type: 'offer', agentId: opts.agentId }),
949
+ })
950
+ if (!offerRes.ok) {
951
+ const code = offerRes.status === 401 ? 'unauthorized' : 'server_error'
952
+ opts.onError?.({ code, message: `signaling failed: HTTP ${offerRes.status}` })
953
+ fireState('error')
954
+ mic.getTracks().forEach((t) => t.stop())
955
+ pc.close()
956
+ audioEl.remove()
957
+ throw new Error(`webrtc offer failed: ${offerRes.status}`)
958
+ }
959
+ const body = await offerRes.json()
960
+ callId = body.callId
961
+ await pc.setRemoteDescription({ type: 'answer', sdp: body.sdp })
962
+ } catch (err) {
963
+ if (!ended) {
964
+ opts.onError?.({
965
+ code: 'network_unreachable',
966
+ message: err instanceof Error ? err.message : 'signaling failed',
967
+ })
968
+ fireState('error')
969
+ mic.getTracks().forEach((t) => t.stop())
970
+ pc.close()
971
+ audioEl.remove()
972
+ }
973
+ throw err
974
+ }
975
+ pc.onicecandidate = (e) => {
976
+ if (!e.candidate) return
977
+ void fetch(iceUrl, {
978
+ method: 'POST',
979
+ headers: { 'content-type': 'application/json' },
980
+ body: JSON.stringify({ callId, candidate: e.candidate }),
981
+ }).catch(() => {})
982
+ }
983
+ pc.onconnectionstatechange = () => {
984
+ const s = pc.connectionState
985
+ if (s === 'connected') fireState('listening')
986
+ if (s === 'failed' || s === 'disconnected') {
987
+ opts.onError?.({ code: 'socket_error', message: `webrtc connection ${s}` })
988
+ teardown()
989
+ }
990
+ if (s === 'closed' && !ended) teardown()
991
+ }
992
+ const teardown = () => {
993
+ if (ended) return
994
+ ended = true
995
+ try {
996
+ mic.getTracks().forEach((t) => t.stop())
997
+ } catch {}
998
+ try {
999
+ pc.close()
1000
+ } catch {}
1001
+ try {
1002
+ audioEl.remove()
1003
+ } catch {}
1004
+ fireState('ended')
1005
+ opts.onEnd?.()
1006
+ }
1007
+ return {
1008
+ get state() {
1009
+ return proto.state
1010
+ },
1011
+ get transcript() {
1012
+ return proto.transcript.slice()
1013
+ },
1014
+ get isMuted() {
1015
+ return muted
1016
+ },
1017
+ end: () => teardown(),
1018
+ mute: () => {
1019
+ if (muted) return
1020
+ muted = true
1021
+ mic.getAudioTracks().forEach((t) => (t.enabled = false))
1022
+ },
1023
+ unmute: () => {
1024
+ if (!muted) return
1025
+ muted = false
1026
+ mic.getAudioTracks().forEach((t) => (t.enabled = true))
1027
+ },
1028
+ }
1029
+ }
776
1030
 
777
1031
  // src/browser.ts
778
- var browserWsFactory = (url) => new globalThis.WebSocket(url);
1032
+ var browserWsFactory = (url) => new globalThis.WebSocket(url)
779
1033
  var BrowserVoiceFactory = class {
780
1034
  constructor(config) {
781
1035
  this.startCall = async (options) => {
782
1036
  if (!options.agentId) {
783
- throw new Error("startCall: agentId is required");
1037
+ throw new Error('startCall: agentId is required')
784
1038
  }
785
- const { context, metadata } = mergeStartCallContext(this.config, options);
1039
+ const { context, metadata } = mergeStartCallContext(this.config, options)
786
1040
  const fetchArgs = {
787
1041
  agentId: options.agentId,
788
1042
  userId: options.userId,
789
1043
  context,
790
- metadata
791
- };
792
- let token;
1044
+ metadata,
1045
+ }
1046
+ let resolved
793
1047
  if (options.token) {
794
- token = options.token;
1048
+ resolved = { token: options.token, transport: 'ws' }
795
1049
  } else {
796
- token = await this.config.fetchToken(fetchArgs);
797
- if (!token) {
798
- throw new Error("configureVoiceClient.fetchToken returned empty token");
1050
+ const r = await this.config.fetchToken(fetchArgs)
1051
+ if (!r) {
1052
+ throw new Error('configureVoiceClient.fetchToken returned empty token')
799
1053
  }
1054
+ resolved = typeof r === 'string' ? { token: r, transport: 'ws' } : r
1055
+ if (!resolved.token) {
1056
+ throw new Error('configureVoiceClient.fetchToken returned an object without `token`')
1057
+ }
1058
+ }
1059
+ if (resolved.transport === 'webrtc') {
1060
+ return createWebRtcCall({
1061
+ agentId: options.agentId,
1062
+ apiBase: this.config.apiBase,
1063
+ token: resolved.token,
1064
+ webrtcGatewayBase: resolved.webrtcGatewayBase,
1065
+ onStateChange: options.onStateChange,
1066
+ onTranscript: options.onTranscript,
1067
+ onError: options.onError,
1068
+ // Synthesise a minimal CallEndEvent. WebRTC doesn't carry an end reason
1069
+ // from the server yet — use 'agent_ended' as placeholder. durationMs is
1070
+ // tracked at 0 until the followup lands (see spec Followups section).
1071
+ onEnd: options.onEnd
1072
+ ? () => options.onEnd({ reason: 'agent_ended', durationMs: 0 })
1073
+ : void 0,
1074
+ onInterrupt: options.onInterrupt,
1075
+ onAgentTurnStart: options.onAgentTurnStart,
1076
+ })
800
1077
  }
801
1078
  const client = new BrowserVoiceClient({
802
1079
  config: this.config,
803
1080
  // Carry merged context/metadata through to startCall so server can
804
1081
  // see what the SDK saw.
805
1082
  options: { ...options, context, metadata },
806
- token,
807
- wsFactory: browserWsFactory
808
- });
809
- await client.start();
810
- return client;
811
- };
812
- this.config = config;
813
- }
814
- };
1083
+ token: resolved.token,
1084
+ wsFactory: browserWsFactory,
1085
+ })
1086
+ await client.start()
1087
+ return client
1088
+ }
1089
+ this.config = config
1090
+ }
1091
+ }
815
1092
  function configureVoiceClient(config) {
816
- return new BrowserVoiceFactory(normalizeConfig(config));
1093
+ return new BrowserVoiceFactory(normalizeConfig(config))
817
1094
  }
818
1095
  // Annotate the CommonJS export names for ESM import in node:
819
- 0 && (module.exports = {
820
- buildWsUrl,
821
- configureVoiceClient,
822
- createAudioCapture,
823
- createAudioPlayback,
824
- createProtocolState,
825
- createReconnectingWebSocket,
826
- handleServerMessage
827
- });
828
- //# sourceMappingURL=browser.js.map
1096
+ 0 &&
1097
+ (module.exports = {
1098
+ buildWsUrl,
1099
+ configureVoiceClient,
1100
+ createAudioCapture,
1101
+ createAudioPlayback,
1102
+ createProtocolState,
1103
+ createReconnectingWebSocket,
1104
+ handleServerMessage,
1105
+ })
1106
+ //# sourceMappingURL=browser.js.map