@craftedxp/voice-js 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/browser.js CHANGED
@@ -1,24 +1,26 @@
1
- "use strict";
2
- var __defProp = Object.defineProperty;
3
- var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
- var __getOwnPropNames = Object.getOwnPropertyNames;
5
- var __hasOwnProp = Object.prototype.hasOwnProperty;
1
+ 'use strict'
2
+ var __defProp = Object.defineProperty
3
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor
4
+ var __getOwnPropNames = Object.getOwnPropertyNames
5
+ var __hasOwnProp = Object.prototype.hasOwnProperty
6
6
  var __export = (target, all) => {
7
- for (var name in all)
8
- __defProp(target, name, { get: all[name], enumerable: true });
9
- };
7
+ for (var name in all) __defProp(target, name, { get: all[name], enumerable: true })
8
+ }
10
9
  var __copyProps = (to, from, except, desc) => {
11
- if (from && typeof from === "object" || typeof from === "function") {
10
+ if ((from && typeof from === 'object') || typeof from === 'function') {
12
11
  for (let key of __getOwnPropNames(from))
13
12
  if (!__hasOwnProp.call(to, key) && key !== except)
14
- __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
13
+ __defProp(to, key, {
14
+ get: () => from[key],
15
+ enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable,
16
+ })
15
17
  }
16
- return to;
17
- };
18
- var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
18
+ return to
19
+ }
20
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, '__esModule', { value: true }), mod)
19
21
 
20
22
  // src/browser.ts
21
- var browser_exports = {};
23
+ var browser_exports = {}
22
24
  __export(browser_exports, {
23
25
  buildWsUrl: () => buildWsUrl,
24
26
  configureVoiceClient: () => configureVoiceClient,
@@ -26,57 +28,64 @@ __export(browser_exports, {
26
28
  createAudioPlayback: () => createAudioPlayback,
27
29
  createProtocolState: () => createProtocolState,
28
30
  createReconnectingWebSocket: () => createReconnectingWebSocket,
29
- handleServerMessage: () => handleServerMessage
30
- });
31
- module.exports = __toCommonJS(browser_exports);
31
+ handleServerMessage: () => handleServerMessage,
32
+ })
33
+ module.exports = __toCommonJS(browser_exports)
32
34
 
33
35
  // src/config.ts
34
36
  function normalizeConfig(config) {
35
- if (!config) throw new Error("configureVoiceClient: config is required");
36
- if ("apiKey" in config) {
37
+ if (!config) throw new Error('configureVoiceClient: config is required')
38
+ if ('apiKey' in config) {
37
39
  throw new Error(
38
- "configureVoiceClient: `apiKey` is no longer supported. Embedding sk_ in JS code ships server-grade credentials to every client. Pass `fetchToken: async ({ agentId }) => { /* call YOUR backend mint */ }` instead \u2014 see the @craftedxp/voice-js README for the migration recipe."
39
- );
40
+ 'configureVoiceClient: `apiKey` is no longer supported. Embedding sk_ in JS code ships server-grade credentials to every client. Pass `fetchToken: async ({ agentId }) => { /* call YOUR backend mint */ }` instead \u2014 see the @craftedxp/voice-js README for the migration recipe.',
41
+ )
40
42
  }
41
43
  if (!config.apiBase) {
42
- throw new Error("configureVoiceClient: apiBase is required");
44
+ throw new Error('configureVoiceClient: apiBase is required')
43
45
  }
44
- if (typeof config.fetchToken !== "function") {
45
- throw new Error("configureVoiceClient: fetchToken must be a function");
46
+ if (typeof config.fetchToken !== 'function') {
47
+ throw new Error('configureVoiceClient: fetchToken must be a function')
46
48
  }
47
49
  return {
48
50
  ...config,
49
- apiBase: config.apiBase.replace(/\/+$/, "")
50
- };
51
+ apiBase: config.apiBase.replace(/\/+$/, ''),
52
+ }
51
53
  }
52
54
  function mergeStartCallContext(factory, call) {
53
- const context = factory.defaultContext || call.context ? { ...factory.defaultContext ?? {}, ...call.context ?? {} } : void 0;
54
- const metadata = factory.defaultMetadata || call.metadata ? { ...factory.defaultMetadata ?? {}, ...call.metadata ?? {} } : void 0;
55
- return { context, metadata };
55
+ const context =
56
+ factory.defaultContext || call.context
57
+ ? { ...(factory.defaultContext ?? {}), ...(call.context ?? {}) }
58
+ : void 0
59
+ const metadata =
60
+ factory.defaultMetadata || call.metadata
61
+ ? { ...(factory.defaultMetadata ?? {}), ...(call.metadata ?? {}) }
62
+ : void 0
63
+ return { context, metadata }
56
64
  }
57
65
 
58
66
  // src/worklets/mic-downsampler.worklet.js
59
- var mic_downsampler_worklet_default = "// AudioWorklet \u2014 runs off the main thread in the audio rendering graph.\n//\n// Input: Float32 samples at the AudioContext's native sampleRate (typically\n// 48000 Hz on desktop, 44100 Hz on some iOS devices).\n// Output: 16 kHz mono Int16 PCM, shipped to the main thread via\n// `port.postMessage(ArrayBuffer, [ArrayBuffer])` (transferred, not copied).\n//\n// Why AudioWorklet instead of ScriptProcessorNode: ScriptProcessorNode is\n// deprecated + main-thread-bound, so any JS jank produces audible audio\n// glitches. AudioWorklet's `process()` runs on the audio rendering thread\n// at the graph's block cadence (128 frames by default) and backpressures\n// via returning `true` / `false`.\n//\n// This file is loaded as text (see tsup.config.ts loader) and registered\n// at runtime via `audioWorklet.addModule(blobUrl)`.\n\nclass MicDownsampler extends AudioWorkletProcessor {\n constructor() {\n super()\n // Target sample rate for STT. Matches Deepgram Nova-3 + the platform's\n // server-side SAMPLE_RATE constant in AgentCallHandler.\n this.targetRate = 16000\n // Accumulator for the downsample. We collect incoming samples and emit\n // an Int16 chunk when we've accumulated ~1024 target-rate samples\n // (~64 ms at 16 kHz) \u2014 matches the mobile SDK's chunk size so both\n // platforms have the same server-side framing.\n this.outputFrames = 1024\n this.acc = []\n // Running index used for fractional resampling.\n this.readCursor = 0\n }\n\n // `inputs[0][0]` = first channel of first input. 128 Float32 samples per\n // call at the context's sampleRate. Return true = keep processing.\n process(inputs) {\n const input = inputs[0]\n if (!input || input.length === 0) return true\n const channel = input[0]\n if (!channel || channel.length === 0) return true\n\n const ctxRate = sampleRate // global inside AudioWorkletProcessor\n const ratio = ctxRate / this.targetRate\n\n // Simple linear-interp downsample. For 48000 \u2192 16000 that's 3:1, which\n // linear handles fine for voice. Anti-alias filtering would be\n // theoretically better but inaudible for speech.\n for (let i = 0; i < channel.length; i++) {\n this.acc.push(channel[i])\n }\n\n while (this.acc.length - this.readCursor >= ratio * this.outputFrames) {\n const out = new Int16Array(this.outputFrames)\n let readIdx = this.readCursor\n for (let i = 0; i < this.outputFrames; i++) {\n // Linear interp between floor(readIdx) and ceil(readIdx)\n const low = Math.floor(readIdx)\n const high = Math.min(low + 1, this.acc.length - 1)\n const frac = readIdx - low\n const sample = this.acc[low] * (1 - frac) + this.acc[high] * frac\n // Clip + convert to int16\n const clipped = Math.max(-1, Math.min(1, sample))\n out[i] = clipped < 0 ? clipped * 0x8000 : clipped * 0x7fff\n readIdx += ratio\n }\n // Transfer the ArrayBuffer (zero-copy) to the main thread.\n this.port.postMessage(out.buffer, [out.buffer])\n this.readCursor = readIdx\n }\n\n // Garbage-collect the consumed portion of `acc` every so often so it\n // doesn't grow without bound. Leave ~one chunk of headroom.\n if (this.readCursor > ratio * this.outputFrames) {\n this.acc = this.acc.slice(Math.floor(this.readCursor))\n this.readCursor -= Math.floor(this.readCursor)\n }\n\n return true\n }\n}\n\nregisterProcessor('mic-downsampler', MicDownsampler)\n";
67
+ var mic_downsampler_worklet_default =
68
+ "// AudioWorklet \u2014 runs off the main thread in the audio rendering graph.\n//\n// Input: Float32 samples at the AudioContext's native sampleRate (typically\n// 48000 Hz on desktop, 44100 Hz on some iOS devices).\n// Output: 16 kHz mono Int16 PCM, shipped to the main thread via\n// `port.postMessage(ArrayBuffer, [ArrayBuffer])` (transferred, not copied).\n//\n// Why AudioWorklet instead of ScriptProcessorNode: ScriptProcessorNode is\n// deprecated + main-thread-bound, so any JS jank produces audible audio\n// glitches. AudioWorklet's `process()` runs on the audio rendering thread\n// at the graph's block cadence (128 frames by default) and backpressures\n// via returning `true` / `false`.\n//\n// This file is loaded as text (see tsup.config.ts loader) and registered\n// at runtime via `audioWorklet.addModule(blobUrl)`.\n\nclass MicDownsampler extends AudioWorkletProcessor {\n constructor() {\n super()\n // Target sample rate for STT. Matches Deepgram Nova-3 + the platform's\n // server-side SAMPLE_RATE constant in AgentCallHandler.\n this.targetRate = 16000\n // Accumulator for the downsample. We collect incoming samples and emit\n // an Int16 chunk when we've accumulated ~1024 target-rate samples\n // (~64 ms at 16 kHz) \u2014 matches the mobile SDK's chunk size so both\n // platforms have the same server-side framing.\n this.outputFrames = 1024\n this.acc = []\n // Running index used for fractional resampling.\n this.readCursor = 0\n }\n\n // `inputs[0][0]` = first channel of first input. 128 Float32 samples per\n // call at the context's sampleRate. Return true = keep processing.\n process(inputs) {\n const input = inputs[0]\n if (!input || input.length === 0) return true\n const channel = input[0]\n if (!channel || channel.length === 0) return true\n\n const ctxRate = sampleRate // global inside AudioWorkletProcessor\n const ratio = ctxRate / this.targetRate\n\n // Simple linear-interp downsample. For 48000 \u2192 16000 that's 3:1, which\n // linear handles fine for voice. Anti-alias filtering would be\n // theoretically better but inaudible for speech.\n for (let i = 0; i < channel.length; i++) {\n this.acc.push(channel[i])\n }\n\n while (this.acc.length - this.readCursor >= ratio * this.outputFrames) {\n const out = new Int16Array(this.outputFrames)\n let readIdx = this.readCursor\n for (let i = 0; i < this.outputFrames; i++) {\n // Linear interp between floor(readIdx) and ceil(readIdx)\n const low = Math.floor(readIdx)\n const high = Math.min(low + 1, this.acc.length - 1)\n const frac = readIdx - low\n const sample = this.acc[low] * (1 - frac) + this.acc[high] * frac\n // Clip + convert to int16\n const clipped = Math.max(-1, Math.min(1, sample))\n out[i] = clipped < 0 ? clipped * 0x8000 : clipped * 0x7fff\n readIdx += ratio\n }\n // Transfer the ArrayBuffer (zero-copy) to the main thread.\n this.port.postMessage(out.buffer, [out.buffer])\n this.readCursor = readIdx\n }\n\n // Garbage-collect the consumed portion of `acc` every so often so it\n // doesn't grow without bound. Leave ~one chunk of headroom.\n if (this.readCursor > ratio * this.outputFrames) {\n this.acc = this.acc.slice(Math.floor(this.readCursor))\n this.readCursor -= Math.floor(this.readCursor)\n }\n\n return true\n }\n}\n\nregisterProcessor('mic-downsampler', MicDownsampler)\n"
60
69
 
61
70
  // src/AudioCapture.ts
62
- var VOLUME_INTERVAL_MS = 100;
71
+ var VOLUME_INTERVAL_MS = 100
63
72
  var createAudioCapture = (options) => {
64
- let audioContext = null;
65
- let mediaStream = null;
66
- let sourceNode = null;
67
- let workletNode = null;
68
- let analyser = null;
69
- let volumeTimer = null;
70
- let muted = false;
71
- let capturing = false;
73
+ let audioContext = null
74
+ let mediaStream = null
75
+ let sourceNode = null
76
+ let workletNode = null
77
+ let analyser = null
78
+ let volumeTimer = null
79
+ let muted = false
80
+ let capturing = false
72
81
  const computeRms = (buf) => {
73
- let sum = 0;
74
- for (let i = 0; i < buf.length; i++) sum += buf[i] * buf[i];
75
- const rms = Math.sqrt(sum / buf.length);
76
- return Math.min(1, rms * 1.8);
77
- };
82
+ let sum = 0
83
+ for (let i = 0; i < buf.length; i++) sum += buf[i] * buf[i]
84
+ const rms = Math.sqrt(sum / buf.length)
85
+ return Math.min(1, rms * 1.8)
86
+ }
78
87
  const start = async () => {
79
- if (capturing) return;
88
+ if (capturing) return
80
89
  try {
81
90
  mediaStream = await navigator.mediaDevices.getUserMedia({
82
91
  audio: {
@@ -87,658 +96,748 @@ var createAudioCapture = (options) => {
87
96
  echoCancellation: true,
88
97
  noiseSuppression: true,
89
98
  autoGainControl: true,
90
- channelCount: 1
91
- }
92
- });
93
- audioContext = new AudioContext();
94
- if (audioContext.state === "suspended") await audioContext.resume();
95
- const blob = new Blob([mic_downsampler_worklet_default], { type: "application/javascript" });
96
- const url = URL.createObjectURL(blob);
99
+ channelCount: 1,
100
+ },
101
+ })
102
+ audioContext = new AudioContext()
103
+ if (audioContext.state === 'suspended') await audioContext.resume()
104
+ const blob = new Blob([mic_downsampler_worklet_default], { type: 'application/javascript' })
105
+ const url = URL.createObjectURL(blob)
97
106
  try {
98
- await audioContext.audioWorklet.addModule(url);
107
+ await audioContext.audioWorklet.addModule(url)
99
108
  } finally {
100
- URL.revokeObjectURL(url);
109
+ URL.revokeObjectURL(url)
101
110
  }
102
- sourceNode = audioContext.createMediaStreamSource(mediaStream);
103
- workletNode = new AudioWorkletNode(audioContext, "mic-downsampler");
111
+ sourceNode = audioContext.createMediaStreamSource(mediaStream)
112
+ workletNode = new AudioWorkletNode(audioContext, 'mic-downsampler')
104
113
  workletNode.port.onmessage = (event) => {
105
- if (muted) return;
106
- options.onChunk(event.data);
107
- };
114
+ if (muted) return
115
+ options.onChunk(event.data)
116
+ }
108
117
  if (options.onVolume) {
109
- analyser = audioContext.createAnalyser();
110
- analyser.fftSize = 256;
111
- sourceNode.connect(analyser);
112
- const buf = new Float32Array(analyser.fftSize);
118
+ analyser = audioContext.createAnalyser()
119
+ analyser.fftSize = 256
120
+ sourceNode.connect(analyser)
121
+ const buf = new Float32Array(analyser.fftSize)
113
122
  volumeTimer = setInterval(() => {
114
- if (!analyser) return;
115
- analyser.getFloatTimeDomainData(buf);
116
- options.onVolume?.(computeRms(buf));
117
- }, VOLUME_INTERVAL_MS);
123
+ if (!analyser) return
124
+ analyser.getFloatTimeDomainData(buf)
125
+ options.onVolume?.(computeRms(buf))
126
+ }, VOLUME_INTERVAL_MS)
118
127
  }
119
- sourceNode.connect(workletNode);
120
- const sink = audioContext.createGain();
121
- sink.gain.value = 0;
122
- workletNode.connect(sink).connect(audioContext.destination);
123
- capturing = true;
128
+ sourceNode.connect(workletNode)
129
+ const sink = audioContext.createGain()
130
+ sink.gain.value = 0
131
+ workletNode.connect(sink).connect(audioContext.destination)
132
+ capturing = true
124
133
  } catch (err) {
125
- const wrapped = err instanceof Error ? err : new Error(typeof err === "string" ? err : "capture failed");
126
- options.onError?.(wrapped);
127
- throw wrapped;
134
+ const wrapped =
135
+ err instanceof Error ? err : new Error(typeof err === 'string' ? err : 'capture failed')
136
+ options.onError?.(wrapped)
137
+ throw wrapped
128
138
  }
129
- };
139
+ }
130
140
  const stop = () => {
131
- if (!capturing) return;
132
- capturing = false;
141
+ if (!capturing) return
142
+ capturing = false
133
143
  if (volumeTimer) {
134
- clearInterval(volumeTimer);
135
- volumeTimer = null;
144
+ clearInterval(volumeTimer)
145
+ volumeTimer = null
136
146
  }
137
147
  try {
138
- workletNode?.disconnect();
139
- analyser?.disconnect();
140
- sourceNode?.disconnect();
141
- } catch {
142
- }
143
- workletNode = null;
144
- analyser = null;
145
- sourceNode = null;
148
+ workletNode?.disconnect()
149
+ analyser?.disconnect()
150
+ sourceNode?.disconnect()
151
+ } catch {}
152
+ workletNode = null
153
+ analyser = null
154
+ sourceNode = null
146
155
  if (mediaStream) {
147
- for (const track of mediaStream.getTracks()) track.stop();
148
- mediaStream = null;
156
+ for (const track of mediaStream.getTracks()) track.stop()
157
+ mediaStream = null
149
158
  }
150
- if (audioContext && audioContext.state !== "closed") {
151
- void audioContext.close().catch(() => void 0);
159
+ if (audioContext && audioContext.state !== 'closed') {
160
+ void audioContext.close().catch(() => void 0)
152
161
  }
153
- audioContext = null;
154
- };
162
+ audioContext = null
163
+ }
155
164
  return {
156
165
  start,
157
166
  stop,
158
167
  mute: (v) => {
159
- muted = v;
168
+ muted = v
160
169
  },
161
- isCapturing: () => capturing
162
- };
163
- };
170
+ isCapturing: () => capturing,
171
+ }
172
+ }
164
173
 
165
174
  // src/AudioPlayback.ts
166
- var DEFAULT_SAMPLE_RATE = 16e3;
167
- var VOLUME_INTERVAL_MS2 = 100;
175
+ var DEFAULT_SAMPLE_RATE = 16e3
176
+ var VOLUME_INTERVAL_MS2 = 100
168
177
  var createAudioPlayback = (options = {}) => {
169
- const sampleRate = options.sampleRate ?? DEFAULT_SAMPLE_RATE;
170
- let audioContext = null;
171
- let gainNode = null;
172
- let analyser = null;
173
- let volumeTimer = null;
174
- let nextStartTime = 0;
175
- let scheduledNodes = [];
176
- let speaking = false;
178
+ const sampleRate = options.sampleRate ?? DEFAULT_SAMPLE_RATE
179
+ let audioContext = null
180
+ let gainNode = null
181
+ let analyser = null
182
+ let volumeTimer = null
183
+ let nextStartTime = 0
184
+ let scheduledNodes = []
185
+ let speaking = false
177
186
  const ensureContext = async () => {
178
187
  if (audioContext) {
179
- if (audioContext.state === "suspended") await audioContext.resume();
180
- return;
188
+ if (audioContext.state === 'suspended') await audioContext.resume()
189
+ return
181
190
  }
182
- audioContext = new AudioContext({ sampleRate });
183
- gainNode = audioContext.createGain();
191
+ audioContext = new AudioContext({ sampleRate })
192
+ gainNode = audioContext.createGain()
184
193
  if (options.onVolume) {
185
- analyser = audioContext.createAnalyser();
186
- analyser.fftSize = 256;
187
- gainNode.connect(analyser);
188
- const buf = new Float32Array(analyser.fftSize);
194
+ analyser = audioContext.createAnalyser()
195
+ analyser.fftSize = 256
196
+ gainNode.connect(analyser)
197
+ const buf = new Float32Array(analyser.fftSize)
189
198
  volumeTimer = setInterval(() => {
190
- if (!analyser) return;
191
- analyser.getFloatTimeDomainData(buf);
192
- let sum = 0;
193
- for (let i = 0; i < buf.length; i++) sum += buf[i] * buf[i];
194
- const rms = Math.sqrt(sum / buf.length);
195
- options.onVolume?.(Math.min(1, rms * 1.8));
196
- }, VOLUME_INTERVAL_MS2);
197
- }
198
- gainNode.connect(audioContext.destination);
199
- nextStartTime = audioContext.currentTime;
200
- };
199
+ if (!analyser) return
200
+ analyser.getFloatTimeDomainData(buf)
201
+ let sum = 0
202
+ for (let i = 0; i < buf.length; i++) sum += buf[i] * buf[i]
203
+ const rms = Math.sqrt(sum / buf.length)
204
+ options.onVolume?.(Math.min(1, rms * 1.8))
205
+ }, VOLUME_INTERVAL_MS2)
206
+ }
207
+ gainNode.connect(audioContext.destination)
208
+ nextStartTime = audioContext.currentTime
209
+ }
201
210
  const setSpeaking = (v) => {
202
- if (v === speaking) return;
203
- speaking = v;
204
- options.onSpeakingChange?.(v);
205
- };
211
+ if (v === speaking) return
212
+ speaking = v
213
+ options.onSpeakingChange?.(v)
214
+ }
206
215
  const pruneFinished = () => {
207
- const now = audioContext?.currentTime ?? 0;
216
+ const now = audioContext?.currentTime ?? 0
208
217
  scheduledNodes = scheduledNodes.filter((n) => {
209
- const node = n;
210
- return (node._endsAt ?? 0) > now;
211
- });
212
- if (scheduledNodes.length === 0) setSpeaking(false);
213
- };
218
+ const node = n
219
+ return (node._endsAt ?? 0) > now
220
+ })
221
+ if (scheduledNodes.length === 0) setSpeaking(false)
222
+ }
214
223
  const enqueue = (pcm) => {
215
224
  if (!audioContext) {
216
- void ensureContext().then(() => enqueue(pcm));
217
- return;
218
- }
219
- if (!audioContext || !gainNode) return;
220
- const int16 = new Int16Array(pcm);
221
- if (int16.length === 0) return;
222
- const audioBuffer = audioContext.createBuffer(1, int16.length, sampleRate);
223
- const float32 = audioBuffer.getChannelData(0);
225
+ void ensureContext().then(() => enqueue(pcm))
226
+ return
227
+ }
228
+ if (!audioContext || !gainNode) return
229
+ const int16 = new Int16Array(pcm)
230
+ if (int16.length === 0) return
231
+ const audioBuffer = audioContext.createBuffer(1, int16.length, sampleRate)
232
+ const float32 = audioBuffer.getChannelData(0)
224
233
  for (let i = 0; i < int16.length; i++) {
225
- float32[i] = int16[i] / 32768;
226
- }
227
- const node = audioContext.createBufferSource();
228
- node.buffer = audioBuffer;
229
- node.connect(gainNode);
230
- const now = audioContext.currentTime;
231
- const startAt = Math.max(now, nextStartTime);
232
- node.start(startAt);
233
- const duration = int16.length / sampleRate;
234
- node._endsAt = startAt + duration;
235
- nextStartTime = startAt + duration;
236
- scheduledNodes.push(node);
237
- setSpeaking(true);
238
- node.onended = () => pruneFinished();
239
- };
234
+ float32[i] = int16[i] / 32768
235
+ }
236
+ const node = audioContext.createBufferSource()
237
+ node.buffer = audioBuffer
238
+ node.connect(gainNode)
239
+ const now = audioContext.currentTime
240
+ const startAt = Math.max(now, nextStartTime)
241
+ node.start(startAt)
242
+ const duration = int16.length / sampleRate
243
+ node._endsAt = startAt + duration
244
+ nextStartTime = startAt + duration
245
+ scheduledNodes.push(node)
246
+ setSpeaking(true)
247
+ node.onended = () => pruneFinished()
248
+ }
240
249
  const flush = () => {
241
- if (!audioContext || !gainNode) return;
250
+ if (!audioContext || !gainNode) return
242
251
  for (const node of scheduledNodes) {
243
252
  try {
244
- node.stop();
245
- } catch {
246
- }
253
+ node.stop()
254
+ } catch {}
247
255
  }
248
- scheduledNodes = [];
249
- gainNode.disconnect();
250
- gainNode = audioContext.createGain();
256
+ scheduledNodes = []
257
+ gainNode.disconnect()
258
+ gainNode = audioContext.createGain()
251
259
  if (analyser) {
252
- analyser.disconnect();
253
- gainNode.connect(analyser);
260
+ analyser.disconnect()
261
+ gainNode.connect(analyser)
254
262
  }
255
- gainNode.connect(audioContext.destination);
256
- nextStartTime = audioContext.currentTime;
257
- setSpeaking(false);
258
- };
263
+ gainNode.connect(audioContext.destination)
264
+ nextStartTime = audioContext.currentTime
265
+ setSpeaking(false)
266
+ }
259
267
  const close = () => {
260
- flush();
268
+ flush()
261
269
  if (volumeTimer) {
262
- clearInterval(volumeTimer);
263
- volumeTimer = null;
270
+ clearInterval(volumeTimer)
271
+ volumeTimer = null
264
272
  }
265
- if (audioContext && audioContext.state !== "closed") {
266
- void audioContext.close().catch(() => void 0);
273
+ if (audioContext && audioContext.state !== 'closed') {
274
+ void audioContext.close().catch(() => void 0)
267
275
  }
268
- audioContext = null;
269
- gainNode = null;
270
- analyser = null;
271
- };
276
+ audioContext = null
277
+ gainNode = null
278
+ analyser = null
279
+ }
272
280
  const resume = async () => {
273
- await ensureContext();
274
- };
275
- return { enqueue, flush, close, resume };
276
- };
281
+ await ensureContext()
282
+ }
283
+ return { enqueue, flush, close, resume }
284
+ }
277
285
 
278
286
  // src/ReconnectingWebSocket.ts
279
- var READYSTATE_OPEN = 1;
280
- var READYSTATE_CLOSED = 3;
287
+ var READYSTATE_OPEN = 1
288
+ var READYSTATE_CLOSED = 3
281
289
  var createReconnectingWebSocket = (options, onEvent) => {
282
- const maxRetries = options.maxRetries ?? 3;
283
- const initialBackoff = options.initialBackoffMs ?? 500;
284
- const maxBackoff = options.maxBackoffMs ?? 8e3;
285
- let ws = null;
286
- let intentionalClose = false;
287
- let retries = 0;
288
- let backoff = initialBackoff;
289
- let reconnectTimer = null;
290
+ const maxRetries = options.maxRetries ?? 3
291
+ const initialBackoff = options.initialBackoffMs ?? 500
292
+ const maxBackoff = options.maxBackoffMs ?? 8e3
293
+ let ws = null
294
+ let intentionalClose = false
295
+ let retries = 0
296
+ let backoff = initialBackoff
297
+ let reconnectTimer = null
290
298
  const openOnce = () => {
291
- ws = options.wsFactory(options.url);
292
- ws.binaryType = "arraybuffer";
299
+ ws = options.wsFactory(options.url)
300
+ ws.binaryType = 'arraybuffer'
293
301
  ws.onopen = () => {
294
- if (retries === 0) onEvent({ type: "open" });
295
- else onEvent({ type: "reconnected" });
296
- retries = 0;
297
- backoff = initialBackoff;
298
- };
302
+ if (retries === 0) onEvent({ type: 'open' })
303
+ else onEvent({ type: 'reconnected' })
304
+ retries = 0
305
+ backoff = initialBackoff
306
+ }
299
307
  ws.onmessage = (ev) => {
300
- onEvent({ type: "message", data: ev.data });
301
- };
308
+ onEvent({ type: 'message', data: ev.data })
309
+ }
302
310
  ws.onerror = () => {
303
- onEvent({ type: "error", error: new Error("WebSocket error") });
304
- };
311
+ onEvent({ type: 'error', error: new Error('WebSocket error') })
312
+ }
305
313
  ws.onclose = (ev) => {
306
- ws = null;
307
- const shouldRetry = !intentionalClose && retries < maxRetries;
314
+ ws = null
315
+ const shouldRetry = !intentionalClose && retries < maxRetries
308
316
  if (!shouldRetry) {
309
317
  onEvent({
310
- type: "close",
318
+ type: 'close',
311
319
  code: ev.code,
312
320
  reason: ev.reason,
313
- permanent: true
314
- });
315
- return;
321
+ permanent: true,
322
+ })
323
+ return
316
324
  }
317
325
  onEvent({
318
- type: "close",
326
+ type: 'close',
319
327
  code: ev.code,
320
328
  reason: ev.reason,
321
- permanent: false
322
- });
323
- retries++;
324
- const delay = Math.min(backoff, maxBackoff);
325
- backoff = Math.min(backoff * 2, maxBackoff);
326
- reconnectTimer = setTimeout(openOnce, delay);
327
- };
328
- };
329
- openOnce();
329
+ permanent: false,
330
+ })
331
+ retries++
332
+ const delay = Math.min(backoff, maxBackoff)
333
+ backoff = Math.min(backoff * 2, maxBackoff)
334
+ reconnectTimer = setTimeout(openOnce, delay)
335
+ }
336
+ }
337
+ openOnce()
330
338
  return {
331
339
  send: (data) => {
332
- if (ws && ws.readyState === READYSTATE_OPEN) ws.send(data);
340
+ if (ws && ws.readyState === READYSTATE_OPEN) ws.send(data)
333
341
  },
334
- close: (code = 1e3, reason = "client-requested") => {
335
- intentionalClose = true;
342
+ close: (code = 1e3, reason = 'client-requested') => {
343
+ intentionalClose = true
336
344
  if (reconnectTimer) {
337
- clearTimeout(reconnectTimer);
338
- reconnectTimer = null;
345
+ clearTimeout(reconnectTimer)
346
+ reconnectTimer = null
339
347
  }
340
348
  try {
341
- ws?.close(code, reason);
342
- } catch {
343
- }
349
+ ws?.close(code, reason)
350
+ } catch {}
344
351
  },
345
- readyState: () => ws?.readyState ?? READYSTATE_CLOSED
346
- };
347
- };
352
+ readyState: () => ws?.readyState ?? READYSTATE_CLOSED,
353
+ }
354
+ }
348
355
 
349
356
  // src/protocol.ts
350
357
  var createProtocolState = () => ({
351
- state: "idle",
358
+ state: 'idle',
352
359
  transcript: [],
353
360
  agentBubbleId: null,
354
361
  idCounter: 0,
355
- endReason: null
356
- });
362
+ endReason: null,
363
+ })
357
364
  var mapEndReason = (raw) => {
358
- if (raw === "agent_ended") return "agent_ended";
359
- if (raw === "caller_hung_up") return "user_hangup";
360
- if (raw === "silence_timeout" || raw === "max_duration") return "timeout";
361
- return "error";
362
- };
365
+ if (raw === 'agent_ended') return 'agent_ended'
366
+ if (raw === 'caller_hung_up') return 'user_hangup'
367
+ if (raw === 'silence_timeout' || raw === 'max_duration') return 'timeout'
368
+ return 'error'
369
+ }
363
370
  function handleServerMessage(raw, state, cb) {
364
- let msg;
371
+ let msg
365
372
  try {
366
- msg = JSON.parse(raw);
373
+ msg = JSON.parse(raw)
367
374
  } catch {
368
- return;
375
+ return
369
376
  }
370
377
  switch (msg.type) {
371
- case "connected":
372
- cb.onConnected();
373
- setState(state, "listening", cb);
374
- return;
375
- case "transcript": {
376
- const text = msg.text ?? "";
377
- if (!text) return;
378
- const isFinal = !!msg.isFinal;
379
- if (!isFinal) setState(state, "user_speaking", cb);
380
- upsertUserPartial(state, text, isFinal);
381
- cb.onTranscript(state.transcript);
382
- return;
383
- }
384
- case "agent_turn_start": {
385
- const id = `m${state.idCounter++}`;
386
- state.agentBubbleId = id;
387
- state.transcript = [...state.transcript, { id, role: "agent", text: "" }];
388
- cb.onTranscript(state.transcript);
389
- cb.onAgentTurnStart();
390
- setState(state, "agent_speaking", cb);
391
- return;
392
- }
393
- case "agent_text": {
394
- const delta = msg.text ?? "";
395
- if (!delta || !state.agentBubbleId) return;
396
- const id = state.agentBubbleId;
397
- state.transcript = state.transcript.map(
398
- (e) => e.id === id && e.role === "agent" ? { ...e, text: e.text + delta } : e
399
- );
400
- cb.onTranscript(state.transcript);
401
- return;
402
- }
403
- case "agent_turn_end":
404
- state.agentBubbleId = null;
405
- setState(state, "listening", cb);
406
- return;
407
- case "interrupt":
408
- cb.onInterrupt();
409
- return;
410
- case "agent_turn_abort": {
411
- const committed = (msg.committedText ?? "").trim();
378
+ case 'connected':
379
+ cb.onConnected()
380
+ setState(state, 'listening', cb)
381
+ return
382
+ case 'transcript': {
383
+ const text = msg.text ?? ''
384
+ if (!text) return
385
+ const isFinal = !!msg.isFinal
386
+ if (!isFinal) setState(state, 'user_speaking', cb)
387
+ upsertUserPartial(state, text, isFinal)
388
+ cb.onTranscript(state.transcript)
389
+ return
390
+ }
391
+ case 'agent_turn_start': {
392
+ const id = `m${state.idCounter++}`
393
+ state.agentBubbleId = id
394
+ state.transcript = [...state.transcript, { id, role: 'agent', text: '' }]
395
+ cb.onTranscript(state.transcript)
396
+ const seq = typeof msg.seq === 'number' ? msg.seq : void 0
397
+ cb.onAgentTurnStart(seq)
398
+ setState(state, 'agent_speaking', cb)
399
+ return
400
+ }
401
+ case 'agent_text': {
402
+ const delta = msg.text ?? ''
403
+ if (!delta || !state.agentBubbleId) return
404
+ const id = state.agentBubbleId
405
+ state.transcript = state.transcript.map((e) =>
406
+ e.id === id && e.role === 'agent' ? { ...e, text: e.text + delta } : e,
407
+ )
408
+ cb.onTranscript(state.transcript)
409
+ return
410
+ }
411
+ case 'agent_turn_end': {
412
+ state.agentBubbleId = null
413
+ const seq = typeof msg.seq === 'number' ? msg.seq : void 0
414
+ cb.onAgentTurnEnd(seq)
415
+ setState(state, 'listening', cb)
416
+ return
417
+ }
418
+ case 'interrupt':
419
+ cb.onInterrupt()
420
+ return
421
+ case 'agent_turn_abort': {
422
+ const committed = (msg.committedText ?? '').trim()
412
423
  if (state.agentBubbleId) {
413
- const id = state.agentBubbleId;
424
+ const id = state.agentBubbleId
414
425
  if (committed) {
415
- state.transcript = state.transcript.map(
416
- (e) => e.id === id && e.role === "agent" ? { ...e, text: committed, interrupted: true } : e
417
- );
426
+ state.transcript = state.transcript.map((e) =>
427
+ e.id === id && e.role === 'agent' ? { ...e, text: committed, interrupted: true } : e,
428
+ )
418
429
  } else {
419
- state.transcript = state.transcript.filter((e) => e.id !== id);
430
+ state.transcript = state.transcript.filter((e) => e.id !== id)
420
431
  }
421
- cb.onTranscript(state.transcript);
432
+ cb.onTranscript(state.transcript)
422
433
  }
423
- state.agentBubbleId = null;
424
- return;
434
+ state.agentBubbleId = null
435
+ return
425
436
  }
426
- case "tool_call":
437
+ case 'tool_call':
427
438
  state.transcript = [
428
439
  ...state.transcript,
429
440
  {
430
441
  id: `m${state.idCounter++}`,
431
- role: "tool",
432
- text: `\u2192 ${String(msg.tool ?? "?")}(${msg.args ? JSON.stringify(msg.args) : ""})`
433
- }
434
- ];
435
- cb.onTranscript(state.transcript);
436
- return;
437
- case "tool_result":
442
+ role: 'tool',
443
+ text: `\u2192 ${String(msg.tool ?? '?')}(${msg.args ? JSON.stringify(msg.args) : ''})`,
444
+ },
445
+ ]
446
+ cb.onTranscript(state.transcript)
447
+ return
448
+ case 'tool_result':
438
449
  state.transcript = [
439
450
  ...state.transcript,
440
451
  {
441
452
  id: `m${state.idCounter++}`,
442
- role: "tool",
443
- text: `${msg.ok ? "\u2713" : "\u2717"} ${String(msg.tool ?? "?")}`
444
- }
445
- ];
446
- cb.onTranscript(state.transcript);
447
- return;
448
- case "client_tool_call": {
449
- const toolCallId = String(msg.toolCallId ?? "");
450
- const name = String(msg.name ?? "");
451
- const args = msg.args ?? {};
452
- if (!toolCallId || !name) return;
453
- cb.onClientToolCall({ toolCallId, name, args });
454
- return;
455
- }
456
- case "call_end": {
457
- const reasonRaw = String(msg.reason ?? "");
458
- const reason = mapEndReason(reasonRaw);
459
- state.endReason = reason;
453
+ role: 'tool',
454
+ text: `${msg.ok ? '\u2713' : '\u2717'} ${String(msg.tool ?? '?')}`,
455
+ },
456
+ ]
457
+ cb.onTranscript(state.transcript)
458
+ return
459
+ case 'client_tool_call': {
460
+ const toolCallId = String(msg.toolCallId ?? '')
461
+ const name = String(msg.name ?? '')
462
+ const args = msg.args ?? {}
463
+ if (!toolCallId || !name) return
464
+ cb.onClientToolCall({ toolCallId, name, args })
465
+ return
466
+ }
467
+ case 'call_end': {
468
+ const reasonRaw = String(msg.reason ?? '')
469
+ const reason = mapEndReason(reasonRaw)
470
+ state.endReason = reason
460
471
  state.transcript = [
461
472
  ...state.transcript,
462
473
  {
463
474
  id: `m${state.idCounter++}`,
464
- role: "system",
465
- text: `call ended${reasonRaw ? ` (${reasonRaw})` : ""}`
466
- }
467
- ];
468
- cb.onTranscript(state.transcript);
469
- cb.onCallEnd(reason);
470
- return;
475
+ role: 'system',
476
+ text: `call ended${reasonRaw ? ` (${reasonRaw})` : ''}`,
477
+ },
478
+ ]
479
+ cb.onTranscript(state.transcript)
480
+ cb.onCallEnd(reason)
481
+ return
471
482
  }
472
- case "error": {
473
- const code = msg.code ?? "server_error";
474
- const message = msg.message ?? "server error";
475
- cb.onError({ code, message });
476
- return;
483
+ case 'error': {
484
+ const code = msg.code ?? 'server_error'
485
+ const message = msg.message ?? 'server error'
486
+ cb.onError({ code, message })
487
+ return
477
488
  }
478
489
  }
479
490
  }
480
491
  var setState = (state, next, cb) => {
481
- if (state.state === next) return;
482
- state.state = next;
483
- cb.onState(next);
484
- };
492
+ if (state.state === next) return
493
+ cb.onState(next)
494
+ }
485
495
  var upsertUserPartial = (state, text, isFinal) => {
486
- let idx = -1;
496
+ let idx = -1
487
497
  for (let i = state.transcript.length - 1; i >= 0; i--) {
488
- const e = state.transcript[i];
489
- if (e.role === "user" && e.committed === false) {
490
- idx = i;
491
- break;
498
+ const e = state.transcript[i]
499
+ if (e.role === 'user' && e.committed === false) {
500
+ idx = i
501
+ break
492
502
  }
493
503
  }
494
504
  if (idx === -1) {
495
505
  state.transcript = [
496
506
  ...state.transcript,
497
- { id: `m${state.idCounter++}`, role: "user", text, committed: isFinal }
498
- ];
499
- return;
500
- }
501
- const target = state.transcript[idx];
502
- const next = [...state.transcript];
503
- next[idx] = { ...target, text, committed: isFinal };
504
- state.transcript = next;
505
- };
507
+ { id: `m${state.idCounter++}`, role: 'user', text, committed: isFinal },
508
+ ]
509
+ return
510
+ }
511
+ const target = state.transcript[idx]
512
+ const next = [...state.transcript]
513
+ next[idx] = { ...target, text, committed: isFinal }
514
+ state.transcript = next
515
+ }
506
516
  function buildWsUrl(args) {
507
- const base = new URL(args.apiBase);
508
- const proto = base.protocol === "https:" ? "wss:" : "ws:";
509
- const bargeQS = args.bargeIn === false ? "&barge=off" : "";
510
- return `${proto}//${base.host}/v1/agents/${encodeURIComponent(args.agentId)}/call?token=${encodeURIComponent(args.token)}${bargeQS}`;
517
+ const base = new URL(args.apiBase)
518
+ const proto = base.protocol === 'https:' ? 'wss:' : 'ws:'
519
+ const bargeQS = args.bargeIn === false ? '&barge=off' : ''
520
+ return `${proto}//${base.host}/v1/agents/${encodeURIComponent(args.agentId)}/call?token=${encodeURIComponent(args.token)}${bargeQS}`
511
521
  }
512
522
 
513
523
  // src/clientTools.ts
514
- var NAME_RE = /^[a-zA-Z_][a-zA-Z0-9_]*$/;
515
- var MAX_TOOLS = 64;
516
- var MAX_USAGE = 500;
517
- var MAX_TIMEOUT_MS = 3e4;
524
+ var NAME_RE = /^[a-zA-Z_][a-zA-Z0-9_]*$/
525
+ var MAX_TOOLS = 64
526
+ var MAX_USAGE = 500
527
+ var MAX_TIMEOUT_MS = 3e4
518
528
  var validateClientToolMap = (tools) => {
519
- if (tools === void 0) return;
520
- if (typeof tools !== "object" || tools === null || Array.isArray(tools)) {
521
- throw new Error("clientTools must be an object keyed by tool name");
529
+ if (tools === void 0) return
530
+ if (typeof tools !== 'object' || tools === null || Array.isArray(tools)) {
531
+ throw new Error('clientTools must be an object keyed by tool name')
522
532
  }
523
- const entries = Object.entries(tools);
533
+ const entries = Object.entries(tools)
524
534
  if (entries.length > MAX_TOOLS) {
525
- throw new Error(`clientTools may declare at most 64 tools (got ${entries.length})`);
535
+ throw new Error(`clientTools may declare at most 64 tools (got ${entries.length})`)
526
536
  }
527
537
  for (const [name, def] of entries) {
528
538
  if (!NAME_RE.test(name)) {
529
539
  throw new Error(
530
- `clientTools["${name}"]: name must be a valid identifier (^[a-zA-Z_][a-zA-Z0-9_]*$)`
531
- );
540
+ `clientTools["${name}"]: name must be a valid identifier (^[a-zA-Z_][a-zA-Z0-9_]*$)`,
541
+ )
532
542
  }
533
- if (!def || typeof def !== "object") {
534
- throw new Error(`clientTools["${name}"]: must be an object`);
543
+ if (!def || typeof def !== 'object') {
544
+ throw new Error(`clientTools["${name}"]: must be an object`)
535
545
  }
536
- if (typeof def.description !== "string" || def.description.length === 0) {
537
- throw new Error(`clientTools["${name}"]: must have a description`);
546
+ if (typeof def.description !== 'string' || def.description.length === 0) {
547
+ throw new Error(`clientTools["${name}"]: must have a description`)
538
548
  }
539
- if (typeof def.handler !== "function") {
540
- throw new Error(`clientTools["${name}"]: must have a handler function`);
549
+ if (typeof def.handler !== 'function') {
550
+ throw new Error(`clientTools["${name}"]: must have a handler function`)
541
551
  }
542
552
  if (def.usage !== void 0 && def.usage.length > MAX_USAGE) {
543
- throw new Error(`clientTools["${name}"]: usage must be \u2264500 chars`);
553
+ throw new Error(`clientTools["${name}"]: usage must be \u2264500 chars`)
544
554
  }
545
- if (def.timeoutMs !== void 0 && (!Number.isFinite(def.timeoutMs) || def.timeoutMs <= 0 || def.timeoutMs > MAX_TIMEOUT_MS)) {
546
- throw new Error(`clientTools["${name}"]: timeoutMs must be in (0, 30000]`);
555
+ if (
556
+ def.timeoutMs !== void 0 &&
557
+ (!Number.isFinite(def.timeoutMs) || def.timeoutMs <= 0 || def.timeoutMs > MAX_TIMEOUT_MS)
558
+ ) {
559
+ throw new Error(`clientTools["${name}"]: timeoutMs must be in (0, 30000]`)
547
560
  }
548
561
  }
549
- };
562
+ }
550
563
  var buildRegisterFrame = (tools) => ({
551
- type: "client_tools_register",
564
+ type: 'client_tools_register',
552
565
  tools: Object.entries(tools).map(([name, def]) => ({
553
566
  name,
554
567
  description: def.description,
555
568
  parameters: def.parameters,
556
- ...def.usage !== void 0 ? { usage: def.usage } : {},
557
- ...def.timeoutMs !== void 0 ? { timeoutMs: def.timeoutMs } : {}
558
- }))
559
- });
569
+ ...(def.usage !== void 0 ? { usage: def.usage } : {}),
570
+ ...(def.timeoutMs !== void 0 ? { timeoutMs: def.timeoutMs } : {}),
571
+ })),
572
+ })
560
573
  var dispatchClientToolCall = (send, tools, frame) => {
561
574
  const safeSend = (payload) => {
562
575
  try {
563
- send(payload);
564
- } catch {
565
- }
566
- };
567
- const tool = tools[frame.name];
576
+ send(payload)
577
+ } catch {}
578
+ }
579
+ const tool = tools[frame.name]
568
580
  if (!tool) {
569
581
  safeSend({
570
- type: "client_tool_result",
582
+ type: 'client_tool_result',
571
583
  toolCallId: frame.toolCallId,
572
- error: `No handler for ${frame.name}`
573
- });
574
- return;
584
+ error: `No handler for ${frame.name}`,
585
+ })
586
+ return
575
587
  }
576
588
  void (async () => {
577
589
  try {
578
- const out = await tool.handler(frame.args);
590
+ const out = await tool.handler(frame.args)
579
591
  safeSend({
580
- type: "client_tool_result",
592
+ type: 'client_tool_result',
581
593
  toolCallId: frame.toolCallId,
582
- result: typeof out === "string" ? out : JSON.stringify(out)
583
- });
594
+ result: typeof out === 'string' ? out : JSON.stringify(out),
595
+ })
584
596
  } catch (err) {
585
597
  safeSend({
586
- type: "client_tool_result",
598
+ type: 'client_tool_result',
587
599
  toolCallId: frame.toolCallId,
588
- error: err instanceof Error ? err.message : String(err)
589
- });
600
+ error: err instanceof Error ? err.message : String(err),
601
+ })
590
602
  }
591
- })();
592
- };
603
+ })()
604
+ }
605
+
606
+ // src/ClientMarksBuffer.ts
607
+ var createClientMarksBuffer = (args) => {
608
+ const now = args.now ?? (() => performance.now())
609
+ let pendingFirstOutboundAt = null
610
+ const inFlight = /* @__PURE__ */ new Map()
611
+ const tryEmit = (seq) => {
612
+ const slot = inFlight.get(seq)
613
+ if (!slot) return
614
+ if (!slot.ended) return
615
+ const marks = {}
616
+ if (slot.firstOutboundAt !== null && slot.firstAudibleAt !== null) {
617
+ marks.client_mic_to_first_audible_ms = slot.firstAudibleAt - slot.firstOutboundAt
618
+ }
619
+ args.send({
620
+ type: 'client_marks',
621
+ seq,
622
+ marks,
623
+ clientNow: Date.now(),
624
+ })
625
+ inFlight.delete(seq)
626
+ }
627
+ const markFirstOutboundAudio = () => {
628
+ if (pendingFirstOutboundAt !== null) return
629
+ pendingFirstOutboundAt = now()
630
+ }
631
+ const markFirstAudibleOutput = () => {
632
+ let target
633
+ for (const slot of inFlight.values()) {
634
+ if (!slot.ended) {
635
+ target = slot
636
+ }
637
+ }
638
+ if (!target) return
639
+ if (target.firstAudibleAt !== null) return
640
+ target.firstAudibleAt = now()
641
+ }
642
+ const onAgentTurnStart = (seq) => {
643
+ inFlight.set(seq, {
644
+ firstOutboundAt: pendingFirstOutboundAt,
645
+ firstAudibleAt: null,
646
+ ended: false,
647
+ })
648
+ pendingFirstOutboundAt = null
649
+ }
650
+ const onAgentTurnEnd = (seq) => {
651
+ const slot = inFlight.get(seq)
652
+ if (!slot) {
653
+ args.send({ type: 'client_marks', seq, marks: {}, clientNow: Date.now() })
654
+ return
655
+ }
656
+ slot.ended = true
657
+ tryEmit(seq)
658
+ }
659
+ const flush = () => {
660
+ for (const seq of [...inFlight.keys()]) {
661
+ const slot = inFlight.get(seq)
662
+ slot.ended = true
663
+ tryEmit(seq)
664
+ }
665
+ pendingFirstOutboundAt = null
666
+ }
667
+ return {
668
+ markFirstOutboundAudio,
669
+ markFirstAudibleOutput,
670
+ onAgentTurnStart,
671
+ onAgentTurnEnd,
672
+ flush,
673
+ }
674
+ }
593
675
 
594
676
  // src/VoiceClient.ts
595
677
  var BrowserVoiceClient = class {
596
678
  constructor(args) {
597
- this.rws = null;
598
- this.capture = null;
599
- this.playback = null;
600
- this.muted = false;
601
- this.inputVolume = 0;
602
- this.outputVolume = 0;
603
- this.startedAt = null;
604
- this.endedFired = false;
605
- this.lastError = null;
679
+ this.rws = null
680
+ this.capture = null
681
+ this.playback = null
682
+ this.muted = false
683
+ this.inputVolume = 0
684
+ this.outputVolume = 0
685
+ this.startedAt = null
686
+ this.endedFired = false
687
+ this.lastError = null
606
688
  this.end = () => {
607
- this.teardown("user_hangup");
608
- };
689
+ this.teardown('user_hangup')
690
+ }
609
691
  this.mute = () => {
610
- if (this.muted) return;
611
- this.muted = true;
612
- this.capture?.mute(true);
613
- };
692
+ if (this.muted) return
693
+ this.muted = true
694
+ this.capture?.mute(true)
695
+ }
614
696
  this.unmute = () => {
615
- if (!this.muted) return;
616
- this.muted = false;
617
- this.capture?.mute(false);
618
- };
697
+ if (!this.muted) return
698
+ this.muted = false
699
+ this.capture?.mute(false)
700
+ }
619
701
  // ---------------------------------------------------------------
620
702
  // Internal
621
703
  // ---------------------------------------------------------------
622
704
  this.sendClientToolsRegister = () => {
623
- const frame = buildRegisterFrame(this.args.options.clientTools ?? {});
624
- this.rws?.send(JSON.stringify(frame));
625
- };
705
+ const frame = buildRegisterFrame(this.args.options.clientTools ?? {})
706
+ this.rws?.send(JSON.stringify(frame))
707
+ }
626
708
  this.setState = (next) => {
627
- if (this.proto.state === next) return;
628
- this.proto.state = next;
629
- this.args.options.onStateChange?.(next);
630
- };
709
+ if (this.proto.state === next) return
710
+ this.proto.state = next
711
+ this.args.options.onStateChange?.(next)
712
+ }
631
713
  this.emitError = (err) => {
632
- this.lastError = err;
633
- this.args.options.onError?.(err);
634
- };
714
+ this.lastError = err
715
+ this.args.options.onError?.(err)
716
+ }
635
717
  this.handleSocketEvent = (ev) => {
636
718
  switch (ev.type) {
637
- case "open":
638
- void this.startCapture();
639
- break;
640
- case "reconnected":
641
- this.proto.transcript = [];
642
- this.proto.agentBubbleId = null;
643
- this.args.options.onTranscript?.(this.proto.transcript);
644
- void this.startCapture();
645
- this.setState("listening");
646
- break;
647
- case "message":
648
- if (typeof ev.data === "string") {
719
+ case 'open':
720
+ void this.startCapture()
721
+ break
722
+ case 'reconnected':
723
+ this.proto.transcript = []
724
+ this.proto.agentBubbleId = null
725
+ this.args.options.onTranscript?.(this.proto.transcript)
726
+ void this.startCapture()
727
+ this.setState('listening')
728
+ break
729
+ case 'message':
730
+ if (typeof ev.data === 'string') {
649
731
  handleServerMessage(ev.data, this.proto, {
650
732
  onState: this.setState,
651
733
  onTranscript: (entries) => this.args.options.onTranscript?.(entries),
652
734
  onError: this.emitError,
653
735
  onInterrupt: () => {
654
- this.playback?.flush();
655
- this.args.options.onInterrupt?.();
736
+ this.playback?.flush()
737
+ this.args.options.onInterrupt?.()
738
+ },
739
+ onAgentTurnStart: (seq) => {
740
+ if (typeof seq === 'number') this.marks.onAgentTurnStart(seq)
741
+ this.args.options.onAgentTurnStart?.()
742
+ },
743
+ onAgentTurnEnd: (seq) => {
744
+ if (typeof seq === 'number') this.marks.onAgentTurnEnd(seq)
656
745
  },
657
- onAgentTurnStart: () => this.args.options.onAgentTurnStart?.(),
658
746
  onCallEnd: (reason) => this.teardown(reason),
659
747
  onConnected: () => this.sendClientToolsRegister(),
660
- onClientToolCall: (frame) => dispatchClientToolCall(
661
- (f) => this.rws?.send(JSON.stringify(f)),
662
- this.args.options.clientTools ?? {},
663
- frame
664
- )
665
- });
748
+ onClientToolCall: (frame) =>
749
+ dispatchClientToolCall(
750
+ (f) => this.rws?.send(JSON.stringify(f)),
751
+ this.args.options.clientTools ?? {},
752
+ frame,
753
+ ),
754
+ })
666
755
  } else {
667
- this.playback?.enqueue(ev.data);
756
+ this.marks.markFirstAudibleOutput()
757
+ this.playback?.enqueue(ev.data)
668
758
  }
669
- break;
670
- case "close":
759
+ break
760
+ case 'close':
671
761
  if (ev.permanent) {
672
- const reason = this.proto.endReason ?? (this.lastError ? "error" : "user_hangup");
673
- this.teardown(reason);
762
+ const reason = this.proto.endReason ?? (this.lastError ? 'error' : 'user_hangup')
763
+ this.teardown(reason)
674
764
  }
675
- break;
676
- case "error":
677
- this.emitError({ code: "socket_error", message: ev.error.message });
678
- break;
765
+ break
766
+ case 'error':
767
+ this.emitError({ code: 'socket_error', message: ev.error.message })
768
+ break
679
769
  }
680
- };
770
+ }
681
771
  this.startCapture = async () => {
682
- if (this.capture?.isCapturing()) return;
772
+ if (this.capture?.isCapturing()) return
683
773
  this.capture = createAudioCapture({
684
774
  onChunk: (pcm) => {
685
- this.rws?.send(pcm);
775
+ this.marks.markFirstOutboundAudio()
776
+ this.rws?.send(pcm)
686
777
  },
687
778
  onVolume: (v) => {
688
- this.inputVolume = v;
689
- this.args.options.onVolume?.({ input: v, output: this.outputVolume });
779
+ this.inputVolume = v
780
+ this.args.options.onVolume?.({ input: v, output: this.outputVolume })
690
781
  },
691
782
  onError: (err) => {
692
783
  this.emitError({
693
- code: err.name === "NotAllowedError" ? "mic_denied" : "mic_start_failed",
694
- message: err.message
695
- });
696
- }
697
- });
698
- if (this.muted) this.capture.mute(true);
784
+ code: err.name === 'NotAllowedError' ? 'mic_denied' : 'mic_start_failed',
785
+ message: err.message,
786
+ })
787
+ },
788
+ })
789
+ if (this.muted) this.capture.mute(true)
699
790
  try {
700
- await this.capture.start();
701
- } catch {
702
- }
703
- };
791
+ await this.capture.start()
792
+ } catch {}
793
+ }
704
794
  this.teardown = (reason) => {
705
- this.capture?.stop();
706
- this.capture = null;
707
- this.playback?.close();
708
- this.playback = null;
709
795
  try {
710
- this.rws?.close(1e3, reason);
711
- } catch {
712
- }
713
- this.rws = null;
714
- this.setState("ended");
715
- this.fireEndOnce(reason);
716
- };
796
+ this.marks.flush()
797
+ } catch {}
798
+ this.capture?.stop()
799
+ this.capture = null
800
+ this.playback?.close()
801
+ this.playback = null
802
+ try {
803
+ this.rws?.close(1e3, reason)
804
+ } catch {}
805
+ this.rws = null
806
+ this.setState('ended')
807
+ this.fireEndOnce(reason)
808
+ }
717
809
  this.fireEndOnce = (reason) => {
718
- if (this.endedFired) return;
719
- this.endedFired = true;
720
- const startedAt = this.startedAt ?? Date.now();
810
+ if (this.endedFired) return
811
+ this.endedFired = true
812
+ const startedAt = this.startedAt ?? Date.now()
721
813
  this.args.options.onEnd?.({
722
814
  reason,
723
- errorCode: reason === "error" ? this.lastError?.code : void 0,
724
- durationMs: Date.now() - startedAt
725
- });
726
- };
727
- this.args = args;
728
- this.proto = createProtocolState();
729
- validateClientToolMap(args.options.clientTools);
815
+ errorCode: reason === 'error' ? this.lastError?.code : void 0,
816
+ durationMs: Date.now() - startedAt,
817
+ })
818
+ }
819
+ this.args = args
820
+ this.proto = createProtocolState()
821
+ validateClientToolMap(args.options.clientTools)
822
+ this.marks = createClientMarksBuffer({
823
+ send: (frame) => {
824
+ try {
825
+ this.rws?.send(JSON.stringify(frame))
826
+ } catch {}
827
+ },
828
+ })
730
829
  }
731
830
  // ---------------------------------------------------------------
732
831
  // Call interface
733
832
  // ---------------------------------------------------------------
734
833
  get state() {
735
- return this.proto.state;
834
+ return this.proto.state
736
835
  }
737
836
  get transcript() {
738
- return this.proto.transcript.slice();
837
+ return this.proto.transcript.slice()
739
838
  }
740
839
  get isMuted() {
741
- return this.muted;
840
+ return this.muted
742
841
  }
743
842
  // ---------------------------------------------------------------
744
843
  // Lifecycle — called by the factory immediately after construction.
@@ -746,84 +845,262 @@ var BrowserVoiceClient = class {
746
845
  // failures arrive via `onError`.
747
846
  // ---------------------------------------------------------------
748
847
  async start() {
749
- this.setState("connecting");
750
- this.startedAt = Date.now();
848
+ this.setState('connecting')
849
+ this.startedAt = Date.now()
751
850
  const url = buildWsUrl({
752
851
  apiBase: this.args.config.apiBase,
753
852
  agentId: this.args.options.agentId,
754
853
  token: this.args.token,
755
- bargeIn: this.args.options.bargeIn
756
- });
854
+ bargeIn: this.args.options.bargeIn,
855
+ })
757
856
  this.playback = createAudioPlayback({
758
857
  onVolume: (v) => {
759
- this.outputVolume = v;
760
- this.args.options.onVolume?.({ input: this.inputVolume, output: v });
761
- }
762
- });
858
+ this.outputVolume = v
859
+ this.args.options.onVolume?.({ input: this.inputVolume, output: v })
860
+ },
861
+ })
763
862
  try {
764
- await this.playback.resume();
765
- } catch {
766
- }
863
+ await this.playback.resume()
864
+ } catch {}
767
865
  this.rws = createReconnectingWebSocket(
768
866
  {
769
867
  url,
770
868
  wsFactory: this.args.wsFactory,
771
- maxRetries: 3
869
+ maxRetries: 3,
772
870
  },
773
- (ev) => this.handleSocketEvent(ev)
774
- );
871
+ (ev) => this.handleSocketEvent(ev),
872
+ )
873
+ }
874
+ }
875
+
876
+ // src/webrtc/createWebRtcCall.ts
877
+ async function createWebRtcCall(opts) {
878
+ const proto = createProtocolState()
879
+ let muted = false
880
+ let ended = false
881
+ const fireState = (next) => {
882
+ if (proto.state === next) return
883
+ proto.state = next
884
+ opts.onStateChange?.(next)
885
+ }
886
+ const dispatch = (raw) => {
887
+ handleServerMessage(raw, proto, {
888
+ onState: fireState,
889
+ onTranscript: (entries) => opts.onTranscript?.(entries),
890
+ onError: (err) => opts.onError?.(err),
891
+ onInterrupt: () => opts.onInterrupt?.(),
892
+ onAgentTurnStart: () => opts.onAgentTurnStart?.(),
893
+ onAgentTurnEnd: () => {},
894
+ onCallEnd: () => teardown(),
895
+ onConnected: () => {},
896
+ onClientToolCall: () => {},
897
+ })
775
898
  }
776
- };
899
+ fireState('connecting')
900
+ const pc = new RTCPeerConnection({
901
+ iceServers: [{ urls: 'stun:stun.l.google.com:19302' }],
902
+ })
903
+ const audioEl = document.createElement('audio')
904
+ audioEl.autoplay = true
905
+ audioEl.style.display = 'none'
906
+ document.body.appendChild(audioEl)
907
+ pc.ontrack = (event) => {
908
+ audioEl.srcObject = event.streams[0] ?? new MediaStream([event.track])
909
+ }
910
+ let mic
911
+ try {
912
+ mic = await navigator.mediaDevices.getUserMedia({ audio: true })
913
+ } catch (err) {
914
+ const code =
915
+ err instanceof DOMException && err.name === 'NotAllowedError'
916
+ ? 'mic_denied'
917
+ : 'mic_start_failed'
918
+ opts.onError?.({
919
+ code,
920
+ message: err instanceof Error ? err.message : 'getUserMedia failed',
921
+ })
922
+ fireState('error')
923
+ pc.close()
924
+ audioEl.remove()
925
+ throw err
926
+ }
927
+ for (const track of mic.getAudioTracks()) pc.addTrack(track, mic)
928
+ const dc = pc.createDataChannel('control', { ordered: true })
929
+ dc.onmessage = (e) => {
930
+ if (typeof e.data === 'string') dispatch(e.data)
931
+ }
932
+ dc.onerror = () => {
933
+ opts.onError?.({ code: 'socket_error', message: 'control channel error' })
934
+ }
935
+ const gateway = opts.webrtcGatewayBase || ''
936
+ const offerUrl = gateway
937
+ ? `${gateway}/webrtc/offer?token=${encodeURIComponent(opts.token)}`
938
+ : `${opts.apiBase}/v1/agents/${encodeURIComponent(opts.agentId)}/webrtc/offer?token=${encodeURIComponent(opts.token)}`
939
+ const iceUrl = gateway
940
+ ? `${gateway}/webrtc/ice?token=${encodeURIComponent(opts.token)}`
941
+ : `${opts.apiBase}/v1/agents/${encodeURIComponent(opts.agentId)}/webrtc/ice?token=${encodeURIComponent(opts.token)}`
942
+ await pc.setLocalDescription(await pc.createOffer())
943
+ let callId
944
+ try {
945
+ const offerRes = await fetch(offerUrl, {
946
+ method: 'POST',
947
+ headers: { 'content-type': 'application/json' },
948
+ body: JSON.stringify({ sdp: pc.localDescription.sdp, type: 'offer', agentId: opts.agentId }),
949
+ })
950
+ if (!offerRes.ok) {
951
+ const code = offerRes.status === 401 ? 'unauthorized' : 'server_error'
952
+ opts.onError?.({ code, message: `signaling failed: HTTP ${offerRes.status}` })
953
+ fireState('error')
954
+ mic.getTracks().forEach((t) => t.stop())
955
+ pc.close()
956
+ audioEl.remove()
957
+ throw new Error(`webrtc offer failed: ${offerRes.status}`)
958
+ }
959
+ const body = await offerRes.json()
960
+ callId = body.callId
961
+ await pc.setRemoteDescription({ type: 'answer', sdp: body.sdp })
962
+ } catch (err) {
963
+ if (!ended) {
964
+ opts.onError?.({
965
+ code: 'network_unreachable',
966
+ message: err instanceof Error ? err.message : 'signaling failed',
967
+ })
968
+ fireState('error')
969
+ mic.getTracks().forEach((t) => t.stop())
970
+ pc.close()
971
+ audioEl.remove()
972
+ }
973
+ throw err
974
+ }
975
+ pc.onicecandidate = (e) => {
976
+ if (!e.candidate) return
977
+ void fetch(iceUrl, {
978
+ method: 'POST',
979
+ headers: { 'content-type': 'application/json' },
980
+ body: JSON.stringify({ callId, candidate: e.candidate }),
981
+ }).catch(() => {})
982
+ }
983
+ pc.onconnectionstatechange = () => {
984
+ const s = pc.connectionState
985
+ if (s === 'connected') fireState('listening')
986
+ if (s === 'failed' || s === 'disconnected') {
987
+ opts.onError?.({ code: 'socket_error', message: `webrtc connection ${s}` })
988
+ teardown()
989
+ }
990
+ if (s === 'closed' && !ended) teardown()
991
+ }
992
+ const teardown = () => {
993
+ if (ended) return
994
+ ended = true
995
+ try {
996
+ mic.getTracks().forEach((t) => t.stop())
997
+ } catch {}
998
+ try {
999
+ pc.close()
1000
+ } catch {}
1001
+ try {
1002
+ audioEl.remove()
1003
+ } catch {}
1004
+ fireState('ended')
1005
+ opts.onEnd?.()
1006
+ }
1007
+ return {
1008
+ get state() {
1009
+ return proto.state
1010
+ },
1011
+ get transcript() {
1012
+ return proto.transcript.slice()
1013
+ },
1014
+ get isMuted() {
1015
+ return muted
1016
+ },
1017
+ end: () => teardown(),
1018
+ mute: () => {
1019
+ if (muted) return
1020
+ muted = true
1021
+ mic.getAudioTracks().forEach((t) => (t.enabled = false))
1022
+ },
1023
+ unmute: () => {
1024
+ if (!muted) return
1025
+ muted = false
1026
+ mic.getAudioTracks().forEach((t) => (t.enabled = true))
1027
+ },
1028
+ }
1029
+ }
777
1030
 
778
1031
  // src/browser.ts
779
- var browserWsFactory = (url) => new globalThis.WebSocket(url);
1032
+ var browserWsFactory = (url) => new globalThis.WebSocket(url)
780
1033
  var BrowserVoiceFactory = class {
781
1034
  constructor(config) {
782
1035
  this.startCall = async (options) => {
783
1036
  if (!options.agentId) {
784
- throw new Error("startCall: agentId is required");
1037
+ throw new Error('startCall: agentId is required')
785
1038
  }
786
- const { context, metadata } = mergeStartCallContext(this.config, options);
1039
+ const { context, metadata } = mergeStartCallContext(this.config, options)
787
1040
  const fetchArgs = {
788
1041
  agentId: options.agentId,
789
1042
  userId: options.userId,
790
1043
  context,
791
- metadata
792
- };
793
- let token;
1044
+ metadata,
1045
+ }
1046
+ let resolved
794
1047
  if (options.token) {
795
- token = options.token;
1048
+ resolved = { token: options.token, transport: 'ws' }
796
1049
  } else {
797
- token = await this.config.fetchToken(fetchArgs);
798
- if (!token) {
799
- throw new Error("configureVoiceClient.fetchToken returned empty token");
1050
+ const r = await this.config.fetchToken(fetchArgs)
1051
+ if (!r) {
1052
+ throw new Error('configureVoiceClient.fetchToken returned empty token')
800
1053
  }
1054
+ resolved = typeof r === 'string' ? { token: r, transport: 'ws' } : r
1055
+ if (!resolved.token) {
1056
+ throw new Error('configureVoiceClient.fetchToken returned an object without `token`')
1057
+ }
1058
+ }
1059
+ if (resolved.transport === 'webrtc') {
1060
+ return createWebRtcCall({
1061
+ agentId: options.agentId,
1062
+ apiBase: this.config.apiBase,
1063
+ token: resolved.token,
1064
+ webrtcGatewayBase: resolved.webrtcGatewayBase,
1065
+ onStateChange: options.onStateChange,
1066
+ onTranscript: options.onTranscript,
1067
+ onError: options.onError,
1068
+ // Synthesise a minimal CallEndEvent. WebRTC doesn't carry an end reason
1069
+ // from the server yet — use 'agent_ended' as placeholder. durationMs is
1070
+ // tracked at 0 until the followup lands (see spec Followups section).
1071
+ onEnd: options.onEnd
1072
+ ? () => options.onEnd({ reason: 'agent_ended', durationMs: 0 })
1073
+ : void 0,
1074
+ onInterrupt: options.onInterrupt,
1075
+ onAgentTurnStart: options.onAgentTurnStart,
1076
+ })
801
1077
  }
802
1078
  const client = new BrowserVoiceClient({
803
1079
  config: this.config,
804
1080
  // Carry merged context/metadata through to startCall so server can
805
1081
  // see what the SDK saw.
806
1082
  options: { ...options, context, metadata },
807
- token,
808
- wsFactory: browserWsFactory
809
- });
810
- await client.start();
811
- return client;
812
- };
813
- this.config = config;
814
- }
815
- };
1083
+ token: resolved.token,
1084
+ wsFactory: browserWsFactory,
1085
+ })
1086
+ await client.start()
1087
+ return client
1088
+ }
1089
+ this.config = config
1090
+ }
1091
+ }
816
1092
  function configureVoiceClient(config) {
817
- return new BrowserVoiceFactory(normalizeConfig(config));
1093
+ return new BrowserVoiceFactory(normalizeConfig(config))
818
1094
  }
819
1095
  // Annotate the CommonJS export names for ESM import in node:
820
- 0 && (module.exports = {
821
- buildWsUrl,
822
- configureVoiceClient,
823
- createAudioCapture,
824
- createAudioPlayback,
825
- createProtocolState,
826
- createReconnectingWebSocket,
827
- handleServerMessage
828
- });
829
- //# sourceMappingURL=browser.js.map
1096
+ 0 &&
1097
+ (module.exports = {
1098
+ buildWsUrl,
1099
+ configureVoiceClient,
1100
+ createAudioCapture,
1101
+ createAudioPlayback,
1102
+ createProtocolState,
1103
+ createReconnectingWebSocket,
1104
+ handleServerMessage,
1105
+ })
1106
+ //# sourceMappingURL=browser.js.map