@craftedxp/voice-js 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,725 @@
1
+ "use strict";
2
+ var __defProp = Object.defineProperty;
3
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
+ var __getOwnPropNames = Object.getOwnPropertyNames;
5
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
6
+ var __export = (target, all) => {
7
+ for (var name in all)
8
+ __defProp(target, name, { get: all[name], enumerable: true });
9
+ };
10
+ var __copyProps = (to, from, except, desc) => {
11
+ if (from && typeof from === "object" || typeof from === "function") {
12
+ for (let key of __getOwnPropNames(from))
13
+ if (!__hasOwnProp.call(to, key) && key !== except)
14
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
15
+ }
16
+ return to;
17
+ };
18
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
19
+
20
+ // src/browser.ts
21
+ var browser_exports = {};
22
+ __export(browser_exports, {
23
+ buildWsUrl: () => buildWsUrl,
24
+ configureVoiceClient: () => configureVoiceClient,
25
+ createAudioCapture: () => createAudioCapture,
26
+ createAudioPlayback: () => createAudioPlayback,
27
+ createProtocolState: () => createProtocolState,
28
+ createReconnectingWebSocket: () => createReconnectingWebSocket,
29
+ handleServerMessage: () => handleServerMessage
30
+ });
31
+ module.exports = __toCommonJS(browser_exports);
32
+
33
+ // src/config.ts
34
+ function normalizeConfig(config) {
35
+ if (!config) throw new Error("configureVoiceClient: config is required");
36
+ if ("apiKey" in config) {
37
+ throw new Error(
38
+ "configureVoiceClient: `apiKey` is no longer supported. Embedding sk_ in JS code ships server-grade credentials to every client. Pass `fetchToken: async ({ agentId }) => { /* call YOUR backend mint */ }` instead \u2014 see the @craftedxp/voice-js README for the migration recipe."
39
+ );
40
+ }
41
+ if (!config.apiBase) {
42
+ throw new Error("configureVoiceClient: apiBase is required");
43
+ }
44
+ if (typeof config.fetchToken !== "function") {
45
+ throw new Error("configureVoiceClient: fetchToken must be a function");
46
+ }
47
+ return {
48
+ ...config,
49
+ apiBase: config.apiBase.replace(/\/+$/, "")
50
+ };
51
+ }
52
+ function mergeStartCallContext(factory, call) {
53
+ const context = factory.defaultContext || call.context ? { ...factory.defaultContext ?? {}, ...call.context ?? {} } : void 0;
54
+ const metadata = factory.defaultMetadata || call.metadata ? { ...factory.defaultMetadata ?? {}, ...call.metadata ?? {} } : void 0;
55
+ return { context, metadata };
56
+ }
57
+
58
+ // src/worklets/mic-downsampler.worklet.js
59
+ var mic_downsampler_worklet_default = "// AudioWorklet \u2014 runs off the main thread in the audio rendering graph.\n//\n// Input: Float32 samples at the AudioContext's native sampleRate (typically\n// 48000 Hz on desktop, 44100 Hz on some iOS devices).\n// Output: 16 kHz mono Int16 PCM, shipped to the main thread via\n// `port.postMessage(ArrayBuffer, [ArrayBuffer])` (transferred, not copied).\n//\n// Why AudioWorklet instead of ScriptProcessorNode: ScriptProcessorNode is\n// deprecated + main-thread-bound, so any JS jank produces audible audio\n// glitches. AudioWorklet's `process()` runs on the audio rendering thread\n// at the graph's block cadence (128 frames by default) and backpressures\n// via returning `true` / `false`.\n//\n// This file is loaded as text (see tsup.config.ts loader) and registered\n// at runtime via `audioWorklet.addModule(blobUrl)`.\n\nclass MicDownsampler extends AudioWorkletProcessor {\n constructor() {\n super()\n // Target sample rate for STT. Matches Deepgram Nova-3 + the platform's\n // server-side SAMPLE_RATE constant in AgentCallHandler.\n this.targetRate = 16000\n // Accumulator for the downsample. We collect incoming samples and emit\n // an Int16 chunk when we've accumulated ~1024 target-rate samples\n // (~64 ms at 16 kHz) \u2014 matches the mobile SDK's chunk size so both\n // platforms have the same server-side framing.\n this.outputFrames = 1024\n this.acc = []\n // Running index used for fractional resampling.\n this.readCursor = 0\n }\n\n // `inputs[0][0]` = first channel of first input. 128 Float32 samples per\n // call at the context's sampleRate. Return true = keep processing.\n process(inputs) {\n const input = inputs[0]\n if (!input || input.length === 0) return true\n const channel = input[0]\n if (!channel || channel.length === 0) return true\n\n const ctxRate = sampleRate // global inside AudioWorkletProcessor\n const ratio = ctxRate / this.targetRate\n\n // Simple linear-interp downsample. For 48000 \u2192 16000 that's 3:1, which\n // linear handles fine for voice. Anti-alias filtering would be\n // theoretically better but inaudible for speech.\n for (let i = 0; i < channel.length; i++) {\n this.acc.push(channel[i])\n }\n\n while (this.acc.length - this.readCursor >= ratio * this.outputFrames) {\n const out = new Int16Array(this.outputFrames)\n let readIdx = this.readCursor\n for (let i = 0; i < this.outputFrames; i++) {\n // Linear interp between floor(readIdx) and ceil(readIdx)\n const low = Math.floor(readIdx)\n const high = Math.min(low + 1, this.acc.length - 1)\n const frac = readIdx - low\n const sample = this.acc[low] * (1 - frac) + this.acc[high] * frac\n // Clip + convert to int16\n const clipped = Math.max(-1, Math.min(1, sample))\n out[i] = clipped < 0 ? clipped * 0x8000 : clipped * 0x7fff\n readIdx += ratio\n }\n // Transfer the ArrayBuffer (zero-copy) to the main thread.\n this.port.postMessage(out.buffer, [out.buffer])\n this.readCursor = readIdx\n }\n\n // Garbage-collect the consumed portion of `acc` every so often so it\n // doesn't grow without bound. Leave ~one chunk of headroom.\n if (this.readCursor > ratio * this.outputFrames) {\n this.acc = this.acc.slice(Math.floor(this.readCursor))\n this.readCursor -= Math.floor(this.readCursor)\n }\n\n return true\n }\n}\n\nregisterProcessor('mic-downsampler', MicDownsampler)\n";
60
+
61
+ // src/AudioCapture.ts
62
+ var VOLUME_INTERVAL_MS = 100;
63
+ var createAudioCapture = (options) => {
64
+ let audioContext = null;
65
+ let mediaStream = null;
66
+ let sourceNode = null;
67
+ let workletNode = null;
68
+ let analyser = null;
69
+ let volumeTimer = null;
70
+ let muted = false;
71
+ let capturing = false;
72
+ const computeRms = (buf) => {
73
+ let sum = 0;
74
+ for (let i = 0; i < buf.length; i++) sum += buf[i] * buf[i];
75
+ const rms = Math.sqrt(sum / buf.length);
76
+ return Math.min(1, rms * 1.8);
77
+ };
78
+ const start = async () => {
79
+ if (capturing) return;
80
+ try {
81
+ mediaStream = await navigator.mediaDevices.getUserMedia({
82
+ audio: {
83
+ // Hand tuning for voice agent use: we want the raw signal so the
84
+ // server-side STT can do its own noise handling. Disable browser
85
+ // AEC/AGC/NR — experimentally they fight with whatever processing
86
+ // the TTS playback path feeds back in over speakers.
87
+ echoCancellation: true,
88
+ noiseSuppression: true,
89
+ autoGainControl: true,
90
+ channelCount: 1
91
+ }
92
+ });
93
+ audioContext = new AudioContext();
94
+ if (audioContext.state === "suspended") await audioContext.resume();
95
+ const blob = new Blob([mic_downsampler_worklet_default], { type: "application/javascript" });
96
+ const url = URL.createObjectURL(blob);
97
+ try {
98
+ await audioContext.audioWorklet.addModule(url);
99
+ } finally {
100
+ URL.revokeObjectURL(url);
101
+ }
102
+ sourceNode = audioContext.createMediaStreamSource(mediaStream);
103
+ workletNode = new AudioWorkletNode(audioContext, "mic-downsampler");
104
+ workletNode.port.onmessage = (event) => {
105
+ if (muted) return;
106
+ options.onChunk(event.data);
107
+ };
108
+ if (options.onVolume) {
109
+ analyser = audioContext.createAnalyser();
110
+ analyser.fftSize = 256;
111
+ sourceNode.connect(analyser);
112
+ const buf = new Float32Array(analyser.fftSize);
113
+ volumeTimer = setInterval(() => {
114
+ if (!analyser) return;
115
+ analyser.getFloatTimeDomainData(buf);
116
+ options.onVolume?.(computeRms(buf));
117
+ }, VOLUME_INTERVAL_MS);
118
+ }
119
+ sourceNode.connect(workletNode);
120
+ const sink = audioContext.createGain();
121
+ sink.gain.value = 0;
122
+ workletNode.connect(sink).connect(audioContext.destination);
123
+ capturing = true;
124
+ } catch (err) {
125
+ const wrapped = err instanceof Error ? err : new Error(typeof err === "string" ? err : "capture failed");
126
+ options.onError?.(wrapped);
127
+ throw wrapped;
128
+ }
129
+ };
130
+ const stop = () => {
131
+ if (!capturing) return;
132
+ capturing = false;
133
+ if (volumeTimer) {
134
+ clearInterval(volumeTimer);
135
+ volumeTimer = null;
136
+ }
137
+ try {
138
+ workletNode?.disconnect();
139
+ analyser?.disconnect();
140
+ sourceNode?.disconnect();
141
+ } catch {
142
+ }
143
+ workletNode = null;
144
+ analyser = null;
145
+ sourceNode = null;
146
+ if (mediaStream) {
147
+ for (const track of mediaStream.getTracks()) track.stop();
148
+ mediaStream = null;
149
+ }
150
+ if (audioContext && audioContext.state !== "closed") {
151
+ void audioContext.close().catch(() => void 0);
152
+ }
153
+ audioContext = null;
154
+ };
155
+ return {
156
+ start,
157
+ stop,
158
+ mute: (v) => {
159
+ muted = v;
160
+ },
161
+ isCapturing: () => capturing
162
+ };
163
+ };
164
+
165
+ // src/AudioPlayback.ts
166
+ var DEFAULT_SAMPLE_RATE = 16e3;
167
+ var VOLUME_INTERVAL_MS2 = 100;
168
+ var createAudioPlayback = (options = {}) => {
169
+ const sampleRate = options.sampleRate ?? DEFAULT_SAMPLE_RATE;
170
+ let audioContext = null;
171
+ let gainNode = null;
172
+ let analyser = null;
173
+ let volumeTimer = null;
174
+ let nextStartTime = 0;
175
+ let scheduledNodes = [];
176
+ let speaking = false;
177
+ const ensureContext = async () => {
178
+ if (audioContext) {
179
+ if (audioContext.state === "suspended") await audioContext.resume();
180
+ return;
181
+ }
182
+ audioContext = new AudioContext({ sampleRate });
183
+ gainNode = audioContext.createGain();
184
+ if (options.onVolume) {
185
+ analyser = audioContext.createAnalyser();
186
+ analyser.fftSize = 256;
187
+ gainNode.connect(analyser);
188
+ const buf = new Float32Array(analyser.fftSize);
189
+ volumeTimer = setInterval(() => {
190
+ if (!analyser) return;
191
+ analyser.getFloatTimeDomainData(buf);
192
+ let sum = 0;
193
+ for (let i = 0; i < buf.length; i++) sum += buf[i] * buf[i];
194
+ const rms = Math.sqrt(sum / buf.length);
195
+ options.onVolume?.(Math.min(1, rms * 1.8));
196
+ }, VOLUME_INTERVAL_MS2);
197
+ }
198
+ gainNode.connect(audioContext.destination);
199
+ nextStartTime = audioContext.currentTime;
200
+ };
201
+ const setSpeaking = (v) => {
202
+ if (v === speaking) return;
203
+ speaking = v;
204
+ options.onSpeakingChange?.(v);
205
+ };
206
+ const pruneFinished = () => {
207
+ const now = audioContext?.currentTime ?? 0;
208
+ scheduledNodes = scheduledNodes.filter((n) => {
209
+ const node = n;
210
+ return (node._endsAt ?? 0) > now;
211
+ });
212
+ if (scheduledNodes.length === 0) setSpeaking(false);
213
+ };
214
+ const enqueue = (pcm) => {
215
+ if (!audioContext) {
216
+ void ensureContext().then(() => enqueue(pcm));
217
+ return;
218
+ }
219
+ if (!audioContext || !gainNode) return;
220
+ const int16 = new Int16Array(pcm);
221
+ if (int16.length === 0) return;
222
+ const audioBuffer = audioContext.createBuffer(1, int16.length, sampleRate);
223
+ const float32 = audioBuffer.getChannelData(0);
224
+ for (let i = 0; i < int16.length; i++) {
225
+ float32[i] = int16[i] / 32768;
226
+ }
227
+ const node = audioContext.createBufferSource();
228
+ node.buffer = audioBuffer;
229
+ node.connect(gainNode);
230
+ const now = audioContext.currentTime;
231
+ const startAt = Math.max(now, nextStartTime);
232
+ node.start(startAt);
233
+ const duration = int16.length / sampleRate;
234
+ node._endsAt = startAt + duration;
235
+ nextStartTime = startAt + duration;
236
+ scheduledNodes.push(node);
237
+ setSpeaking(true);
238
+ node.onended = () => pruneFinished();
239
+ };
240
+ const flush = () => {
241
+ if (!audioContext || !gainNode) return;
242
+ for (const node of scheduledNodes) {
243
+ try {
244
+ node.stop();
245
+ } catch {
246
+ }
247
+ }
248
+ scheduledNodes = [];
249
+ gainNode.disconnect();
250
+ gainNode = audioContext.createGain();
251
+ if (analyser) {
252
+ analyser.disconnect();
253
+ gainNode.connect(analyser);
254
+ }
255
+ gainNode.connect(audioContext.destination);
256
+ nextStartTime = audioContext.currentTime;
257
+ setSpeaking(false);
258
+ };
259
+ const close = () => {
260
+ flush();
261
+ if (volumeTimer) {
262
+ clearInterval(volumeTimer);
263
+ volumeTimer = null;
264
+ }
265
+ if (audioContext && audioContext.state !== "closed") {
266
+ void audioContext.close().catch(() => void 0);
267
+ }
268
+ audioContext = null;
269
+ gainNode = null;
270
+ analyser = null;
271
+ };
272
+ const resume = async () => {
273
+ await ensureContext();
274
+ };
275
+ return { enqueue, flush, close, resume };
276
+ };
277
+
278
+ // src/ReconnectingWebSocket.ts
279
+ var READYSTATE_OPEN = 1;
280
+ var READYSTATE_CLOSED = 3;
281
+ var createReconnectingWebSocket = (options, onEvent) => {
282
+ const maxRetries = options.maxRetries ?? 3;
283
+ const initialBackoff = options.initialBackoffMs ?? 500;
284
+ const maxBackoff = options.maxBackoffMs ?? 8e3;
285
+ let ws = null;
286
+ let intentionalClose = false;
287
+ let retries = 0;
288
+ let backoff = initialBackoff;
289
+ let reconnectTimer = null;
290
+ const openOnce = () => {
291
+ ws = options.wsFactory(options.url);
292
+ ws.binaryType = "arraybuffer";
293
+ ws.onopen = () => {
294
+ if (retries === 0) onEvent({ type: "open" });
295
+ else onEvent({ type: "reconnected" });
296
+ retries = 0;
297
+ backoff = initialBackoff;
298
+ };
299
+ ws.onmessage = (ev) => {
300
+ onEvent({ type: "message", data: ev.data });
301
+ };
302
+ ws.onerror = () => {
303
+ onEvent({ type: "error", error: new Error("WebSocket error") });
304
+ };
305
+ ws.onclose = (ev) => {
306
+ ws = null;
307
+ const shouldRetry = !intentionalClose && retries < maxRetries;
308
+ if (!shouldRetry) {
309
+ onEvent({
310
+ type: "close",
311
+ code: ev.code,
312
+ reason: ev.reason,
313
+ permanent: true
314
+ });
315
+ return;
316
+ }
317
+ onEvent({
318
+ type: "close",
319
+ code: ev.code,
320
+ reason: ev.reason,
321
+ permanent: false
322
+ });
323
+ retries++;
324
+ const delay = Math.min(backoff, maxBackoff);
325
+ backoff = Math.min(backoff * 2, maxBackoff);
326
+ reconnectTimer = setTimeout(openOnce, delay);
327
+ };
328
+ };
329
+ openOnce();
330
+ return {
331
+ send: (data) => {
332
+ if (ws && ws.readyState === READYSTATE_OPEN) ws.send(data);
333
+ },
334
+ close: (code = 1e3, reason = "client-requested") => {
335
+ intentionalClose = true;
336
+ if (reconnectTimer) {
337
+ clearTimeout(reconnectTimer);
338
+ reconnectTimer = null;
339
+ }
340
+ try {
341
+ ws?.close(code, reason);
342
+ } catch {
343
+ }
344
+ },
345
+ readyState: () => ws?.readyState ?? READYSTATE_CLOSED
346
+ };
347
+ };
348
+
349
+ // src/protocol.ts
350
+ var createProtocolState = () => ({
351
+ state: "idle",
352
+ transcript: [],
353
+ agentBubbleId: null,
354
+ idCounter: 0,
355
+ endReason: null
356
+ });
357
+ var mapEndReason = (raw) => {
358
+ if (raw === "agent_ended") return "agent_ended";
359
+ if (raw === "caller_hung_up") return "user_hangup";
360
+ if (raw === "silence_timeout" || raw === "max_duration") return "timeout";
361
+ return "error";
362
+ };
363
+ function handleServerMessage(raw, state, cb) {
364
+ let msg;
365
+ try {
366
+ msg = JSON.parse(raw);
367
+ } catch {
368
+ return;
369
+ }
370
+ switch (msg.type) {
371
+ case "connected":
372
+ setState(state, "listening", cb);
373
+ return;
374
+ case "transcript": {
375
+ const text = msg.text ?? "";
376
+ if (!text) return;
377
+ const isFinal = !!msg.isFinal;
378
+ if (!isFinal) setState(state, "user_speaking", cb);
379
+ upsertUserPartial(state, text, isFinal);
380
+ cb.onTranscript(state.transcript);
381
+ return;
382
+ }
383
+ case "agent_turn_start": {
384
+ const id = `m${state.idCounter++}`;
385
+ state.agentBubbleId = id;
386
+ state.transcript = [...state.transcript, { id, role: "agent", text: "" }];
387
+ cb.onTranscript(state.transcript);
388
+ cb.onAgentTurnStart();
389
+ setState(state, "agent_speaking", cb);
390
+ return;
391
+ }
392
+ case "agent_text": {
393
+ const delta = msg.text ?? "";
394
+ if (!delta || !state.agentBubbleId) return;
395
+ const id = state.agentBubbleId;
396
+ state.transcript = state.transcript.map(
397
+ (e) => e.id === id && e.role === "agent" ? { ...e, text: e.text + delta } : e
398
+ );
399
+ cb.onTranscript(state.transcript);
400
+ return;
401
+ }
402
+ case "agent_turn_end":
403
+ state.agentBubbleId = null;
404
+ setState(state, "listening", cb);
405
+ return;
406
+ case "interrupt":
407
+ cb.onInterrupt();
408
+ return;
409
+ case "agent_turn_abort": {
410
+ const committed = (msg.committedText ?? "").trim();
411
+ if (state.agentBubbleId) {
412
+ const id = state.agentBubbleId;
413
+ if (committed) {
414
+ state.transcript = state.transcript.map(
415
+ (e) => e.id === id && e.role === "agent" ? { ...e, text: committed, interrupted: true } : e
416
+ );
417
+ } else {
418
+ state.transcript = state.transcript.filter((e) => e.id !== id);
419
+ }
420
+ cb.onTranscript(state.transcript);
421
+ }
422
+ state.agentBubbleId = null;
423
+ return;
424
+ }
425
+ case "tool_call":
426
+ state.transcript = [
427
+ ...state.transcript,
428
+ {
429
+ id: `m${state.idCounter++}`,
430
+ role: "tool",
431
+ text: `\u2192 ${String(msg.tool ?? "?")}(${msg.args ? JSON.stringify(msg.args) : ""})`
432
+ }
433
+ ];
434
+ cb.onTranscript(state.transcript);
435
+ return;
436
+ case "tool_result":
437
+ state.transcript = [
438
+ ...state.transcript,
439
+ {
440
+ id: `m${state.idCounter++}`,
441
+ role: "tool",
442
+ text: `${msg.ok ? "\u2713" : "\u2717"} ${String(msg.tool ?? "?")}`
443
+ }
444
+ ];
445
+ cb.onTranscript(state.transcript);
446
+ return;
447
+ case "call_end": {
448
+ const reasonRaw = String(msg.reason ?? "");
449
+ const reason = mapEndReason(reasonRaw);
450
+ state.endReason = reason;
451
+ state.transcript = [
452
+ ...state.transcript,
453
+ {
454
+ id: `m${state.idCounter++}`,
455
+ role: "system",
456
+ text: `call ended${reasonRaw ? ` (${reasonRaw})` : ""}`
457
+ }
458
+ ];
459
+ cb.onTranscript(state.transcript);
460
+ cb.onCallEnd(reason);
461
+ return;
462
+ }
463
+ case "error": {
464
+ const code = msg.code ?? "server_error";
465
+ const message = msg.message ?? "server error";
466
+ cb.onError({ code, message });
467
+ return;
468
+ }
469
+ }
470
+ }
471
+ var setState = (state, next, cb) => {
472
+ if (state.state === next) return;
473
+ state.state = next;
474
+ cb.onState(next);
475
+ };
476
+ var upsertUserPartial = (state, text, isFinal) => {
477
+ let idx = -1;
478
+ for (let i = state.transcript.length - 1; i >= 0; i--) {
479
+ const e = state.transcript[i];
480
+ if (e.role === "user" && e.committed === false) {
481
+ idx = i;
482
+ break;
483
+ }
484
+ }
485
+ if (idx === -1) {
486
+ state.transcript = [
487
+ ...state.transcript,
488
+ { id: `m${state.idCounter++}`, role: "user", text, committed: isFinal }
489
+ ];
490
+ return;
491
+ }
492
+ const target = state.transcript[idx];
493
+ const next = [...state.transcript];
494
+ next[idx] = { ...target, text, committed: isFinal };
495
+ state.transcript = next;
496
+ };
497
+ function buildWsUrl(args) {
498
+ const base = new URL(args.apiBase);
499
+ const proto = base.protocol === "https:" ? "wss:" : "ws:";
500
+ const bargeQS = args.bargeIn === false ? "&barge=off" : "";
501
+ return `${proto}//${base.host}/v1/agents/${encodeURIComponent(args.agentId)}/call?token=${encodeURIComponent(args.token)}${bargeQS}`;
502
+ }
503
+
504
+ // src/VoiceClient.ts
505
+ var BrowserVoiceClient = class {
506
+ constructor(args) {
507
+ this.rws = null;
508
+ this.capture = null;
509
+ this.playback = null;
510
+ this.muted = false;
511
+ this.inputVolume = 0;
512
+ this.outputVolume = 0;
513
+ this.startedAt = null;
514
+ this.endedFired = false;
515
+ this.lastError = null;
516
+ this.end = () => {
517
+ this.teardown("user_hangup");
518
+ };
519
+ this.mute = () => {
520
+ if (this.muted) return;
521
+ this.muted = true;
522
+ this.capture?.mute(true);
523
+ };
524
+ this.unmute = () => {
525
+ if (!this.muted) return;
526
+ this.muted = false;
527
+ this.capture?.mute(false);
528
+ };
529
+ // ---------------------------------------------------------------
530
+ // Internal
531
+ // ---------------------------------------------------------------
532
+ this.setState = (next) => {
533
+ if (this.proto.state === next) return;
534
+ this.proto.state = next;
535
+ this.args.options.onStateChange?.(next);
536
+ };
537
+ this.emitError = (err) => {
538
+ this.lastError = err;
539
+ this.args.options.onError?.(err);
540
+ };
541
+ this.handleSocketEvent = (ev) => {
542
+ switch (ev.type) {
543
+ case "open":
544
+ void this.startCapture();
545
+ break;
546
+ case "reconnected":
547
+ this.proto.transcript = [];
548
+ this.proto.agentBubbleId = null;
549
+ this.args.options.onTranscript?.(this.proto.transcript);
550
+ void this.startCapture();
551
+ this.setState("listening");
552
+ break;
553
+ case "message":
554
+ if (typeof ev.data === "string") {
555
+ handleServerMessage(ev.data, this.proto, {
556
+ onState: this.setState,
557
+ onTranscript: (entries) => this.args.options.onTranscript?.(entries),
558
+ onError: this.emitError,
559
+ onInterrupt: () => this.playback?.flush(),
560
+ onAgentTurnStart: () => void 0,
561
+ onCallEnd: (reason) => this.teardown(reason)
562
+ });
563
+ } else {
564
+ this.playback?.enqueue(ev.data);
565
+ }
566
+ break;
567
+ case "close":
568
+ if (ev.permanent) {
569
+ const reason = this.proto.endReason ?? (this.lastError ? "error" : "user_hangup");
570
+ this.teardown(reason);
571
+ }
572
+ break;
573
+ case "error":
574
+ this.emitError({ code: "socket_error", message: ev.error.message });
575
+ break;
576
+ }
577
+ };
578
+ this.startCapture = async () => {
579
+ if (this.capture?.isCapturing()) return;
580
+ this.capture = createAudioCapture({
581
+ onChunk: (pcm) => {
582
+ this.rws?.send(pcm);
583
+ },
584
+ onVolume: (v) => {
585
+ this.inputVolume = v;
586
+ this.args.options.onVolume?.({ input: v, output: this.outputVolume });
587
+ },
588
+ onError: (err) => {
589
+ this.emitError({
590
+ code: err.name === "NotAllowedError" ? "mic_denied" : "mic_start_failed",
591
+ message: err.message
592
+ });
593
+ }
594
+ });
595
+ if (this.muted) this.capture.mute(true);
596
+ try {
597
+ await this.capture.start();
598
+ } catch {
599
+ }
600
+ };
601
+ this.teardown = (reason) => {
602
+ this.capture?.stop();
603
+ this.capture = null;
604
+ this.playback?.close();
605
+ this.playback = null;
606
+ try {
607
+ this.rws?.close(1e3, reason);
608
+ } catch {
609
+ }
610
+ this.rws = null;
611
+ this.setState("ended");
612
+ this.fireEndOnce(reason);
613
+ };
614
+ this.fireEndOnce = (reason) => {
615
+ if (this.endedFired) return;
616
+ this.endedFired = true;
617
+ const startedAt = this.startedAt ?? Date.now();
618
+ this.args.options.onEnd?.({
619
+ reason,
620
+ errorCode: reason === "error" ? this.lastError?.code : void 0,
621
+ durationMs: Date.now() - startedAt
622
+ });
623
+ };
624
+ this.args = args;
625
+ this.proto = createProtocolState();
626
+ }
627
+ // ---------------------------------------------------------------
628
+ // Call interface
629
+ // ---------------------------------------------------------------
630
+ get state() {
631
+ return this.proto.state;
632
+ }
633
+ get transcript() {
634
+ return this.proto.transcript.slice();
635
+ }
636
+ get isMuted() {
637
+ return this.muted;
638
+ }
639
+ // ---------------------------------------------------------------
640
+ // Lifecycle — called by the factory immediately after construction.
641
+ // Resolves once the WS is open and capture is starting; mid-call
642
+ // failures arrive via `onError`.
643
+ // ---------------------------------------------------------------
644
+ async start() {
645
+ this.setState("connecting");
646
+ this.startedAt = Date.now();
647
+ const url = buildWsUrl({
648
+ apiBase: this.args.config.apiBase,
649
+ agentId: this.args.options.agentId,
650
+ token: this.args.token,
651
+ bargeIn: this.args.options.bargeIn
652
+ });
653
+ this.playback = createAudioPlayback({
654
+ onVolume: (v) => {
655
+ this.outputVolume = v;
656
+ this.args.options.onVolume?.({ input: this.inputVolume, output: v });
657
+ }
658
+ });
659
+ try {
660
+ await this.playback.resume();
661
+ } catch {
662
+ }
663
+ this.rws = createReconnectingWebSocket(
664
+ {
665
+ url,
666
+ wsFactory: this.args.wsFactory,
667
+ maxRetries: 3
668
+ },
669
+ (ev) => this.handleSocketEvent(ev)
670
+ );
671
+ }
672
+ };
673
+
674
+ // src/browser.ts
675
+ var browserWsFactory = (url) => new globalThis.WebSocket(url);
676
+ var BrowserVoiceFactory = class {
677
+ constructor(config) {
678
+ this.startCall = async (options) => {
679
+ if (!options.agentId) {
680
+ throw new Error("startCall: agentId is required");
681
+ }
682
+ const { context, metadata } = mergeStartCallContext(this.config, options);
683
+ const fetchArgs = {
684
+ agentId: options.agentId,
685
+ userId: options.userId,
686
+ context,
687
+ metadata
688
+ };
689
+ let token;
690
+ if (options.token) {
691
+ token = options.token;
692
+ } else {
693
+ token = await this.config.fetchToken(fetchArgs);
694
+ if (!token) {
695
+ throw new Error("configureVoiceClient.fetchToken returned empty token");
696
+ }
697
+ }
698
+ const client = new BrowserVoiceClient({
699
+ config: this.config,
700
+ // Carry merged context/metadata through to startCall so server can
701
+ // see what the SDK saw.
702
+ options: { ...options, context, metadata },
703
+ token,
704
+ wsFactory: browserWsFactory
705
+ });
706
+ await client.start();
707
+ return client;
708
+ };
709
+ this.config = config;
710
+ }
711
+ };
712
+ function configureVoiceClient(config) {
713
+ return new BrowserVoiceFactory(normalizeConfig(config));
714
+ }
715
+ // Annotate the CommonJS export names for ESM import in node:
716
+ 0 && (module.exports = {
717
+ buildWsUrl,
718
+ configureVoiceClient,
719
+ createAudioCapture,
720
+ createAudioPlayback,
721
+ createProtocolState,
722
+ createReconnectingWebSocket,
723
+ handleServerMessage
724
+ });
725
+ //# sourceMappingURL=browser.js.map