npm - @craftedxp/voice-js - Versions diffs - 0.3.2 → 0.4.0 - Mend

@craftedxp/voice-js 0.3.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/dist/browser.js CHANGED Viewed

@@ -1,24 +1,26 @@
-"use strict";
-var __defProp = Object.defineProperty;
-var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
-var __getOwnPropNames = Object.getOwnPropertyNames;
-var __hasOwnProp = Object.prototype.hasOwnProperty;
+'use strict'
+var __defProp = Object.defineProperty
+var __getOwnPropDesc = Object.getOwnPropertyDescriptor
+var __getOwnPropNames = Object.getOwnPropertyNames
+var __hasOwnProp = Object.prototype.hasOwnProperty
 var __export = (target, all) => {
-  for (var name in all)
-    __defProp(target, name, { get: all[name], enumerable: true });
-};
+  for (var name in all) __defProp(target, name, { get: all[name], enumerable: true })
+}
 var __copyProps = (to, from, except, desc) => {
-  if (from && typeof from === "object" || typeof from === "function") {
+  if ((from && typeof from === 'object') || typeof from === 'function') {
     for (let key of __getOwnPropNames(from))
       if (!__hasOwnProp.call(to, key) && key !== except)
-        __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
+        __defProp(to, key, {
+          get: () => from[key],
+          enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable,
+        })
   }
-  return to;
-};
-var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
+  return to
+}
+var __toCommonJS = (mod) => __copyProps(__defProp({}, '__esModule', { value: true }), mod)
 // src/browser.ts
-var browser_exports = {};
+var browser_exports = {}
 __export(browser_exports, {
   buildWsUrl: () => buildWsUrl,
   configureVoiceClient: () => configureVoiceClient,
@@ -26,57 +28,64 @@ __export(browser_exports, {
   createAudioPlayback: () => createAudioPlayback,
   createProtocolState: () => createProtocolState,
   createReconnectingWebSocket: () => createReconnectingWebSocket,
-  handleServerMessage: () => handleServerMessage
-});
-module.exports = __toCommonJS(browser_exports);
+  handleServerMessage: () => handleServerMessage,
+})
+module.exports = __toCommonJS(browser_exports)
 // src/config.ts
 function normalizeConfig(config) {
-  if (!config) throw new Error("configureVoiceClient: config is required");
-  if ("apiKey" in config) {
+  if (!config) throw new Error('configureVoiceClient: config is required')
+  if ('apiKey' in config) {
     throw new Error(
-      "configureVoiceClient: `apiKey` is no longer supported. Embedding sk_ in JS code ships server-grade credentials to every client. Pass `fetchToken: async ({ agentId }) => { /* call YOUR backend mint */ }` instead \u2014 see the @craftedxp/voice-js README for the migration recipe."
-    );
+      'configureVoiceClient: `apiKey` is no longer supported. Embedding sk_ in JS code ships server-grade credentials to every client. Pass `fetchToken: async ({ agentId }) => { /* call YOUR backend mint */ }` instead \u2014 see the @craftedxp/voice-js README for the migration recipe.',
+    )
   }
   if (!config.apiBase) {
-    throw new Error("configureVoiceClient: apiBase is required");
+    throw new Error('configureVoiceClient: apiBase is required')
   }
-  if (typeof config.fetchToken !== "function") {
-    throw new Error("configureVoiceClient: fetchToken must be a function");
+  if (typeof config.fetchToken !== 'function') {
+    throw new Error('configureVoiceClient: fetchToken must be a function')
   }
   return {
     ...config,
-    apiBase: config.apiBase.replace(/\/+$/, "")
-  };
+    apiBase: config.apiBase.replace(/\/+$/, ''),
+  }
 }
 function mergeStartCallContext(factory, call) {
-  const context = factory.defaultContext || call.context ? { ...factory.defaultContext ?? {}, ...call.context ?? {} } : void 0;
-  const metadata = factory.defaultMetadata || call.metadata ? { ...factory.defaultMetadata ?? {}, ...call.metadata ?? {} } : void 0;
-  return { context, metadata };
+  const context =
+    factory.defaultContext || call.context
+      ? { ...(factory.defaultContext ?? {}), ...(call.context ?? {}) }
+      : void 0
+  const metadata =
+    factory.defaultMetadata || call.metadata
+      ? { ...(factory.defaultMetadata ?? {}), ...(call.metadata ?? {}) }
+      : void 0
+  return { context, metadata }
 }
 // src/worklets/mic-downsampler.worklet.js
-var mic_downsampler_worklet_default = "// AudioWorklet \u2014 runs off the main thread in the audio rendering graph.\n//\n// Input: Float32 samples at the AudioContext's native sampleRate (typically\n// 48000 Hz on desktop, 44100 Hz on some iOS devices).\n// Output: 16 kHz mono Int16 PCM, shipped to the main thread via\n// `port.postMessage(ArrayBuffer, [ArrayBuffer])` (transferred, not copied).\n//\n// Why AudioWorklet instead of ScriptProcessorNode: ScriptProcessorNode is\n// deprecated + main-thread-bound, so any JS jank produces audible audio\n// glitches. AudioWorklet's `process()` runs on the audio rendering thread\n// at the graph's block cadence (128 frames by default) and backpressures\n// via returning `true` / `false`.\n//\n// This file is loaded as text (see tsup.config.ts loader) and registered\n// at runtime via `audioWorklet.addModule(blobUrl)`.\n\nclass MicDownsampler extends AudioWorkletProcessor {\n  constructor() {\n    super()\n    // Target sample rate for STT. Matches Deepgram Nova-3 + the platform's\n    // server-side SAMPLE_RATE constant in AgentCallHandler.\n    this.targetRate = 16000\n    // Accumulator for the downsample. We collect incoming samples and emit\n    // an Int16 chunk when we've accumulated ~1024 target-rate samples\n    // (~64 ms at 16 kHz) \u2014 matches the mobile SDK's chunk size so both\n    // platforms have the same server-side framing.\n    this.outputFrames = 1024\n    this.acc = []\n    // Running index used for fractional resampling.\n    this.readCursor = 0\n  }\n\n  // `inputs[0][0]` = first channel of first input. 128 Float32 samples per\n  // call at the context's sampleRate. Return true = keep processing.\n  process(inputs) {\n    const input = inputs[0]\n    if (!input || input.length === 0) return true\n    const channel = input[0]\n    if (!channel || channel.length === 0) return true\n\n    const ctxRate = sampleRate // global inside AudioWorkletProcessor\n    const ratio = ctxRate / this.targetRate\n\n    // Simple linear-interp downsample. For 48000 \u2192 16000 that's 3:1, which\n    // linear handles fine for voice. Anti-alias filtering would be\n    // theoretically better but inaudible for speech.\n    for (let i = 0; i < channel.length; i++) {\n      this.acc.push(channel[i])\n    }\n\n    while (this.acc.length - this.readCursor >= ratio * this.outputFrames) {\n      const out = new Int16Array(this.outputFrames)\n      let readIdx = this.readCursor\n      for (let i = 0; i < this.outputFrames; i++) {\n        // Linear interp between floor(readIdx) and ceil(readIdx)\n        const low = Math.floor(readIdx)\n        const high = Math.min(low + 1, this.acc.length - 1)\n        const frac = readIdx - low\n        const sample = this.acc[low] * (1 - frac) + this.acc[high] * frac\n        // Clip + convert to int16\n        const clipped = Math.max(-1, Math.min(1, sample))\n        out[i] = clipped < 0 ? clipped * 0x8000 : clipped * 0x7fff\n        readIdx += ratio\n      }\n      // Transfer the ArrayBuffer (zero-copy) to the main thread.\n      this.port.postMessage(out.buffer, [out.buffer])\n      this.readCursor = readIdx\n    }\n\n    // Garbage-collect the consumed portion of `acc` every so often so it\n    // doesn't grow without bound. Leave ~one chunk of headroom.\n    if (this.readCursor > ratio * this.outputFrames) {\n      this.acc = this.acc.slice(Math.floor(this.readCursor))\n      this.readCursor -= Math.floor(this.readCursor)\n    }\n\n    return true\n  }\n}\n\nregisterProcessor('mic-downsampler', MicDownsampler)\n";
+var mic_downsampler_worklet_default =
+  "// AudioWorklet \u2014 runs off the main thread in the audio rendering graph.\n//\n// Input: Float32 samples at the AudioContext's native sampleRate (typically\n// 48000 Hz on desktop, 44100 Hz on some iOS devices).\n// Output: 16 kHz mono Int16 PCM, shipped to the main thread via\n// `port.postMessage(ArrayBuffer, [ArrayBuffer])` (transferred, not copied).\n//\n// Why AudioWorklet instead of ScriptProcessorNode: ScriptProcessorNode is\n// deprecated + main-thread-bound, so any JS jank produces audible audio\n// glitches. AudioWorklet's `process()` runs on the audio rendering thread\n// at the graph's block cadence (128 frames by default) and backpressures\n// via returning `true` / `false`.\n//\n// This file is loaded as text (see tsup.config.ts loader) and registered\n// at runtime via `audioWorklet.addModule(blobUrl)`.\n\nclass MicDownsampler extends AudioWorkletProcessor {\n  constructor() {\n    super()\n    // Target sample rate for STT. Matches Deepgram Nova-3 + the platform's\n    // server-side SAMPLE_RATE constant in AgentCallHandler.\n    this.targetRate = 16000\n    // Accumulator for the downsample. We collect incoming samples and emit\n    // an Int16 chunk when we've accumulated ~1024 target-rate samples\n    // (~64 ms at 16 kHz) \u2014 matches the mobile SDK's chunk size so both\n    // platforms have the same server-side framing.\n    this.outputFrames = 1024\n    this.acc = []\n    // Running index used for fractional resampling.\n    this.readCursor = 0\n  }\n\n  // `inputs[0][0]` = first channel of first input. 128 Float32 samples per\n  // call at the context's sampleRate. Return true = keep processing.\n  process(inputs) {\n    const input = inputs[0]\n    if (!input || input.length === 0) return true\n    const channel = input[0]\n    if (!channel || channel.length === 0) return true\n\n    const ctxRate = sampleRate // global inside AudioWorkletProcessor\n    const ratio = ctxRate / this.targetRate\n\n    // Simple linear-interp downsample. For 48000 \u2192 16000 that's 3:1, which\n    // linear handles fine for voice. Anti-alias filtering would be\n    // theoretically better but inaudible for speech.\n    for (let i = 0; i < channel.length; i++) {\n      this.acc.push(channel[i])\n    }\n\n    while (this.acc.length - this.readCursor >= ratio * this.outputFrames) {\n      const out = new Int16Array(this.outputFrames)\n      let readIdx = this.readCursor\n      for (let i = 0; i < this.outputFrames; i++) {\n        // Linear interp between floor(readIdx) and ceil(readIdx)\n        const low = Math.floor(readIdx)\n        const high = Math.min(low + 1, this.acc.length - 1)\n        const frac = readIdx - low\n        const sample = this.acc[low] * (1 - frac) + this.acc[high] * frac\n        // Clip + convert to int16\n        const clipped = Math.max(-1, Math.min(1, sample))\n        out[i] = clipped < 0 ? clipped * 0x8000 : clipped * 0x7fff\n        readIdx += ratio\n      }\n      // Transfer the ArrayBuffer (zero-copy) to the main thread.\n      this.port.postMessage(out.buffer, [out.buffer])\n      this.readCursor = readIdx\n    }\n\n    // Garbage-collect the consumed portion of `acc` every so often so it\n    // doesn't grow without bound. Leave ~one chunk of headroom.\n    if (this.readCursor > ratio * this.outputFrames) {\n      this.acc = this.acc.slice(Math.floor(this.readCursor))\n      this.readCursor -= Math.floor(this.readCursor)\n    }\n\n    return true\n  }\n}\n\nregisterProcessor('mic-downsampler', MicDownsampler)\n"
 // src/AudioCapture.ts
-var VOLUME_INTERVAL_MS = 100;
+var VOLUME_INTERVAL_MS = 100
 var createAudioCapture = (options) => {
-  let audioContext = null;
-  let mediaStream = null;
-  let sourceNode = null;
-  let workletNode = null;
-  let analyser = null;
-  let volumeTimer = null;
-  let muted = false;
-  let capturing = false;
+  let audioContext = null
+  let mediaStream = null
+  let sourceNode = null
+  let workletNode = null
+  let analyser = null
+  let volumeTimer = null
+  let muted = false
+  let capturing = false
   const computeRms = (buf) => {
-    let sum = 0;
-    for (let i = 0; i < buf.length; i++) sum += buf[i] * buf[i];
-    const rms = Math.sqrt(sum / buf.length);
-    return Math.min(1, rms * 1.8);
-  };
+    let sum = 0
+    for (let i = 0; i < buf.length; i++) sum += buf[i] * buf[i]
+    const rms = Math.sqrt(sum / buf.length)
+    return Math.min(1, rms * 1.8)
+  }
   const start = async () => {
-    if (capturing) return;
+    if (capturing) return
     try {
       mediaStream = await navigator.mediaDevices.getUserMedia({
         audio: {
@@ -87,657 +96,748 @@ var createAudioCapture = (options) => {
           echoCancellation: true,
           noiseSuppression: true,
           autoGainControl: true,
-          channelCount: 1
-        }
-      });
-      audioContext = new AudioContext();
-      if (audioContext.state === "suspended") await audioContext.resume();
-      const blob = new Blob([mic_downsampler_worklet_default], { type: "application/javascript" });
-      const url = URL.createObjectURL(blob);
+          channelCount: 1,
+        },
+      })
+      audioContext = new AudioContext()
+      if (audioContext.state === 'suspended') await audioContext.resume()
+      const blob = new Blob([mic_downsampler_worklet_default], { type: 'application/javascript' })
+      const url = URL.createObjectURL(blob)
       try {
-        await audioContext.audioWorklet.addModule(url);
+        await audioContext.audioWorklet.addModule(url)
       } finally {
-        URL.revokeObjectURL(url);
+        URL.revokeObjectURL(url)
       }
-      sourceNode = audioContext.createMediaStreamSource(mediaStream);
-      workletNode = new AudioWorkletNode(audioContext, "mic-downsampler");
+      sourceNode = audioContext.createMediaStreamSource(mediaStream)
+      workletNode = new AudioWorkletNode(audioContext, 'mic-downsampler')
       workletNode.port.onmessage = (event) => {
-        if (muted) return;
-        options.onChunk(event.data);
-      };
+        if (muted) return
+        options.onChunk(event.data)
+      }
       if (options.onVolume) {
-        analyser = audioContext.createAnalyser();
-        analyser.fftSize = 256;
-        sourceNode.connect(analyser);
-        const buf = new Float32Array(analyser.fftSize);
+        analyser = audioContext.createAnalyser()
+        analyser.fftSize = 256
+        sourceNode.connect(analyser)
+        const buf = new Float32Array(analyser.fftSize)
         volumeTimer = setInterval(() => {
-          if (!analyser) return;
-          analyser.getFloatTimeDomainData(buf);
-          options.onVolume?.(computeRms(buf));
-        }, VOLUME_INTERVAL_MS);
+          if (!analyser) return
+          analyser.getFloatTimeDomainData(buf)
+          options.onVolume?.(computeRms(buf))
+        }, VOLUME_INTERVAL_MS)
       }
-      sourceNode.connect(workletNode);
-      const sink = audioContext.createGain();
-      sink.gain.value = 0;
-      workletNode.connect(sink).connect(audioContext.destination);
-      capturing = true;
+      sourceNode.connect(workletNode)
+      const sink = audioContext.createGain()
+      sink.gain.value = 0
+      workletNode.connect(sink).connect(audioContext.destination)
+      capturing = true
     } catch (err) {
-      const wrapped = err instanceof Error ? err : new Error(typeof err === "string" ? err : "capture failed");
-      options.onError?.(wrapped);
-      throw wrapped;
+      const wrapped =
+        err instanceof Error ? err : new Error(typeof err === 'string' ? err : 'capture failed')
+      options.onError?.(wrapped)
+      throw wrapped
     }
-  };
+  }
   const stop = () => {
-    if (!capturing) return;
-    capturing = false;
+    if (!capturing) return
+    capturing = false
     if (volumeTimer) {
-      clearInterval(volumeTimer);
-      volumeTimer = null;
+      clearInterval(volumeTimer)
+      volumeTimer = null
     }
     try {
-      workletNode?.disconnect();
-      analyser?.disconnect();
-      sourceNode?.disconnect();
-    } catch {
-    }
-    workletNode = null;
-    analyser = null;
-    sourceNode = null;
+      workletNode?.disconnect()
+      analyser?.disconnect()
+      sourceNode?.disconnect()
+    } catch {}
+    workletNode = null
+    analyser = null
+    sourceNode = null
     if (mediaStream) {
-      for (const track of mediaStream.getTracks()) track.stop();
-      mediaStream = null;
+      for (const track of mediaStream.getTracks()) track.stop()
+      mediaStream = null
     }
-    if (audioContext && audioContext.state !== "closed") {
-      void audioContext.close().catch(() => void 0);
+    if (audioContext && audioContext.state !== 'closed') {
+      void audioContext.close().catch(() => void 0)
     }
-    audioContext = null;
-  };
+    audioContext = null
+  }
   return {
     start,
     stop,
     mute: (v) => {
-      muted = v;
+      muted = v
     },
-    isCapturing: () => capturing
-  };
-};
+    isCapturing: () => capturing,
+  }
+}
 // src/AudioPlayback.ts
-var DEFAULT_SAMPLE_RATE = 16e3;
-var VOLUME_INTERVAL_MS2 = 100;
+var DEFAULT_SAMPLE_RATE = 16e3
+var VOLUME_INTERVAL_MS2 = 100
 var createAudioPlayback = (options = {}) => {
-  const sampleRate = options.sampleRate ?? DEFAULT_SAMPLE_RATE;
-  let audioContext = null;
-  let gainNode = null;
-  let analyser = null;
-  let volumeTimer = null;
-  let nextStartTime = 0;
-  let scheduledNodes = [];
-  let speaking = false;
+  const sampleRate = options.sampleRate ?? DEFAULT_SAMPLE_RATE
+  let audioContext = null
+  let gainNode = null
+  let analyser = null
+  let volumeTimer = null
+  let nextStartTime = 0
+  let scheduledNodes = []
+  let speaking = false
   const ensureContext = async () => {
     if (audioContext) {
-      if (audioContext.state === "suspended") await audioContext.resume();
-      return;
+      if (audioContext.state === 'suspended') await audioContext.resume()
+      return
     }
-    audioContext = new AudioContext({ sampleRate });
-    gainNode = audioContext.createGain();
+    audioContext = new AudioContext({ sampleRate })
+    gainNode = audioContext.createGain()
     if (options.onVolume) {
-      analyser = audioContext.createAnalyser();
-      analyser.fftSize = 256;
-      gainNode.connect(analyser);
-      const buf = new Float32Array(analyser.fftSize);
+      analyser = audioContext.createAnalyser()
+      analyser.fftSize = 256
+      gainNode.connect(analyser)
+      const buf = new Float32Array(analyser.fftSize)
       volumeTimer = setInterval(() => {
-        if (!analyser) return;
-        analyser.getFloatTimeDomainData(buf);
-        let sum = 0;
-        for (let i = 0; i < buf.length; i++) sum += buf[i] * buf[i];
-        const rms = Math.sqrt(sum / buf.length);
-        options.onVolume?.(Math.min(1, rms * 1.8));
-      }, VOLUME_INTERVAL_MS2);
-    }
-    gainNode.connect(audioContext.destination);
-    nextStartTime = audioContext.currentTime;
-  };
+        if (!analyser) return
+        analyser.getFloatTimeDomainData(buf)
+        let sum = 0
+        for (let i = 0; i < buf.length; i++) sum += buf[i] * buf[i]
+        const rms = Math.sqrt(sum / buf.length)
+        options.onVolume?.(Math.min(1, rms * 1.8))
+      }, VOLUME_INTERVAL_MS2)
+    }
+    gainNode.connect(audioContext.destination)
+    nextStartTime = audioContext.currentTime
+  }
   const setSpeaking = (v) => {
-    if (v === speaking) return;
-    speaking = v;
-    options.onSpeakingChange?.(v);
-  };
+    if (v === speaking) return
+    speaking = v
+    options.onSpeakingChange?.(v)
+  }
   const pruneFinished = () => {
-    const now = audioContext?.currentTime ?? 0;
+    const now = audioContext?.currentTime ?? 0
     scheduledNodes = scheduledNodes.filter((n) => {
-      const node = n;
-      return (node._endsAt ?? 0) > now;
-    });
-    if (scheduledNodes.length === 0) setSpeaking(false);
-  };
+      const node = n
+      return (node._endsAt ?? 0) > now
+    })
+    if (scheduledNodes.length === 0) setSpeaking(false)
+  }
   const enqueue = (pcm) => {
     if (!audioContext) {
-      void ensureContext().then(() => enqueue(pcm));
-      return;
-    }
-    if (!audioContext || !gainNode) return;
-    const int16 = new Int16Array(pcm);
-    if (int16.length === 0) return;
-    const audioBuffer = audioContext.createBuffer(1, int16.length, sampleRate);
-    const float32 = audioBuffer.getChannelData(0);
+      void ensureContext().then(() => enqueue(pcm))
+      return
+    }
+    if (!audioContext || !gainNode) return
+    const int16 = new Int16Array(pcm)
+    if (int16.length === 0) return
+    const audioBuffer = audioContext.createBuffer(1, int16.length, sampleRate)
+    const float32 = audioBuffer.getChannelData(0)
     for (let i = 0; i < int16.length; i++) {
-      float32[i] = int16[i] / 32768;
-    }
-    const node = audioContext.createBufferSource();
-    node.buffer = audioBuffer;
-    node.connect(gainNode);
-    const now = audioContext.currentTime;
-    const startAt = Math.max(now, nextStartTime);
-    node.start(startAt);
-    const duration = int16.length / sampleRate;
-    node._endsAt = startAt + duration;
-    nextStartTime = startAt + duration;
-    scheduledNodes.push(node);
-    setSpeaking(true);
-    node.onended = () => pruneFinished();
-  };
+      float32[i] = int16[i] / 32768
+    }
+    const node = audioContext.createBufferSource()
+    node.buffer = audioBuffer
+    node.connect(gainNode)
+    const now = audioContext.currentTime
+    const startAt = Math.max(now, nextStartTime)
+    node.start(startAt)
+    const duration = int16.length / sampleRate
+    node._endsAt = startAt + duration
+    nextStartTime = startAt + duration
+    scheduledNodes.push(node)
+    setSpeaking(true)
+    node.onended = () => pruneFinished()
+  }
   const flush = () => {
-    if (!audioContext || !gainNode) return;
+    if (!audioContext || !gainNode) return
     for (const node of scheduledNodes) {
       try {
-        node.stop();
-      } catch {
-      }
+        node.stop()
+      } catch {}
     }
-    scheduledNodes = [];
-    gainNode.disconnect();
-    gainNode = audioContext.createGain();
+    scheduledNodes = []
+    gainNode.disconnect()
+    gainNode = audioContext.createGain()
     if (analyser) {
-      analyser.disconnect();
-      gainNode.connect(analyser);
+      analyser.disconnect()
+      gainNode.connect(analyser)
     }
-    gainNode.connect(audioContext.destination);
-    nextStartTime = audioContext.currentTime;
-    setSpeaking(false);
-  };
+    gainNode.connect(audioContext.destination)
+    nextStartTime = audioContext.currentTime
+    setSpeaking(false)
+  }
   const close = () => {
-    flush();
+    flush()
     if (volumeTimer) {
-      clearInterval(volumeTimer);
-      volumeTimer = null;
+      clearInterval(volumeTimer)
+      volumeTimer = null
     }
-    if (audioContext && audioContext.state !== "closed") {
-      void audioContext.close().catch(() => void 0);
+    if (audioContext && audioContext.state !== 'closed') {
+      void audioContext.close().catch(() => void 0)
     }
-    audioContext = null;
-    gainNode = null;
-    analyser = null;
-  };
+    audioContext = null
+    gainNode = null
+    analyser = null
+  }
   const resume = async () => {
-    await ensureContext();
-  };
-  return { enqueue, flush, close, resume };
-};
+    await ensureContext()
+  }
+  return { enqueue, flush, close, resume }
+}
 // src/ReconnectingWebSocket.ts
-var READYSTATE_OPEN = 1;
-var READYSTATE_CLOSED = 3;
+var READYSTATE_OPEN = 1
+var READYSTATE_CLOSED = 3
 var createReconnectingWebSocket = (options, onEvent) => {
-  const maxRetries = options.maxRetries ?? 3;
-  const initialBackoff = options.initialBackoffMs ?? 500;
-  const maxBackoff = options.maxBackoffMs ?? 8e3;
-  let ws = null;
-  let intentionalClose = false;
-  let retries = 0;
-  let backoff = initialBackoff;
-  let reconnectTimer = null;
+  const maxRetries = options.maxRetries ?? 3
+  const initialBackoff = options.initialBackoffMs ?? 500
+  const maxBackoff = options.maxBackoffMs ?? 8e3
+  let ws = null
+  let intentionalClose = false
+  let retries = 0
+  let backoff = initialBackoff
+  let reconnectTimer = null
   const openOnce = () => {
-    ws = options.wsFactory(options.url);
-    ws.binaryType = "arraybuffer";
+    ws = options.wsFactory(options.url)
+    ws.binaryType = 'arraybuffer'
     ws.onopen = () => {
-      if (retries === 0) onEvent({ type: "open" });
-      else onEvent({ type: "reconnected" });
-      retries = 0;
-      backoff = initialBackoff;
-    };
+      if (retries === 0) onEvent({ type: 'open' })
+      else onEvent({ type: 'reconnected' })
+      retries = 0
+      backoff = initialBackoff
+    }
     ws.onmessage = (ev) => {
-      onEvent({ type: "message", data: ev.data });
-    };
+      onEvent({ type: 'message', data: ev.data })
+    }
     ws.onerror = () => {
-      onEvent({ type: "error", error: new Error("WebSocket error") });
-    };
+      onEvent({ type: 'error', error: new Error('WebSocket error') })
+    }
     ws.onclose = (ev) => {
-      ws = null;
-      const shouldRetry = !intentionalClose && retries < maxRetries;
+      ws = null
+      const shouldRetry = !intentionalClose && retries < maxRetries
       if (!shouldRetry) {
         onEvent({
-          type: "close",
+          type: 'close',
           code: ev.code,
           reason: ev.reason,
-          permanent: true
-        });
-        return;
+          permanent: true,
+        })
+        return
       }
       onEvent({
-        type: "close",
+        type: 'close',
         code: ev.code,
         reason: ev.reason,
-        permanent: false
-      });
-      retries++;
-      const delay = Math.min(backoff, maxBackoff);
-      backoff = Math.min(backoff * 2, maxBackoff);
-      reconnectTimer = setTimeout(openOnce, delay);
-    };
-  };
-  openOnce();
+        permanent: false,
+      })
+      retries++
+      const delay = Math.min(backoff, maxBackoff)
+      backoff = Math.min(backoff * 2, maxBackoff)
+      reconnectTimer = setTimeout(openOnce, delay)
+    }
+  }
+  openOnce()
   return {
     send: (data) => {
-      if (ws && ws.readyState === READYSTATE_OPEN) ws.send(data);
+      if (ws && ws.readyState === READYSTATE_OPEN) ws.send(data)
     },
-    close: (code = 1e3, reason = "client-requested") => {
-      intentionalClose = true;
+    close: (code = 1e3, reason = 'client-requested') => {
+      intentionalClose = true
       if (reconnectTimer) {
-        clearTimeout(reconnectTimer);
-        reconnectTimer = null;
+        clearTimeout(reconnectTimer)
+        reconnectTimer = null
       }
       try {
-        ws?.close(code, reason);
-      } catch {
-      }
+        ws?.close(code, reason)
+      } catch {}
     },
-    readyState: () => ws?.readyState ?? READYSTATE_CLOSED
-  };
-};
+    readyState: () => ws?.readyState ?? READYSTATE_CLOSED,
+  }
+}
 // src/protocol.ts
 var createProtocolState = () => ({
-  state: "idle",
+  state: 'idle',
   transcript: [],
   agentBubbleId: null,
   idCounter: 0,
-  endReason: null
-});
+  endReason: null,
+})
 var mapEndReason = (raw) => {
-  if (raw === "agent_ended") return "agent_ended";
-  if (raw === "caller_hung_up") return "user_hangup";
-  if (raw === "silence_timeout" || raw === "max_duration") return "timeout";
-  return "error";
-};
+  if (raw === 'agent_ended') return 'agent_ended'
+  if (raw === 'caller_hung_up') return 'user_hangup'
+  if (raw === 'silence_timeout' || raw === 'max_duration') return 'timeout'
+  return 'error'
+}
 function handleServerMessage(raw, state, cb) {
-  let msg;
+  let msg
   try {
-    msg = JSON.parse(raw);
+    msg = JSON.parse(raw)
   } catch {
-    return;
+    return
   }
   switch (msg.type) {
-    case "connected":
-      cb.onConnected();
-      setState(state, "listening", cb);
-      return;
-    case "transcript": {
-      const text = msg.text ?? "";
-      if (!text) return;
-      const isFinal = !!msg.isFinal;
-      if (!isFinal) setState(state, "user_speaking", cb);
-      upsertUserPartial(state, text, isFinal);
-      cb.onTranscript(state.transcript);
-      return;
-    }
-    case "agent_turn_start": {
-      const id = `m${state.idCounter++}`;
-      state.agentBubbleId = id;
-      state.transcript = [...state.transcript, { id, role: "agent", text: "" }];
-      cb.onTranscript(state.transcript);
-      cb.onAgentTurnStart();
-      setState(state, "agent_speaking", cb);
-      return;
-    }
-    case "agent_text": {
-      const delta = msg.text ?? "";
-      if (!delta || !state.agentBubbleId) return;
-      const id = state.agentBubbleId;
-      state.transcript = state.transcript.map(
-        (e) => e.id === id && e.role === "agent" ? { ...e, text: e.text + delta } : e
-      );
-      cb.onTranscript(state.transcript);
-      return;
-    }
-    case "agent_turn_end":
-      state.agentBubbleId = null;
-      setState(state, "listening", cb);
-      return;
-    case "interrupt":
-      cb.onInterrupt();
-      return;
-    case "agent_turn_abort": {
-      const committed = (msg.committedText ?? "").trim();
+    case 'connected':
+      cb.onConnected()
+      setState(state, 'listening', cb)
+      return
+    case 'transcript': {
+      const text = msg.text ?? ''
+      if (!text) return
+      const isFinal = !!msg.isFinal
+      if (!isFinal) setState(state, 'user_speaking', cb)
+      upsertUserPartial(state, text, isFinal)
+      cb.onTranscript(state.transcript)
+      return
+    }
+    case 'agent_turn_start': {
+      const id = `m${state.idCounter++}`
+      state.agentBubbleId = id
+      state.transcript = [...state.transcript, { id, role: 'agent', text: '' }]
+      cb.onTranscript(state.transcript)
+      const seq = typeof msg.seq === 'number' ? msg.seq : void 0
+      cb.onAgentTurnStart(seq)
+      setState(state, 'agent_speaking', cb)
+      return
+    }
+    case 'agent_text': {
+      const delta = msg.text ?? ''
+      if (!delta || !state.agentBubbleId) return
+      const id = state.agentBubbleId
+      state.transcript = state.transcript.map((e) =>
+        e.id === id && e.role === 'agent' ? { ...e, text: e.text + delta } : e,
+      )
+      cb.onTranscript(state.transcript)
+      return
+    }
+    case 'agent_turn_end': {
+      state.agentBubbleId = null
+      const seq = typeof msg.seq === 'number' ? msg.seq : void 0
+      cb.onAgentTurnEnd(seq)
+      setState(state, 'listening', cb)
+      return
+    }
+    case 'interrupt':
+      cb.onInterrupt()
+      return
+    case 'agent_turn_abort': {
+      const committed = (msg.committedText ?? '').trim()
       if (state.agentBubbleId) {
-        const id = state.agentBubbleId;
+        const id = state.agentBubbleId
         if (committed) {
-          state.transcript = state.transcript.map(
-            (e) => e.id === id && e.role === "agent" ? { ...e, text: committed, interrupted: true } : e
-          );
+          state.transcript = state.transcript.map((e) =>
+            e.id === id && e.role === 'agent' ? { ...e, text: committed, interrupted: true } : e,
+          )
         } else {
-          state.transcript = state.transcript.filter((e) => e.id !== id);
+          state.transcript = state.transcript.filter((e) => e.id !== id)
         }
-        cb.onTranscript(state.transcript);
+        cb.onTranscript(state.transcript)
       }
-      state.agentBubbleId = null;
-      return;
+      state.agentBubbleId = null
+      return
     }
-    case "tool_call":
+    case 'tool_call':
       state.transcript = [
         ...state.transcript,
         {
           id: `m${state.idCounter++}`,
-          role: "tool",
-          text: `\u2192 ${String(msg.tool ?? "?")}(${msg.args ? JSON.stringify(msg.args) : ""})`
-        }
-      ];
-      cb.onTranscript(state.transcript);
-      return;
-    case "tool_result":
+          role: 'tool',
+          text: `\u2192 ${String(msg.tool ?? '?')}(${msg.args ? JSON.stringify(msg.args) : ''})`,
+        },
+      ]
+      cb.onTranscript(state.transcript)
+      return
+    case 'tool_result':
       state.transcript = [
         ...state.transcript,
         {
           id: `m${state.idCounter++}`,
-          role: "tool",
-          text: `${msg.ok ? "\u2713" : "\u2717"} ${String(msg.tool ?? "?")}`
-        }
-      ];
-      cb.onTranscript(state.transcript);
-      return;
-    case "client_tool_call": {
-      const toolCallId = String(msg.toolCallId ?? "");
-      const name = String(msg.name ?? "");
-      const args = msg.args ?? {};
-      if (!toolCallId || !name) return;
-      cb.onClientToolCall({ toolCallId, name, args });
-      return;
-    }
-    case "call_end": {
-      const reasonRaw = String(msg.reason ?? "");
-      const reason = mapEndReason(reasonRaw);
-      state.endReason = reason;
+          role: 'tool',
+          text: `${msg.ok ? '\u2713' : '\u2717'} ${String(msg.tool ?? '?')}`,
+        },
+      ]
+      cb.onTranscript(state.transcript)
+      return
+    case 'client_tool_call': {
+      const toolCallId = String(msg.toolCallId ?? '')
+      const name = String(msg.name ?? '')
+      const args = msg.args ?? {}
+      if (!toolCallId || !name) return
+      cb.onClientToolCall({ toolCallId, name, args })
+      return
+    }
+    case 'call_end': {
+      const reasonRaw = String(msg.reason ?? '')
+      const reason = mapEndReason(reasonRaw)
+      state.endReason = reason
       state.transcript = [
         ...state.transcript,
         {
           id: `m${state.idCounter++}`,
-          role: "system",
-          text: `call ended${reasonRaw ? ` (${reasonRaw})` : ""}`
-        }
-      ];
-      cb.onTranscript(state.transcript);
-      cb.onCallEnd(reason);
-      return;
+          role: 'system',
+          text: `call ended${reasonRaw ? ` (${reasonRaw})` : ''}`,
+        },
+      ]
+      cb.onTranscript(state.transcript)
+      cb.onCallEnd(reason)
+      return
     }
-    case "error": {
-      const code = msg.code ?? "server_error";
-      const message = msg.message ?? "server error";
-      cb.onError({ code, message });
-      return;
+    case 'error': {
+      const code = msg.code ?? 'server_error'
+      const message = msg.message ?? 'server error'
+      cb.onError({ code, message })
+      return
     }
   }
 }
 var setState = (state, next, cb) => {
-  if (state.state === next) return;
-  cb.onState(next);
-};
+  if (state.state === next) return
+  cb.onState(next)
+}
 var upsertUserPartial = (state, text, isFinal) => {
-  let idx = -1;
+  let idx = -1
   for (let i = state.transcript.length - 1; i >= 0; i--) {
-    const e = state.transcript[i];
-    if (e.role === "user" && e.committed === false) {
-      idx = i;
-      break;
+    const e = state.transcript[i]
+    if (e.role === 'user' && e.committed === false) {
+      idx = i
+      break
     }
   }
   if (idx === -1) {
     state.transcript = [
       ...state.transcript,
-      { id: `m${state.idCounter++}`, role: "user", text, committed: isFinal }
-    ];
-    return;
-  }
-  const target = state.transcript[idx];
-  const next = [...state.transcript];
-  next[idx] = { ...target, text, committed: isFinal };
-  state.transcript = next;
-};
+      { id: `m${state.idCounter++}`, role: 'user', text, committed: isFinal },
+    ]
+    return
+  }
+  const target = state.transcript[idx]
+  const next = [...state.transcript]
+  next[idx] = { ...target, text, committed: isFinal }
+  state.transcript = next
+}
 function buildWsUrl(args) {
-  const base = new URL(args.apiBase);
-  const proto = base.protocol === "https:" ? "wss:" : "ws:";
-  const bargeQS = args.bargeIn === false ? "&barge=off" : "";
-  return `${proto}//${base.host}/v1/agents/${encodeURIComponent(args.agentId)}/call?token=${encodeURIComponent(args.token)}${bargeQS}`;
+  const base = new URL(args.apiBase)
+  const proto = base.protocol === 'https:' ? 'wss:' : 'ws:'
+  const bargeQS = args.bargeIn === false ? '&barge=off' : ''
+  return `${proto}//${base.host}/v1/agents/${encodeURIComponent(args.agentId)}/call?token=${encodeURIComponent(args.token)}${bargeQS}`
 }
 // src/clientTools.ts
-var NAME_RE = /^[a-zA-Z_][a-zA-Z0-9_]*$/;
-var MAX_TOOLS = 64;
-var MAX_USAGE = 500;
-var MAX_TIMEOUT_MS = 3e4;
+var NAME_RE = /^[a-zA-Z_][a-zA-Z0-9_]*$/
+var MAX_TOOLS = 64
+var MAX_USAGE = 500
+var MAX_TIMEOUT_MS = 3e4
 var validateClientToolMap = (tools) => {
-  if (tools === void 0) return;
-  if (typeof tools !== "object" || tools === null || Array.isArray(tools)) {
-    throw new Error("clientTools must be an object keyed by tool name");
+  if (tools === void 0) return
+  if (typeof tools !== 'object' || tools === null || Array.isArray(tools)) {
+    throw new Error('clientTools must be an object keyed by tool name')
   }
-  const entries = Object.entries(tools);
+  const entries = Object.entries(tools)
   if (entries.length > MAX_TOOLS) {
-    throw new Error(`clientTools may declare at most 64 tools (got ${entries.length})`);
+    throw new Error(`clientTools may declare at most 64 tools (got ${entries.length})`)
   }
   for (const [name, def] of entries) {
     if (!NAME_RE.test(name)) {
       throw new Error(
-        `clientTools["${name}"]: name must be a valid identifier (^[a-zA-Z_][a-zA-Z0-9_]*$)`
-      );
+        `clientTools["${name}"]: name must be a valid identifier (^[a-zA-Z_][a-zA-Z0-9_]*$)`,
+      )
     }
-    if (!def || typeof def !== "object") {
-      throw new Error(`clientTools["${name}"]: must be an object`);
+    if (!def || typeof def !== 'object') {
+      throw new Error(`clientTools["${name}"]: must be an object`)
     }
-    if (typeof def.description !== "string" || def.description.length === 0) {
-      throw new Error(`clientTools["${name}"]: must have a description`);
+    if (typeof def.description !== 'string' || def.description.length === 0) {
+      throw new Error(`clientTools["${name}"]: must have a description`)
     }
-    if (typeof def.handler !== "function") {
-      throw new Error(`clientTools["${name}"]: must have a handler function`);
+    if (typeof def.handler !== 'function') {
+      throw new Error(`clientTools["${name}"]: must have a handler function`)
     }
     if (def.usage !== void 0 && def.usage.length > MAX_USAGE) {
-      throw new Error(`clientTools["${name}"]: usage must be \u2264500 chars`);
+      throw new Error(`clientTools["${name}"]: usage must be \u2264500 chars`)
     }
-    if (def.timeoutMs !== void 0 && (!Number.isFinite(def.timeoutMs) || def.timeoutMs <= 0 || def.timeoutMs > MAX_TIMEOUT_MS)) {
-      throw new Error(`clientTools["${name}"]: timeoutMs must be in (0, 30000]`);
+    if (
+      def.timeoutMs !== void 0 &&
+      (!Number.isFinite(def.timeoutMs) || def.timeoutMs <= 0 || def.timeoutMs > MAX_TIMEOUT_MS)
+    ) {
+      throw new Error(`clientTools["${name}"]: timeoutMs must be in (0, 30000]`)
     }
   }
-};
+}
 var buildRegisterFrame = (tools) => ({
-  type: "client_tools_register",
+  type: 'client_tools_register',
   tools: Object.entries(tools).map(([name, def]) => ({
     name,
     description: def.description,
     parameters: def.parameters,
-    ...def.usage !== void 0 ? { usage: def.usage } : {},
-    ...def.timeoutMs !== void 0 ? { timeoutMs: def.timeoutMs } : {}
-  }))
-});
+    ...(def.usage !== void 0 ? { usage: def.usage } : {}),
+    ...(def.timeoutMs !== void 0 ? { timeoutMs: def.timeoutMs } : {}),
+  })),
+})
 var dispatchClientToolCall = (send, tools, frame) => {
   const safeSend = (payload) => {
     try {
-      send(payload);
-    } catch {
-    }
-  };
-  const tool = tools[frame.name];
+      send(payload)
+    } catch {}
+  }
+  const tool = tools[frame.name]
   if (!tool) {
     safeSend({
-      type: "client_tool_result",
+      type: 'client_tool_result',
       toolCallId: frame.toolCallId,
-      error: `No handler for ${frame.name}`
-    });
-    return;
+      error: `No handler for ${frame.name}`,
+    })
+    return
   }
   void (async () => {
     try {
-      const out = await tool.handler(frame.args);
+      const out = await tool.handler(frame.args)
       safeSend({
-        type: "client_tool_result",
+        type: 'client_tool_result',
         toolCallId: frame.toolCallId,
-        result: typeof out === "string" ? out : JSON.stringify(out)
-      });
+        result: typeof out === 'string' ? out : JSON.stringify(out),
+      })
     } catch (err) {
       safeSend({
-        type: "client_tool_result",
+        type: 'client_tool_result',
         toolCallId: frame.toolCallId,
-        error: err instanceof Error ? err.message : String(err)
-      });
+        error: err instanceof Error ? err.message : String(err),
+      })
     }
-  })();
-};
+  })()
+}
+// src/ClientMarksBuffer.ts
+var createClientMarksBuffer = (args) => {
+  const now = args.now ?? (() => performance.now())
+  let pendingFirstOutboundAt = null
+  const inFlight = /* @__PURE__ */ new Map()
+  const tryEmit = (seq) => {
+    const slot = inFlight.get(seq)
+    if (!slot) return
+    if (!slot.ended) return
+    const marks = {}
+    if (slot.firstOutboundAt !== null && slot.firstAudibleAt !== null) {
+      marks.client_mic_to_first_audible_ms = slot.firstAudibleAt - slot.firstOutboundAt
+    }
+    args.send({
+      type: 'client_marks',
+      seq,
+      marks,
+      clientNow: Date.now(),
+    })
+    inFlight.delete(seq)
+  }
+  const markFirstOutboundAudio = () => {
+    if (pendingFirstOutboundAt !== null) return
+    pendingFirstOutboundAt = now()
+  }
+  const markFirstAudibleOutput = () => {
+    let target
+    for (const slot of inFlight.values()) {
+      if (!slot.ended) {
+        target = slot
+      }
+    }
+    if (!target) return
+    if (target.firstAudibleAt !== null) return
+    target.firstAudibleAt = now()
+  }
+  const onAgentTurnStart = (seq) => {
+    inFlight.set(seq, {
+      firstOutboundAt: pendingFirstOutboundAt,
+      firstAudibleAt: null,
+      ended: false,
+    })
+    pendingFirstOutboundAt = null
+  }
+  const onAgentTurnEnd = (seq) => {
+    const slot = inFlight.get(seq)
+    if (!slot) {
+      args.send({ type: 'client_marks', seq, marks: {}, clientNow: Date.now() })
+      return
+    }
+    slot.ended = true
+    tryEmit(seq)
+  }
+  const flush = () => {
+    for (const seq of [...inFlight.keys()]) {
+      const slot = inFlight.get(seq)
+      slot.ended = true
+      tryEmit(seq)
+    }
+    pendingFirstOutboundAt = null
+  }
+  return {
+    markFirstOutboundAudio,
+    markFirstAudibleOutput,
+    onAgentTurnStart,
+    onAgentTurnEnd,
+    flush,
+  }
+}
 // src/VoiceClient.ts
 var BrowserVoiceClient = class {
   constructor(args) {
-    this.rws = null;
-    this.capture = null;
-    this.playback = null;
-    this.muted = false;
-    this.inputVolume = 0;
-    this.outputVolume = 0;
-    this.startedAt = null;
-    this.endedFired = false;
-    this.lastError = null;
+    this.rws = null
+    this.capture = null
+    this.playback = null
+    this.muted = false
+    this.inputVolume = 0
+    this.outputVolume = 0
+    this.startedAt = null
+    this.endedFired = false
+    this.lastError = null
     this.end = () => {
-      this.teardown("user_hangup");
-    };
+      this.teardown('user_hangup')
+    }
     this.mute = () => {
-      if (this.muted) return;
-      this.muted = true;
-      this.capture?.mute(true);
-    };
+      if (this.muted) return
+      this.muted = true
+      this.capture?.mute(true)
+    }
     this.unmute = () => {
-      if (!this.muted) return;
-      this.muted = false;
-      this.capture?.mute(false);
-    };
+      if (!this.muted) return
+      this.muted = false
+      this.capture?.mute(false)
+    }
     // ---------------------------------------------------------------
     // Internal
     // ---------------------------------------------------------------
     this.sendClientToolsRegister = () => {
-      const frame = buildRegisterFrame(this.args.options.clientTools ?? {});
-      this.rws?.send(JSON.stringify(frame));
-    };
+      const frame = buildRegisterFrame(this.args.options.clientTools ?? {})
+      this.rws?.send(JSON.stringify(frame))
+    }
     this.setState = (next) => {
-      if (this.proto.state === next) return;
-      this.proto.state = next;
-      this.args.options.onStateChange?.(next);
-    };
+      if (this.proto.state === next) return
+      this.proto.state = next
+      this.args.options.onStateChange?.(next)
+    }
     this.emitError = (err) => {
-      this.lastError = err;
-      this.args.options.onError?.(err);
-    };
+      this.lastError = err
+      this.args.options.onError?.(err)
+    }
     this.handleSocketEvent = (ev) => {
       switch (ev.type) {
-        case "open":
-          void this.startCapture();
-          break;
-        case "reconnected":
-          this.proto.transcript = [];
-          this.proto.agentBubbleId = null;
-          this.args.options.onTranscript?.(this.proto.transcript);
-          void this.startCapture();
-          this.setState("listening");
-          break;
-        case "message":
-          if (typeof ev.data === "string") {
+        case 'open':
+          void this.startCapture()
+          break
+        case 'reconnected':
+          this.proto.transcript = []
+          this.proto.agentBubbleId = null
+          this.args.options.onTranscript?.(this.proto.transcript)
+          void this.startCapture()
+          this.setState('listening')
+          break
+        case 'message':
+          if (typeof ev.data === 'string') {
             handleServerMessage(ev.data, this.proto, {
               onState: this.setState,
               onTranscript: (entries) => this.args.options.onTranscript?.(entries),
               onError: this.emitError,
               onInterrupt: () => {
-                this.playback?.flush();
-                this.args.options.onInterrupt?.();
+                this.playback?.flush()
+                this.args.options.onInterrupt?.()
+              },
+              onAgentTurnStart: (seq) => {
+                if (typeof seq === 'number') this.marks.onAgentTurnStart(seq)
+                this.args.options.onAgentTurnStart?.()
+              },
+              onAgentTurnEnd: (seq) => {
+                if (typeof seq === 'number') this.marks.onAgentTurnEnd(seq)
               },
-              onAgentTurnStart: () => this.args.options.onAgentTurnStart?.(),
               onCallEnd: (reason) => this.teardown(reason),
               onConnected: () => this.sendClientToolsRegister(),
-              onClientToolCall: (frame) => dispatchClientToolCall(
-                (f) => this.rws?.send(JSON.stringify(f)),
-                this.args.options.clientTools ?? {},
-                frame
-              )
-            });
+              onClientToolCall: (frame) =>
+                dispatchClientToolCall(
+                  (f) => this.rws?.send(JSON.stringify(f)),
+                  this.args.options.clientTools ?? {},
+                  frame,
+                ),
+            })
           } else {
-            this.playback?.enqueue(ev.data);
+            this.marks.markFirstAudibleOutput()
+            this.playback?.enqueue(ev.data)
           }
-          break;
-        case "close":
+          break
+        case 'close':
           if (ev.permanent) {
-            const reason = this.proto.endReason ?? (this.lastError ? "error" : "user_hangup");
-            this.teardown(reason);
+            const reason = this.proto.endReason ?? (this.lastError ? 'error' : 'user_hangup')
+            this.teardown(reason)
           }
-          break;
-        case "error":
-          this.emitError({ code: "socket_error", message: ev.error.message });
-          break;
+          break
+        case 'error':
+          this.emitError({ code: 'socket_error', message: ev.error.message })
+          break
       }
-    };
+    }
     this.startCapture = async () => {
-      if (this.capture?.isCapturing()) return;
+      if (this.capture?.isCapturing()) return
       this.capture = createAudioCapture({
         onChunk: (pcm) => {
-          this.rws?.send(pcm);
+          this.marks.markFirstOutboundAudio()
+          this.rws?.send(pcm)
         },
         onVolume: (v) => {
-          this.inputVolume = v;
-          this.args.options.onVolume?.({ input: v, output: this.outputVolume });
+          this.inputVolume = v
+          this.args.options.onVolume?.({ input: v, output: this.outputVolume })
         },
         onError: (err) => {
           this.emitError({
-            code: err.name === "NotAllowedError" ? "mic_denied" : "mic_start_failed",
-            message: err.message
-          });
-        }
-      });
-      if (this.muted) this.capture.mute(true);
+            code: err.name === 'NotAllowedError' ? 'mic_denied' : 'mic_start_failed',
+            message: err.message,
+          })
+        },
+      })
+      if (this.muted) this.capture.mute(true)
       try {
-        await this.capture.start();
-      } catch {
-      }
-    };
+        await this.capture.start()
+      } catch {}
+    }
     this.teardown = (reason) => {
-      this.capture?.stop();
-      this.capture = null;
-      this.playback?.close();
-      this.playback = null;
       try {
-        this.rws?.close(1e3, reason);
-      } catch {
-      }
-      this.rws = null;
-      this.setState("ended");
-      this.fireEndOnce(reason);
-    };
+        this.marks.flush()
+      } catch {}
+      this.capture?.stop()
+      this.capture = null
+      this.playback?.close()
+      this.playback = null
+      try {
+        this.rws?.close(1e3, reason)
+      } catch {}
+      this.rws = null
+      this.setState('ended')
+      this.fireEndOnce(reason)
+    }
     this.fireEndOnce = (reason) => {
-      if (this.endedFired) return;
-      this.endedFired = true;
-      const startedAt = this.startedAt ?? Date.now();
+      if (this.endedFired) return
+      this.endedFired = true
+      const startedAt = this.startedAt ?? Date.now()
       this.args.options.onEnd?.({
         reason,
-        errorCode: reason === "error" ? this.lastError?.code : void 0,
-        durationMs: Date.now() - startedAt
-      });
-    };
-    this.args = args;
-    this.proto = createProtocolState();
-    validateClientToolMap(args.options.clientTools);
+        errorCode: reason === 'error' ? this.lastError?.code : void 0,
+        durationMs: Date.now() - startedAt,
+      })
+    }
+    this.args = args
+    this.proto = createProtocolState()
+    validateClientToolMap(args.options.clientTools)
+    this.marks = createClientMarksBuffer({
+      send: (frame) => {
+        try {
+          this.rws?.send(JSON.stringify(frame))
+        } catch {}
+      },
+    })
   }
   // ---------------------------------------------------------------
   // Call interface
   // ---------------------------------------------------------------
   get state() {
-    return this.proto.state;
+    return this.proto.state
   }
   get transcript() {
-    return this.proto.transcript.slice();
+    return this.proto.transcript.slice()
   }
   get isMuted() {
-    return this.muted;
+    return this.muted
   }
   // ---------------------------------------------------------------
   // Lifecycle — called by the factory immediately after construction.
@@ -745,84 +845,262 @@ var BrowserVoiceClient = class {
   // failures arrive via `onError`.
   // ---------------------------------------------------------------
   async start() {
-    this.setState("connecting");
-    this.startedAt = Date.now();
+    this.setState('connecting')
+    this.startedAt = Date.now()
     const url = buildWsUrl({
       apiBase: this.args.config.apiBase,
       agentId: this.args.options.agentId,
       token: this.args.token,
-      bargeIn: this.args.options.bargeIn
-    });
+      bargeIn: this.args.options.bargeIn,
+    })
     this.playback = createAudioPlayback({
       onVolume: (v) => {
-        this.outputVolume = v;
-        this.args.options.onVolume?.({ input: this.inputVolume, output: v });
-      }
-    });
+        this.outputVolume = v
+        this.args.options.onVolume?.({ input: this.inputVolume, output: v })
+      },
+    })
     try {
-      await this.playback.resume();
-    } catch {
-    }
+      await this.playback.resume()
+    } catch {}
     this.rws = createReconnectingWebSocket(
       {
         url,
         wsFactory: this.args.wsFactory,
-        maxRetries: 3
+        maxRetries: 3,
       },
-      (ev) => this.handleSocketEvent(ev)
-    );
+      (ev) => this.handleSocketEvent(ev),
+    )
+  }
+}
+// src/webrtc/createWebRtcCall.ts
+async function createWebRtcCall(opts) {
+  const proto = createProtocolState()
+  let muted = false
+  let ended = false
+  const fireState = (next) => {
+    if (proto.state === next) return
+    proto.state = next
+    opts.onStateChange?.(next)
+  }
+  const dispatch = (raw) => {
+    handleServerMessage(raw, proto, {
+      onState: fireState,
+      onTranscript: (entries) => opts.onTranscript?.(entries),
+      onError: (err) => opts.onError?.(err),
+      onInterrupt: () => opts.onInterrupt?.(),
+      onAgentTurnStart: () => opts.onAgentTurnStart?.(),
+      onAgentTurnEnd: () => {},
+      onCallEnd: () => teardown(),
+      onConnected: () => {},
+      onClientToolCall: () => {},
+    })
   }
-};
+  fireState('connecting')
+  const pc = new RTCPeerConnection({
+    iceServers: [{ urls: 'stun:stun.l.google.com:19302' }],
+  })
+  const audioEl = document.createElement('audio')
+  audioEl.autoplay = true
+  audioEl.style.display = 'none'
+  document.body.appendChild(audioEl)
+  pc.ontrack = (event) => {
+    audioEl.srcObject = event.streams[0] ?? new MediaStream([event.track])
+  }
+  let mic
+  try {
+    mic = await navigator.mediaDevices.getUserMedia({ audio: true })
+  } catch (err) {
+    const code =
+      err instanceof DOMException && err.name === 'NotAllowedError'
+        ? 'mic_denied'
+        : 'mic_start_failed'
+    opts.onError?.({
+      code,
+      message: err instanceof Error ? err.message : 'getUserMedia failed',
+    })
+    fireState('error')
+    pc.close()
+    audioEl.remove()
+    throw err
+  }
+  for (const track of mic.getAudioTracks()) pc.addTrack(track, mic)
+  const dc = pc.createDataChannel('control', { ordered: true })
+  dc.onmessage = (e) => {
+    if (typeof e.data === 'string') dispatch(e.data)
+  }
+  dc.onerror = () => {
+    opts.onError?.({ code: 'socket_error', message: 'control channel error' })
+  }
+  const gateway = opts.webrtcGatewayBase || ''
+  const offerUrl = gateway
+    ? `${gateway}/webrtc/offer?token=${encodeURIComponent(opts.token)}`
+    : `${opts.apiBase}/v1/agents/${encodeURIComponent(opts.agentId)}/webrtc/offer?token=${encodeURIComponent(opts.token)}`
+  const iceUrl = gateway
+    ? `${gateway}/webrtc/ice?token=${encodeURIComponent(opts.token)}`
+    : `${opts.apiBase}/v1/agents/${encodeURIComponent(opts.agentId)}/webrtc/ice?token=${encodeURIComponent(opts.token)}`
+  await pc.setLocalDescription(await pc.createOffer())
+  let callId
+  try {
+    const offerRes = await fetch(offerUrl, {
+      method: 'POST',
+      headers: { 'content-type': 'application/json' },
+      body: JSON.stringify({ sdp: pc.localDescription.sdp, type: 'offer', agentId: opts.agentId }),
+    })
+    if (!offerRes.ok) {
+      const code = offerRes.status === 401 ? 'unauthorized' : 'server_error'
+      opts.onError?.({ code, message: `signaling failed: HTTP ${offerRes.status}` })
+      fireState('error')
+      mic.getTracks().forEach((t) => t.stop())
+      pc.close()
+      audioEl.remove()
+      throw new Error(`webrtc offer failed: ${offerRes.status}`)
+    }
+    const body = await offerRes.json()
+    callId = body.callId
+    await pc.setRemoteDescription({ type: 'answer', sdp: body.sdp })
+  } catch (err) {
+    if (!ended) {
+      opts.onError?.({
+        code: 'network_unreachable',
+        message: err instanceof Error ? err.message : 'signaling failed',
+      })
+      fireState('error')
+      mic.getTracks().forEach((t) => t.stop())
+      pc.close()
+      audioEl.remove()
+    }
+    throw err
+  }
+  pc.onicecandidate = (e) => {
+    if (!e.candidate) return
+    void fetch(iceUrl, {
+      method: 'POST',
+      headers: { 'content-type': 'application/json' },
+      body: JSON.stringify({ callId, candidate: e.candidate }),
+    }).catch(() => {})
+  }
+  pc.onconnectionstatechange = () => {
+    const s = pc.connectionState
+    if (s === 'connected') fireState('listening')
+    if (s === 'failed' || s === 'disconnected') {
+      opts.onError?.({ code: 'socket_error', message: `webrtc connection ${s}` })
+      teardown()
+    }
+    if (s === 'closed' && !ended) teardown()
+  }
+  const teardown = () => {
+    if (ended) return
+    ended = true
+    try {
+      mic.getTracks().forEach((t) => t.stop())
+    } catch {}
+    try {
+      pc.close()
+    } catch {}
+    try {
+      audioEl.remove()
+    } catch {}
+    fireState('ended')
+    opts.onEnd?.()
+  }
+  return {
+    get state() {
+      return proto.state
+    },
+    get transcript() {
+      return proto.transcript.slice()
+    },
+    get isMuted() {
+      return muted
+    },
+    end: () => teardown(),
+    mute: () => {
+      if (muted) return
+      muted = true
+      mic.getAudioTracks().forEach((t) => (t.enabled = false))
+    },
+    unmute: () => {
+      if (!muted) return
+      muted = false
+      mic.getAudioTracks().forEach((t) => (t.enabled = true))
+    },
+  }
+}
 // src/browser.ts
-var browserWsFactory = (url) => new globalThis.WebSocket(url);
+var browserWsFactory = (url) => new globalThis.WebSocket(url)
 var BrowserVoiceFactory = class {
   constructor(config) {
     this.startCall = async (options) => {
       if (!options.agentId) {
-        throw new Error("startCall: agentId is required");
+        throw new Error('startCall: agentId is required')
       }
-      const { context, metadata } = mergeStartCallContext(this.config, options);
+      const { context, metadata } = mergeStartCallContext(this.config, options)
       const fetchArgs = {
         agentId: options.agentId,
         userId: options.userId,
         context,
-        metadata
-      };
-      let token;
+        metadata,
+      }
+      let resolved
       if (options.token) {
-        token = options.token;
+        resolved = { token: options.token, transport: 'ws' }
       } else {
-        token = await this.config.fetchToken(fetchArgs);
-        if (!token) {
-          throw new Error("configureVoiceClient.fetchToken returned empty token");
+        const r = await this.config.fetchToken(fetchArgs)
+        if (!r) {
+          throw new Error('configureVoiceClient.fetchToken returned empty token')
         }
+        resolved = typeof r === 'string' ? { token: r, transport: 'ws' } : r
+        if (!resolved.token) {
+          throw new Error('configureVoiceClient.fetchToken returned an object without `token`')
+        }
+      }
+      if (resolved.transport === 'webrtc') {
+        return createWebRtcCall({
+          agentId: options.agentId,
+          apiBase: this.config.apiBase,
+          token: resolved.token,
+          webrtcGatewayBase: resolved.webrtcGatewayBase,
+          onStateChange: options.onStateChange,
+          onTranscript: options.onTranscript,
+          onError: options.onError,
+          // Synthesise a minimal CallEndEvent. WebRTC doesn't carry an end reason
+          // from the server yet — use 'agent_ended' as placeholder. durationMs is
+          // tracked at 0 until the followup lands (see spec Followups section).
+          onEnd: options.onEnd
+            ? () => options.onEnd({ reason: 'agent_ended', durationMs: 0 })
+            : void 0,
+          onInterrupt: options.onInterrupt,
+          onAgentTurnStart: options.onAgentTurnStart,
+        })
       }
       const client = new BrowserVoiceClient({
         config: this.config,
         // Carry merged context/metadata through to startCall so server can
         // see what the SDK saw.
         options: { ...options, context, metadata },
-        token,
-        wsFactory: browserWsFactory
-      });
-      await client.start();
-      return client;
-    };
-    this.config = config;
-  }
-};
+        token: resolved.token,
+        wsFactory: browserWsFactory,
+      })
+      await client.start()
+      return client
+    }
+    this.config = config
+  }
+}
 function configureVoiceClient(config) {
-  return new BrowserVoiceFactory(normalizeConfig(config));
+  return new BrowserVoiceFactory(normalizeConfig(config))
 }
 // Annotate the CommonJS export names for ESM import in node:
-0 && (module.exports = {
-  buildWsUrl,
-  configureVoiceClient,
-  createAudioCapture,
-  createAudioPlayback,
-  createProtocolState,
-  createReconnectingWebSocket,
-  handleServerMessage
-});
-//# sourceMappingURL=browser.js.map
+0 &&
+  (module.exports = {
+    buildWsUrl,
+    configureVoiceClient,
+    createAudioCapture,
+    createAudioPlayback,
+    createProtocolState,
+    createReconnectingWebSocket,
+    handleServerMessage,
+  })
+//# sourceMappingURL=browser.js.map