npm - @oshara/voice-sdk - Versions diffs - 0.1.0 - Mend

@oshara/voice-sdk 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

package/README.md +198 -0
package/dist/appearance-CNWT8x1G.cjs +2 -0
package/dist/appearance-CNWT8x1G.cjs.map +1 -0
package/dist/appearance-i6QBkpCk.js +650 -0
package/dist/appearance-i6QBkpCk.js.map +1 -0
package/dist/consent-CK9VXNPa.js +54 -0
package/dist/consent-CK9VXNPa.js.map +1 -0
package/dist/consent-D7QNSkQD.cjs +2 -0
package/dist/consent-D7QNSkQD.cjs.map +1 -0
package/dist/core/analytics.d.ts +30 -0
package/dist/core/appearance.d.ts +113 -0
package/dist/core/audioSettings.d.ts +69 -0
package/dist/core/consent.d.ts +17 -0
package/dist/core/createVoiceAgent.d.ts +79 -0
package/dist/core/events.d.ts +103 -0
package/dist/core/formController.d.ts +28 -0
package/dist/core/forms.d.ts +235 -0
package/dist/core/index.d.ts +29 -0
package/dist/core/prevContext.d.ts +26 -0
package/dist/core/transport.d.ts +30 -0
package/dist/core/types.d.ts +49 -0
package/dist/core/voice.d.ts +79 -0
package/dist/createVoiceAgent-BM3HODS6.js +1058 -0
package/dist/createVoiceAgent-BM3HODS6.js.map +1 -0
package/dist/createVoiceAgent-CJWxWzz6.cjs +4 -0
package/dist/createVoiceAgent-CJWxWzz6.cjs.map +1 -0
package/dist/index.cjs +2 -0
package/dist/index.cjs.map +1 -0
package/dist/index.js +44 -0
package/dist/index.js.map +1 -0
package/dist/react/index.d.ts +60 -0
package/dist/react.cjs +2 -0
package/dist/react.cjs.map +1 -0
package/dist/react.js +115 -0
package/dist/react.js.map +1 -0
package/dist/styles.css +1838 -0
package/dist/ui/index.d.ts +21 -0
package/dist/ui/ui.d.ts +165 -0
package/dist/ui.cjs +284 -0
package/dist/ui.cjs.map +1 -0
package/dist/ui.js +1153 -0
package/dist/ui.js.map +1 -0
package/package.json +67 -0
package/src/core/analytics.ts +111 -0
package/src/core/appearance.ts +464 -0
package/src/core/audioSettings.ts +180 -0
package/src/core/consent.ts +78 -0
package/src/core/createVoiceAgent.ts +280 -0
package/src/core/events.ts +120 -0
package/src/core/formController.ts +317 -0
package/src/core/forms.ts +861 -0
package/src/core/index.ts +121 -0
package/src/core/prevContext.ts +153 -0
package/src/core/transport.ts +118 -0
package/src/core/types.ts +66 -0
package/src/core/voice.ts +1179 -0
package/src/react/index.ts +238 -0
package/src/ui/index.ts +507 -0
package/src/ui/styles.css +1838 -0
package/src/ui/ui.ts +1672 -0
package/src/vite-env.d.ts +10 -0

package/src/core/voice.ts ADDED Viewed

@@ -0,0 +1,1179 @@
+import {
+  AudioPresets,
+  ConnectionState,
+  LocalAudioTrack,
+  RemoteAudioTrack,
+  RemoteParticipant,
+  RemoteTrack,
+  RemoteTrackPublication,
+  Room,
+  RoomEvent,
+  Track,
+  TranscriptionSegment,
+} from "livekit-client";
+import {
+  AudioPrefs,
+  loadAudioPrefs,
+  NoiseFilterEngine,
+  saveAudioPrefs,
+} from "./audioSettings";
+import type { AppearanceConfig } from "./appearance";
+import type { Emit } from "./events";
+import {
+  DEFAULT_DEEPFILTER_MODULE_URL,
+  type DeepFilterUrls,
+  type OrbState,
+  type SessionInit,
+} from "./types";
+import {
+  formatPrevContextForAgent,
+  loadPrevContext,
+  PrevTurn,
+  savePrevContext,
+} from "./prevContext";
+export type { SessionInit } from "./types";
+/**
+ * Snapshot reported back to the UI after the mic publishes or when any
+ * audio setting changes. The `applied*` flags are read from the live
+ * MediaStreamTrack via getSettings(), so they reflect what the browser
+ * actually honored — which may differ from what we requested.
+ */
+export interface AudioStateSnapshot {
+  prefs: AudioPrefs;
+  applied: {
+    echoCancellation: boolean | undefined;
+    noiseSuppression: boolean | undefined;
+    autoGainControl: boolean | undefined;
+    voiceIsolation: boolean | undefined;
+    sampleRate: number | undefined;
+    channelCount: number | undefined;
+    deviceId: string | undefined;
+  };
+  /**
+   * Effective state of the deep-learning NS engine:
+   * - `engine` — what the user picked (off / krisp / deepfilter).
+   * - `status` — what actually happened: "active" if the processor is
+   *   attached, "unsupported" if the chosen engine isn't available in
+   *   this browser, "failed" if attach errored, "off" if engine === "off".
+   */
+  noiseFilter: {
+    engine: NoiseFilterEngine;
+    status: "active" | "off" | "unsupported" | "failed";
+  };
+}
+export interface AudioStats {
+  /** Local outbound audio level (0-1) from RTCStats. */
+  outboundAudioLevel: number;
+  /** Remote inbound audio level (0-1) from RTCStats. */
+  inboundAudioLevel: number;
+  /** Packets lost on inbound (agent → user) stream. */
+  packetsLost: number;
+  /** Inbound jitter in ms. */
+  jitter: number;
+  /** Round-trip time in ms (peer connection). */
+  roundTripTime: number;
+}
+export interface VoiceController {
+  start: () => Promise<void>;
+  end: () => Promise<void>;
+  toggleMute: () => Promise<boolean>;
+  isActive: () => boolean;
+  /** Returns the session_id for the active session, or null if no session is running. */
+  sessionId: () => string | null;
+  /** Publish a JSON data message back to the agent (used after form submit). */
+  publishData: (payload: unknown, topic: string) => Promise<void>;
+  /** Apply a partial update to the audio preferences (live, no reconnect). */
+  updateAudioSettings: (delta: Partial<AudioPrefs>) => Promise<AudioStateSnapshot>;
+  /** Read the current audio state (preferences + actually-applied values). */
+  getAudioState: () => AudioStateSnapshot;
+  /** Poll a snapshot of audio RTC stats (returns null if no call). */
+  getAudioStats: () => Promise<AudioStats | null>;
+}
+export interface VoiceControllerOptions {
+  /** Mint a LiveKit session (POST /api/agents/agent-session/). */
+  fetchSession: () => Promise<SessionInit>;
+  /** AICharacter slug — namespaces persisted prefs / context. */
+  agentSlug: string;
+  /** Read the live appearance config (labels, max_call_seconds). */
+  getAppearance: () => AppearanceConfig;
+  /** Typed event emitter — replaces all the old direct UI calls. */
+  emit: Emit;
+  /** Per-instance DeepFilterNet3 asset overrides. */
+  deepFilter?: DeepFilterUrls;
+  /** Pre-resolved audio prefs to start from. Defaults to loadAudioPrefs(slug). */
+  initialPrefs?: AudioPrefs;
+  /** Persist pref changes to localStorage (default true). */
+  persistPrefs?: boolean;
+}
+export function createVoiceController(
+  opts: VoiceControllerOptions,
+): VoiceController {
+  const {
+    fetchSession,
+    agentSlug,
+    getAppearance,
+    emit,
+    deepFilter = {},
+    persistPrefs = true,
+  } = opts;
+  const deepFilterModuleUrl =
+    (deepFilter.moduleUrl && deepFilter.moduleUrl.trim()) ||
+    DEFAULT_DEEPFILTER_MODULE_URL;
+  const deepFilterCdnUrl =
+    deepFilter.cdnUrl && deepFilter.cdnUrl.trim() ? deepFilter.cdnUrl.trim() : undefined;
+  const deepFilterWasmUrl =
+    deepFilter.wasmUrl && deepFilter.wasmUrl.trim() ? deepFilter.wasmUrl.trim() : undefined;
+  const deepFilterOnnxUrl =
+    deepFilter.onnxUrl && deepFilter.onnxUrl.trim() ? deepFilter.onnxUrl.trim() : undefined;
+  let room: Room | null = null;
+  let currentSessionId: string | null = null;
+  let audioEl: HTMLAudioElement | null = null;
+  let muted = false;
+  let duckingMuted = false;
+  let duckTimerId: number | null = null;
+  let unduckTimerId: number | null = null;
+  // Duck-in delay: long enough that a quick user barge-in / reply isn't
+  // chopped before the agent-side VAD locks on, short enough that the
+  // mic-vs-speaker echo path is closed before too much leaks back. 80ms
+  // (an earlier value) made preemptive generation fire late because the
+  // very start of user replies was getting muted before VAD confirmed
+  // speech. 200ms keeps the agent's turn detector happy. We unmute
+  // immediately when the agent stops speaking — any post-agent grace
+  // just delays the user's next turn reaching VAD.
+  const DUCK_DELAY_MS = 200;
+  let prefs: AudioPrefs = opts.initialPrefs
+    ? { ...opts.initialPrefs }
+    : loadAudioPrefs(agentSlug);
+  const persistPrefsToStorage = () => {
+    if (persistPrefs) saveAudioPrefs(agentSlug, prefs);
+  };
+  let noiseFilterStatus: AudioStateSnapshot["noiseFilter"]["status"] = "off";
+  // ── Orb-state reconciliation ──────────────────────────────────────
+  // The orb is driven by three independent signals: agent audio + user audio
+  // (RoomEvent.ActiveSpeakersChanged) and an explicit "thinking" status
+  // (voice.agent_status data events, with lk.agent.state as a fallback).
+  // Real audio is authoritative; the inferred "no speakers → idle" must not
+  // stomp a live thinking state. Priority: speaking > thinking > listening >
+  // idle.
+  let agentSpeaking = false;
+  let userSpeaking = false;
+  let agentThinking = false;
+  let lastAgentStatusLabel = "";
+  // Debounce the thinking → idle clear so a very fast tool doesn't flash the
+  // status line on then instantly off.
+  let thinkingClearTimer: number | null = null;
+  const THINKING_CLEAR_DELAY_MS = 150;
+  const reconcileOrb = () => {
+    let next: OrbState;
+    if (agentSpeaking) next = "speaking";
+    else if (agentThinking) next = "thinking";
+    else if (userSpeaking) next = "listening";
+    else next = "idle";
+    emit("state", {
+      orb: next,
+      statusLabel: next === "thinking" ? lastAgentStatusLabel || null : null,
+    });
+  };
+  const cancelThinkingClear = () => {
+    if (thinkingClearTimer !== null) {
+      window.clearTimeout(thinkingClearTimer);
+      thinkingClearTimer = null;
+    }
+  };
+  /** Enter the thinking state with a contextual label (may be empty). */
+  const setThinking = (label: string) => {
+    cancelThinkingClear();
+    agentThinking = true;
+    lastAgentStatusLabel = label || "";
+    reconcileOrb();
+  };
+  /** Leave thinking, debounced so brief tool calls don't flicker. */
+  const clearThinking = () => {
+    if (!agentThinking || thinkingClearTimer !== null) return;
+    thinkingClearTimer = window.setTimeout(() => {
+      thinkingClearTimer = null;
+      agentThinking = false;
+      lastAgentStatusLabel = "";
+      reconcileOrb();
+    }, THINKING_CLEAR_DELAY_MS);
+  };
+  /** Live handle on the deepfilter processor so we can tweak strength without re-attaching. */
+  let deepFilterProcessor: { setSuppressionLevel?: (n: number) => void } | null = null;
+  let callDeadline: number | null = null;
+  let callTickId: number | null = null;
+  /** Final transcript turns captured during the current call, in arrival order. */
+  const turns = new Map<string, PrevTurn>();
+  let prevContextSent = false;
+  let unloadHandler: (() => void) | null = null;
+  /** Most-recent agent-generated session summary received over data channel. */
+  let latestSummary = "";
+  let pendingSummaryResolver: ((summary: string) => void) | null = null;
+  const persistTurns = (summary?: string) => {
+    if (turns.size === 0 && !(summary && summary.trim())) return;
+    savePrevContext(
+      agentSlug,
+      Array.from(turns.values()),
+      summary ?? latestSummary,
+    );
+  };
+  const recordTranscriptSegment = (
+    role: "user" | "agent",
+    segmentId: string,
+    text: string,
+    isFinal: boolean,
+  ) => {
+    const cleaned = (text || "").trim();
+    if (!cleaned) return;
+    const key = `${role}:${segmentId}`;
+    if (isFinal) {
+      turns.set(key, { role, text: cleaned });
+    } else if (turns.has(key)) {
+      // Update interim text in place so the final-only persist below still
+      // captures the latest text if the call dies before a final segment.
+      turns.set(key, { role, text: cleaned });
+    }
+  };
+  const clearCallTimeout = () => {
+    callDeadline = null;
+    if (callTickId !== null) {
+      window.clearInterval(callTickId);
+      callTickId = null;
+    }
+    emit("call:timer", { remainingMs: null });
+  };
+  const reset = () => {
+    cancelThinkingClear();
+    agentSpeaking = false;
+    userSpeaking = false;
+    agentThinking = false;
+    lastAgentStatusLabel = "";
+    emit("state", { orb: "idle", statusLabel: null });
+    emit("controls", { canStart: true, canMute: false, canEnd: false });
+    emit("mute", { muted: false });
+    muted = false;
+    duckingMuted = false;
+    if (duckTimerId !== null) {
+      window.clearTimeout(duckTimerId);
+      duckTimerId = null;
+    }
+    if (unduckTimerId !== null) {
+      window.clearTimeout(unduckTimerId);
+      unduckTimerId = null;
+    }
+    clearCallTimeout();
+    if (audioEl) {
+      audioEl.remove();
+      audioEl = null;
+    }
+    noiseFilterStatus = "off";
+    deepFilterProcessor = null;
+  };
+  const requestSessionSummary = async (target: Room): Promise<string> => {
+    // Ask the agent to summarize. If the agent answers within the timeout
+    // we get a freshly-generated user-profile summary; otherwise we fall
+    // through and persist whatever raw turns + cached summary we have.
+    const TIMEOUT_MS = 2500;
+    const waiter = new Promise<string>((resolve) => {
+      pendingSummaryResolver = resolve;
+    });
+    try {
+      await target.localParticipant.publishData(
+        new TextEncoder().encode(JSON.stringify({ type: "request_summary" })),
+        { reliable: true, topic: "voice.request_summary" },
+      );
+    } catch (err) {
+      // eslint-disable-next-line no-console
+      console.warn("[voice-agent] Failed to publish summary request:", err);
+      pendingSummaryResolver = null;
+      return latestSummary;
+    }
+    const timeout = new Promise<string>((resolve) =>
+      window.setTimeout(() => resolve(""), TIMEOUT_MS),
+    );
+    const result = await Promise.race([waiter, timeout]);
+    pendingSummaryResolver = null;
+    return result || latestSummary;
+  };
+  const end = async (skipSummary = false) => {
+    clearCallTimeout();
+    if (unloadHandler) {
+      window.removeEventListener("beforeunload", unloadHandler);
+      window.removeEventListener("pagehide", unloadHandler);
+      unloadHandler = null;
+    }
+    let summary = latestSummary;
+    if (room && !skipSummary) {
+      emit("call:status", { status: "Saving…" });
+      try {
+        summary = await requestSessionSummary(room);
+      } catch {
+        summary = latestSummary;
+      }
+    }
+    try {
+      if (room) {
+        await room.disconnect();
+      }
+    } finally {
+      room = null;
+      currentSessionId = null;
+      try { persistTurns(summary); } catch { /* ignore storage errors */ }
+      reset();
+      emit("connection", { phase: "disconnected" });
+    }
+  };
+  const publishPreviousContext = async (target: Room): Promise<void> => {
+    if (prevContextSent) return;
+    const record = loadPrevContext(agentSlug);
+    if (!record) {
+      prevContextSent = true;
+      return;
+    }
+    const text = formatPrevContextForAgent(record);
+    if (!text) {
+      prevContextSent = true;
+      return;
+    }
+    const payload = JSON.stringify({
+      type: "previous_context",
+      text,
+      saved_at: new Date(record.savedAt).toISOString(),
+    });
+    try {
+      await target.localParticipant.publishData(
+        new TextEncoder().encode(payload),
+        {
+          reliable: true,
+          topic: "voice.previous_context",
+        },
+      );
+      prevContextSent = true;
+    } catch (err) {
+      // eslint-disable-next-line no-console
+      console.warn("[voice-agent] Failed to publish previous context:", err);
+    }
+  };
+  const startCallTimeout = () => {
+    clearCallTimeout();
+    const limit = getAppearance().max_call_seconds;
+    if (!Number.isFinite(limit) || limit <= 0) return;
+    callDeadline = Date.now() + limit * 1000;
+    emit("call:timer", { remainingMs: limit * 1000 });
+    callTickId = window.setInterval(() => {
+      if (callDeadline === null) return;
+      const remaining = callDeadline - Date.now();
+      if (remaining <= 0) {
+        emit("call:timer", { remainingMs: 0 });
+        clearCallTimeout();
+        emit("call:status", { status: getAppearance().labels.call_ended });
+        void end();
+        return;
+      }
+      emit("call:timer", { remainingMs: remaining });
+    }, 500);
+  };
+  const start = async () => {
+    emit("controls", { canStart: false, canMute: false, canEnd: false });
+    emit("transcript:clear", {});
+    turns.clear();
+    prevContextSent = false;
+    latestSummary = "";
+    pendingSummaryResolver = null;
+    emit("connection", { phase: "connecting" });
+    emit("state", { orb: "connecting", statusLabel: null });
+    emit("call:status", { status: getAppearance().labels.connecting });
+    let init: SessionInit;
+    try {
+      init = await fetchSession();
+      currentSessionId = init.session_id;
+    } catch (e) {
+      emit("call:status", { status: `Error: ${(e as Error).message}` });
+      emit("error", { scope: "session", error: e as Error });
+      emit("controls", { canStart: true, canMute: false, canEnd: false });
+      emit("connection", { phase: "failed", error: (e as Error).message });
+      return;
+    }
+    room = new Room({
+      adaptiveStream: true,
+      dynacast: true,
+      webAudioMix: false,
+    });
+    room.on(RoomEvent.ConnectionStateChanged, (state: ConnectionState) => {
+      if (state === ConnectionState.Connected) {
+        emit("call:status", { status: "Connected" });
+        emit("connection", { phase: "connected" });
+      }
+      if (state === ConnectionState.Disconnected) {
+        emit("call:status", { status: "Disconnected" });
+      }
+    });
+    // Surface autoplay blocks so the user can tap to start audio. Browsers
+    // refuse to autoplay audio without a prior user gesture; in our flow
+    // the FAB tap satisfies that, but not always (e.g. iframe contexts).
+    room.on(RoomEvent.AudioPlaybackStatusChanged, () => {
+      if (!room?.canPlaybackAudio) {
+        // eslint-disable-next-line no-console
+        console.warn("[voice-agent] Audio playback blocked by browser — call room.startAudio() after a user gesture");
+      }
+    });
+    room.on(
+      RoomEvent.TrackSubscribed,
+      (track: RemoteTrack, _pub: RemoteTrackPublication, _p: RemoteParticipant) => {
+        if (track.kind === Track.Kind.Audio) {
+          const remoteAudio = track as RemoteAudioTrack;
+          audioEl = remoteAudio.attach() as HTMLAudioElement;
+          audioEl.autoplay = true;
+          audioEl.setAttribute("playsinline", "");
+          audioEl.style.display = "none";
+          audioEl.volume = Math.max(0, Math.min(1, prefs.outputVolume / 100));
+          document.body.appendChild(audioEl);
+          // Pin the saved output device, if any. Silently no-ops on
+          // browsers without setSinkId support (Safari/Firefox).
+          void applySinkId(audioEl, prefs.speakerDeviceId);
+        }
+      },
+    );
+    room.on(RoomEvent.ActiveSpeakersChanged, (speakers) => {
+      const localId = room?.localParticipant.identity;
+      agentSpeaking = speakers.some((s) => s.identity !== localId);
+      userSpeaking = speakers.some((s) => s.identity === localId);
+      // Real audio is authoritative over an inferred thinking state — clear
+      // it immediately (no debounce) when either side actually speaks.
+      if (agentSpeaking || userSpeaking) {
+        cancelThinkingClear();
+        agentThinking = false;
+        lastAgentStatusLabel = "";
+      }
+      reconcileOrb();
+      applyDucking(agentSpeaking);
+    });
+    room.on(
+      RoomEvent.TranscriptionReceived,
+      (segments: TranscriptionSegment[], participant) => {
+        const isSelf = participant?.identity === room?.localParticipant.identity;
+        const role: "user" | "agent" = isSelf ? "user" : "agent";
+        for (const seg of segments) {
+          emit("transcript", {
+            role,
+            segmentId: seg.id,
+            text: seg.text,
+            isFinal: seg.final,
+          });
+          recordTranscriptSegment(role, seg.id, seg.text, seg.final);
+        }
+      },
+    );
+    // Once the agent participant joins we have a real peer to receive
+    // data on `voice.previous_context`. We give the agent a brief moment
+    // to register its data handler (it runs after ctx.connect in main.py
+    // but before AgentSession.start in session_agent.py) before sending.
+    room.on(RoomEvent.ParticipantConnected, (participant: RemoteParticipant) => {
+      if (!room || prevContextSent) return;
+      window.setTimeout(() => {
+        if (!room || prevContextSent) return;
+        void publishPreviousContext(room);
+      }, 750);
+      // Acknowledge the unused identity in dev/typecheck without renaming.
+      void participant;
+    });
+    room.on(
+      RoomEvent.DataReceived,
+      (
+        payload: Uint8Array,
+        _participant?: RemoteParticipant,
+        _kind?: unknown,
+        topic?: string,
+      ) => {
+        if (topic !== "voice.session_summary") return;
+        try {
+          const decoded = new TextDecoder().decode(payload);
+          if (!decoded) return;
+          let parsed: unknown;
+          try {
+            parsed = JSON.parse(decoded);
+          } catch {
+            parsed = null;
+          }
+          const text =
+            parsed && typeof parsed === "object" &&
+            typeof (parsed as { text?: unknown }).text === "string"
+              ? ((parsed as { text: string }).text || "").trim()
+              : decoded.trim();
+          if (!text) return;
+          latestSummary = text;
+          if (pendingSummaryResolver) {
+            const resolve = pendingSummaryResolver;
+            pendingSummaryResolver = null;
+            resolve(text);
+          }
+        } catch (err) {
+          // eslint-disable-next-line no-console
+          console.warn("[voice-agent] Failed to handle session summary:", err);
+        }
+      },
+    );
+    room.on(
+      RoomEvent.DataReceived,
+      (_payload: Uint8Array, _participant?: RemoteParticipant, _kind?: unknown, topic?: string) => {
+        if (topic !== "voice.end_call") return;
+        void end(true); // agent is shutting down — skip summary request
+      },
+    );
+    // Contextual processing status (source of truth) — drives the thinking
+    // orb and the status line.
+    room.on(
+      RoomEvent.DataReceived,
+      (payload, _participant?, _kind?, topic?) => {
+        if (topic !== "voice.agent_status") return;
+        try {
+          const decoded = new TextDecoder().decode(payload);
+          const parsed = decoded ? JSON.parse(decoded) : null;
+          if (!parsed || typeof parsed !== "object") return;
+          const state = String((parsed as { state?: unknown }).state || "");
+          const label = String((parsed as { label?: unknown }).label || "");
+          if (state === "thinking") setThinking(label);
+          else clearThinking(); // idle / speaking → leave thinking
+        } catch (err) {
+          // eslint-disable-next-line no-console
+          console.debug("[voice-agent] bad agent_status payload", err);
+        }
+      },
+    );
+    // Free secondary signal: the SDK publishes lk.agent.state
+    // (connecting/thinking/listening/speaking) on the agent participant's
+    // attributes. Used as a fallback when a voice.agent_status event is
+    // missed (e.g. an older backend that doesn't emit them).
+    room.on(
+      RoomEvent.ParticipantAttributesChanged,
+      (changed: Record<string, string>, participant) => {
+        if (participant?.identity === room?.localParticipant.identity) return;
+        const state = changed?.["lk.agent.state"];
+        if (typeof state !== "string") return;
+        if (state === "thinking") {
+          // Don't clobber a richer contextual label already in flight.
+          if (!agentThinking) setThinking("");
+        } else if (state === "speaking") {
+          cancelThinkingClear();
+          agentThinking = false;
+          lastAgentStatusLabel = "";
+          reconcileOrb();
+        } else if (state === "listening" || state === "idle") {
+          clearThinking();
+        }
+      },
+    );
+    room.on(RoomEvent.DataReceived, (payload, _participant, _kind, topic) => {
+      try {
+        const decoded = new TextDecoder().decode(payload);
+        const data = decoded ? JSON.parse(decoded) : null;
+        emit("data", { data, topic });
+      } catch (err) {
+        // eslint-disable-next-line no-console
+        console.debug("[voice-agent] non-JSON data message ignored", err);
+      }
+    });
+    room.on(RoomEvent.Disconnected, () => {
+      void end();
+    });
+    // Backstop for agent-driven end-of-call. The agent signals end via a
+    // `voice.end_call` data message, but reliable data can race the agent's
+    // own session teardown. When the agent participant leaves the room
+    // (session.aclose on the server), tear the call down here too — this is
+    // a signaling event, so it can't be dropped like a data packet.
+    room.on(
+      RoomEvent.ParticipantDisconnected,
+      (_participant: RemoteParticipant) => {
+        if (!room) return;
+        // In this 1:1 widget↔agent topology any remote leaving means the
+        // agent is gone; if no remotes remain, end the call.
+        if (room.remoteParticipants.size === 0) {
+          void end(true);
+        }
+      },
+    );
+    try {
+      await room.connect(init.livekit_url, init.token);
+      await room.localParticipant.setMicrophoneEnabled(
+        true,
+        buildCaptureOptions(prefs),
+        {
+          // Opus "speech" preset — narrower band optimized for voice.
+          audioPreset: AudioPresets.speech,
+          // DTX intentionally OFF: it skips frames during silence, which
+          // disrupts the steady cadence that the agent's contextual turn
+          // detector + Silero VAD rely on to time end-of-turn. With DTX
+          // on, preemptive LLM generation fires noticeably later.
+          dtx: false,
+          // RED intentionally OFF: redundancy adds decode jitter without
+          // a clear win on the typical low-loss WiFi/wired path. Re-enable
+          // if you observe audible packet-loss artifacts.
+          red: false,
+        },
+      );
+      // Honor saved mic device if the user picked one in a prior call.
+      if (prefs.micDeviceId) {
+        try {
+          await room.switchActiveDevice("audioinput", prefs.micDeviceId);
+        } catch (err) {
+          // eslint-disable-next-line no-console
+          console.warn("[voice-agent] saved mic device unavailable:", err);
+        }
+      }
+      await applyNoiseFilter(prefs.noiseFilter);
+      emitAudioState();
+      emit("state", { orb: "listening", statusLabel: null });
+      emit("call:status", { status: getAppearance().labels.listening });
+      emit("controls", { canStart: true, canMute: true, canEnd: true });
+      startCallTimeout();
+      // The ParticipantConnected event only fires for participants that
+      // join *after* we subscribe. If the agent worker is already in the
+      // room when we connect, fire the publish path anyway after the same
+      // grace period.
+      if (room.remoteParticipants.size > 0) {
+        window.setTimeout(() => {
+          if (!room || prevContextSent) return;
+          void publishPreviousContext(room);
+        }, 750);
+      }
+      // Persist whatever we have if the user closes the tab mid-call.
+      unloadHandler = () => persistTurns();
+      window.addEventListener("beforeunload", unloadHandler);
+      window.addEventListener("pagehide", unloadHandler);
+    } catch (e) {
+      emit("call:status", { status: `Connect failed: ${(e as Error).message}` });
+      emit("error", { scope: "connect", error: e as Error });
+      await end();
+    }
+  };
+  /**
+   * Half-duplex ducking. While the agent is speaking we mute the user's
+   * mic so the speaker's audio can't loop back into the published track
+   * (causing audible echo + STT confusion). We use
+   * `LocalAudioTrack.mute()` / `.unmute()` — WebRTC-level mute that flips
+   * the sender's enabled flag without unpublishing the track, so it
+   * returns audio orders of magnitude faster than re-running
+   * `setMicrophoneEnabled(true)`.
+   *
+   * Disabled entirely when `prefs.headphonesMode` is on — the user has no
+   * acoustic echo path through speakers, so cutting them off mid-thought
+   * just chops their barge-in.
+   */
+  const applyDucking = (agentSpeaking: boolean) => {
+    if (!room || muted || prefs.headphonesMode) {
+      cancelDuckTimers();
+      // If we'd previously muted via ducking but the user just enabled
+      // headphones mode, undo it.
+      if (duckingMuted) {
+        duckingMuted = false;
+        void getLocalAudioTrack(room)?.unmute();
+      }
+      return;
+    }
+    if (agentSpeaking) {
+      if (unduckTimerId !== null) {
+        window.clearTimeout(unduckTimerId);
+        unduckTimerId = null;
+      }
+      if (duckingMuted || duckTimerId !== null) return;
+      duckTimerId = window.setTimeout(() => {
+        duckTimerId = null;
+        if (!room || muted || duckingMuted || prefs.headphonesMode) return;
+        const track = getLocalAudioTrack(room);
+        if (!track) return;
+        duckingMuted = true;
+        void track.mute();
+      }, DUCK_DELAY_MS);
+    } else {
+      if (duckTimerId !== null) {
+        window.clearTimeout(duckTimerId);
+        duckTimerId = null;
+      }
+      if (unduckTimerId !== null) {
+        window.clearTimeout(unduckTimerId);
+        unduckTimerId = null;
+      }
+      if (!duckingMuted) return;
+      // Unmute immediately: any post-agent grace delays the start of the
+      // user's next turn reaching the agent's VAD/turn-detector, which is
+      // exactly what makes preemptive LLM generation fire late.
+      duckingMuted = false;
+      void getLocalAudioTrack(room)?.unmute();
+    }
+  };
+  const cancelDuckTimers = () => {
+    if (duckTimerId !== null) {
+      window.clearTimeout(duckTimerId);
+      duckTimerId = null;
+    }
+    if (unduckTimerId !== null) {
+      window.clearTimeout(unduckTimerId);
+      unduckTimerId = null;
+    }
+  };
+  const toggleMute = async () => {
+    if (!room) return muted;
+    muted = !muted;
+    cancelDuckTimers();
+    duckingMuted = false;
+    // Mirror the ducking primitive: WebRTC-level mute/unmute is much
+    // faster than (un)publishing the track and avoids a brief audio gap
+    // on the agent side when the user toggles their own mute.
+    const track = getLocalAudioTrack(room);
+    if (track) {
+      if (muted) await track.mute();
+      else await track.unmute();
+    } else {
+      await room.localParticipant.setMicrophoneEnabled(!muted);
+    }
+    emit("mute", { muted });
+    emit("call:status", {
+      status: muted
+        ? getAppearance().labels.muted
+        : getAppearance().labels.listening,
+    });
+    return muted;
+  };
+  const publishData = async (payload: unknown, topic: string) => {
+    if (!room) return;
+    try {
+      await room.localParticipant.publishData(
+        new TextEncoder().encode(JSON.stringify(payload)),
+        { reliable: true, topic },
+      );
+    } catch (err) {
+      // eslint-disable-next-line no-console
+      console.warn("[voice-agent] failed to publish data:", err);
+    }
+  };
+  /**
+   * Attach/detach the deep-learning NS processor according to `engine`.
+   * Krisp is statically imported (small, always present); DeepFilterNet3
+   * is loaded with a dynamic `import()` so the package only enters the
+   * bundle when the user actually picks it (and the widget still works
+   * if the package isn't installed at all).
+   */
+  const applyNoiseFilter = async (engine: NoiseFilterEngine): Promise<void> => {
+    const track = room ? getLocalAudioTrack(room) : null;
+    if (!track) {
+      noiseFilterStatus = engine === "off" ? "off" : "failed";
+      deepFilterProcessor = null;
+      return;
+    }
+    // Detach whatever's currently attached before switching engines.
+    try {
+      await track.stopProcessor();
+    } catch {
+      /* no-op: no processor was attached */
+    }
+    deepFilterProcessor = null;
+    if (engine === "off") {
+      noiseFilterStatus = "off";
+      return;
+    }
+    if (engine === "krisp") {
+      noiseFilterStatus = await attachKrispNoiseFilter(track);
+      return;
+    }
+    if (engine === "deepfilter") {
+      try {
+        // Runtime-computed URL so Rollup's `inlineDynamicImports` can't
+        // statically resolve the path. The widget bundles cleanly even
+        // when `deepfilternet3-noise-filter` isn't installed locally;
+        // the package is fetched from esm.sh (or a self-hosted ESM
+        // mirror set via `setDeepFilterModuleUrl`) on first use.
+        const moduleUrl = deepFilterModuleUrl;
+        const mod = await import(/* @vite-ignore */ moduleUrl);
+        const Ctor = mod.DeepFilterNoiseFilterProcessor as new (
+          options: {
+            noiseReductionLevel?: number;
+            assetConfig?: { cdnUrl?: string };
+          },
+        ) => unknown;
+        if (typeof Ctor !== "function") {
+          throw new Error("DeepFilterNoiseFilterProcessor export missing");
+        }
+        // The upstream package only honors `assetConfig.cdnUrl` and then
+        // appends fixed paths (`v2/pkg/df_bg.wasm`, `v2/models/DeepFilterNet3_onnx.tar.gz`).
+        // To support arbitrary self-hosted file URLs we temporarily patch
+        // `globalThis.fetch` while the processor initializes and rewrite
+        // any request matching those filenames to the override URLs.
+        const assetCfg: { cdnUrl?: string } = {};
+        if (deepFilterCdnUrl) assetCfg.cdnUrl = deepFilterCdnUrl;
+        const instance = new Ctor({
+          noiseReductionLevel: prefs.deepFilterStrength,
+          assetConfig: Object.keys(assetCfg).length > 0 ? assetCfg : undefined,
+        });
+        const wasmOverride = deepFilterWasmUrl;
+        const modelOverride = deepFilterOnnxUrl;
+        const originalFetch =
+          wasmOverride || modelOverride
+            ? globalThis.fetch.bind(globalThis)
+            : null;
+        if (originalFetch) {
+          const patched: typeof fetch = (input, init) => {
+            const url =
+              typeof input === "string"
+                ? input
+                : input instanceof URL
+                  ? input.href
+                  : (input as Request).url;
+            if (wasmOverride && /\/df_bg\.wasm(?:$|[?#])/.test(url)) {
+              return originalFetch(wasmOverride, init);
+            }
+            if (
+              modelOverride &&
+              /\/DeepFilterNet3_onnx\.tar\.gz(?:$|[?#])/.test(url)
+            ) {
+              return originalFetch(modelOverride, init);
+            }
+            return originalFetch(input, init);
+          };
+          globalThis.fetch = patched;
+        }
+        try {
+          await track.setProcessor(
+            instance as unknown as Parameters<typeof track.setProcessor>[0],
+          );
+        } finally {
+          if (originalFetch) globalThis.fetch = originalFetch;
+        }
+        deepFilterProcessor = instance as { setSuppressionLevel?: (n: number) => void };
+        noiseFilterStatus = "active";
+        // eslint-disable-next-line no-console
+        console.info(
+          "[voice-agent] DeepFilterNet3 noise filter attached (strength=" +
+            prefs.deepFilterStrength +
+            ")",
+        );
+      } catch (err) {
+        // eslint-disable-next-line no-console
+        console.warn(
+          "[voice-agent] DeepFilterNet3 unavailable, falling back to Krisp:",
+          err,
+        );
+        noiseFilterStatus = "unsupported";
+        // Best-effort fallback so the user isn't left with raw audio.
+        const krispResult = await attachKrispNoiseFilter(track);
+        if (krispResult === "active") {
+          noiseFilterStatus = "active";
+        }
+      }
+    }
+  };
+  const emitAudioState = () => {
+    emit("audio", snapshotAudioState());
+  };
+  const snapshotAudioState = (): AudioStateSnapshot => {
+    const track = room ? getLocalAudioTrack(room) : null;
+    const settings = track?.mediaStreamTrack?.getSettings() as
+      | (MediaTrackSettings & { voiceIsolation?: boolean })
+      | undefined;
+    return {
+      prefs: { ...prefs },
+      applied: {
+        echoCancellation: settings?.echoCancellation,
+        noiseSuppression: settings?.noiseSuppression,
+        autoGainControl: settings?.autoGainControl,
+        voiceIsolation: settings?.voiceIsolation,
+        sampleRate: settings?.sampleRate,
+        channelCount: settings?.channelCount,
+        deviceId: settings?.deviceId,
+      },
+      noiseFilter: {
+        engine: prefs.noiseFilter,
+        status: noiseFilterStatus,
+      },
+    };
+  };
+  const updateAudioSettings = async (
+    delta: Partial<AudioPrefs>,
+  ): Promise<AudioStateSnapshot> => {
+    const next: AudioPrefs = { ...prefs, ...delta };
+    const localTrack = room ? getLocalAudioTrack(room) : null;
+    // 1. Standard MediaTrackSettings — flip live via applyConstraints. Falls
+    //    back to restartTrack only if applyConstraints rejects.
+    const constraintKeys = [
+      "echoCancellation",
+      "noiseSuppression",
+      "autoGainControl",
+      "voiceIsolation",
+    ] as const;
+    const constraintDelta: MediaTrackConstraints & { voiceIsolation?: boolean } = {};
+    for (const k of constraintKeys) {
+      if (k in delta && delta[k] !== prefs[k]) {
+        (constraintDelta as Record<string, unknown>)[k] = next[k];
+      }
+    }
+    if (localTrack && Object.keys(constraintDelta).length > 0) {
+      try {
+        await localTrack.mediaStreamTrack.applyConstraints(constraintDelta);
+      } catch (err) {
+        // eslint-disable-next-line no-console
+        console.warn(
+          "[voice-agent] applyConstraints rejected, falling back to restartTrack:",
+          err,
+        );
+        try {
+          await localTrack.restartTrack(buildCaptureOptions(next));
+        } catch (restartErr) {
+          // eslint-disable-next-line no-console
+          console.warn("[voice-agent] restartTrack failed:", restartErr);
+        }
+      }
+    }
+    // 2. Noise-filter engine — separate audio-graph attach/detach.
+    if ("noiseFilter" in delta && delta.noiseFilter !== prefs.noiseFilter) {
+      // Commit the new engine choice into `prefs` *before* applying so
+      // applyNoiseFilter sees the strength etc. for the right engine.
+      prefs = { ...prefs, noiseFilter: next.noiseFilter };
+      await applyNoiseFilter(next.noiseFilter);
+    }
+    // DeepFilterNet3 strength — live-adjust if the processor exposes the
+    // hook, otherwise reattach. Skip when not on deepfilter.
+    if (
+      "deepFilterStrength" in delta &&
+      delta.deepFilterStrength !== prefs.deepFilterStrength &&
+      next.noiseFilter === "deepfilter"
+    ) {
+      prefs = { ...prefs, deepFilterStrength: next.deepFilterStrength };
+      if (deepFilterProcessor && typeof deepFilterProcessor.setSuppressionLevel === "function") {
+        try {
+          deepFilterProcessor.setSuppressionLevel(next.deepFilterStrength);
+        } catch (err) {
+          // eslint-disable-next-line no-console
+          console.warn("[voice-agent] DeepFilter setSuppressionLevel failed:", err);
+        }
+      } else {
+        await applyNoiseFilter("deepfilter");
+      }
+    }
+    // 3. Mic device — switchActiveDevice does the heavy lifting.
+    if ("micDeviceId" in delta && delta.micDeviceId !== prefs.micDeviceId && room) {
+      try {
+        await room.switchActiveDevice("audioinput", next.micDeviceId || "default");
+      } catch (err) {
+        // eslint-disable-next-line no-console
+        console.warn("[voice-agent] failed to switch mic device:", err);
+      }
+    }
+    // 4. Speaker device — HTMLAudioElement.setSinkId.
+    if (
+      "speakerDeviceId" in delta &&
+      delta.speakerDeviceId !== prefs.speakerDeviceId &&
+      audioEl
+    ) {
+      await applySinkId(audioEl, next.speakerDeviceId);
+    }
+    // 5. Volume — direct property on the element.
+    if ("outputVolume" in delta && audioEl) {
+      audioEl.volume = Math.max(0, Math.min(1, next.outputVolume / 100));
+    }
+    // 6. Headphones mode — toggling on while currently ducked needs to
+    //    immediately un-mute the local track; applyDucking handles both
+    //    directions via its early-return path.
+    if ("headphonesMode" in delta && delta.headphonesMode !== prefs.headphonesMode) {
+      // Re-run with current speaker state. If no one is currently
+      // speaking we pass false to force un-mute on next pass.
+      applyDucking(false);
+    }
+    prefs = next;
+    persistPrefsToStorage();
+    const state = snapshotAudioState();
+    emit("audio", state);
+    return state;
+  };
+  const getAudioStats = async (): Promise<AudioStats | null> => {
+    if (!room) return null;
+    const reports: RTCStatsReport[] = [];
+    const localTrack = getLocalAudioTrack(room);
+    if (localTrack) {
+      try {
+        const r = await localTrack.getRTCStatsReport();
+        if (r) reports.push(r);
+      } catch { /* ignore */ }
+    }
+    // Walk remote participants for inbound audio stats.
+    for (const participant of room.remoteParticipants.values()) {
+      for (const pub of participant.audioTrackPublications.values()) {
+        const track = pub.track;
+        if (!track) continue;
+        try {
+          const r = await track.getRTCStatsReport();
+          if (r) reports.push(r);
+        } catch { /* ignore */ }
+      }
+    }
+    const stats: AudioStats = {
+      outboundAudioLevel: 0,
+      inboundAudioLevel: 0,
+      packetsLost: 0,
+      jitter: 0,
+      roundTripTime: 0,
+    };
+    for (const report of reports) {
+      report.forEach((entry) => {
+        const e = entry as RTCStats & Record<string, unknown>;
+        if (e.type === "outbound-rtp" && e.kind === "audio") {
+          const lvl = e.audioLevel;
+          if (typeof lvl === "number") stats.outboundAudioLevel = lvl;
+        }
+        if (e.type === "inbound-rtp" && e.kind === "audio") {
+          const lvl = e.audioLevel;
+          if (typeof lvl === "number") stats.inboundAudioLevel = lvl;
+          const lost = e.packetsLost;
+          if (typeof lost === "number") stats.packetsLost = lost;
+          const j = e.jitter;
+          if (typeof j === "number") stats.jitter = j * 1000;
+        }
+        if (e.type === "candidate-pair" && (e.selected === true || e.nominated === true)) {
+          const rtt = e.currentRoundTripTime;
+          if (typeof rtt === "number") stats.roundTripTime = rtt * 1000;
+        }
+      });
+    }
+    return stats;
+  };
+  return {
+    start,
+    end,
+    toggleMute,
+    isActive: () => room !== null,
+    sessionId: () => currentSessionId,
+    publishData,
+    updateAudioSettings,
+    getAudioState: snapshotAudioState,
+    getAudioStats,
+  };
+}
+function buildCaptureOptions(prefs: AudioPrefs): MediaTrackConstraints & {
+  voiceIsolation?: boolean;
+} {
+  return {
+    echoCancellation: prefs.echoCancellation,
+    noiseSuppression: prefs.noiseSuppression,
+    autoGainControl: prefs.autoGainControl,
+    voiceIsolation: prefs.voiceIsolation,
+    channelCount: 1,
+    sampleRate: 48000,
+    deviceId: prefs.micDeviceId ? { ideal: prefs.micDeviceId } : undefined,
+  };
+}
+function getLocalAudioTrack(room: Room | null): LocalAudioTrack | null {
+  if (!room) return null;
+  const pub = room.localParticipant.getTrackPublication(Track.Source.Microphone);
+  const track = pub?.audioTrack;
+  return track instanceof LocalAudioTrack ? track : null;
+}
+async function applySinkId(el: HTMLAudioElement, deviceId: string): Promise<void> {
+  const setSinkId = (el as unknown as { setSinkId?: (id: string) => Promise<void> })
+    .setSinkId;
+  if (typeof setSinkId !== "function") return;
+  try {
+    await setSinkId.call(el, deviceId || "");
+  } catch (err) {
+    // eslint-disable-next-line no-console
+    console.warn("[voice-agent] setSinkId failed:", err);
+  }
+}
+async function attachKrispNoiseFilter(
+  track: LocalAudioTrack,
+): Promise<"active" | "failed" | "unsupported"> {
+  // Loaded lazily: the package evaluates `class extends Worker` at module load,
+  // which throws in Node. Deferring the import keeps `import "@oshara/voice-sdk"`
+  // safe server-side; this path only ever runs in a browser call.
+  const { KrispNoiseFilter, isKrispNoiseFilterSupported } = await import(
+    "@livekit/krisp-noise-filter"
+  );
+  if (!isKrispNoiseFilterSupported()) {
+    // eslint-disable-next-line no-console
+    console.warn(
+      "[voice-agent] Krisp noise filter NOT supported in this browser — relying on browser noiseSuppression only",
+    );
+    return "unsupported";
+  }
+  try {
+    await track.setProcessor(KrispNoiseFilter());
+    // eslint-disable-next-line no-console
+    console.info("[voice-agent] Krisp noise filter attached");
+    return "active";
+  } catch (err) {
+    // eslint-disable-next-line no-console
+    console.warn("[voice-agent] Failed to attach Krisp noise filter:", err);
+    return "failed";
+  }
+}