@oshara/voice-sdk 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. package/README.md +198 -0
  2. package/dist/appearance-CNWT8x1G.cjs +2 -0
  3. package/dist/appearance-CNWT8x1G.cjs.map +1 -0
  4. package/dist/appearance-i6QBkpCk.js +650 -0
  5. package/dist/appearance-i6QBkpCk.js.map +1 -0
  6. package/dist/consent-CK9VXNPa.js +54 -0
  7. package/dist/consent-CK9VXNPa.js.map +1 -0
  8. package/dist/consent-D7QNSkQD.cjs +2 -0
  9. package/dist/consent-D7QNSkQD.cjs.map +1 -0
  10. package/dist/core/analytics.d.ts +30 -0
  11. package/dist/core/appearance.d.ts +113 -0
  12. package/dist/core/audioSettings.d.ts +69 -0
  13. package/dist/core/consent.d.ts +17 -0
  14. package/dist/core/createVoiceAgent.d.ts +79 -0
  15. package/dist/core/events.d.ts +103 -0
  16. package/dist/core/formController.d.ts +28 -0
  17. package/dist/core/forms.d.ts +235 -0
  18. package/dist/core/index.d.ts +29 -0
  19. package/dist/core/prevContext.d.ts +26 -0
  20. package/dist/core/transport.d.ts +30 -0
  21. package/dist/core/types.d.ts +49 -0
  22. package/dist/core/voice.d.ts +79 -0
  23. package/dist/createVoiceAgent-BM3HODS6.js +1058 -0
  24. package/dist/createVoiceAgent-BM3HODS6.js.map +1 -0
  25. package/dist/createVoiceAgent-CJWxWzz6.cjs +4 -0
  26. package/dist/createVoiceAgent-CJWxWzz6.cjs.map +1 -0
  27. package/dist/index.cjs +2 -0
  28. package/dist/index.cjs.map +1 -0
  29. package/dist/index.js +44 -0
  30. package/dist/index.js.map +1 -0
  31. package/dist/react/index.d.ts +60 -0
  32. package/dist/react.cjs +2 -0
  33. package/dist/react.cjs.map +1 -0
  34. package/dist/react.js +115 -0
  35. package/dist/react.js.map +1 -0
  36. package/dist/styles.css +1838 -0
  37. package/dist/ui/index.d.ts +21 -0
  38. package/dist/ui/ui.d.ts +165 -0
  39. package/dist/ui.cjs +284 -0
  40. package/dist/ui.cjs.map +1 -0
  41. package/dist/ui.js +1153 -0
  42. package/dist/ui.js.map +1 -0
  43. package/package.json +67 -0
  44. package/src/core/analytics.ts +111 -0
  45. package/src/core/appearance.ts +464 -0
  46. package/src/core/audioSettings.ts +180 -0
  47. package/src/core/consent.ts +78 -0
  48. package/src/core/createVoiceAgent.ts +280 -0
  49. package/src/core/events.ts +120 -0
  50. package/src/core/formController.ts +317 -0
  51. package/src/core/forms.ts +861 -0
  52. package/src/core/index.ts +121 -0
  53. package/src/core/prevContext.ts +153 -0
  54. package/src/core/transport.ts +118 -0
  55. package/src/core/types.ts +66 -0
  56. package/src/core/voice.ts +1179 -0
  57. package/src/react/index.ts +238 -0
  58. package/src/ui/index.ts +507 -0
  59. package/src/ui/styles.css +1838 -0
  60. package/src/ui/ui.ts +1672 -0
  61. package/src/vite-env.d.ts +10 -0
@@ -0,0 +1,1179 @@
1
+ import {
2
+ AudioPresets,
3
+ ConnectionState,
4
+ LocalAudioTrack,
5
+ RemoteAudioTrack,
6
+ RemoteParticipant,
7
+ RemoteTrack,
8
+ RemoteTrackPublication,
9
+ Room,
10
+ RoomEvent,
11
+ Track,
12
+ TranscriptionSegment,
13
+ } from "livekit-client";
14
+ import {
15
+ AudioPrefs,
16
+ loadAudioPrefs,
17
+ NoiseFilterEngine,
18
+ saveAudioPrefs,
19
+ } from "./audioSettings";
20
+
21
+ import type { AppearanceConfig } from "./appearance";
22
+ import type { Emit } from "./events";
23
+ import {
24
+ DEFAULT_DEEPFILTER_MODULE_URL,
25
+ type DeepFilterUrls,
26
+ type OrbState,
27
+ type SessionInit,
28
+ } from "./types";
29
+ import {
30
+ formatPrevContextForAgent,
31
+ loadPrevContext,
32
+ PrevTurn,
33
+ savePrevContext,
34
+ } from "./prevContext";
35
+
36
+ export type { SessionInit } from "./types";
37
+
38
+ /**
39
+ * Snapshot reported back to the UI after the mic publishes or when any
40
+ * audio setting changes. The `applied*` flags are read from the live
41
+ * MediaStreamTrack via getSettings(), so they reflect what the browser
42
+ * actually honored — which may differ from what we requested.
43
+ */
44
+ export interface AudioStateSnapshot {
45
+ prefs: AudioPrefs;
46
+ applied: {
47
+ echoCancellation: boolean | undefined;
48
+ noiseSuppression: boolean | undefined;
49
+ autoGainControl: boolean | undefined;
50
+ voiceIsolation: boolean | undefined;
51
+ sampleRate: number | undefined;
52
+ channelCount: number | undefined;
53
+ deviceId: string | undefined;
54
+ };
55
+ /**
56
+ * Effective state of the deep-learning NS engine:
57
+ * - `engine` — what the user picked (off / krisp / deepfilter).
58
+ * - `status` — what actually happened: "active" if the processor is
59
+ * attached, "unsupported" if the chosen engine isn't available in
60
+ * this browser, "failed" if attach errored, "off" if engine === "off".
61
+ */
62
+ noiseFilter: {
63
+ engine: NoiseFilterEngine;
64
+ status: "active" | "off" | "unsupported" | "failed";
65
+ };
66
+ }
67
+
68
+ export interface AudioStats {
69
+ /** Local outbound audio level (0-1) from RTCStats. */
70
+ outboundAudioLevel: number;
71
+ /** Remote inbound audio level (0-1) from RTCStats. */
72
+ inboundAudioLevel: number;
73
+ /** Packets lost on inbound (agent → user) stream. */
74
+ packetsLost: number;
75
+ /** Inbound jitter in ms. */
76
+ jitter: number;
77
+ /** Round-trip time in ms (peer connection). */
78
+ roundTripTime: number;
79
+ }
80
+
81
+ export interface VoiceController {
82
+ start: () => Promise<void>;
83
+ end: () => Promise<void>;
84
+ toggleMute: () => Promise<boolean>;
85
+ isActive: () => boolean;
86
+ /** Returns the session_id for the active session, or null if no session is running. */
87
+ sessionId: () => string | null;
88
+ /** Publish a JSON data message back to the agent (used after form submit). */
89
+ publishData: (payload: unknown, topic: string) => Promise<void>;
90
+ /** Apply a partial update to the audio preferences (live, no reconnect). */
91
+ updateAudioSettings: (delta: Partial<AudioPrefs>) => Promise<AudioStateSnapshot>;
92
+ /** Read the current audio state (preferences + actually-applied values). */
93
+ getAudioState: () => AudioStateSnapshot;
94
+ /** Poll a snapshot of audio RTC stats (returns null if no call). */
95
+ getAudioStats: () => Promise<AudioStats | null>;
96
+ }
97
+
98
+ export interface VoiceControllerOptions {
99
+ /** Mint a LiveKit session (POST /api/agents/agent-session/). */
100
+ fetchSession: () => Promise<SessionInit>;
101
+ /** AICharacter slug — namespaces persisted prefs / context. */
102
+ agentSlug: string;
103
+ /** Read the live appearance config (labels, max_call_seconds). */
104
+ getAppearance: () => AppearanceConfig;
105
+ /** Typed event emitter — replaces all the old direct UI calls. */
106
+ emit: Emit;
107
+ /** Per-instance DeepFilterNet3 asset overrides. */
108
+ deepFilter?: DeepFilterUrls;
109
+ /** Pre-resolved audio prefs to start from. Defaults to loadAudioPrefs(slug). */
110
+ initialPrefs?: AudioPrefs;
111
+ /** Persist pref changes to localStorage (default true). */
112
+ persistPrefs?: boolean;
113
+ }
114
+
115
+ export function createVoiceController(
116
+ opts: VoiceControllerOptions,
117
+ ): VoiceController {
118
+ const {
119
+ fetchSession,
120
+ agentSlug,
121
+ getAppearance,
122
+ emit,
123
+ deepFilter = {},
124
+ persistPrefs = true,
125
+ } = opts;
126
+ const deepFilterModuleUrl =
127
+ (deepFilter.moduleUrl && deepFilter.moduleUrl.trim()) ||
128
+ DEFAULT_DEEPFILTER_MODULE_URL;
129
+ const deepFilterCdnUrl =
130
+ deepFilter.cdnUrl && deepFilter.cdnUrl.trim() ? deepFilter.cdnUrl.trim() : undefined;
131
+ const deepFilterWasmUrl =
132
+ deepFilter.wasmUrl && deepFilter.wasmUrl.trim() ? deepFilter.wasmUrl.trim() : undefined;
133
+ const deepFilterOnnxUrl =
134
+ deepFilter.onnxUrl && deepFilter.onnxUrl.trim() ? deepFilter.onnxUrl.trim() : undefined;
135
+
136
+ let room: Room | null = null;
137
+ let currentSessionId: string | null = null;
138
+ let audioEl: HTMLAudioElement | null = null;
139
+ let muted = false;
140
+ let duckingMuted = false;
141
+ let duckTimerId: number | null = null;
142
+ let unduckTimerId: number | null = null;
143
+ // Duck-in delay: long enough that a quick user barge-in / reply isn't
144
+ // chopped before the agent-side VAD locks on, short enough that the
145
+ // mic-vs-speaker echo path is closed before too much leaks back. 80ms
146
+ // (an earlier value) made preemptive generation fire late because the
147
+ // very start of user replies was getting muted before VAD confirmed
148
+ // speech. 200ms keeps the agent's turn detector happy. We unmute
149
+ // immediately when the agent stops speaking — any post-agent grace
150
+ // just delays the user's next turn reaching VAD.
151
+ const DUCK_DELAY_MS = 200;
152
+ let prefs: AudioPrefs = opts.initialPrefs
153
+ ? { ...opts.initialPrefs }
154
+ : loadAudioPrefs(agentSlug);
155
+ const persistPrefsToStorage = () => {
156
+ if (persistPrefs) saveAudioPrefs(agentSlug, prefs);
157
+ };
158
+ let noiseFilterStatus: AudioStateSnapshot["noiseFilter"]["status"] = "off";
159
+
160
+ // ── Orb-state reconciliation ──────────────────────────────────────
161
+ // The orb is driven by three independent signals: agent audio + user audio
162
+ // (RoomEvent.ActiveSpeakersChanged) and an explicit "thinking" status
163
+ // (voice.agent_status data events, with lk.agent.state as a fallback).
164
+ // Real audio is authoritative; the inferred "no speakers → idle" must not
165
+ // stomp a live thinking state. Priority: speaking > thinking > listening >
166
+ // idle.
167
+ let agentSpeaking = false;
168
+ let userSpeaking = false;
169
+ let agentThinking = false;
170
+ let lastAgentStatusLabel = "";
171
+ // Debounce the thinking → idle clear so a very fast tool doesn't flash the
172
+ // status line on then instantly off.
173
+ let thinkingClearTimer: number | null = null;
174
+ const THINKING_CLEAR_DELAY_MS = 150;
175
+
176
+ const reconcileOrb = () => {
177
+ let next: OrbState;
178
+ if (agentSpeaking) next = "speaking";
179
+ else if (agentThinking) next = "thinking";
180
+ else if (userSpeaking) next = "listening";
181
+ else next = "idle";
182
+ emit("state", {
183
+ orb: next,
184
+ statusLabel: next === "thinking" ? lastAgentStatusLabel || null : null,
185
+ });
186
+ };
187
+
188
+ const cancelThinkingClear = () => {
189
+ if (thinkingClearTimer !== null) {
190
+ window.clearTimeout(thinkingClearTimer);
191
+ thinkingClearTimer = null;
192
+ }
193
+ };
194
+
195
+ /** Enter the thinking state with a contextual label (may be empty). */
196
+ const setThinking = (label: string) => {
197
+ cancelThinkingClear();
198
+ agentThinking = true;
199
+ lastAgentStatusLabel = label || "";
200
+ reconcileOrb();
201
+ };
202
+
203
+ /** Leave thinking, debounced so brief tool calls don't flicker. */
204
+ const clearThinking = () => {
205
+ if (!agentThinking || thinkingClearTimer !== null) return;
206
+ thinkingClearTimer = window.setTimeout(() => {
207
+ thinkingClearTimer = null;
208
+ agentThinking = false;
209
+ lastAgentStatusLabel = "";
210
+ reconcileOrb();
211
+ }, THINKING_CLEAR_DELAY_MS);
212
+ };
213
+ /** Live handle on the deepfilter processor so we can tweak strength without re-attaching. */
214
+ let deepFilterProcessor: { setSuppressionLevel?: (n: number) => void } | null = null;
215
+ let callDeadline: number | null = null;
216
+ let callTickId: number | null = null;
217
+ /** Final transcript turns captured during the current call, in arrival order. */
218
+ const turns = new Map<string, PrevTurn>();
219
+ let prevContextSent = false;
220
+ let unloadHandler: (() => void) | null = null;
221
+ /** Most-recent agent-generated session summary received over data channel. */
222
+ let latestSummary = "";
223
+ let pendingSummaryResolver: ((summary: string) => void) | null = null;
224
+
225
+ const persistTurns = (summary?: string) => {
226
+ if (turns.size === 0 && !(summary && summary.trim())) return;
227
+ savePrevContext(
228
+ agentSlug,
229
+ Array.from(turns.values()),
230
+ summary ?? latestSummary,
231
+ );
232
+ };
233
+
234
+ const recordTranscriptSegment = (
235
+ role: "user" | "agent",
236
+ segmentId: string,
237
+ text: string,
238
+ isFinal: boolean,
239
+ ) => {
240
+ const cleaned = (text || "").trim();
241
+ if (!cleaned) return;
242
+ const key = `${role}:${segmentId}`;
243
+ if (isFinal) {
244
+ turns.set(key, { role, text: cleaned });
245
+ } else if (turns.has(key)) {
246
+ // Update interim text in place so the final-only persist below still
247
+ // captures the latest text if the call dies before a final segment.
248
+ turns.set(key, { role, text: cleaned });
249
+ }
250
+ };
251
+
252
+ const clearCallTimeout = () => {
253
+ callDeadline = null;
254
+ if (callTickId !== null) {
255
+ window.clearInterval(callTickId);
256
+ callTickId = null;
257
+ }
258
+ emit("call:timer", { remainingMs: null });
259
+ };
260
+
261
+ const reset = () => {
262
+ cancelThinkingClear();
263
+ agentSpeaking = false;
264
+ userSpeaking = false;
265
+ agentThinking = false;
266
+ lastAgentStatusLabel = "";
267
+ emit("state", { orb: "idle", statusLabel: null });
268
+ emit("controls", { canStart: true, canMute: false, canEnd: false });
269
+ emit("mute", { muted: false });
270
+ muted = false;
271
+ duckingMuted = false;
272
+ if (duckTimerId !== null) {
273
+ window.clearTimeout(duckTimerId);
274
+ duckTimerId = null;
275
+ }
276
+ if (unduckTimerId !== null) {
277
+ window.clearTimeout(unduckTimerId);
278
+ unduckTimerId = null;
279
+ }
280
+ clearCallTimeout();
281
+ if (audioEl) {
282
+ audioEl.remove();
283
+ audioEl = null;
284
+ }
285
+ noiseFilterStatus = "off";
286
+ deepFilterProcessor = null;
287
+ };
288
+
289
+ const requestSessionSummary = async (target: Room): Promise<string> => {
290
+ // Ask the agent to summarize. If the agent answers within the timeout
291
+ // we get a freshly-generated user-profile summary; otherwise we fall
292
+ // through and persist whatever raw turns + cached summary we have.
293
+ const TIMEOUT_MS = 2500;
294
+ const waiter = new Promise<string>((resolve) => {
295
+ pendingSummaryResolver = resolve;
296
+ });
297
+ try {
298
+ await target.localParticipant.publishData(
299
+ new TextEncoder().encode(JSON.stringify({ type: "request_summary" })),
300
+ { reliable: true, topic: "voice.request_summary" },
301
+ );
302
+ } catch (err) {
303
+ // eslint-disable-next-line no-console
304
+ console.warn("[voice-agent] Failed to publish summary request:", err);
305
+ pendingSummaryResolver = null;
306
+ return latestSummary;
307
+ }
308
+ const timeout = new Promise<string>((resolve) =>
309
+ window.setTimeout(() => resolve(""), TIMEOUT_MS),
310
+ );
311
+ const result = await Promise.race([waiter, timeout]);
312
+ pendingSummaryResolver = null;
313
+ return result || latestSummary;
314
+ };
315
+
316
+ const end = async (skipSummary = false) => {
317
+ clearCallTimeout();
318
+ if (unloadHandler) {
319
+ window.removeEventListener("beforeunload", unloadHandler);
320
+ window.removeEventListener("pagehide", unloadHandler);
321
+ unloadHandler = null;
322
+ }
323
+ let summary = latestSummary;
324
+ if (room && !skipSummary) {
325
+ emit("call:status", { status: "Saving…" });
326
+ try {
327
+ summary = await requestSessionSummary(room);
328
+ } catch {
329
+ summary = latestSummary;
330
+ }
331
+ }
332
+ try {
333
+ if (room) {
334
+ await room.disconnect();
335
+ }
336
+ } finally {
337
+ room = null;
338
+ currentSessionId = null;
339
+ try { persistTurns(summary); } catch { /* ignore storage errors */ }
340
+ reset();
341
+ emit("connection", { phase: "disconnected" });
342
+ }
343
+ };
344
+
345
+ const publishPreviousContext = async (target: Room): Promise<void> => {
346
+ if (prevContextSent) return;
347
+ const record = loadPrevContext(agentSlug);
348
+ if (!record) {
349
+ prevContextSent = true;
350
+ return;
351
+ }
352
+ const text = formatPrevContextForAgent(record);
353
+ if (!text) {
354
+ prevContextSent = true;
355
+ return;
356
+ }
357
+ const payload = JSON.stringify({
358
+ type: "previous_context",
359
+ text,
360
+ saved_at: new Date(record.savedAt).toISOString(),
361
+ });
362
+ try {
363
+ await target.localParticipant.publishData(
364
+ new TextEncoder().encode(payload),
365
+ {
366
+ reliable: true,
367
+ topic: "voice.previous_context",
368
+ },
369
+ );
370
+ prevContextSent = true;
371
+ } catch (err) {
372
+ // eslint-disable-next-line no-console
373
+ console.warn("[voice-agent] Failed to publish previous context:", err);
374
+ }
375
+ };
376
+
377
+ const startCallTimeout = () => {
378
+ clearCallTimeout();
379
+ const limit = getAppearance().max_call_seconds;
380
+ if (!Number.isFinite(limit) || limit <= 0) return;
381
+
382
+ callDeadline = Date.now() + limit * 1000;
383
+ emit("call:timer", { remainingMs: limit * 1000 });
384
+
385
+ callTickId = window.setInterval(() => {
386
+ if (callDeadline === null) return;
387
+ const remaining = callDeadline - Date.now();
388
+ if (remaining <= 0) {
389
+ emit("call:timer", { remainingMs: 0 });
390
+ clearCallTimeout();
391
+ emit("call:status", { status: getAppearance().labels.call_ended });
392
+ void end();
393
+ return;
394
+ }
395
+ emit("call:timer", { remainingMs: remaining });
396
+ }, 500);
397
+ };
398
+
399
+ const start = async () => {
400
+ emit("controls", { canStart: false, canMute: false, canEnd: false });
401
+ emit("transcript:clear", {});
402
+ turns.clear();
403
+ prevContextSent = false;
404
+ latestSummary = "";
405
+ pendingSummaryResolver = null;
406
+ emit("connection", { phase: "connecting" });
407
+ emit("state", { orb: "connecting", statusLabel: null });
408
+ emit("call:status", { status: getAppearance().labels.connecting });
409
+
410
+ let init: SessionInit;
411
+ try {
412
+ init = await fetchSession();
413
+ currentSessionId = init.session_id;
414
+ } catch (e) {
415
+ emit("call:status", { status: `Error: ${(e as Error).message}` });
416
+ emit("error", { scope: "session", error: e as Error });
417
+ emit("controls", { canStart: true, canMute: false, canEnd: false });
418
+ emit("connection", { phase: "failed", error: (e as Error).message });
419
+ return;
420
+ }
421
+
422
+ room = new Room({
423
+ adaptiveStream: true,
424
+ dynacast: true,
425
+ webAudioMix: false,
426
+ });
427
+
428
+ room.on(RoomEvent.ConnectionStateChanged, (state: ConnectionState) => {
429
+ if (state === ConnectionState.Connected) {
430
+ emit("call:status", { status: "Connected" });
431
+ emit("connection", { phase: "connected" });
432
+ }
433
+ if (state === ConnectionState.Disconnected) {
434
+ emit("call:status", { status: "Disconnected" });
435
+ }
436
+ });
437
+
438
+ // Surface autoplay blocks so the user can tap to start audio. Browsers
439
+ // refuse to autoplay audio without a prior user gesture; in our flow
440
+ // the FAB tap satisfies that, but not always (e.g. iframe contexts).
441
+ room.on(RoomEvent.AudioPlaybackStatusChanged, () => {
442
+ if (!room?.canPlaybackAudio) {
443
+ // eslint-disable-next-line no-console
444
+ console.warn("[voice-agent] Audio playback blocked by browser — call room.startAudio() after a user gesture");
445
+ }
446
+ });
447
+
448
+ room.on(
449
+ RoomEvent.TrackSubscribed,
450
+ (track: RemoteTrack, _pub: RemoteTrackPublication, _p: RemoteParticipant) => {
451
+ if (track.kind === Track.Kind.Audio) {
452
+ const remoteAudio = track as RemoteAudioTrack;
453
+ audioEl = remoteAudio.attach() as HTMLAudioElement;
454
+ audioEl.autoplay = true;
455
+ audioEl.setAttribute("playsinline", "");
456
+ audioEl.style.display = "none";
457
+ audioEl.volume = Math.max(0, Math.min(1, prefs.outputVolume / 100));
458
+ document.body.appendChild(audioEl);
459
+ // Pin the saved output device, if any. Silently no-ops on
460
+ // browsers without setSinkId support (Safari/Firefox).
461
+ void applySinkId(audioEl, prefs.speakerDeviceId);
462
+ }
463
+ },
464
+ );
465
+
466
+ room.on(RoomEvent.ActiveSpeakersChanged, (speakers) => {
467
+ const localId = room?.localParticipant.identity;
468
+ agentSpeaking = speakers.some((s) => s.identity !== localId);
469
+ userSpeaking = speakers.some((s) => s.identity === localId);
470
+ // Real audio is authoritative over an inferred thinking state — clear
471
+ // it immediately (no debounce) when either side actually speaks.
472
+ if (agentSpeaking || userSpeaking) {
473
+ cancelThinkingClear();
474
+ agentThinking = false;
475
+ lastAgentStatusLabel = "";
476
+ }
477
+ reconcileOrb();
478
+ applyDucking(agentSpeaking);
479
+ });
480
+
481
+ room.on(
482
+ RoomEvent.TranscriptionReceived,
483
+ (segments: TranscriptionSegment[], participant) => {
484
+ const isSelf = participant?.identity === room?.localParticipant.identity;
485
+ const role: "user" | "agent" = isSelf ? "user" : "agent";
486
+ for (const seg of segments) {
487
+ emit("transcript", {
488
+ role,
489
+ segmentId: seg.id,
490
+ text: seg.text,
491
+ isFinal: seg.final,
492
+ });
493
+ recordTranscriptSegment(role, seg.id, seg.text, seg.final);
494
+ }
495
+ },
496
+ );
497
+
498
+ // Once the agent participant joins we have a real peer to receive
499
+ // data on `voice.previous_context`. We give the agent a brief moment
500
+ // to register its data handler (it runs after ctx.connect in main.py
501
+ // but before AgentSession.start in session_agent.py) before sending.
502
+ room.on(RoomEvent.ParticipantConnected, (participant: RemoteParticipant) => {
503
+ if (!room || prevContextSent) return;
504
+ window.setTimeout(() => {
505
+ if (!room || prevContextSent) return;
506
+ void publishPreviousContext(room);
507
+ }, 750);
508
+ // Acknowledge the unused identity in dev/typecheck without renaming.
509
+ void participant;
510
+ });
511
+
512
+ room.on(
513
+ RoomEvent.DataReceived,
514
+ (
515
+ payload: Uint8Array,
516
+ _participant?: RemoteParticipant,
517
+ _kind?: unknown,
518
+ topic?: string,
519
+ ) => {
520
+ if (topic !== "voice.session_summary") return;
521
+ try {
522
+ const decoded = new TextDecoder().decode(payload);
523
+ if (!decoded) return;
524
+ let parsed: unknown;
525
+ try {
526
+ parsed = JSON.parse(decoded);
527
+ } catch {
528
+ parsed = null;
529
+ }
530
+ const text =
531
+ parsed && typeof parsed === "object" &&
532
+ typeof (parsed as { text?: unknown }).text === "string"
533
+ ? ((parsed as { text: string }).text || "").trim()
534
+ : decoded.trim();
535
+ if (!text) return;
536
+ latestSummary = text;
537
+ if (pendingSummaryResolver) {
538
+ const resolve = pendingSummaryResolver;
539
+ pendingSummaryResolver = null;
540
+ resolve(text);
541
+ }
542
+ } catch (err) {
543
+ // eslint-disable-next-line no-console
544
+ console.warn("[voice-agent] Failed to handle session summary:", err);
545
+ }
546
+ },
547
+ );
548
+
549
+ room.on(
550
+ RoomEvent.DataReceived,
551
+ (_payload: Uint8Array, _participant?: RemoteParticipant, _kind?: unknown, topic?: string) => {
552
+ if (topic !== "voice.end_call") return;
553
+ void end(true); // agent is shutting down — skip summary request
554
+ },
555
+ );
556
+
557
+ // Contextual processing status (source of truth) — drives the thinking
558
+ // orb and the status line.
559
+ room.on(
560
+ RoomEvent.DataReceived,
561
+ (payload, _participant?, _kind?, topic?) => {
562
+ if (topic !== "voice.agent_status") return;
563
+ try {
564
+ const decoded = new TextDecoder().decode(payload);
565
+ const parsed = decoded ? JSON.parse(decoded) : null;
566
+ if (!parsed || typeof parsed !== "object") return;
567
+ const state = String((parsed as { state?: unknown }).state || "");
568
+ const label = String((parsed as { label?: unknown }).label || "");
569
+ if (state === "thinking") setThinking(label);
570
+ else clearThinking(); // idle / speaking → leave thinking
571
+ } catch (err) {
572
+ // eslint-disable-next-line no-console
573
+ console.debug("[voice-agent] bad agent_status payload", err);
574
+ }
575
+ },
576
+ );
577
+
578
+ // Free secondary signal: the SDK publishes lk.agent.state
579
+ // (connecting/thinking/listening/speaking) on the agent participant's
580
+ // attributes. Used as a fallback when a voice.agent_status event is
581
+ // missed (e.g. an older backend that doesn't emit them).
582
+ room.on(
583
+ RoomEvent.ParticipantAttributesChanged,
584
+ (changed: Record<string, string>, participant) => {
585
+ if (participant?.identity === room?.localParticipant.identity) return;
586
+ const state = changed?.["lk.agent.state"];
587
+ if (typeof state !== "string") return;
588
+ if (state === "thinking") {
589
+ // Don't clobber a richer contextual label already in flight.
590
+ if (!agentThinking) setThinking("");
591
+ } else if (state === "speaking") {
592
+ cancelThinkingClear();
593
+ agentThinking = false;
594
+ lastAgentStatusLabel = "";
595
+ reconcileOrb();
596
+ } else if (state === "listening" || state === "idle") {
597
+ clearThinking();
598
+ }
599
+ },
600
+ );
601
+
602
+ room.on(RoomEvent.DataReceived, (payload, _participant, _kind, topic) => {
603
+ try {
604
+ const decoded = new TextDecoder().decode(payload);
605
+ const data = decoded ? JSON.parse(decoded) : null;
606
+ emit("data", { data, topic });
607
+ } catch (err) {
608
+ // eslint-disable-next-line no-console
609
+ console.debug("[voice-agent] non-JSON data message ignored", err);
610
+ }
611
+ });
612
+
613
+ room.on(RoomEvent.Disconnected, () => {
614
+ void end();
615
+ });
616
+
617
+ // Backstop for agent-driven end-of-call. The agent signals end via a
618
+ // `voice.end_call` data message, but reliable data can race the agent's
619
+ // own session teardown. When the agent participant leaves the room
620
+ // (session.aclose on the server), tear the call down here too — this is
621
+ // a signaling event, so it can't be dropped like a data packet.
622
+ room.on(
623
+ RoomEvent.ParticipantDisconnected,
624
+ (_participant: RemoteParticipant) => {
625
+ if (!room) return;
626
+ // In this 1:1 widget↔agent topology any remote leaving means the
627
+ // agent is gone; if no remotes remain, end the call.
628
+ if (room.remoteParticipants.size === 0) {
629
+ void end(true);
630
+ }
631
+ },
632
+ );
633
+
634
+ try {
635
+ await room.connect(init.livekit_url, init.token);
636
+ await room.localParticipant.setMicrophoneEnabled(
637
+ true,
638
+ buildCaptureOptions(prefs),
639
+ {
640
+ // Opus "speech" preset — narrower band optimized for voice.
641
+ audioPreset: AudioPresets.speech,
642
+ // DTX intentionally OFF: it skips frames during silence, which
643
+ // disrupts the steady cadence that the agent's contextual turn
644
+ // detector + Silero VAD rely on to time end-of-turn. With DTX
645
+ // on, preemptive LLM generation fires noticeably later.
646
+ dtx: false,
647
+ // RED intentionally OFF: redundancy adds decode jitter without
648
+ // a clear win on the typical low-loss WiFi/wired path. Re-enable
649
+ // if you observe audible packet-loss artifacts.
650
+ red: false,
651
+ },
652
+ );
653
+ // Honor saved mic device if the user picked one in a prior call.
654
+ if (prefs.micDeviceId) {
655
+ try {
656
+ await room.switchActiveDevice("audioinput", prefs.micDeviceId);
657
+ } catch (err) {
658
+ // eslint-disable-next-line no-console
659
+ console.warn("[voice-agent] saved mic device unavailable:", err);
660
+ }
661
+ }
662
+ await applyNoiseFilter(prefs.noiseFilter);
663
+ emitAudioState();
664
+ emit("state", { orb: "listening", statusLabel: null });
665
+ emit("call:status", { status: getAppearance().labels.listening });
666
+ emit("controls", { canStart: true, canMute: true, canEnd: true });
667
+ startCallTimeout();
668
+
669
+ // The ParticipantConnected event only fires for participants that
670
+ // join *after* we subscribe. If the agent worker is already in the
671
+ // room when we connect, fire the publish path anyway after the same
672
+ // grace period.
673
+ if (room.remoteParticipants.size > 0) {
674
+ window.setTimeout(() => {
675
+ if (!room || prevContextSent) return;
676
+ void publishPreviousContext(room);
677
+ }, 750);
678
+ }
679
+
680
+ // Persist whatever we have if the user closes the tab mid-call.
681
+ unloadHandler = () => persistTurns();
682
+ window.addEventListener("beforeunload", unloadHandler);
683
+ window.addEventListener("pagehide", unloadHandler);
684
+ } catch (e) {
685
+ emit("call:status", { status: `Connect failed: ${(e as Error).message}` });
686
+ emit("error", { scope: "connect", error: e as Error });
687
+ await end();
688
+ }
689
+ };
690
+
691
+ /**
692
+ * Half-duplex ducking. While the agent is speaking we mute the user's
693
+ * mic so the speaker's audio can't loop back into the published track
694
+ * (causing audible echo + STT confusion). We use
695
+ * `LocalAudioTrack.mute()` / `.unmute()` — WebRTC-level mute that flips
696
+ * the sender's enabled flag without unpublishing the track, so it
697
+ * returns audio orders of magnitude faster than re-running
698
+ * `setMicrophoneEnabled(true)`.
699
+ *
700
+ * Disabled entirely when `prefs.headphonesMode` is on — the user has no
701
+ * acoustic echo path through speakers, so cutting them off mid-thought
702
+ * just chops their barge-in.
703
+ */
704
+ const applyDucking = (agentSpeaking: boolean) => {
705
+ if (!room || muted || prefs.headphonesMode) {
706
+ cancelDuckTimers();
707
+ // If we'd previously muted via ducking but the user just enabled
708
+ // headphones mode, undo it.
709
+ if (duckingMuted) {
710
+ duckingMuted = false;
711
+ void getLocalAudioTrack(room)?.unmute();
712
+ }
713
+ return;
714
+ }
715
+
716
+ if (agentSpeaking) {
717
+ if (unduckTimerId !== null) {
718
+ window.clearTimeout(unduckTimerId);
719
+ unduckTimerId = null;
720
+ }
721
+ if (duckingMuted || duckTimerId !== null) return;
722
+ duckTimerId = window.setTimeout(() => {
723
+ duckTimerId = null;
724
+ if (!room || muted || duckingMuted || prefs.headphonesMode) return;
725
+ const track = getLocalAudioTrack(room);
726
+ if (!track) return;
727
+ duckingMuted = true;
728
+ void track.mute();
729
+ }, DUCK_DELAY_MS);
730
+ } else {
731
+ if (duckTimerId !== null) {
732
+ window.clearTimeout(duckTimerId);
733
+ duckTimerId = null;
734
+ }
735
+ if (unduckTimerId !== null) {
736
+ window.clearTimeout(unduckTimerId);
737
+ unduckTimerId = null;
738
+ }
739
+ if (!duckingMuted) return;
740
+ // Unmute immediately: any post-agent grace delays the start of the
741
+ // user's next turn reaching the agent's VAD/turn-detector, which is
742
+ // exactly what makes preemptive LLM generation fire late.
743
+ duckingMuted = false;
744
+ void getLocalAudioTrack(room)?.unmute();
745
+ }
746
+ };
747
+
748
+ const cancelDuckTimers = () => {
749
+ if (duckTimerId !== null) {
750
+ window.clearTimeout(duckTimerId);
751
+ duckTimerId = null;
752
+ }
753
+ if (unduckTimerId !== null) {
754
+ window.clearTimeout(unduckTimerId);
755
+ unduckTimerId = null;
756
+ }
757
+ };
758
+
759
+ const toggleMute = async () => {
760
+ if (!room) return muted;
761
+ muted = !muted;
762
+ cancelDuckTimers();
763
+ duckingMuted = false;
764
+ // Mirror the ducking primitive: WebRTC-level mute/unmute is much
765
+ // faster than (un)publishing the track and avoids a brief audio gap
766
+ // on the agent side when the user toggles their own mute.
767
+ const track = getLocalAudioTrack(room);
768
+ if (track) {
769
+ if (muted) await track.mute();
770
+ else await track.unmute();
771
+ } else {
772
+ await room.localParticipant.setMicrophoneEnabled(!muted);
773
+ }
774
+ emit("mute", { muted });
775
+ emit("call:status", {
776
+ status: muted
777
+ ? getAppearance().labels.muted
778
+ : getAppearance().labels.listening,
779
+ });
780
+ return muted;
781
+ };
782
+
783
+ const publishData = async (payload: unknown, topic: string) => {
784
+ if (!room) return;
785
+ try {
786
+ await room.localParticipant.publishData(
787
+ new TextEncoder().encode(JSON.stringify(payload)),
788
+ { reliable: true, topic },
789
+ );
790
+ } catch (err) {
791
+ // eslint-disable-next-line no-console
792
+ console.warn("[voice-agent] failed to publish data:", err);
793
+ }
794
+ };
795
+
796
+ /**
797
+ * Attach/detach the deep-learning NS processor according to `engine`.
798
+ * Krisp is statically imported (small, always present); DeepFilterNet3
799
+ * is loaded with a dynamic `import()` so the package only enters the
800
+ * bundle when the user actually picks it (and the widget still works
801
+ * if the package isn't installed at all).
802
+ */
803
+ const applyNoiseFilter = async (engine: NoiseFilterEngine): Promise<void> => {
804
+ const track = room ? getLocalAudioTrack(room) : null;
805
+ if (!track) {
806
+ noiseFilterStatus = engine === "off" ? "off" : "failed";
807
+ deepFilterProcessor = null;
808
+ return;
809
+ }
810
+
811
+ // Detach whatever's currently attached before switching engines.
812
+ try {
813
+ await track.stopProcessor();
814
+ } catch {
815
+ /* no-op: no processor was attached */
816
+ }
817
+ deepFilterProcessor = null;
818
+
819
+ if (engine === "off") {
820
+ noiseFilterStatus = "off";
821
+ return;
822
+ }
823
+
824
+ if (engine === "krisp") {
825
+ noiseFilterStatus = await attachKrispNoiseFilter(track);
826
+ return;
827
+ }
828
+
829
+ if (engine === "deepfilter") {
830
+ try {
831
+ // Runtime-computed URL so Rollup's `inlineDynamicImports` can't
832
+ // statically resolve the path. The widget bundles cleanly even
833
+ // when `deepfilternet3-noise-filter` isn't installed locally;
834
+ // the package is fetched from esm.sh (or a self-hosted ESM
835
+ // mirror set via `setDeepFilterModuleUrl`) on first use.
836
+ const moduleUrl = deepFilterModuleUrl;
837
+ const mod = await import(/* @vite-ignore */ moduleUrl);
838
+ const Ctor = mod.DeepFilterNoiseFilterProcessor as new (
839
+ options: {
840
+ noiseReductionLevel?: number;
841
+ assetConfig?: { cdnUrl?: string };
842
+ },
843
+ ) => unknown;
844
+ if (typeof Ctor !== "function") {
845
+ throw new Error("DeepFilterNoiseFilterProcessor export missing");
846
+ }
847
+ // The upstream package only honors `assetConfig.cdnUrl` and then
848
+ // appends fixed paths (`v2/pkg/df_bg.wasm`, `v2/models/DeepFilterNet3_onnx.tar.gz`).
849
+ // To support arbitrary self-hosted file URLs we temporarily patch
850
+ // `globalThis.fetch` while the processor initializes and rewrite
851
+ // any request matching those filenames to the override URLs.
852
+ const assetCfg: { cdnUrl?: string } = {};
853
+ if (deepFilterCdnUrl) assetCfg.cdnUrl = deepFilterCdnUrl;
854
+ const instance = new Ctor({
855
+ noiseReductionLevel: prefs.deepFilterStrength,
856
+ assetConfig: Object.keys(assetCfg).length > 0 ? assetCfg : undefined,
857
+ });
858
+
859
+ const wasmOverride = deepFilterWasmUrl;
860
+ const modelOverride = deepFilterOnnxUrl;
861
+ const originalFetch =
862
+ wasmOverride || modelOverride
863
+ ? globalThis.fetch.bind(globalThis)
864
+ : null;
865
+ if (originalFetch) {
866
+ const patched: typeof fetch = (input, init) => {
867
+ const url =
868
+ typeof input === "string"
869
+ ? input
870
+ : input instanceof URL
871
+ ? input.href
872
+ : (input as Request).url;
873
+ if (wasmOverride && /\/df_bg\.wasm(?:$|[?#])/.test(url)) {
874
+ return originalFetch(wasmOverride, init);
875
+ }
876
+ if (
877
+ modelOverride &&
878
+ /\/DeepFilterNet3_onnx\.tar\.gz(?:$|[?#])/.test(url)
879
+ ) {
880
+ return originalFetch(modelOverride, init);
881
+ }
882
+ return originalFetch(input, init);
883
+ };
884
+ globalThis.fetch = patched;
885
+ }
886
+
887
+ try {
888
+ await track.setProcessor(
889
+ instance as unknown as Parameters<typeof track.setProcessor>[0],
890
+ );
891
+ } finally {
892
+ if (originalFetch) globalThis.fetch = originalFetch;
893
+ }
894
+ deepFilterProcessor = instance as { setSuppressionLevel?: (n: number) => void };
895
+ noiseFilterStatus = "active";
896
+ // eslint-disable-next-line no-console
897
+ console.info(
898
+ "[voice-agent] DeepFilterNet3 noise filter attached (strength=" +
899
+ prefs.deepFilterStrength +
900
+ ")",
901
+ );
902
+ } catch (err) {
903
+ // eslint-disable-next-line no-console
904
+ console.warn(
905
+ "[voice-agent] DeepFilterNet3 unavailable, falling back to Krisp:",
906
+ err,
907
+ );
908
+ noiseFilterStatus = "unsupported";
909
+ // Best-effort fallback so the user isn't left with raw audio.
910
+ const krispResult = await attachKrispNoiseFilter(track);
911
+ if (krispResult === "active") {
912
+ noiseFilterStatus = "active";
913
+ }
914
+ }
915
+ }
916
+ };
917
+
918
+ const emitAudioState = () => {
919
+ emit("audio", snapshotAudioState());
920
+ };
921
+
922
+ const snapshotAudioState = (): AudioStateSnapshot => {
923
+ const track = room ? getLocalAudioTrack(room) : null;
924
+ const settings = track?.mediaStreamTrack?.getSettings() as
925
+ | (MediaTrackSettings & { voiceIsolation?: boolean })
926
+ | undefined;
927
+ return {
928
+ prefs: { ...prefs },
929
+ applied: {
930
+ echoCancellation: settings?.echoCancellation,
931
+ noiseSuppression: settings?.noiseSuppression,
932
+ autoGainControl: settings?.autoGainControl,
933
+ voiceIsolation: settings?.voiceIsolation,
934
+ sampleRate: settings?.sampleRate,
935
+ channelCount: settings?.channelCount,
936
+ deviceId: settings?.deviceId,
937
+ },
938
+ noiseFilter: {
939
+ engine: prefs.noiseFilter,
940
+ status: noiseFilterStatus,
941
+ },
942
+ };
943
+ };
944
+
945
+ const updateAudioSettings = async (
946
+ delta: Partial<AudioPrefs>,
947
+ ): Promise<AudioStateSnapshot> => {
948
+ const next: AudioPrefs = { ...prefs, ...delta };
949
+ const localTrack = room ? getLocalAudioTrack(room) : null;
950
+
951
+ // 1. Standard MediaTrackSettings — flip live via applyConstraints. Falls
952
+ // back to restartTrack only if applyConstraints rejects.
953
+ const constraintKeys = [
954
+ "echoCancellation",
955
+ "noiseSuppression",
956
+ "autoGainControl",
957
+ "voiceIsolation",
958
+ ] as const;
959
+ const constraintDelta: MediaTrackConstraints & { voiceIsolation?: boolean } = {};
960
+ for (const k of constraintKeys) {
961
+ if (k in delta && delta[k] !== prefs[k]) {
962
+ (constraintDelta as Record<string, unknown>)[k] = next[k];
963
+ }
964
+ }
965
+ if (localTrack && Object.keys(constraintDelta).length > 0) {
966
+ try {
967
+ await localTrack.mediaStreamTrack.applyConstraints(constraintDelta);
968
+ } catch (err) {
969
+ // eslint-disable-next-line no-console
970
+ console.warn(
971
+ "[voice-agent] applyConstraints rejected, falling back to restartTrack:",
972
+ err,
973
+ );
974
+ try {
975
+ await localTrack.restartTrack(buildCaptureOptions(next));
976
+ } catch (restartErr) {
977
+ // eslint-disable-next-line no-console
978
+ console.warn("[voice-agent] restartTrack failed:", restartErr);
979
+ }
980
+ }
981
+ }
982
+
983
+ // 2. Noise-filter engine — separate audio-graph attach/detach.
984
+ if ("noiseFilter" in delta && delta.noiseFilter !== prefs.noiseFilter) {
985
+ // Commit the new engine choice into `prefs` *before* applying so
986
+ // applyNoiseFilter sees the strength etc. for the right engine.
987
+ prefs = { ...prefs, noiseFilter: next.noiseFilter };
988
+ await applyNoiseFilter(next.noiseFilter);
989
+ }
990
+
991
+ // DeepFilterNet3 strength — live-adjust if the processor exposes the
992
+ // hook, otherwise reattach. Skip when not on deepfilter.
993
+ if (
994
+ "deepFilterStrength" in delta &&
995
+ delta.deepFilterStrength !== prefs.deepFilterStrength &&
996
+ next.noiseFilter === "deepfilter"
997
+ ) {
998
+ prefs = { ...prefs, deepFilterStrength: next.deepFilterStrength };
999
+ if (deepFilterProcessor && typeof deepFilterProcessor.setSuppressionLevel === "function") {
1000
+ try {
1001
+ deepFilterProcessor.setSuppressionLevel(next.deepFilterStrength);
1002
+ } catch (err) {
1003
+ // eslint-disable-next-line no-console
1004
+ console.warn("[voice-agent] DeepFilter setSuppressionLevel failed:", err);
1005
+ }
1006
+ } else {
1007
+ await applyNoiseFilter("deepfilter");
1008
+ }
1009
+ }
1010
+
1011
+ // 3. Mic device — switchActiveDevice does the heavy lifting.
1012
+ if ("micDeviceId" in delta && delta.micDeviceId !== prefs.micDeviceId && room) {
1013
+ try {
1014
+ await room.switchActiveDevice("audioinput", next.micDeviceId || "default");
1015
+ } catch (err) {
1016
+ // eslint-disable-next-line no-console
1017
+ console.warn("[voice-agent] failed to switch mic device:", err);
1018
+ }
1019
+ }
1020
+
1021
+ // 4. Speaker device — HTMLAudioElement.setSinkId.
1022
+ if (
1023
+ "speakerDeviceId" in delta &&
1024
+ delta.speakerDeviceId !== prefs.speakerDeviceId &&
1025
+ audioEl
1026
+ ) {
1027
+ await applySinkId(audioEl, next.speakerDeviceId);
1028
+ }
1029
+
1030
+ // 5. Volume — direct property on the element.
1031
+ if ("outputVolume" in delta && audioEl) {
1032
+ audioEl.volume = Math.max(0, Math.min(1, next.outputVolume / 100));
1033
+ }
1034
+
1035
+ // 6. Headphones mode — toggling on while currently ducked needs to
1036
+ // immediately un-mute the local track; applyDucking handles both
1037
+ // directions via its early-return path.
1038
+ if ("headphonesMode" in delta && delta.headphonesMode !== prefs.headphonesMode) {
1039
+ // Re-run with current speaker state. If no one is currently
1040
+ // speaking we pass false to force un-mute on next pass.
1041
+ applyDucking(false);
1042
+ }
1043
+
1044
+ prefs = next;
1045
+ persistPrefsToStorage();
1046
+ const state = snapshotAudioState();
1047
+ emit("audio", state);
1048
+ return state;
1049
+ };
1050
+
1051
+ const getAudioStats = async (): Promise<AudioStats | null> => {
1052
+ if (!room) return null;
1053
+ const reports: RTCStatsReport[] = [];
1054
+
1055
+ const localTrack = getLocalAudioTrack(room);
1056
+ if (localTrack) {
1057
+ try {
1058
+ const r = await localTrack.getRTCStatsReport();
1059
+ if (r) reports.push(r);
1060
+ } catch { /* ignore */ }
1061
+ }
1062
+
1063
+ // Walk remote participants for inbound audio stats.
1064
+ for (const participant of room.remoteParticipants.values()) {
1065
+ for (const pub of participant.audioTrackPublications.values()) {
1066
+ const track = pub.track;
1067
+ if (!track) continue;
1068
+ try {
1069
+ const r = await track.getRTCStatsReport();
1070
+ if (r) reports.push(r);
1071
+ } catch { /* ignore */ }
1072
+ }
1073
+ }
1074
+
1075
+ const stats: AudioStats = {
1076
+ outboundAudioLevel: 0,
1077
+ inboundAudioLevel: 0,
1078
+ packetsLost: 0,
1079
+ jitter: 0,
1080
+ roundTripTime: 0,
1081
+ };
1082
+ for (const report of reports) {
1083
+ report.forEach((entry) => {
1084
+ const e = entry as RTCStats & Record<string, unknown>;
1085
+ if (e.type === "outbound-rtp" && e.kind === "audio") {
1086
+ const lvl = e.audioLevel;
1087
+ if (typeof lvl === "number") stats.outboundAudioLevel = lvl;
1088
+ }
1089
+ if (e.type === "inbound-rtp" && e.kind === "audio") {
1090
+ const lvl = e.audioLevel;
1091
+ if (typeof lvl === "number") stats.inboundAudioLevel = lvl;
1092
+ const lost = e.packetsLost;
1093
+ if (typeof lost === "number") stats.packetsLost = lost;
1094
+ const j = e.jitter;
1095
+ if (typeof j === "number") stats.jitter = j * 1000;
1096
+ }
1097
+ if (e.type === "candidate-pair" && (e.selected === true || e.nominated === true)) {
1098
+ const rtt = e.currentRoundTripTime;
1099
+ if (typeof rtt === "number") stats.roundTripTime = rtt * 1000;
1100
+ }
1101
+ });
1102
+ }
1103
+ return stats;
1104
+ };
1105
+
1106
+ return {
1107
+ start,
1108
+ end,
1109
+ toggleMute,
1110
+ isActive: () => room !== null,
1111
+ sessionId: () => currentSessionId,
1112
+ publishData,
1113
+ updateAudioSettings,
1114
+ getAudioState: snapshotAudioState,
1115
+ getAudioStats,
1116
+ };
1117
+ }
1118
+
1119
+ function buildCaptureOptions(prefs: AudioPrefs): MediaTrackConstraints & {
1120
+ voiceIsolation?: boolean;
1121
+ } {
1122
+ return {
1123
+ echoCancellation: prefs.echoCancellation,
1124
+ noiseSuppression: prefs.noiseSuppression,
1125
+ autoGainControl: prefs.autoGainControl,
1126
+ voiceIsolation: prefs.voiceIsolation,
1127
+ channelCount: 1,
1128
+ sampleRate: 48000,
1129
+ deviceId: prefs.micDeviceId ? { ideal: prefs.micDeviceId } : undefined,
1130
+ };
1131
+ }
1132
+
1133
+ function getLocalAudioTrack(room: Room | null): LocalAudioTrack | null {
1134
+ if (!room) return null;
1135
+ const pub = room.localParticipant.getTrackPublication(Track.Source.Microphone);
1136
+ const track = pub?.audioTrack;
1137
+ return track instanceof LocalAudioTrack ? track : null;
1138
+ }
1139
+
1140
+ async function applySinkId(el: HTMLAudioElement, deviceId: string): Promise<void> {
1141
+ const setSinkId = (el as unknown as { setSinkId?: (id: string) => Promise<void> })
1142
+ .setSinkId;
1143
+ if (typeof setSinkId !== "function") return;
1144
+ try {
1145
+ await setSinkId.call(el, deviceId || "");
1146
+ } catch (err) {
1147
+ // eslint-disable-next-line no-console
1148
+ console.warn("[voice-agent] setSinkId failed:", err);
1149
+ }
1150
+ }
1151
+
1152
+ async function attachKrispNoiseFilter(
1153
+ track: LocalAudioTrack,
1154
+ ): Promise<"active" | "failed" | "unsupported"> {
1155
+ // Loaded lazily: the package evaluates `class extends Worker` at module load,
1156
+ // which throws in Node. Deferring the import keeps `import "@oshara/voice-sdk"`
1157
+ // safe server-side; this path only ever runs in a browser call.
1158
+ const { KrispNoiseFilter, isKrispNoiseFilterSupported } = await import(
1159
+ "@livekit/krisp-noise-filter"
1160
+ );
1161
+ if (!isKrispNoiseFilterSupported()) {
1162
+ // eslint-disable-next-line no-console
1163
+ console.warn(
1164
+ "[voice-agent] Krisp noise filter NOT supported in this browser — relying on browser noiseSuppression only",
1165
+ );
1166
+ return "unsupported";
1167
+ }
1168
+
1169
+ try {
1170
+ await track.setProcessor(KrispNoiseFilter());
1171
+ // eslint-disable-next-line no-console
1172
+ console.info("[voice-agent] Krisp noise filter attached");
1173
+ return "active";
1174
+ } catch (err) {
1175
+ // eslint-disable-next-line no-console
1176
+ console.warn("[voice-agent] Failed to attach Krisp noise filter:", err);
1177
+ return "failed";
1178
+ }
1179
+ }