npm - @livekit/agents - Versions diffs - 1.0.44 → 1.0.46 - Mend

@livekit/agents 1.0.44 → 1.0.46

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (157) hide show

package/dist/ipc/supervised_proc.cjs +1 -1
package/dist/ipc/supervised_proc.cjs.map +1 -1
package/dist/ipc/supervised_proc.js +1 -1
package/dist/ipc/supervised_proc.js.map +1 -1
package/dist/llm/llm.cjs +1 -1
package/dist/llm/llm.cjs.map +1 -1
package/dist/llm/llm.js +1 -1
package/dist/llm/llm.js.map +1 -1
package/dist/log.cjs +13 -9
package/dist/log.cjs.map +1 -1
package/dist/log.d.cts +1 -1
package/dist/log.d.ts +1 -1
package/dist/log.d.ts.map +1 -1
package/dist/log.js +13 -9
package/dist/log.js.map +1 -1
package/dist/stream/index.cjs +3 -0
package/dist/stream/index.cjs.map +1 -1
package/dist/stream/index.d.cts +1 -0
package/dist/stream/index.d.ts +1 -0
package/dist/stream/index.d.ts.map +1 -1
package/dist/stream/index.js +2 -0
package/dist/stream/index.js.map +1 -1
package/dist/stream/multi_input_stream.cjs +139 -0
package/dist/stream/multi_input_stream.cjs.map +1 -0
package/dist/stream/multi_input_stream.d.cts +55 -0
package/dist/stream/multi_input_stream.d.ts +55 -0
package/dist/stream/multi_input_stream.d.ts.map +1 -0
package/dist/stream/multi_input_stream.js +115 -0
package/dist/stream/multi_input_stream.js.map +1 -0
package/dist/stream/multi_input_stream.test.cjs +340 -0
package/dist/stream/multi_input_stream.test.cjs.map +1 -0
package/dist/stream/multi_input_stream.test.js +339 -0
package/dist/stream/multi_input_stream.test.js.map +1 -0
package/dist/stt/stt.cjs +2 -2
package/dist/stt/stt.cjs.map +1 -1
package/dist/stt/stt.js +2 -2
package/dist/stt/stt.js.map +1 -1
package/dist/telemetry/trace_types.cjs +42 -0
package/dist/telemetry/trace_types.cjs.map +1 -1
package/dist/telemetry/trace_types.d.cts +14 -0
package/dist/telemetry/trace_types.d.ts +14 -0
package/dist/telemetry/trace_types.d.ts.map +1 -1
package/dist/telemetry/trace_types.js +28 -0
package/dist/telemetry/trace_types.js.map +1 -1
package/dist/tts/fallback_adapter.cjs +466 -0
package/dist/tts/fallback_adapter.cjs.map +1 -0
package/dist/tts/fallback_adapter.d.cts +110 -0
package/dist/tts/fallback_adapter.d.ts +110 -0
package/dist/tts/fallback_adapter.d.ts.map +1 -0
package/dist/tts/fallback_adapter.js +442 -0
package/dist/tts/fallback_adapter.js.map +1 -0
package/dist/tts/index.cjs +3 -0
package/dist/tts/index.cjs.map +1 -1
package/dist/tts/index.d.cts +1 -0
package/dist/tts/index.d.ts +1 -0
package/dist/tts/index.d.ts.map +1 -1
package/dist/tts/index.js +2 -0
package/dist/tts/index.js.map +1 -1
package/dist/tts/tts.cjs +2 -2
package/dist/tts/tts.cjs.map +1 -1
package/dist/tts/tts.js +2 -2
package/dist/tts/tts.js.map +1 -1
package/dist/utils.cjs +13 -0
package/dist/utils.cjs.map +1 -1
package/dist/utils.d.cts +1 -0
package/dist/utils.d.ts +1 -0
package/dist/utils.d.ts.map +1 -1
package/dist/utils.js +13 -0
package/dist/utils.js.map +1 -1
package/dist/vad.cjs +11 -10
package/dist/vad.cjs.map +1 -1
package/dist/vad.d.cts +5 -3
package/dist/vad.d.ts +5 -3
package/dist/vad.d.ts.map +1 -1
package/dist/vad.js +11 -10
package/dist/vad.js.map +1 -1
package/dist/voice/agent_activity.cjs +35 -10
package/dist/voice/agent_activity.cjs.map +1 -1
package/dist/voice/agent_activity.d.cts +1 -0
package/dist/voice/agent_activity.d.ts +1 -0
package/dist/voice/agent_activity.d.ts.map +1 -1
package/dist/voice/agent_activity.js +35 -10
package/dist/voice/agent_activity.js.map +1 -1
package/dist/voice/agent_session.cjs +19 -7
package/dist/voice/agent_session.cjs.map +1 -1
package/dist/voice/agent_session.d.cts +3 -2
package/dist/voice/agent_session.d.ts +3 -2
package/dist/voice/agent_session.d.ts.map +1 -1
package/dist/voice/agent_session.js +19 -7
package/dist/voice/agent_session.js.map +1 -1
package/dist/voice/audio_recognition.cjs +85 -36
package/dist/voice/audio_recognition.cjs.map +1 -1
package/dist/voice/audio_recognition.d.cts +22 -1
package/dist/voice/audio_recognition.d.ts +22 -1
package/dist/voice/audio_recognition.d.ts.map +1 -1
package/dist/voice/audio_recognition.js +89 -36
package/dist/voice/audio_recognition.js.map +1 -1
package/dist/voice/audio_recognition_span.test.cjs +233 -0
package/dist/voice/audio_recognition_span.test.cjs.map +1 -0
package/dist/voice/audio_recognition_span.test.js +232 -0
package/dist/voice/audio_recognition_span.test.js.map +1 -0
package/dist/voice/io.cjs +6 -3
package/dist/voice/io.cjs.map +1 -1
package/dist/voice/io.d.cts +3 -2
package/dist/voice/io.d.ts +3 -2
package/dist/voice/io.d.ts.map +1 -1
package/dist/voice/io.js +6 -3
package/dist/voice/io.js.map +1 -1
package/dist/voice/recorder_io/recorder_io.cjs +3 -1
package/dist/voice/recorder_io/recorder_io.cjs.map +1 -1
package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -1
package/dist/voice/recorder_io/recorder_io.js +3 -1
package/dist/voice/recorder_io/recorder_io.js.map +1 -1
package/dist/voice/room_io/_input.cjs +23 -20
package/dist/voice/room_io/_input.cjs.map +1 -1
package/dist/voice/room_io/_input.d.cts +2 -2
package/dist/voice/room_io/_input.d.ts +2 -2
package/dist/voice/room_io/_input.d.ts.map +1 -1
package/dist/voice/room_io/_input.js +13 -9
package/dist/voice/room_io/_input.js.map +1 -1
package/dist/voice/room_io/room_io.cjs +9 -0
package/dist/voice/room_io/room_io.cjs.map +1 -1
package/dist/voice/room_io/room_io.d.cts +3 -1
package/dist/voice/room_io/room_io.d.ts +3 -1
package/dist/voice/room_io/room_io.d.ts.map +1 -1
package/dist/voice/room_io/room_io.js +9 -0
package/dist/voice/room_io/room_io.js.map +1 -1
package/dist/voice/utils.cjs +47 -0
package/dist/voice/utils.cjs.map +1 -0
package/dist/voice/utils.d.cts +4 -0
package/dist/voice/utils.d.ts +4 -0
package/dist/voice/utils.d.ts.map +1 -0
package/dist/voice/utils.js +23 -0
package/dist/voice/utils.js.map +1 -0
package/package.json +1 -1
package/src/ipc/supervised_proc.ts +1 -1
package/src/llm/llm.ts +1 -1
package/src/log.ts +22 -11
package/src/stream/index.ts +1 -0
package/src/stream/multi_input_stream.test.ts +540 -0
package/src/stream/multi_input_stream.ts +172 -0
package/src/stt/stt.ts +2 -2
package/src/telemetry/trace_types.ts +18 -0
package/src/tts/fallback_adapter.ts +579 -0
package/src/tts/index.ts +1 -0
package/src/tts/tts.ts +2 -2
package/src/utils.ts +16 -0
package/src/vad.ts +12 -11
package/src/voice/agent_activity.ts +25 -0
package/src/voice/agent_session.ts +17 -11
package/src/voice/audio_recognition.ts +114 -38
package/src/voice/audio_recognition_span.test.ts +261 -0
package/src/voice/io.ts +7 -4
package/src/voice/recorder_io/recorder_io.ts +2 -1
package/src/voice/room_io/_input.ts +16 -10
package/src/voice/room_io/room_io.ts +12 -0
package/src/voice/utils.ts +29 -0

package/src/voice/audio_recognition_span.test.ts ADDED Viewed

@@ -0,0 +1,261 @@
+// SPDX-FileCopyrightText: 2026 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+import { ParticipantKind } from '@livekit/rtc-node';
+import { InMemorySpanExporter, SimpleSpanProcessor } from '@opentelemetry/sdk-trace-base';
+import { NodeTracerProvider } from '@opentelemetry/sdk-trace-node';
+import { describe, expect, it, vi } from 'vitest';
+import { initializeLogger } from '../log.js';
+import { type SpeechEvent, SpeechEventType } from '../stt/stt.js';
+import { setTracerProvider } from '../telemetry/index.js';
+import { VAD, type VADEvent, VADEventType, type VADStream } from '../vad.js';
+import { AudioRecognition, type _TurnDetector } from './audio_recognition.js';
+function setupInMemoryTracing() {
+  const exporter = new InMemorySpanExporter();
+  const provider = new NodeTracerProvider();
+  provider.addSpanProcessor(new SimpleSpanProcessor(exporter));
+  provider.register();
+  setTracerProvider(provider);
+  return { exporter };
+}
+function spanByName(spans: any[], name: string) {
+  return spans.find((s) => s.name === name);
+}
+class FakeVADStream extends (Object as unknown as { new (): VADStream }) {
+  // We intentionally avoid extending the real VADStream (it is not exported as a value in JS output
+  // in some bundling contexts). Instead we emulate the async iterator shape used by AudioRecognition.
+  private events: VADEvent[];
+  private idx = 0;
+  constructor(events: VADEvent[]) {
+    super();
+    this.events = events;
+  }
+  updateInputStream() {}
+  detachInputStream() {}
+  close() {}
+  [Symbol.asyncIterator]() {
+    return this;
+  }
+  async next(): Promise<IteratorResult<VADEvent>> {
+    if (this.idx >= this.events.length) {
+      return { done: true, value: undefined };
+    }
+    const value = this.events[this.idx++]!;
+    return { done: false, value };
+  }
+}
+class FakeVAD extends VAD {
+  label = 'fake-vad';
+  private events: VADEvent[];
+  constructor(events: VADEvent[]) {
+    super({ updateInterval: 1 });
+    this.events = events;
+  }
+  stream(): any {
+    return new FakeVADStream(this.events);
+  }
+}
+const alwaysTrueTurnDetector: _TurnDetector = {
+  supportsLanguage: async () => true,
+  unlikelyThreshold: async () => undefined,
+  predictEndOfTurn: async () => 1.0,
+};
+describe('AudioRecognition user_turn span parity', () => {
+  initializeLogger({ pretty: false, level: 'silent' });
+  it('creates user_turn and parents eou_detection under it (stt mode)', async () => {
+    const { exporter } = setupInMemoryTracing();
+    const hooks = {
+      onStartOfSpeech: vi.fn(),
+      onVADInferenceDone: vi.fn(),
+      onEndOfSpeech: vi.fn(),
+      onInterimTranscript: vi.fn(),
+      onFinalTranscript: vi.fn(),
+      onPreemptiveGeneration: vi.fn(),
+      retrieveChatCtx: () =>
+        ({
+          copy() {
+            return this;
+          },
+          addMessage() {},
+          toJSON() {
+            return { items: [] };
+          },
+        }) as any,
+      onEndOfTurn: vi.fn(async () => true),
+    };
+    const sttEvents: SpeechEvent[] = [
+      { type: SpeechEventType.START_OF_SPEECH },
+      {
+        type: SpeechEventType.FINAL_TRANSCRIPT,
+        alternatives: [
+          {
+            language: 'en',
+            text: 'hello',
+            startTime: 0,
+            endTime: 0,
+            confidence: 0.9,
+          },
+        ],
+      },
+      { type: SpeechEventType.END_OF_SPEECH },
+    ];
+    const sttNode = async () =>
+      new ReadableStream<SpeechEvent>({
+        start(controller) {
+          for (const ev of sttEvents) controller.enqueue(ev);
+          controller.close();
+        },
+      });
+    const ar = new AudioRecognition({
+      recognitionHooks: hooks as any,
+      stt: sttNode as any,
+      vad: undefined,
+      turnDetector: alwaysTrueTurnDetector,
+      turnDetectionMode: 'stt',
+      minEndpointingDelay: 0,
+      maxEndpointingDelay: 0,
+      sttModel: 'deepgram-nova2',
+      sttProvider: 'deepgram',
+      getLinkedParticipant: () => ({ sid: 'p1', identity: 'bob', kind: ParticipantKind.AGENT }),
+    });
+    await ar.start();
+    // allow background task to drain
+    await new Promise((r) => setTimeout(r, 20));
+    await ar.close();
+    const spans = exporter.getFinishedSpans();
+    const userTurn = spanByName(spans, 'user_turn');
+    const eou = spanByName(spans, 'eou_detection');
+    expect(userTurn, 'user_turn span missing').toBeTruthy();
+    expect(eou, 'eou_detection span missing').toBeTruthy();
+    expect(eou.parentSpanId).toBe(userTurn.spanContext().spanId);
+    // creation-time attributes
+    expect(userTurn.attributes['lk.participant_id']).toBe('p1');
+    expect(userTurn.attributes['lk.participant_identity']).toBe('bob');
+    expect(userTurn.attributes['lk.participant_kind']).toBe('AGENT');
+    expect(userTurn.attributes['gen_ai.request.model']).toBe('deepgram-nova2');
+    expect(userTurn.attributes['gen_ai.provider.name']).toBe('deepgram');
+    // end-of-turn attributes
+    expect(userTurn.attributes['lk.user_transcript']).toContain('hello');
+    expect(userTurn.attributes['lk.transcript_confidence']).toBeGreaterThan(0);
+  });
+  it('creates user_turn from VAD startTime (vad mode) and keeps same parenting', async () => {
+    const { exporter } = setupInMemoryTracing();
+    const hooks = {
+      onStartOfSpeech: vi.fn(),
+      onVADInferenceDone: vi.fn(),
+      onEndOfSpeech: vi.fn(),
+      onInterimTranscript: vi.fn(),
+      onFinalTranscript: vi.fn(),
+      onPreemptiveGeneration: vi.fn(),
+      retrieveChatCtx: () =>
+        ({
+          copy() {
+            return this;
+          },
+          addMessage() {},
+          toJSON() {
+            return { items: [] };
+          },
+        }) as any,
+      onEndOfTurn: vi.fn(async () => true),
+    };
+    const now = Date.now();
+    const vadEvents: VADEvent[] = [
+      {
+        type: VADEventType.START_OF_SPEECH,
+        samplesIndex: 0,
+        timestamp: now,
+        speechDuration: 100,
+        silenceDuration: 0,
+        frames: [],
+        probability: 0,
+        inferenceDuration: 0,
+        speaking: true,
+        rawAccumulatedSilence: 0,
+        rawAccumulatedSpeech: 0,
+      },
+      {
+        type: VADEventType.END_OF_SPEECH,
+        samplesIndex: 0,
+        timestamp: now + 200,
+        speechDuration: 100,
+        silenceDuration: 100,
+        frames: [],
+        probability: 0,
+        inferenceDuration: 0,
+        speaking: false,
+        rawAccumulatedSilence: 0,
+        rawAccumulatedSpeech: 0,
+      },
+    ];
+    const sttEvents: SpeechEvent[] = [
+      {
+        type: SpeechEventType.FINAL_TRANSCRIPT,
+        alternatives: [
+          {
+            language: 'en',
+            text: 'test',
+            startTime: 0,
+            endTime: 0,
+            confidence: 0.8,
+          },
+        ],
+      },
+    ];
+    const sttNode = async () =>
+      new ReadableStream<SpeechEvent>({
+        start(controller) {
+          for (const ev of sttEvents) controller.enqueue(ev);
+          controller.close();
+        },
+      });
+    const ar = new AudioRecognition({
+      recognitionHooks: hooks as any,
+      stt: sttNode as any,
+      vad: new FakeVAD(vadEvents) as any,
+      turnDetector: alwaysTrueTurnDetector,
+      turnDetectionMode: 'vad',
+      minEndpointingDelay: 0,
+      maxEndpointingDelay: 0,
+      sttModel: 'stt-model',
+      sttProvider: 'stt-provider',
+      getLinkedParticipant: () => ({ sid: 'p2', identity: 'alice', kind: ParticipantKind.AGENT }),
+    });
+    await ar.start();
+    await new Promise((r) => setTimeout(r, 20));
+    await ar.close();
+    const spans = exporter.getFinishedSpans();
+    const userTurn = spanByName(spans, 'user_turn');
+    const eou = spanByName(spans, 'eou_detection');
+    expect(userTurn).toBeTruthy();
+    expect(eou).toBeTruthy();
+    expect(eou.parentSpanId).toBe(userTurn.spanContext().spanId);
+    expect(hooks.onStartOfSpeech).toHaveBeenCalled();
+    expect(hooks.onEndOfSpeech).toHaveBeenCalled();
+  });
+});

package/src/voice/io.ts CHANGED Viewed

@@ -8,7 +8,7 @@ import type { ChatContext } from '../llm/chat_context.js';
 import type { ChatChunk } from '../llm/llm.js';
 import type { ToolContext } from '../llm/tool_context.js';
 import { log } from '../log.js';
-import { DeferredReadableStream } from '../stream/deferred_stream.js';
+import { MultiInputStream } from '../stream/multi_input_stream.js';
 import type { SpeechEvent } from '../stt/stt.js';
 import { Future } from '../utils.js';
 import type { ModelSettings } from './agent.js';
@@ -84,11 +84,14 @@ export interface AudioOutputCapabilities {
 }
 export abstract class AudioInput {
-  protected deferredStream: DeferredReadableStream<AudioFrame> =
-    new DeferredReadableStream<AudioFrame>();
+  protected multiStream: MultiInputStream<AudioFrame> = new MultiInputStream<AudioFrame>();
   get stream(): ReadableStream<AudioFrame> {
-    return this.deferredStream.stream;
+    return this.multiStream.stream;
+  }
+  async close(): Promise<void> {
+    await this.multiStream.close();
   }
   onAttached(): void {}

package/src/voice/recorder_io/recorder_io.ts CHANGED Viewed

@@ -105,6 +105,7 @@ export class RecorderIO {
       await this.outChan.close();
       await this.closeFuture.await;
       await cancelAndWait([this.forwardTask!, this.encodeTask!]);
+      await this.inRecord?.close();
       this.started = false;
     } finally {
@@ -378,7 +379,7 @@ class RecorderAudioInput extends AudioInput {
     this.source = source;
     // Set up the intercepting stream
-    this.deferredStream.setSource(this.createInterceptingStream());
+    this.multiStream.addInputStream(this.createInterceptingStream());
   }
   /**

package/src/voice/room_io/_input.ts CHANGED Viewed

@@ -1,9 +1,10 @@
 // SPDX-FileCopyrightText: 2025 LiveKit, Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
-import { type AudioFrame, FrameProcessor } from '@livekit/rtc-node';
 import {
+  type AudioFrame,
   AudioStream,
+  FrameProcessor,
   type NoiseCancellationOptions,
   RemoteParticipant,
   type RemoteTrack,
@@ -25,7 +26,9 @@ export class ParticipantAudioInputStream extends AudioInput {
   private frameProcessor?: FrameProcessor<AudioFrame>;
   private publication: RemoteTrackPublication | null = null;
   private participantIdentity: string | null = null;
+  private currentInputId: string | null = null;
   private logger = log();
   constructor({
     room,
     sampleRate,
@@ -60,8 +63,10 @@ export class ParticipantAudioInputStream extends AudioInput {
     if (this.participantIdentity === participantIdentity) {
       return;
     }
+    if (this.participantIdentity) {
+      this.closeStream();
+    }
     this.participantIdentity = participantIdentity;
-    this.closeStream();
     if (!participantIdentity) {
       return;
@@ -119,12 +124,11 @@ export class ParticipantAudioInputStream extends AudioInput {
   };
   private closeStream() {
-    if (this.deferredStream.isSourceSet) {
-      this.deferredStream.detachSource();
+    if (this.currentInputId) {
+      void this.multiStream.removeInputStream(this.currentInputId);
+      this.currentInputId = null;
     }
-    this.frameProcessor?.close();
     this.publication = null;
   }
@@ -143,7 +147,7 @@ export class ParticipantAudioInputStream extends AudioInput {
     }
     this.closeStream();
     this.publication = publication;
-    this.deferredStream.setSource(
+    this.currentInputId = this.multiStream.addInputStream(
       resampleStream({
         stream: this.createStream(track),
         outputRate: this.sampleRate,
@@ -179,12 +183,14 @@ export class ParticipantAudioInputStream extends AudioInput {
     }) as unknown as ReadableStream<AudioFrame>;
   }
-  async close() {
+  override async close() {
     this.room.off(RoomEvent.TrackSubscribed, this.onTrackSubscribed);
     this.room.off(RoomEvent.TrackUnpublished, this.onTrackUnpublished);
     this.room.off(RoomEvent.TokenRefreshed, this.onTokenRefreshed);
     this.closeStream();
-    // Ignore errors - stream may be locked by RecorderIO or already cancelled
-    await this.deferredStream.stream.cancel().catch(() => {});
+    await super.close();
+    this.frameProcessor?.close();
+    this.frameProcessor = undefined;
   }
 }

package/src/voice/room_io/room_io.ts CHANGED Viewed

@@ -376,6 +376,18 @@ export class RoomIO {
     return this.participantAvailableFuture.done;
   }
+  get linkedParticipant(): RemoteParticipant | undefined {
+    if (!this.isParticipantAvailable) {
+      return undefined;
+    }
+    return this.participantAvailableFuture.result;
+  }
+  get localParticipant(): Participant | undefined {
+    return this.room.localParticipant ?? undefined;
+  }
   /** Switch to a different participant */
   setParticipant(participantIdentity: string | null) {
     this.logger.debug({ participantIdentity }, 'setting participant');

package/src/voice/utils.ts ADDED Viewed

@@ -0,0 +1,29 @@
+// SPDX-FileCopyrightText: 2025 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+import type { Participant, ParticipantKind } from '@livekit/rtc-node';
+import type { Span } from '@opentelemetry/api';
+import { traceTypes } from '../telemetry/index.js';
+export function setParticipantSpanAttributes(
+  span: Span,
+  participant: Pick<Participant, 'sid' | 'identity' | 'kind'>,
+): void {
+  if (participant.sid) {
+    span.setAttribute(traceTypes.ATTR_PARTICIPANT_ID, participant.sid);
+  }
+  span.setAttribute(traceTypes.ATTR_PARTICIPANT_IDENTITY, participant.identity);
+  span.setAttribute(traceTypes.ATTR_PARTICIPANT_KIND, participantKindName(participant.kind));
+}
+function participantKindName(kind: ParticipantKind): string {
+  const names: Record<number, string> = {
+    0: 'STANDARD',
+    1: 'INGRESS',
+    2: 'EGRESS',
+    3: 'SIP',
+    4: 'AGENT',
+    5: 'CONNECTOR',
+  };
+  return names[kind as number] ?? String(kind);
+}