npm - @livekit/agents - Versions diffs - 1.0.36 → 1.0.38 - Mend

@livekit/agents 1.0.36 → 1.0.38

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (149) hide show

package/dist/cli.cjs.map +1 -1
package/dist/inference/api_protos.cjs +68 -0
package/dist/inference/api_protos.cjs.map +1 -1
package/dist/inference/api_protos.d.cts +345 -4
package/dist/inference/api_protos.d.ts +345 -4
package/dist/inference/api_protos.d.ts.map +1 -1
package/dist/inference/api_protos.js +60 -0
package/dist/inference/api_protos.js.map +1 -1
package/dist/inference/stt.cjs +32 -21
package/dist/inference/stt.cjs.map +1 -1
package/dist/inference/stt.d.ts.map +1 -1
package/dist/inference/stt.js +34 -21
package/dist/inference/stt.js.map +1 -1
package/dist/ipc/inference_proc_executor.cjs.map +1 -1
package/dist/ipc/job_proc_executor.cjs.map +1 -1
package/dist/stt/stt.cjs +10 -0
package/dist/stt/stt.cjs.map +1 -1
package/dist/stt/stt.d.cts +12 -0
package/dist/stt/stt.d.ts +12 -0
package/dist/stt/stt.d.ts.map +1 -1
package/dist/stt/stt.js +10 -0
package/dist/stt/stt.js.map +1 -1
package/dist/telemetry/traces.cjs +4 -3
package/dist/telemetry/traces.cjs.map +1 -1
package/dist/telemetry/traces.d.cts +2 -0
package/dist/telemetry/traces.d.ts +2 -0
package/dist/telemetry/traces.d.ts.map +1 -1
package/dist/telemetry/traces.js +4 -3
package/dist/telemetry/traces.js.map +1 -1
package/dist/utils.cjs +6 -0
package/dist/utils.cjs.map +1 -1
package/dist/utils.d.cts +2 -0
package/dist/utils.d.ts +2 -0
package/dist/utils.d.ts.map +1 -1
package/dist/utils.js +6 -0
package/dist/utils.js.map +1 -1
package/dist/voice/agent.cjs +5 -0
package/dist/voice/agent.cjs.map +1 -1
package/dist/voice/agent.d.ts.map +1 -1
package/dist/voice/agent.js +5 -0
package/dist/voice/agent.js.map +1 -1
package/dist/voice/agent_activity.cjs +49 -23
package/dist/voice/agent_activity.cjs.map +1 -1
package/dist/voice/agent_activity.d.cts +1 -1
package/dist/voice/agent_activity.d.ts +1 -1
package/dist/voice/agent_activity.d.ts.map +1 -1
package/dist/voice/agent_activity.js +50 -24
package/dist/voice/agent_activity.js.map +1 -1
package/dist/voice/agent_session.cjs +7 -5
package/dist/voice/agent_session.cjs.map +1 -1
package/dist/voice/agent_session.d.cts +5 -2
package/dist/voice/agent_session.d.ts +5 -2
package/dist/voice/agent_session.d.ts.map +1 -1
package/dist/voice/agent_session.js +7 -5
package/dist/voice/agent_session.js.map +1 -1
package/dist/voice/audio_recognition.cjs +3 -1
package/dist/voice/audio_recognition.cjs.map +1 -1
package/dist/voice/audio_recognition.d.ts.map +1 -1
package/dist/voice/audio_recognition.js +3 -1
package/dist/voice/audio_recognition.js.map +1 -1
package/dist/voice/avatar/datastream_io.cjs +6 -0
package/dist/voice/avatar/datastream_io.cjs.map +1 -1
package/dist/voice/avatar/datastream_io.d.cts +1 -0
package/dist/voice/avatar/datastream_io.d.ts +1 -0
package/dist/voice/avatar/datastream_io.d.ts.map +1 -1
package/dist/voice/avatar/datastream_io.js +6 -0
package/dist/voice/avatar/datastream_io.js.map +1 -1
package/dist/voice/background_audio.cjs.map +1 -1
package/dist/voice/generation.cjs +14 -5
package/dist/voice/generation.cjs.map +1 -1
package/dist/voice/generation.d.cts +3 -2
package/dist/voice/generation.d.ts +3 -2
package/dist/voice/generation.d.ts.map +1 -1
package/dist/voice/generation.js +14 -5
package/dist/voice/generation.js.map +1 -1
package/dist/voice/io.cjs +12 -0
package/dist/voice/io.cjs.map +1 -1
package/dist/voice/io.d.cts +19 -1
package/dist/voice/io.d.ts +19 -1
package/dist/voice/io.d.ts.map +1 -1
package/dist/voice/io.js +12 -0
package/dist/voice/io.js.map +1 -1
package/dist/voice/recorder_io/recorder_io.cjs +91 -28
package/dist/voice/recorder_io/recorder_io.cjs.map +1 -1
package/dist/voice/recorder_io/recorder_io.d.cts +7 -1
package/dist/voice/recorder_io/recorder_io.d.ts +7 -1
package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -1
package/dist/voice/recorder_io/recorder_io.js +91 -28
package/dist/voice/recorder_io/recorder_io.js.map +1 -1
package/dist/voice/room_io/_input.cjs +40 -11
package/dist/voice/room_io/_input.cjs.map +1 -1
package/dist/voice/room_io/_input.d.cts +4 -1
package/dist/voice/room_io/_input.d.ts +4 -1
package/dist/voice/room_io/_input.d.ts.map +1 -1
package/dist/voice/room_io/_input.js +31 -2
package/dist/voice/room_io/_input.js.map +1 -1
package/dist/voice/room_io/_output.cjs +6 -0
package/dist/voice/room_io/_output.cjs.map +1 -1
package/dist/voice/room_io/_output.d.cts +1 -0
package/dist/voice/room_io/_output.d.ts +1 -0
package/dist/voice/room_io/_output.d.ts.map +1 -1
package/dist/voice/room_io/_output.js +6 -0
package/dist/voice/room_io/_output.js.map +1 -1
package/dist/voice/room_io/room_io.cjs.map +1 -1
package/dist/voice/room_io/room_io.d.cts +2 -2
package/dist/voice/room_io/room_io.d.ts +2 -2
package/dist/voice/room_io/room_io.d.ts.map +1 -1
package/dist/voice/room_io/room_io.js.map +1 -1
package/dist/voice/speech_handle.cjs +2 -0
package/dist/voice/speech_handle.cjs.map +1 -1
package/dist/voice/speech_handle.d.cts +3 -0
package/dist/voice/speech_handle.d.ts +3 -0
package/dist/voice/speech_handle.d.ts.map +1 -1
package/dist/voice/speech_handle.js +2 -0
package/dist/voice/speech_handle.js.map +1 -1
package/dist/voice/testing/index.cjs +2 -0
package/dist/voice/testing/index.cjs.map +1 -1
package/dist/voice/testing/index.d.cts +1 -1
package/dist/voice/testing/index.d.ts +1 -1
package/dist/voice/testing/index.d.ts.map +1 -1
package/dist/voice/testing/index.js +2 -0
package/dist/voice/testing/index.js.map +1 -1
package/dist/voice/testing/run_result.cjs +294 -5
package/dist/voice/testing/run_result.cjs.map +1 -1
package/dist/voice/testing/run_result.d.cts +149 -1
package/dist/voice/testing/run_result.d.ts +149 -1
package/dist/voice/testing/run_result.d.ts.map +1 -1
package/dist/voice/testing/run_result.js +293 -5
package/dist/voice/testing/run_result.js.map +1 -1
package/package.json +1 -1
package/src/inference/api_protos.ts +83 -0
package/src/inference/stt.ts +39 -22
package/src/stt/stt.ts +21 -0
package/src/telemetry/traces.ts +6 -2
package/src/utils.ts +7 -0
package/src/voice/agent.ts +9 -0
package/src/voice/agent_activity.ts +72 -26
package/src/voice/agent_session.ts +6 -5
package/src/voice/audio_recognition.ts +2 -0
package/src/voice/avatar/datastream_io.ts +8 -0
package/src/voice/generation.ts +24 -12
package/src/voice/io.ts +27 -5
package/src/voice/recorder_io/recorder_io.ts +123 -31
package/src/voice/room_io/_input.ts +32 -4
package/src/voice/room_io/_output.ts +8 -0
package/src/voice/room_io/room_io.ts +3 -1
package/src/voice/speech_handle.ts +4 -0
package/src/voice/testing/index.ts +1 -0
package/src/voice/testing/run_result.ts +373 -12

package/src/voice/io.ts CHANGED Viewed

@@ -30,12 +30,14 @@ export type TTSNode = (
 ) => Promise<ReadableStream<AudioFrame> | null>;
 /**
- * A string with timing information for word-level alignment.
+ *A string with optional start and end timestamps for word-level alignment.
  */
 export interface TimedString {
   text: string;
   startTime?: number; // seconds
   endTime?: number; // seconds
+  confidence?: number;
+  startTimeOffset?: number;
 }
 export interface AudioOutputCapabilities {
@@ -57,6 +59,7 @@ export abstract class AudioInput {
 }
 export abstract class AudioOutput extends EventEmitter {
+  static readonly EVENT_PLAYBACK_STARTED = 'playbackStarted';
   static readonly EVENT_PLAYBACK_FINISHED = 'playbackFinished';
   private playbackFinishedFuture: Future<void> = new Future();
@@ -77,7 +80,11 @@ export abstract class AudioOutput extends EventEmitter {
   ) {
     super();
     this.capabilities = capabilities;
     if (this.nextInChain) {
+      this.nextInChain.on(AudioOutput.EVENT_PLAYBACK_STARTED, (ev: PlaybackStartedEvent) =>
+        this.onPlaybackStarted(ev.createdAt),
+      );
       this.nextInChain.on(AudioOutput.EVENT_PLAYBACK_FINISHED, (ev: PlaybackFinishedEvent) =>
         this.onPlaybackFinished(ev),
       );
@@ -117,6 +124,14 @@ export abstract class AudioOutput extends EventEmitter {
     return this.lastPlaybackEvent;
   }
+  /**
+   * Called when playback actually starts (first frame is sent to output).
+   * Developers building audio sinks should call this when the first frame is captured.
+   */
+  onPlaybackStarted(createdAt: number): void {
+    this.emit(AudioOutput.EVENT_PLAYBACK_STARTED, { createdAt } as PlaybackStartedEvent);
+  }
   /**
    * Developers building audio sinks must call this method when a playback/segment is finished.
    * Segments are segmented by calls to flush() or clearBuffer()
@@ -174,15 +189,22 @@ export abstract class AudioOutput extends EventEmitter {
 }
 export interface PlaybackFinishedEvent {
-  // How much of the audio was played back
+  /** How much of the audio was played back, in seconds */
   playbackPosition: number;
-  // Interrupted is True if playback was interrupted (clearBuffer() was called)
+  /** True if playback was interrupted (clearBuffer() was called) */
   interrupted: boolean;
-  // Transcript synced with playback; may be partial if the audio was interrupted
-  // When null, the transcript is not synchronized with the playback
+  /**
+   * Transcript synced with playback; may be partial if the audio was interrupted.
+   * When undefined, the transcript is not synchronized with the playback.
+   */
   synchronizedTranscript?: string;
 }
+export interface PlaybackStartedEvent {
+  /** The timestamp (Date.now()) when the playback started */
+  createdAt: number;
+}
 export abstract class TextOutput {
   constructor(protected readonly nextInChain?: TextOutput) {}

package/src/voice/recorder_io/recorder_io.ts CHANGED Viewed

@@ -123,7 +123,7 @@ export class RecorderIO {
   }
   private writeCb(buf: AudioFrame[]): void {
-    const inputBuf = this.inRecord!.takeBuf();
+    const inputBuf = this.inRecord!.takeBuf(this.outRecord?._lastSpeechEndTime);
     this.inChan.write(inputBuf);
     this.outChan.write(buf);
   }
@@ -137,8 +137,18 @@ export class RecorderIO {
   }
   get recordingStartedAt(): number | undefined {
-    // Use session start time to align with trace timestamps
-    return this.session._startedAt;
+    const inT = this.inRecord?.startedWallTime;
+    const outT = this.outRecord?.startedWallTime;
+    if (inT === undefined) {
+      return outT;
+    }
+    if (outT === undefined) {
+      return inT;
+    }
+    return Math.min(inT, outT);
   }
   /**
@@ -159,7 +169,7 @@ export class RecorderIO {
       }
       // Flush input buffer
-      const inputBuf = this.inRecord!.takeBuf();
+      const inputBuf = this.inRecord!.takeBuf(this.outRecord!._lastSpeechEndTime);
       this.inChan
         .write(inputBuf)
         .catch((err) => this.logger.error({ err }, 'Error writing RecorderIO input buffer'));
@@ -359,6 +369,8 @@ class RecorderAudioInput extends AudioInput {
   private recorderIO: RecorderIO;
   private accFrames: AudioFrame[] = [];
   private _startedWallTime?: number;
+  private _padded: boolean = false;
+  private logger = log();
   constructor(recorderIO: RecorderIO, source: AudioInput) {
     super();
@@ -378,10 +390,46 @@ class RecorderAudioInput extends AudioInput {
   /**
    * Take accumulated frames and clear the buffer
+   * @param padSince - If provided and input started after this time, pad with silence
    */
-  takeBuf(): AudioFrame[] {
-    const frames = this.accFrames;
+  takeBuf(padSince?: number): AudioFrame[] {
+    let frames = this.accFrames;
     this.accFrames = [];
+    if (
+      padSince !== undefined &&
+      this._startedWallTime !== undefined &&
+      this._startedWallTime > padSince &&
+      !this._padded &&
+      frames.length > 0
+    ) {
+      const padding = this._startedWallTime - padSince;
+      this.logger.warn(
+        {
+          lastAgentSpeechTime: padSince,
+          inputStartedTime: this._startedWallTime,
+        },
+        'input speech started after last agent speech ended',
+      );
+      this._padded = true;
+      const firstFrame = frames[0]!;
+      frames = [
+        createSilenceFrame(padding / 1000, firstFrame.sampleRate, firstFrame.channels),
+        ...frames,
+      ];
+    } else if (
+      padSince !== undefined &&
+      this._startedWallTime === undefined &&
+      !this._padded &&
+      frames.length === 0
+    ) {
+      // We could pad with silence here with some fixed SR and channels,
+      // but it's better for the user to know that this is happening
+      this.logger.warn(
+        "input speech hasn't started yet, skipping silence padding, recording may be inaccurate until the speech starts",
+      );
+    }
     return frames;
   }
@@ -455,6 +503,10 @@ class RecorderAudioOutput extends AudioOutput {
   private writeFn: (buf: AudioFrame[]) => void;
   private accFrames: AudioFrame[] = [];
   private _startedWallTime?: number;
+  private _logger = log();
+  _lastSpeechEndTime?: number;
+  private _lastSpeechStartTime?: number;
   // Pause tracking
   private currentPauseStart?: number;
@@ -508,9 +560,32 @@ class RecorderAudioOutput extends AudioOutput {
   }
   onPlaybackFinished(options: PlaybackFinishedEvent): void {
-    const finishTime = Date.now();
+    const finishTime = this.currentPauseStart ?? Date.now();
+    const trailingSilenceDuration = Math.max(0, Date.now() - finishTime);
+    // Convert playbackPosition from seconds to ms for internal calculations
+    let playbackPosition = options.playbackPosition * 1000;
+    if (this._lastSpeechStartTime === undefined) {
+      this._logger.warn(
+        {
+          finishTime,
+          playbackPosition,
+          interrupted: options.interrupted,
+        },
+        'playback finished before speech started',
+      );
+      playbackPosition = 0;
+    }
+    // Clamp playbackPosition to actual elapsed time (all in ms)
+    playbackPosition = Math.max(
+      0,
+      Math.min(finishTime - (this._lastSpeechStartTime ?? 0), playbackPosition),
+    );
-    super.onPlaybackFinished(options);
+    // Convert back to seconds for the event
+    super.onPlaybackFinished({ ...options, playbackPosition: playbackPosition / 1000 });
     if (!this.recorderIO.recording) {
       return;
@@ -523,28 +598,29 @@ class RecorderAudioOutput extends AudioOutput {
     if (this.accFrames.length === 0) {
       this.resetPauseState();
+      this._lastSpeechEndTime = Date.now();
+      this._lastSpeechStartTime = undefined;
       return;
     }
-    const playbackPosition = options.playbackPosition;
+    // pauseEvents stores (position, duration) in ms
     const pauseEvents: Array<[number, number]> = [];
+    let playbackStartTime = finishTime - playbackPosition;
     if (this.pauseWallTimes.length > 0) {
       const totalPauseDuration = this.pauseWallTimes.reduce(
         (sum, [start, end]) => sum + (end - start),
         0,
       );
-      // Convert playbackPosition from seconds to milliseconds for wall time calculations
-      const playbackStartTime = finishTime - playbackPosition * 1000 - totalPauseDuration;
+      playbackStartTime = finishTime - playbackPosition - totalPauseDuration;
       let accumulatedPause = 0;
       for (const [pauseStart, pauseEnd] of this.pauseWallTimes) {
-        let position = (pauseStart - playbackStartTime - accumulatedPause) / 1000; // Convert to seconds
-        const duration = (pauseEnd - pauseStart) / 1000; // Convert to seconds
+        let position = pauseStart - playbackStartTime - accumulatedPause;
+        const duration = pauseEnd - pauseStart;
         position = Math.max(0, Math.min(position, playbackPosition));
         pauseEvents.push([position, duration]);
-        accumulatedPause += pauseEnd - pauseStart;
+        accumulatedPause += duration;
       }
     }
@@ -558,10 +634,10 @@ class RecorderAudioOutput extends AudioOutput {
     for (const frame of this.accFrames) {
       let currentFrame = frame;
-      const frameDuration = frame.samplesPerChannel / frame.sampleRate;
+      const frameDuration = (frame.samplesPerChannel / frame.sampleRate) * 1000;
       if (frameDuration + accDur > playbackPosition) {
-        const [left] = splitFrame(currentFrame, playbackPosition - accDur);
+        const [left] = splitFrame(currentFrame, (playbackPosition - accDur) / 1000);
         currentFrame = left;
         shouldBreak = true;
       }
@@ -569,27 +645,29 @@ class RecorderAudioOutput extends AudioOutput {
       // Process any pauses before this frame starts
       while (pauseIdx < pauseEvents.length && pauseEvents[pauseIdx]![0] <= accDur) {
         const [, pauseDur] = pauseEvents[pauseIdx]!;
-        buf.push(createSilenceFrame(pauseDur, sampleRate, numChannels));
+        buf.push(createSilenceFrame(pauseDur / 1000, sampleRate, numChannels));
         pauseIdx++;
       }
       // Process any pauses within this frame
-      const currentFrameDuration = currentFrame.samplesPerChannel / currentFrame.sampleRate;
+      const currentFrameDuration =
+        (currentFrame.samplesPerChannel / currentFrame.sampleRate) * 1000;
       while (
         pauseIdx < pauseEvents.length &&
         pauseEvents[pauseIdx]![0] < accDur + currentFrameDuration
       ) {
         const [pausePos, pauseDur] = pauseEvents[pauseIdx]!;
-        const [left, right] = splitFrame(currentFrame, pausePos - accDur);
+        const [left, right] = splitFrame(currentFrame, (pausePos - accDur) / 1000);
         buf.push(left);
-        accDur += left.samplesPerChannel / left.sampleRate;
-        buf.push(createSilenceFrame(pauseDur, sampleRate, numChannels));
+        accDur += (left.samplesPerChannel / left.sampleRate) * 1000;
+        buf.push(createSilenceFrame(pauseDur / 1000, sampleRate, numChannels));
         currentFrame = right;
         pauseIdx++;
       }
       buf.push(currentFrame);
-      accDur += currentFrame.samplesPerChannel / currentFrame.sampleRate;
+      accDur += (currentFrame.samplesPerChannel / currentFrame.sampleRate) * 1000;
       if (shouldBreak) {
         break;
@@ -600,31 +678,41 @@ class RecorderAudioOutput extends AudioOutput {
     while (pauseIdx < pauseEvents.length) {
       const [pausePos, pauseDur] = pauseEvents[pauseIdx]!;
       if (pausePos <= playbackPosition) {
-        buf.push(createSilenceFrame(pauseDur, sampleRate, numChannels));
+        buf.push(createSilenceFrame(pauseDur / 1000, sampleRate, numChannels));
       }
       pauseIdx++;
     }
     if (buf.length > 0) {
+      if (trailingSilenceDuration > 0) {
+        buf.push(createSilenceFrame(trailingSilenceDuration / 1000, sampleRate, numChannels));
+      }
       this.writeFn(buf);
     }
     this.accFrames = [];
     this.resetPauseState();
+    this._lastSpeechEndTime = Date.now();
+    this._lastSpeechStartTime = undefined;
   }
   async captureFrame(frame: AudioFrame): Promise<void> {
+    if (this.nextInChain) {
+      await this.nextInChain.captureFrame(frame);
+    }
     await super.captureFrame(frame);
     if (this.recorderIO.recording) {
-      if (this._startedWallTime === undefined) {
-        this._startedWallTime = Date.now();
-      }
       this.accFrames.push(frame);
     }
-    if (this.nextInChain) {
-      await this.nextInChain.captureFrame(frame);
+    if (this._startedWallTime === undefined) {
+      this._startedWallTime = Date.now();
+    }
+    if (this._lastSpeechStartTime === undefined) {
+      this._lastSpeechStartTime = Date.now();
     }
   }
@@ -646,8 +734,12 @@ class RecorderAudioOutput extends AudioOutput {
 /**
  * Create a silent audio frame with the given duration
  */
-function createSilenceFrame(duration: number, sampleRate: number, numChannels: number): AudioFrame {
-  const samples = Math.floor(duration * sampleRate);
+function createSilenceFrame(
+  durationInS: number,
+  sampleRate: number,
+  numChannels: number,
+): AudioFrame {
+  const samples = Math.floor(durationInS * sampleRate);
   const data = new Int16Array(samples * numChannels); // Zero-filled by default
   return new AudioFrame(data, sampleRate, numChannels, samples);
 }

package/src/voice/room_io/_input.ts CHANGED Viewed

@@ -1,7 +1,7 @@
 // SPDX-FileCopyrightText: 2025 LiveKit, Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
-import type { AudioFrame } from '@livekit/rtc-node';
+import { type AudioFrame, FrameProcessor } from '@livekit/rtc-node';
 import {
   AudioStream,
   type NoiseCancellationOptions,
@@ -22,6 +22,7 @@ export class ParticipantAudioInputStream extends AudioInput {
   private sampleRate: number;
   private numChannels: number;
   private noiseCancellation?: NoiseCancellationOptions;
+  private frameProcessor?: FrameProcessor<AudioFrame>;
   private publication: RemoteTrackPublication | null = null;
   private participantIdentity: string | null = null;
   private logger = log();
@@ -34,16 +35,21 @@ export class ParticipantAudioInputStream extends AudioInput {
     room: Room;
     sampleRate: number;
     numChannels: number;
-    noiseCancellation?: NoiseCancellationOptions;
+    noiseCancellation?: NoiseCancellationOptions | FrameProcessor<AudioFrame>;
   }) {
     super();
     this.room = room;
     this.sampleRate = sampleRate;
     this.numChannels = numChannels;
-    this.noiseCancellation = noiseCancellation;
+    if (noiseCancellation instanceof FrameProcessor) {
+      this.frameProcessor = noiseCancellation;
+    } else {
+      this.noiseCancellation = noiseCancellation;
+    }
     this.room.on(RoomEvent.TrackSubscribed, this.onTrackSubscribed);
     this.room.on(RoomEvent.TrackUnpublished, this.onTrackUnpublished);
+    this.room.on(RoomEvent.TokenRefreshed, this.onTokenRefreshed);
   }
   setParticipant(participant: RemoteParticipant | string | null) {
@@ -116,6 +122,9 @@ export class ParticipantAudioInputStream extends AudioInput {
     if (this.deferredStream.isSourceSet) {
       this.deferredStream.detachSource();
     }
+    this.frameProcessor?.close();
     this.publication = null;
   }
@@ -140,14 +149,32 @@ export class ParticipantAudioInputStream extends AudioInput {
         outputRate: this.sampleRate,
       }),
     );
+    this.frameProcessor?.onStreamInfoUpdated({
+      participantIdentity: participant.identity,
+      roomName: this.room.name!,
+      publicationSid: publication.sid!,
+    });
+    this.frameProcessor?.onCredentialsUpdated({
+      token: this.room.token!,
+      url: this.room.serverUrl!,
+    });
     return true;
   };
+  private onTokenRefreshed = () => {
+    if (this.room.token && this.room.serverUrl) {
+      this.frameProcessor?.onCredentialsUpdated({
+        token: this.room.token,
+        url: this.room.serverUrl,
+      });
+    }
+  };
   private createStream(track: RemoteTrack): ReadableStream<AudioFrame> {
     return new AudioStream(track, {
       sampleRate: this.sampleRate,
       numChannels: this.numChannels,
-      noiseCancellation: this.noiseCancellation,
+      noiseCancellation: this.frameProcessor || this.noiseCancellation,
       // TODO(AJS-269): resolve compatibility issue with node-sdk to remove the forced type casting
     }) as unknown as ReadableStream<AudioFrame>;
   }
@@ -155,6 +182,7 @@ export class ParticipantAudioInputStream extends AudioInput {
   async close() {
     this.room.off(RoomEvent.TrackSubscribed, this.onTrackSubscribed);
     this.room.off(RoomEvent.TrackUnpublished, this.onTrackUnpublished);
+    this.room.off(RoomEvent.TokenRefreshed, this.onTokenRefreshed);
     this.closeStream();
     // Ignore errors - stream may be locked by RecorderIO or already cancelled
     await this.deferredStream.stream.cancel().catch(() => {});

package/src/voice/room_io/_output.ts CHANGED Viewed

@@ -326,6 +326,7 @@ export class ParticipantAudioOutput extends AudioOutput {
   private pushedDuration: number = 0;
   private startedFuture: Future<void> = new Future();
   private interruptedFuture: Future<void> = new Future();
+  private firstFrameEmitted: boolean = false;
   constructor(room: Room, options: AudioOutputOptions) {
     super(options.sampleRate, undefined, { pause: true });
@@ -347,6 +348,11 @@ export class ParticipantAudioOutput extends AudioOutput {
     super.captureFrame(frame);
+    if (!this.firstFrameEmitted) {
+      this.firstFrameEmitted = true;
+      this.onPlaybackStarted(Date.now());
+    }
     // TODO(AJS-102): use frame.durationMs once available in rtc-node
     this.pushedDuration += frame.samplesPerChannel / frame.sampleRate;
     await this.audioSource.captureFrame(frame);
@@ -382,6 +388,8 @@ export class ParticipantAudioOutput extends AudioOutput {
     this.pushedDuration = 0;
     this.interruptedFuture = new Future();
+    this.firstFrameEmitted = false;
     this.onPlaybackFinished({
       playbackPosition: pushedDuration,
       interrupted,

package/src/voice/room_io/room_io.ts CHANGED Viewed

@@ -2,8 +2,10 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 import {
+  type AudioFrame,
   ConnectionState,
   DisconnectReason,
+  type FrameProcessor,
   type NoiseCancellationOptions,
   type Participant,
   ParticipantKind,
@@ -75,7 +77,7 @@ export interface RoomInputOptions {
     Can be overridden by the `participant` argument of RoomIO constructor or `set_participant`.
   */
   participantIdentity?: string;
-  noiseCancellation?: NoiseCancellationOptions;
+  noiseCancellation?: NoiseCancellationOptions | FrameProcessor<AudioFrame>;
   textInputCallback?: TextInputCallback;
   /** Participant kinds accepted for auto subscription. If not provided,
     accept `DEFAULT_PARTICIPANT_KINDS`

package/src/voice/speech_handle.ts CHANGED Viewed

@@ -1,6 +1,7 @@
 // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
+import type { Context } from '@opentelemetry/api';
 import type { ChatItem } from '../llm/index.js';
 import type { Task } from '../utils.js';
 import { Event, Future, shortuuid } from '../utils.js';
@@ -42,6 +43,9 @@ export class SpeechHandle {
   /** @internal */
   _numSteps = 1;
+  /** @internal - OpenTelemetry context for the agent turn span */
+  _agentTurnContext?: Context;
   private itemAddedCallbacks: Set<(item: ChatItem) => void> = new Set();
   private doneCallbacks: Set<(sh: SpeechHandle) => void> = new Set();

package/src/voice/testing/index.ts CHANGED Viewed

@@ -24,6 +24,7 @@ export {
   AgentHandoffAssert,
   AssertionError,
   EventAssert,
+  EventRangeAssert,
   FunctionCallAssert,
   FunctionCallOutputAssert,
   MessageAssert,