npm - @livekit/agents - Versions diffs - 1.0.36 → 1.0.38 - Mend

@livekit/agents 1.0.36 → 1.0.38

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (149) hide show

package/dist/cli.cjs.map +1 -1
package/dist/inference/api_protos.cjs +68 -0
package/dist/inference/api_protos.cjs.map +1 -1
package/dist/inference/api_protos.d.cts +345 -4
package/dist/inference/api_protos.d.ts +345 -4
package/dist/inference/api_protos.d.ts.map +1 -1
package/dist/inference/api_protos.js +60 -0
package/dist/inference/api_protos.js.map +1 -1
package/dist/inference/stt.cjs +32 -21
package/dist/inference/stt.cjs.map +1 -1
package/dist/inference/stt.d.ts.map +1 -1
package/dist/inference/stt.js +34 -21
package/dist/inference/stt.js.map +1 -1
package/dist/ipc/inference_proc_executor.cjs.map +1 -1
package/dist/ipc/job_proc_executor.cjs.map +1 -1
package/dist/stt/stt.cjs +10 -0
package/dist/stt/stt.cjs.map +1 -1
package/dist/stt/stt.d.cts +12 -0
package/dist/stt/stt.d.ts +12 -0
package/dist/stt/stt.d.ts.map +1 -1
package/dist/stt/stt.js +10 -0
package/dist/stt/stt.js.map +1 -1
package/dist/telemetry/traces.cjs +4 -3
package/dist/telemetry/traces.cjs.map +1 -1
package/dist/telemetry/traces.d.cts +2 -0
package/dist/telemetry/traces.d.ts +2 -0
package/dist/telemetry/traces.d.ts.map +1 -1
package/dist/telemetry/traces.js +4 -3
package/dist/telemetry/traces.js.map +1 -1
package/dist/utils.cjs +6 -0
package/dist/utils.cjs.map +1 -1
package/dist/utils.d.cts +2 -0
package/dist/utils.d.ts +2 -0
package/dist/utils.d.ts.map +1 -1
package/dist/utils.js +6 -0
package/dist/utils.js.map +1 -1
package/dist/voice/agent.cjs +5 -0
package/dist/voice/agent.cjs.map +1 -1
package/dist/voice/agent.d.ts.map +1 -1
package/dist/voice/agent.js +5 -0
package/dist/voice/agent.js.map +1 -1
package/dist/voice/agent_activity.cjs +49 -23
package/dist/voice/agent_activity.cjs.map +1 -1
package/dist/voice/agent_activity.d.cts +1 -1
package/dist/voice/agent_activity.d.ts +1 -1
package/dist/voice/agent_activity.d.ts.map +1 -1
package/dist/voice/agent_activity.js +50 -24
package/dist/voice/agent_activity.js.map +1 -1
package/dist/voice/agent_session.cjs +7 -5
package/dist/voice/agent_session.cjs.map +1 -1
package/dist/voice/agent_session.d.cts +5 -2
package/dist/voice/agent_session.d.ts +5 -2
package/dist/voice/agent_session.d.ts.map +1 -1
package/dist/voice/agent_session.js +7 -5
package/dist/voice/agent_session.js.map +1 -1
package/dist/voice/audio_recognition.cjs +3 -1
package/dist/voice/audio_recognition.cjs.map +1 -1
package/dist/voice/audio_recognition.d.ts.map +1 -1
package/dist/voice/audio_recognition.js +3 -1
package/dist/voice/audio_recognition.js.map +1 -1
package/dist/voice/avatar/datastream_io.cjs +6 -0
package/dist/voice/avatar/datastream_io.cjs.map +1 -1
package/dist/voice/avatar/datastream_io.d.cts +1 -0
package/dist/voice/avatar/datastream_io.d.ts +1 -0
package/dist/voice/avatar/datastream_io.d.ts.map +1 -1
package/dist/voice/avatar/datastream_io.js +6 -0
package/dist/voice/avatar/datastream_io.js.map +1 -1
package/dist/voice/background_audio.cjs.map +1 -1
package/dist/voice/generation.cjs +14 -5
package/dist/voice/generation.cjs.map +1 -1
package/dist/voice/generation.d.cts +3 -2
package/dist/voice/generation.d.ts +3 -2
package/dist/voice/generation.d.ts.map +1 -1
package/dist/voice/generation.js +14 -5
package/dist/voice/generation.js.map +1 -1
package/dist/voice/io.cjs +12 -0
package/dist/voice/io.cjs.map +1 -1
package/dist/voice/io.d.cts +19 -1
package/dist/voice/io.d.ts +19 -1
package/dist/voice/io.d.ts.map +1 -1
package/dist/voice/io.js +12 -0
package/dist/voice/io.js.map +1 -1
package/dist/voice/recorder_io/recorder_io.cjs +91 -28
package/dist/voice/recorder_io/recorder_io.cjs.map +1 -1
package/dist/voice/recorder_io/recorder_io.d.cts +7 -1
package/dist/voice/recorder_io/recorder_io.d.ts +7 -1
package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -1
package/dist/voice/recorder_io/recorder_io.js +91 -28
package/dist/voice/recorder_io/recorder_io.js.map +1 -1
package/dist/voice/room_io/_input.cjs +40 -11
package/dist/voice/room_io/_input.cjs.map +1 -1
package/dist/voice/room_io/_input.d.cts +4 -1
package/dist/voice/room_io/_input.d.ts +4 -1
package/dist/voice/room_io/_input.d.ts.map +1 -1
package/dist/voice/room_io/_input.js +31 -2
package/dist/voice/room_io/_input.js.map +1 -1
package/dist/voice/room_io/_output.cjs +6 -0
package/dist/voice/room_io/_output.cjs.map +1 -1
package/dist/voice/room_io/_output.d.cts +1 -0
package/dist/voice/room_io/_output.d.ts +1 -0
package/dist/voice/room_io/_output.d.ts.map +1 -1
package/dist/voice/room_io/_output.js +6 -0
package/dist/voice/room_io/_output.js.map +1 -1
package/dist/voice/room_io/room_io.cjs.map +1 -1
package/dist/voice/room_io/room_io.d.cts +2 -2
package/dist/voice/room_io/room_io.d.ts +2 -2
package/dist/voice/room_io/room_io.d.ts.map +1 -1
package/dist/voice/room_io/room_io.js.map +1 -1
package/dist/voice/speech_handle.cjs +2 -0
package/dist/voice/speech_handle.cjs.map +1 -1
package/dist/voice/speech_handle.d.cts +3 -0
package/dist/voice/speech_handle.d.ts +3 -0
package/dist/voice/speech_handle.d.ts.map +1 -1
package/dist/voice/speech_handle.js +2 -0
package/dist/voice/speech_handle.js.map +1 -1
package/dist/voice/testing/index.cjs +2 -0
package/dist/voice/testing/index.cjs.map +1 -1
package/dist/voice/testing/index.d.cts +1 -1
package/dist/voice/testing/index.d.ts +1 -1
package/dist/voice/testing/index.d.ts.map +1 -1
package/dist/voice/testing/index.js +2 -0
package/dist/voice/testing/index.js.map +1 -1
package/dist/voice/testing/run_result.cjs +294 -5
package/dist/voice/testing/run_result.cjs.map +1 -1
package/dist/voice/testing/run_result.d.cts +149 -1
package/dist/voice/testing/run_result.d.ts +149 -1
package/dist/voice/testing/run_result.d.ts.map +1 -1
package/dist/voice/testing/run_result.js +293 -5
package/dist/voice/testing/run_result.js.map +1 -1
package/package.json +1 -1
package/src/inference/api_protos.ts +83 -0
package/src/inference/stt.ts +39 -22
package/src/stt/stt.ts +21 -0
package/src/telemetry/traces.ts +6 -2
package/src/utils.ts +7 -0
package/src/voice/agent.ts +9 -0
package/src/voice/agent_activity.ts +72 -26
package/src/voice/agent_session.ts +6 -5
package/src/voice/audio_recognition.ts +2 -0
package/src/voice/avatar/datastream_io.ts +8 -0
package/src/voice/generation.ts +24 -12
package/src/voice/io.ts +27 -5
package/src/voice/recorder_io/recorder_io.ts +123 -31
package/src/voice/room_io/_input.ts +32 -4
package/src/voice/room_io/_output.ts +8 -0
package/src/voice/room_io/room_io.ts +3 -1
package/src/voice/speech_handle.ts +4 -0
package/src/voice/testing/index.ts +1 -0
package/src/voice/testing/run_result.ts +373 -12

package/src/inference/stt.ts CHANGED Viewed

@@ -16,6 +16,12 @@ import {
 } from '../stt/index.js';
 import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS } from '../types.js';
 import { type AudioBuffer, Event, Task, cancelAndWait, shortuuid, waitForAbort } from '../utils.js';
+import type { TimedString } from '../voice/io.js';
+import {
+  type SttServerEvent,
+  type SttTranscriptEvent,
+  sttServerEventSchema,
+} from './api_protos.js';
 import { type AnyString, connectWs, createAccessToken } from './utils.js';
 export type DeepgramModels =
@@ -122,7 +128,7 @@ export class STT<TModel extends STTModels> extends BaseSTT {
     apiSecret?: string;
     modelOptions?: STTOptions<TModel>;
   }) {
-    super({ streaming: true, interimResults: true });
+    super({ streaming: true, interimResults: true, alignedTranscript: 'word' });
     const {
       model,
@@ -271,7 +277,6 @@ export class SpeechStream<TModel extends STTModels> extends BaseSpeechStream {
       let closing = false;
       let finalReceived = false;
-      type SttServerEvent = Record<string, any>;
       const eventChannel = createStreamChannel<SttServerEvent>();
       const resourceCleanup = () => {
@@ -380,10 +385,19 @@ export class SpeechStream<TModel extends STTModels> extends BaseSpeechStream {
             if (signal.aborted) return;
             if (result.done) return;
-            const json = result.value;
-            const type = json.type as string | undefined;
+            // Parse and validate with Zod schema
+            const parseResult = await sttServerEventSchema.safeParseAsync(result.value);
+            if (!parseResult.success) {
+              this.#logger.warn(
+                { error: parseResult.error, rawData: result.value },
+                'Failed to parse STT server event',
+              );
+              continue;
+            }
+            const event: SttServerEvent = parseResult.data;
-            switch (type) {
+            switch (event.type) {
               case 'session.created':
               case 'session.finalized':
                 break;
@@ -392,21 +406,15 @@ export class SpeechStream<TModel extends STTModels> extends BaseSpeechStream {
                 resourceCleanup();
                 break;
               case 'interim_transcript':
-                this.processTranscript(json, false);
+                this.processTranscript(event, false);
                 break;
               case 'final_transcript':
-                this.processTranscript(json, true);
+                this.processTranscript(event, true);
                 break;
               case 'error':
-                this.#logger.error({ error: json }, 'Received error from LiveKit STT');
+                this.#logger.error({ error: event }, 'Received error from LiveKit STT');
                 resourceCleanup();
-                throw new APIError(`LiveKit STT returned error: ${JSON.stringify(json)}`);
-              default:
-                this.#logger.warn(
-                  { message: json },
-                  'Received unexpected message from LiveKit STT',
-                );
-                break;
+                throw new APIError(`LiveKit STT returned error: ${JSON.stringify(event)}`);
             }
           }
         } finally {
@@ -457,13 +465,13 @@ export class SpeechStream<TModel extends STTModels> extends BaseSpeechStream {
     }
   }
-  private processTranscript(data: Record<string, any>, isFinal: boolean) {
+  private processTranscript(data: SttTranscriptEvent, isFinal: boolean) {
     // Check if queue is closed to avoid race condition during disconnect
     if (this.queue.closed) return;
-    const requestId = data.request_id ?? this.requestId;
-    const text = data.transcript ?? '';
-    const language = data.language ?? this.opts.language ?? 'en';
+    const requestId = data.session_id || this.requestId;
+    const text = data.transcript;
+    const language = data.language || this.opts.language || 'en';
     if (!text && !isFinal) return;
@@ -476,10 +484,19 @@ export class SpeechStream<TModel extends STTModels> extends BaseSpeechStream {
       const speechData: SpeechData = {
         language,
-        startTime: data.start ?? 0,
-        endTime: data.duration ?? 0,
-        confidence: data.confidence ?? 1.0,
+        startTime: this.startTimeOffset + data.start,
+        endTime: this.startTimeOffset + data.start + data.duration,
+        confidence: data.confidence,
         text,
+        words: data.words.map(
+          (word): TimedString => ({
+            text: word.word,
+            startTime: word.start + this.startTimeOffset,
+            endTime: word.end + this.startTimeOffset,
+            startTimeOffset: this.startTimeOffset,
+            confidence: word.confidence,
+          }),
+        ),
       };
       if (isFinal) {

package/src/stt/stt.ts CHANGED Viewed

@@ -13,6 +13,7 @@ import { DeferredReadableStream } from '../stream/deferred_stream.js';
 import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS, intervalForRetry } from '../types.js';
 import type { AudioBuffer } from '../utils.js';
 import { AsyncIterableQueue, delay, startSoon, toError } from '../utils.js';
+import type { TimedString } from '../voice/index.js';
 /** Indicates start/middle/end of speech */
 export enum SpeechEventType {
@@ -53,6 +54,7 @@ export interface SpeechData {
   startTime: number;
   endTime: number;
   confidence: number;
+  words?: TimedString[];
 }
 export interface RecognitionUsage {
@@ -76,6 +78,13 @@ export interface SpeechEvent {
 export interface STTCapabilities {
   streaming: boolean;
   interimResults: boolean;
+  /**
+   * Whether this STT supports aligned transcripts with word/chunk timestamps.
+   * - 'word': Provider returns word-level timestamps
+   * - 'chunk': Provider returns chunk-level timestamps (e.g., sentence/phrase boundaries)
+   * - false: Provider does not support aligned transcripts
+   */
+  alignedTranscript?: 'word' | 'chunk' | false;
 }
 export interface STTError {
@@ -176,6 +185,7 @@ export abstract class SpeechStream implements AsyncIterableIterator<SpeechEvent>
   private deferredInputStream: DeferredReadableStream<AudioFrame>;
   private logger = log();
   private _connOptions: APIConnectOptions;
+  private _startTimeOffset: number = 0;
   protected abortController = new AbortController();
@@ -300,6 +310,17 @@ export abstract class SpeechStream implements AsyncIterableIterator<SpeechEvent>
     return this.abortController.signal;
   }
+  get startTimeOffset(): number {
+    return this._startTimeOffset;
+  }
+  set startTimeOffset(value: number) {
+    if (value < 0) {
+      throw new Error('startTimeOffset must be non-negative');
+    }
+    this._startTimeOffset = value;
+  }
   updateInputStream(audioStream: ReadableStream<AudioFrame>) {
     this.deferredInputStream.setSource(audioStream);
   }

package/src/telemetry/traces.ts CHANGED Viewed

@@ -37,6 +37,8 @@ export interface StartSpanOptions {
   attributes?: Attributes;
   /** Whether to end the span when the function exits (default: true) */
   endOnExit?: boolean;
+  /** Optional start time for the span in milliseconds (Date.now() format) */
+  startTime?: number;
 }
 /**
@@ -79,10 +81,12 @@ class DynamicTracer {
    */
   startSpan(options: StartSpanOptions): Span {
     const ctx = options.context || otelContext.active();
     const span = this.tracer.startSpan(
       options.name,
       {
         attributes: options.attributes,
+        startTime: options.startTime,
       },
       ctx,
     );
@@ -101,7 +105,7 @@ class DynamicTracer {
   async startActiveSpan<T>(fn: (span: Span) => Promise<T>, options: StartSpanOptions): Promise<T> {
     const ctx = options.context || otelContext.active();
     const endOnExit = options.endOnExit === undefined ? true : options.endOnExit; // default true
-    const opts: SpanOptions = { attributes: options.attributes };
+    const opts: SpanOptions = { attributes: options.attributes, startTime: options.startTime };
     // Directly return the tracer's startActiveSpan result - it handles async correctly
     return await this.tracer.startActiveSpan(options.name, opts, ctx, async (span) => {
@@ -125,7 +129,7 @@ class DynamicTracer {
   startActiveSpanSync<T>(fn: (span: Span) => T, options: StartSpanOptions): T {
     const ctx = options.context || otelContext.active();
     const endOnExit = options.endOnExit === undefined ? true : options.endOnExit; // default true
-    const opts: SpanOptions = { attributes: options.attributes };
+    const opts: SpanOptions = { attributes: options.attributes, startTime: options.startTime };
     return this.tracer.startActiveSpan(options.name, opts, ctx, (span) => {
       try {

package/src/utils.ts CHANGED Viewed

@@ -125,6 +125,7 @@ export class Future<T = void> {
   #resolvePromise!: (value: T) => void;
   #rejectPromise!: (error: Error) => void;
   #done: boolean = false;
+  #rejected: boolean = false;
   constructor() {
     this.#await = new Promise<T>((resolve, reject) => {
@@ -141,6 +142,11 @@ export class Future<T = void> {
     return this.#done;
   }
+  /** Whether the future was rejected (cancelled) */
+  get rejected() {
+    return this.#rejected;
+  }
   resolve(value: T) {
     this.#done = true;
     this.#resolvePromise(value);
@@ -148,6 +154,7 @@ export class Future<T = void> {
   reject(error: Error) {
     this.#done = true;
+    this.#rejected = true;
     this.#rejectPromise(error);
   }
 }

package/src/voice/agent.ts CHANGED Viewed

@@ -271,6 +271,15 @@ export class Agent<UserData = any> {
       const connOptions = activity.agentSession.connOptions.sttConnOptions;
       const stream = wrapped_stt.stream({ connOptions });
+      // Set startTimeOffset to provide linear timestamps across reconnections
+      const audioInputStartedAt =
+        activity.agentSession._recorderIO?.recordingStartedAt ?? // Use recording start time if available
+        activity.agentSession._startedAt ?? // Fallback to session start time
+        Date.now(); // Fallback to current time
+      stream.startTimeOffset = (Date.now() - audioInputStartedAt) / 1000;
       stream.updateInputStream(audio);
       let cleaned = false;

package/src/voice/agent_activity.ts CHANGED Viewed

@@ -4,7 +4,7 @@
 import { Mutex } from '@livekit/mutex';
 import type { AudioFrame } from '@livekit/rtc-node';
 import type { Span } from '@opentelemetry/api';
-import { ROOT_CONTEXT, trace } from '@opentelemetry/api';
+import { ROOT_CONTEXT, context as otelContext, trace } from '@opentelemetry/api';
 import { Heap } from 'heap-js';
 import { AsyncLocalStorage } from 'node:async_hooks';
 import { ReadableStream } from 'node:stream/web';
@@ -637,9 +637,12 @@ export class AgentActivity implements RecognitionHooks {
   }
   // recognition hooks
-  onStartOfSpeech(_ev: VADEvent): void {
-    this.agentSession._updateUserState('speaking');
+  onStartOfSpeech(ev: VADEvent): void {
+    let speechStartTime = Date.now();
+    if (ev) {
+      speechStartTime = speechStartTime - ev.speechDuration;
+    }
+    this.agentSession._updateUserState('speaking', speechStartTime);
   }
   onEndOfSpeech(ev: VADEvent): void {
@@ -1168,6 +1171,8 @@ export class AgentActivity implements RecognitionHooks {
     replyAbortController: AbortController,
     audio?: ReadableStream<AudioFrame> | null,
   ): Promise<void> {
+    speechHandle._agentTurnContext = otelContext.active();
     speechHandleStorage.enterWith(speechHandle);
     const transcriptionOutput = this.agentSession.output.transcriptionEnabled
@@ -1212,13 +1217,18 @@ export class AgentActivity implements RecognitionHooks {
       tasks.push(textForwardTask);
     }
-    const onFirstFrame = () => {
-      this.agentSession._updateAgentState('speaking');
+    const onFirstFrame = (startedSpeakingAt?: number) => {
+      this.agentSession._updateAgentState('speaking', {
+        startTime: startedSpeakingAt,
+        otelContext: speechHandle._agentTurnContext,
+      });
     };
     if (!audioOutput) {
       if (textOut) {
-        textOut.firstTextFut.await.finally(onFirstFrame);
+        textOut.firstTextFut.await
+          .then(() => onFirstFrame())
+          .catch(() => this.logger.debug('firstTextFut cancelled before first frame'));
       }
     } else {
       let audioOut: _AudioOut | null = null;
@@ -1249,7 +1259,9 @@ export class AgentActivity implements RecognitionHooks {
         tasks.push(forwardTask);
         audioOut = _audioOut;
       }
-      audioOut.firstFrameFut.await.finally(onFirstFrame);
+      audioOut.firstFrameFut.await
+        .then((ts) => onFirstFrame(ts))
+        .catch(() => this.logger.debug('firstFrameFut cancelled before first frame'));
     }
     await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
@@ -1303,6 +1315,8 @@ export class AgentActivity implements RecognitionHooks {
     toolsMessages?: ChatItem[];
     span: Span;
   }): Promise<void> => {
+    speechHandle._agentTurnContext = otelContext.active();
     span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
     if (instructions) {
       span.setAttribute(traceTypes.ATTR_INSTRUCTIONS, instructions);
@@ -1402,8 +1416,11 @@ export class AgentActivity implements RecognitionHooks {
       textOut = _textOut;
     }
-    const onFirstFrame = () => {
-      this.agentSession._updateAgentState('speaking');
+    const onFirstFrame = (startedSpeakingAt?: number) => {
+      this.agentSession._updateAgentState('speaking', {
+        startTime: startedSpeakingAt,
+        otelContext: speechHandle._agentTurnContext,
+      });
     };
     let audioOut: _AudioOut | null = null;
@@ -1416,12 +1433,16 @@ export class AgentActivity implements RecognitionHooks {
         );
         audioOut = _audioOut;
         tasks.push(forwardTask);
-        audioOut.firstFrameFut.await.finally(onFirstFrame);
+        audioOut.firstFrameFut.await
+          .then((ts) => onFirstFrame(ts))
+          .catch(() => this.logger.debug('firstFrameFut cancelled before first frame'));
       } else {
         throw Error('ttsStream is null when audioOutput is enabled');
       }
     } else {
-      textOut?.firstTextFut.await.finally(onFirstFrame);
+      textOut?.firstTextFut.await
+        .then(() => onFirstFrame())
+        .catch(() => this.logger.debug('firstTextFut cancelled before first frame'));
     }
     //TODO(AJS-272): before executing tools, make sure we generated all the text
@@ -1462,8 +1483,14 @@ export class AgentActivity implements RecognitionHooks {
         msg.createdAt = replyStartedAt;
       }
       this.agent._chatCtx.insert(toolsMessages);
-      // Also add to session history (matches Python agent_session.py _tool_items_added)
-      this.agentSession._toolItemsAdded(toolsMessages as (FunctionCall | FunctionCallOutput)[]);
+      // Only add FunctionCallOutput items to session history since FunctionCall items
+      // were already added by onToolExecutionStarted when the tool execution began
+      const toolCallOutputs = toolsMessages.filter(
+        (m): m is FunctionCallOutput => m.type === 'function_call_output',
+      );
+      if (toolCallOutputs.length > 0) {
+        this.agentSession._toolItemsAdded(toolCallOutputs);
+      }
     }
     if (speechHandle.interrupted) {
@@ -1487,10 +1514,10 @@ export class AgentActivity implements RecognitionHooks {
       if (audioOutput) {
         const playbackEv = await audioOutput.waitForPlayout();
-        if (audioOut?.firstFrameFut.done) {
+        if (audioOut?.firstFrameFut.done && !audioOut.firstFrameFut.rejected) {
           // playback EV is valid only if the first frame was already played
           this.logger.info(
-            { speech_id: speechHandle.id, playbackPosition: playbackEv.playbackPosition },
+            { speech_id: speechHandle.id, playbackPositionInS: playbackEv.playbackPosition },
             'playout interrupted',
           );
           if (playbackEv.synchronizedTranscript) {
@@ -1656,8 +1683,18 @@ export class AgentActivity implements RecognitionHooks {
       for (const msg of toolMessages) {
         msg.createdAt = replyStartedAt;
       }
       this.agent._chatCtx.insert(toolMessages);
-      this.agentSession._toolItemsAdded(toolMessages as (FunctionCall | FunctionCallOutput)[]);
+      // Only add FunctionCallOutput items to session history since FunctionCall items
+      // were already added by onToolExecutionStarted when the tool execution began
+      const toolCallOutputs = toolMessages.filter(
+        (m): m is FunctionCallOutput => m.type === 'function_call_output',
+      );
+      if (toolCallOutputs.length > 0) {
+        this.agentSession._toolItemsAdded(toolCallOutputs);
+      }
     }
   };
@@ -1725,6 +1762,8 @@ export class AgentActivity implements RecognitionHooks {
     replyAbortController: AbortController;
     span: Span;
   }): Promise<void> {
+    speechHandle._agentTurnContext = otelContext.active();
     span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
     speechHandleStorage.enterWith(speechHandle);
@@ -1762,8 +1801,11 @@ export class AgentActivity implements RecognitionHooks {
       return;
     }
-    const onFirstFrame = () => {
-      this.agentSession._updateAgentState('speaking');
+    const onFirstFrame = (startedSpeakingAt?: number) => {
+      this.agentSession._updateAgentState('speaking', {
+        startTime: startedSpeakingAt,
+        otelContext: speechHandle._agentTurnContext,
+      });
     };
     const readMessages = async (
@@ -1851,10 +1893,14 @@ export class AgentActivity implements RecognitionHooks {
               );
               forwardTasks.push(forwardTask);
               audioOut = _audioOut;
-              audioOut.firstFrameFut.await.finally(onFirstFrame);
+              audioOut.firstFrameFut.await
+                .then((ts) => onFirstFrame(ts))
+                .catch(() => this.logger.debug('firstFrameFut cancelled before first frame'));
             }
           } else if (textOut) {
-            textOut.firstTextFut.await.finally(onFirstFrame);
+            textOut.firstTextFut.await
+              .then(() => onFirstFrame())
+              .catch(() => this.logger.debug('firstTextFut cancelled before first frame'));
           }
           outputs.push([msg.messageId, textOut, audioOut, msgModalities]);
         }
@@ -1955,11 +2001,11 @@ export class AgentActivity implements RecognitionHooks {
         if (audioOutput) {
           audioOutput.clearBuffer();
           const playbackEv = await audioOutput.waitForPlayout();
-          let playbackPosition = playbackEv.playbackPosition;
-          if (audioOut?.firstFrameFut.done) {
+          let playbackPositionInS = playbackEv.playbackPosition;
+          if (audioOut?.firstFrameFut.done && !audioOut.firstFrameFut.rejected) {
             // playback EV is valid only if the first frame was already played
             this.logger.info(
-              { speech_id: speechHandle.id, playbackPosition: playbackEv.playbackPosition },
+              { speech_id: speechHandle.id, playbackPositionInS },
               'playout interrupted',
             );
             if (playbackEv.synchronizedTranscript) {
@@ -1967,13 +2013,13 @@ export class AgentActivity implements RecognitionHooks {
             }
           } else {
             forwardedText = '';
-            playbackPosition = 0;
+            playbackPositionInS = 0;
           }
           // truncate server-side message
           this.realtimeSession.truncate({
             messageId: msgId,
-            audioEndMs: Math.floor(playbackPosition),
+            audioEndMs: Math.floor(playbackPositionInS * 1000),
             modalities: msgModalities,
             audioTranscript: forwardedText,
           });

package/src/voice/agent_session.ts CHANGED Viewed

@@ -677,7 +677,7 @@ export class AgentSession<
   }
   /** @internal */
-  _updateAgentState(state: AgentState) {
+  _updateAgentState(state: AgentState, options?: { startTime?: number; otelContext?: Context }) {
     if (this._agentState === state) {
       return;
     }
@@ -690,7 +690,8 @@ export class AgentSession<
       if (this.agentSpeakingSpan === undefined) {
         this.agentSpeakingSpan = tracer.startSpan({
           name: 'agent_speaking',
-          context: this.rootSpanContext,
+          context: options?.otelContext ?? this.rootSpanContext,
+          startTime: options?.startTime,
         });
         // TODO(brian): PR4 - Set participant attributes if roomIO.room.localParticipant is available
@@ -719,7 +720,7 @@ export class AgentSession<
   }
   /** @internal */
-  _updateUserState(state: UserState, _lastSpeakingTime?: number) {
+  _updateUserState(state: UserState, lastSpeakingTime?: number) {
     if (this.userState === state) {
       return;
     }
@@ -728,13 +729,13 @@ export class AgentSession<
       this.userSpeakingSpan = tracer.startSpan({
         name: 'user_speaking',
         context: this.rootSpanContext,
+        startTime: lastSpeakingTime,
       });
       // TODO(brian): PR4 - Set participant attributes if roomIO.linkedParticipant is available
       // (Ref: Python agent_session.py line 1192-1195)
     } else if (this.userSpeakingSpan !== undefined) {
-      // TODO(brian): PR4 - Set ATTR_END_TIME attribute with lastSpeakingTime if available
-      this.userSpeakingSpan.end();
+      this.userSpeakingSpan.end(lastSpeakingTime);
       this.userSpeakingSpan = undefined;
     }

package/src/voice/audio_recognition.ts CHANGED Viewed

@@ -566,9 +566,11 @@ export class AudioRecognition {
             this.speaking = true;
             if (!this.userTurnSpan) {
+              const startTime = Date.now() - ev.speechDuration;
               this.userTurnSpan = tracer.startSpan({
                 name: 'user_turn',
                 context: this.rootSpanContext,
+                startTime,
               });
             }

package/src/voice/avatar/datastream_io.ts CHANGED Viewed

@@ -47,6 +47,7 @@ export class DataStreamAudioOutput extends AudioOutput {
   private started: boolean = false;
   private lock = new Mutex();
   private startTask?: Task<void>;
+  private firstFrameEmitted: boolean = false;
   #logger = log();
@@ -146,6 +147,11 @@ export class DataStreamAudioOutput extends AudioOutput {
     await this.startTask.result;
     await super.captureFrame(frame);
+    if (!this.firstFrameEmitted) {
+      this.firstFrameEmitted = true;
+      this.onPlaybackStarted(Date.now());
+    }
     if (!this.streamWriter) {
       this.streamWriter = await this.room.localParticipant!.streamBytes({
         name: shortuuid('AUDIO_'),
@@ -174,6 +180,8 @@ export class DataStreamAudioOutput extends AudioOutput {
     this.streamWriter.close().finally(() => {
       this.streamWriter = undefined;
     });
+    this.firstFrameEmitted = false;
   }
   clearBuffer(): void {

package/src/voice/generation.ts CHANGED Viewed

@@ -27,7 +27,7 @@ import { traceTypes, tracer } from '../telemetry/index.js';
 import { Future, Task, shortuuid, toError, waitForAbort } from '../utils.js';
 import { type Agent, type ModelSettings, asyncLocalStorage, isStopResponse } from './agent.js';
 import type { AgentSession } from './agent_session.js';
-import type { AudioOutput, LLMNode, TTSNode, TextOutput } from './io.js';
+import { AudioOutput, type LLMNode, type TTSNode, type TextOutput } from './io.js';
 import { RunContext } from './run_context.js';
 import type { SpeechHandle } from './speech_handle.js';
@@ -608,7 +608,8 @@ export function performTextForwarding(
 export interface _AudioOut {
   audio: Array<AudioFrame>;
-  firstFrameFut: Future;
+  /** Future that will be set with the timestamp of the first frame's capture */
+  firstFrameFut: Future<number>;
 }
 async function forwardAudio(
@@ -620,7 +621,16 @@ async function forwardAudio(
   const reader = ttsStream.getReader();
   let resampler: AudioResampler | null = null;
+  const onPlaybackStarted = (ev: { createdAt: number }) => {
+    if (!out.firstFrameFut.done) {
+      out.firstFrameFut.resolve(ev.createdAt);
+    }
+  };
   try {
+    audioOuput.on(AudioOutput.EVENT_PLAYBACK_STARTED, onPlaybackStarted);
+    audioOuput.resume();
     while (true) {
       if (signal?.aborted) {
         break;
@@ -647,20 +657,21 @@ async function forwardAudio(
       } else {
         await audioOuput.captureFrame(frame);
       }
-      // set the first frame future if not already set
-      // (after completing the first frame)
-      if (!out.firstFrameFut.done) {
-        out.firstFrameFut.resolve();
-      }
     }
-  } finally {
-    reader?.releaseLock();
     if (resampler) {
       for (const f of resampler.flush()) {
         await audioOuput.captureFrame(f);
       }
     }
+  } finally {
+    audioOuput.off(AudioOutput.EVENT_PLAYBACK_STARTED, onPlaybackStarted);
+    if (!out.firstFrameFut.done) {
+      out.firstFrameFut.reject(new Error('audio forwarding cancelled before playback started'));
+    }
+    reader?.releaseLock();
     audioOuput.flush();
   }
 }
@@ -670,10 +681,11 @@ export function performAudioForwarding(
   audioOutput: AudioOutput,
   controller: AbortController,
 ): [Task<void>, _AudioOut] {
-  const out = {
+  const out: _AudioOut = {
     audio: [],
-    firstFrameFut: new Future(),
+    firstFrameFut: new Future<number>(),
   };
   return [
     Task.from(
       (controller) => forwardAudio(ttsStream, audioOutput, out, controller.signal),