npm - @livekit/agents - Versions diffs - 1.0.37 → 1.0.39 - Mend

@livekit/agents 1.0.37 → 1.0.39

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (155) hide show

package/dist/cli.cjs.map +1 -1
package/dist/inference/api_protos.cjs +68 -0
package/dist/inference/api_protos.cjs.map +1 -1
package/dist/inference/api_protos.d.cts +345 -4
package/dist/inference/api_protos.d.ts +345 -4
package/dist/inference/api_protos.d.ts.map +1 -1
package/dist/inference/api_protos.js +60 -0
package/dist/inference/api_protos.js.map +1 -1
package/dist/inference/llm.cjs +7 -3
package/dist/inference/llm.cjs.map +1 -1
package/dist/inference/llm.d.cts +5 -6
package/dist/inference/llm.d.ts +5 -6
package/dist/inference/llm.d.ts.map +1 -1
package/dist/inference/llm.js +7 -3
package/dist/inference/llm.js.map +1 -1
package/dist/inference/stt.cjs +32 -21
package/dist/inference/stt.cjs.map +1 -1
package/dist/inference/stt.d.cts +5 -4
package/dist/inference/stt.d.ts +5 -4
package/dist/inference/stt.d.ts.map +1 -1
package/dist/inference/stt.js +34 -21
package/dist/inference/stt.js.map +1 -1
package/dist/inference/tts.cjs.map +1 -1
package/dist/inference/tts.d.cts +10 -7
package/dist/inference/tts.d.ts +10 -7
package/dist/inference/tts.d.ts.map +1 -1
package/dist/inference/tts.js.map +1 -1
package/dist/ipc/inference_proc_executor.cjs.map +1 -1
package/dist/ipc/job_proc_executor.cjs.map +1 -1
package/dist/stt/stream_adapter.cjs +9 -1
package/dist/stt/stream_adapter.cjs.map +1 -1
package/dist/stt/stream_adapter.d.ts.map +1 -1
package/dist/stt/stream_adapter.js +9 -1
package/dist/stt/stream_adapter.js.map +1 -1
package/dist/stt/stt.cjs +10 -0
package/dist/stt/stt.cjs.map +1 -1
package/dist/stt/stt.d.cts +12 -0
package/dist/stt/stt.d.ts +12 -0
package/dist/stt/stt.d.ts.map +1 -1
package/dist/stt/stt.js +10 -0
package/dist/stt/stt.js.map +1 -1
package/dist/telemetry/traces.cjs +4 -3
package/dist/telemetry/traces.cjs.map +1 -1
package/dist/telemetry/traces.d.cts +2 -0
package/dist/telemetry/traces.d.ts +2 -0
package/dist/telemetry/traces.d.ts.map +1 -1
package/dist/telemetry/traces.js +4 -3
package/dist/telemetry/traces.js.map +1 -1
package/dist/utils.cjs +11 -0
package/dist/utils.cjs.map +1 -1
package/dist/utils.d.cts +10 -0
package/dist/utils.d.ts +10 -0
package/dist/utils.d.ts.map +1 -1
package/dist/utils.js +10 -0
package/dist/utils.js.map +1 -1
package/dist/voice/agent.cjs +6 -2
package/dist/voice/agent.cjs.map +1 -1
package/dist/voice/agent.d.ts.map +1 -1
package/dist/voice/agent.js +6 -2
package/dist/voice/agent.js.map +1 -1
package/dist/voice/agent_activity.cjs +72 -37
package/dist/voice/agent_activity.cjs.map +1 -1
package/dist/voice/agent_activity.d.cts +2 -1
package/dist/voice/agent_activity.d.ts +2 -1
package/dist/voice/agent_activity.d.ts.map +1 -1
package/dist/voice/agent_activity.js +73 -38
package/dist/voice/agent_activity.js.map +1 -1
package/dist/voice/agent_session.cjs +7 -5
package/dist/voice/agent_session.cjs.map +1 -1
package/dist/voice/agent_session.d.cts +5 -2
package/dist/voice/agent_session.d.ts +5 -2
package/dist/voice/agent_session.d.ts.map +1 -1
package/dist/voice/agent_session.js +7 -5
package/dist/voice/agent_session.js.map +1 -1
package/dist/voice/audio_recognition.cjs +3 -1
package/dist/voice/audio_recognition.cjs.map +1 -1
package/dist/voice/audio_recognition.d.ts.map +1 -1
package/dist/voice/audio_recognition.js +3 -1
package/dist/voice/audio_recognition.js.map +1 -1
package/dist/voice/avatar/datastream_io.cjs +6 -0
package/dist/voice/avatar/datastream_io.cjs.map +1 -1
package/dist/voice/avatar/datastream_io.d.cts +1 -0
package/dist/voice/avatar/datastream_io.d.ts +1 -0
package/dist/voice/avatar/datastream_io.d.ts.map +1 -1
package/dist/voice/avatar/datastream_io.js +6 -0
package/dist/voice/avatar/datastream_io.js.map +1 -1
package/dist/voice/background_audio.cjs.map +1 -1
package/dist/voice/generation.cjs +14 -5
package/dist/voice/generation.cjs.map +1 -1
package/dist/voice/generation.d.cts +3 -2
package/dist/voice/generation.d.ts +3 -2
package/dist/voice/generation.d.ts.map +1 -1
package/dist/voice/generation.js +14 -5
package/dist/voice/generation.js.map +1 -1
package/dist/voice/io.cjs +12 -0
package/dist/voice/io.cjs.map +1 -1
package/dist/voice/io.d.cts +19 -1
package/dist/voice/io.d.ts +19 -1
package/dist/voice/io.d.ts.map +1 -1
package/dist/voice/io.js +12 -0
package/dist/voice/io.js.map +1 -1
package/dist/voice/recorder_io/recorder_io.cjs +91 -28
package/dist/voice/recorder_io/recorder_io.cjs.map +1 -1
package/dist/voice/recorder_io/recorder_io.d.cts +7 -1
package/dist/voice/recorder_io/recorder_io.d.ts +7 -1
package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -1
package/dist/voice/recorder_io/recorder_io.js +91 -28
package/dist/voice/recorder_io/recorder_io.js.map +1 -1
package/dist/voice/room_io/_input.cjs +40 -11
package/dist/voice/room_io/_input.cjs.map +1 -1
package/dist/voice/room_io/_input.d.cts +4 -1
package/dist/voice/room_io/_input.d.ts +4 -1
package/dist/voice/room_io/_input.d.ts.map +1 -1
package/dist/voice/room_io/_input.js +31 -2
package/dist/voice/room_io/_input.js.map +1 -1
package/dist/voice/room_io/_output.cjs +6 -0
package/dist/voice/room_io/_output.cjs.map +1 -1
package/dist/voice/room_io/_output.d.cts +1 -0
package/dist/voice/room_io/_output.d.ts +1 -0
package/dist/voice/room_io/_output.d.ts.map +1 -1
package/dist/voice/room_io/_output.js +6 -0
package/dist/voice/room_io/_output.js.map +1 -1
package/dist/voice/room_io/room_io.cjs.map +1 -1
package/dist/voice/room_io/room_io.d.cts +2 -2
package/dist/voice/room_io/room_io.d.ts +2 -2
package/dist/voice/room_io/room_io.d.ts.map +1 -1
package/dist/voice/room_io/room_io.js.map +1 -1
package/dist/voice/speech_handle.cjs +2 -0
package/dist/voice/speech_handle.cjs.map +1 -1
package/dist/voice/speech_handle.d.cts +3 -0
package/dist/voice/speech_handle.d.ts +3 -0
package/dist/voice/speech_handle.d.ts.map +1 -1
package/dist/voice/speech_handle.js +2 -0
package/dist/voice/speech_handle.js.map +1 -1
package/package.json +2 -2
package/src/inference/api_protos.ts +83 -0
package/src/inference/llm.ts +20 -15
package/src/inference/stt.ts +48 -29
package/src/inference/tts.ts +36 -16
package/src/stt/stream_adapter.ts +12 -1
package/src/stt/stt.ts +21 -0
package/src/telemetry/traces.ts +6 -2
package/src/utils.ts +21 -0
package/src/voice/agent.ts +11 -2
package/src/voice/agent_activity.ts +108 -41
package/src/voice/agent_session.ts +6 -5
package/src/voice/audio_recognition.ts +2 -0
package/src/voice/avatar/datastream_io.ts +8 -0
package/src/voice/generation.ts +24 -12
package/src/voice/io.ts +27 -5
package/src/voice/recorder_io/recorder_io.ts +123 -31
package/src/voice/room_io/_input.ts +32 -4
package/src/voice/room_io/_output.ts +8 -0
package/src/voice/room_io/room_io.ts +3 -1
package/src/voice/speech_handle.ts +4 -0

package/src/voice/agent_activity.ts CHANGED Viewed

@@ -4,7 +4,7 @@
 import { Mutex } from '@livekit/mutex';
 import type { AudioFrame } from '@livekit/rtc-node';
 import type { Span } from '@opentelemetry/api';
-import { ROOT_CONTEXT, trace } from '@opentelemetry/api';
+import { ROOT_CONTEXT, context as otelContext, trace } from '@opentelemetry/api';
 import { Heap } from 'heap-js';
 import { AsyncLocalStorage } from 'node:async_hooks';
 import { ReadableStream } from 'node:stream/web';
@@ -194,12 +194,13 @@ export class AgentActivity implements RecognitionHooks {
     if (
       !this.vad &&
       this.stt &&
+      !this.stt.capabilities.streaming &&
       this.llm instanceof LLM &&
       this.allowInterruptions &&
       this.turnDetectionMode === undefined
     ) {
       this.logger.warn(
-        'VAD is not set. Enabling VAD is recommended when using LLM and STT ' +
+        'VAD is not set. Enabling VAD is recommended when using LLM and non-streaming STT ' +
           'for more responsive interruption handling.',
       );
     }
@@ -637,9 +638,12 @@ export class AgentActivity implements RecognitionHooks {
   }
   // recognition hooks
-  onStartOfSpeech(_ev: VADEvent): void {
-    this.agentSession._updateUserState('speaking');
+  onStartOfSpeech(ev: VADEvent): void {
+    let speechStartTime = Date.now();
+    if (ev) {
+      speechStartTime = speechStartTime - ev.speechDuration;
+    }
+    this.agentSession._updateUserState('speaking', speechStartTime);
   }
   onEndOfSpeech(ev: VADEvent): void {
@@ -656,12 +660,14 @@ export class AgentActivity implements RecognitionHooks {
       return;
     }
-    if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection) {
-      // skip speech handle interruption if server side turn detection is enabled
-      return;
+    if (ev.speechDuration >= this.agentSession.options.minInterruptionDuration) {
+      this.interruptByAudioActivity();
     }
+  }
-    if (ev.speechDuration < this.agentSession.options.minInterruptionDuration) {
+  private interruptByAudioActivity(): void {
+    if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection) {
+      // skip speech handle interruption if server side turn detection is enabled
       return;
     }
@@ -691,7 +697,10 @@ export class AgentActivity implements RecognitionHooks {
       !this._currentSpeech.interrupted &&
       this._currentSpeech.allowInterruptions
     ) {
-      this.logger.info({ 'speech id': this._currentSpeech.id }, 'speech interrupted by VAD');
+      this.logger.info(
+        { 'speech id': this._currentSpeech.id },
+        'speech interrupted by audio activity',
+      );
       this.realtimeSession?.interrupt();
       this._currentSpeech.interrupt();
     }
@@ -712,6 +721,10 @@ export class AgentActivity implements RecognitionHooks {
         // TODO(AJS-106): add multi participant support
       }),
     );
+    if (ev.alternatives![0].text) {
+      this.interruptByAudioActivity();
+    }
   }
   onFinalTranscript(ev: SpeechEvent): void {
@@ -729,6 +742,20 @@ export class AgentActivity implements RecognitionHooks {
         // TODO(AJS-106): add multi participant support
       }),
     );
+    // agent speech might not be interrupted if VAD failed and a final transcript is received
+    // we call interruptByAudioActivity (idempotent) to pause the speech, if possible
+    if (
+      this.audioRecognition &&
+      this.turnDetection !== 'manual' &&
+      this.turnDetection !== 'realtime_llm'
+    ) {
+      this.interruptByAudioActivity();
+      // TODO: resume false interruption - schedule a resume timer if interrupted after end_of_speech
+    }
+    // TODO: resume false interruption - start interrupt paused speech task
   }
   onPreemptiveGeneration(info: PreemptiveGenerationInfo): void {
@@ -1168,6 +1195,8 @@ export class AgentActivity implements RecognitionHooks {
     replyAbortController: AbortController,
     audio?: ReadableStream<AudioFrame> | null,
   ): Promise<void> {
+    speechHandle._agentTurnContext = otelContext.active();
     speechHandleStorage.enterWith(speechHandle);
     const transcriptionOutput = this.agentSession.output.transcriptionEnabled
@@ -1212,13 +1241,18 @@ export class AgentActivity implements RecognitionHooks {
       tasks.push(textForwardTask);
     }
-    const onFirstFrame = () => {
-      this.agentSession._updateAgentState('speaking');
+    const onFirstFrame = (startedSpeakingAt?: number) => {
+      this.agentSession._updateAgentState('speaking', {
+        startTime: startedSpeakingAt,
+        otelContext: speechHandle._agentTurnContext,
+      });
     };
     if (!audioOutput) {
       if (textOut) {
-        textOut.firstTextFut.await.finally(onFirstFrame);
+        textOut.firstTextFut.await
+          .then(() => onFirstFrame())
+          .catch(() => this.logger.debug('firstTextFut cancelled before first frame'));
       }
     } else {
       let audioOut: _AudioOut | null = null;
@@ -1249,7 +1283,9 @@ export class AgentActivity implements RecognitionHooks {
         tasks.push(forwardTask);
         audioOut = _audioOut;
       }
-      audioOut.firstFrameFut.await.finally(onFirstFrame);
+      audioOut.firstFrameFut.await
+        .then((ts) => onFirstFrame(ts))
+        .catch(() => this.logger.debug('firstFrameFut cancelled before first frame'));
     }
     await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
@@ -1303,6 +1339,8 @@ export class AgentActivity implements RecognitionHooks {
     toolsMessages?: ChatItem[];
     span: Span;
   }): Promise<void> => {
+    speechHandle._agentTurnContext = otelContext.active();
     span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
     if (instructions) {
       span.setAttribute(traceTypes.ATTR_INSTRUCTIONS, instructions);
@@ -1402,8 +1440,11 @@ export class AgentActivity implements RecognitionHooks {
       textOut = _textOut;
     }
-    const onFirstFrame = () => {
-      this.agentSession._updateAgentState('speaking');
+    const onFirstFrame = (startedSpeakingAt?: number) => {
+      this.agentSession._updateAgentState('speaking', {
+        startTime: startedSpeakingAt,
+        otelContext: speechHandle._agentTurnContext,
+      });
     };
     let audioOut: _AudioOut | null = null;
@@ -1416,12 +1457,16 @@ export class AgentActivity implements RecognitionHooks {
         );
         audioOut = _audioOut;
         tasks.push(forwardTask);
-        audioOut.firstFrameFut.await.finally(onFirstFrame);
+        audioOut.firstFrameFut.await
+          .then((ts) => onFirstFrame(ts))
+          .catch(() => this.logger.debug('firstFrameFut cancelled before first frame'));
       } else {
         throw Error('ttsStream is null when audioOutput is enabled');
       }
     } else {
-      textOut?.firstTextFut.await.finally(onFirstFrame);
+      textOut?.firstTextFut.await
+        .then(() => onFirstFrame())
+        .catch(() => this.logger.debug('firstTextFut cancelled before first frame'));
     }
     //TODO(AJS-272): before executing tools, make sure we generated all the text
@@ -1462,8 +1507,14 @@ export class AgentActivity implements RecognitionHooks {
         msg.createdAt = replyStartedAt;
       }
       this.agent._chatCtx.insert(toolsMessages);
-      // Also add to session history (matches Python agent_session.py _tool_items_added)
-      this.agentSession._toolItemsAdded(toolsMessages as (FunctionCall | FunctionCallOutput)[]);
+      // Only add FunctionCallOutput items to session history since FunctionCall items
+      // were already added by onToolExecutionStarted when the tool execution began
+      const toolCallOutputs = toolsMessages.filter(
+        (m): m is FunctionCallOutput => m.type === 'function_call_output',
+      );
+      if (toolCallOutputs.length > 0) {
+        this.agentSession._toolItemsAdded(toolCallOutputs);
+      }
     }
     if (speechHandle.interrupted) {
@@ -1487,10 +1538,10 @@ export class AgentActivity implements RecognitionHooks {
       if (audioOutput) {
         const playbackEv = await audioOutput.waitForPlayout();
-        if (audioOut?.firstFrameFut.done) {
+        if (audioOut?.firstFrameFut.done && !audioOut.firstFrameFut.rejected) {
           // playback EV is valid only if the first frame was already played
           this.logger.info(
-            { speech_id: speechHandle.id, playbackPosition: playbackEv.playbackPosition },
+            { speech_id: speechHandle.id, playbackPositionInS: playbackEv.playbackPosition },
             'playout interrupted',
           );
           if (playbackEv.synchronizedTranscript) {
@@ -1656,8 +1707,18 @@ export class AgentActivity implements RecognitionHooks {
       for (const msg of toolMessages) {
         msg.createdAt = replyStartedAt;
       }
       this.agent._chatCtx.insert(toolMessages);
-      this.agentSession._toolItemsAdded(toolMessages as (FunctionCall | FunctionCallOutput)[]);
+      // Only add FunctionCallOutput items to session history since FunctionCall items
+      // were already added by onToolExecutionStarted when the tool execution began
+      const toolCallOutputs = toolMessages.filter(
+        (m): m is FunctionCallOutput => m.type === 'function_call_output',
+      );
+      if (toolCallOutputs.length > 0) {
+        this.agentSession._toolItemsAdded(toolCallOutputs);
+      }
     }
   };
@@ -1725,6 +1786,8 @@ export class AgentActivity implements RecognitionHooks {
     replyAbortController: AbortController;
     span: Span;
   }): Promise<void> {
+    speechHandle._agentTurnContext = otelContext.active();
     span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
     speechHandleStorage.enterWith(speechHandle);
@@ -1762,8 +1825,11 @@ export class AgentActivity implements RecognitionHooks {
       return;
     }
-    const onFirstFrame = () => {
-      this.agentSession._updateAgentState('speaking');
+    const onFirstFrame = (startedSpeakingAt?: number) => {
+      this.agentSession._updateAgentState('speaking', {
+        startTime: startedSpeakingAt,
+        otelContext: speechHandle._agentTurnContext,
+      });
     };
     const readMessages = async (
@@ -1851,10 +1917,14 @@ export class AgentActivity implements RecognitionHooks {
               );
               forwardTasks.push(forwardTask);
               audioOut = _audioOut;
-              audioOut.firstFrameFut.await.finally(onFirstFrame);
+              audioOut.firstFrameFut.await
+                .then((ts) => onFirstFrame(ts))
+                .catch(() => this.logger.debug('firstFrameFut cancelled before first frame'));
             }
           } else if (textOut) {
-            textOut.firstTextFut.await.finally(onFirstFrame);
+            textOut.firstTextFut.await
+              .then(() => onFirstFrame())
+              .catch(() => this.logger.debug('firstTextFut cancelled before first frame'));
           }
           outputs.push([msg.messageId, textOut, audioOut, msgModalities]);
         }
@@ -1936,7 +2006,6 @@ export class AgentActivity implements RecognitionHooks {
     if (audioOutput) {
       await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
-      this.agentSession._updateAgentState('listening');
     }
     if (speechHandle.interrupted) {
@@ -1955,11 +2024,11 @@ export class AgentActivity implements RecognitionHooks {
         if (audioOutput) {
           audioOutput.clearBuffer();
           const playbackEv = await audioOutput.waitForPlayout();
-          let playbackPosition = playbackEv.playbackPosition;
-          if (audioOut?.firstFrameFut.done) {
+          let playbackPositionInS = playbackEv.playbackPosition;
+          if (audioOut?.firstFrameFut.done && !audioOut.firstFrameFut.rejected) {
             // playback EV is valid only if the first frame was already played
             this.logger.info(
-              { speech_id: speechHandle.id, playbackPosition: playbackEv.playbackPosition },
+              { speech_id: speechHandle.id, playbackPositionInS },
               'playout interrupted',
             );
             if (playbackEv.synchronizedTranscript) {
@@ -1967,13 +2036,13 @@ export class AgentActivity implements RecognitionHooks {
             }
           } else {
             forwardedText = '';
-            playbackPosition = 0;
+            playbackPositionInS = 0;
           }
           // truncate server-side message
           this.realtimeSession.truncate({
             messageId: msgId,
-            audioEndMs: Math.floor(playbackPosition),
+            audioEndMs: Math.floor(playbackPositionInS * 1000),
             modalities: msgModalities,
             audioTranscript: forwardedText,
           });
@@ -2023,17 +2092,15 @@ export class AgentActivity implements RecognitionHooks {
     speechHandle._markGenerationDone();
     // TODO(brian): close tees
-    toolOutput.firstToolStartedFuture.await.finally(() => {
-      this.agentSession._updateAgentState('thinking');
-    });
     await executeToolsTask.result;
+    if (toolOutput.output.length > 0) {
+      this.agentSession._updateAgentState('thinking');
+    } else if (this.agentSession.agentState === 'speaking') {
+      this.agentSession._updateAgentState('listening');
+    }
     if (toolOutput.output.length === 0) {
-      // return to listening state for thinking-only turns (no audio output, no tools)
-      if (!speechHandle.interrupted) {
-        this.agentSession._updateAgentState('listening');
-      }
       return;
     }

package/src/voice/agent_session.ts CHANGED Viewed

@@ -677,7 +677,7 @@ export class AgentSession<
   }
   /** @internal */
-  _updateAgentState(state: AgentState) {
+  _updateAgentState(state: AgentState, options?: { startTime?: number; otelContext?: Context }) {
     if (this._agentState === state) {
       return;
     }
@@ -690,7 +690,8 @@ export class AgentSession<
       if (this.agentSpeakingSpan === undefined) {
         this.agentSpeakingSpan = tracer.startSpan({
           name: 'agent_speaking',
-          context: this.rootSpanContext,
+          context: options?.otelContext ?? this.rootSpanContext,
+          startTime: options?.startTime,
         });
         // TODO(brian): PR4 - Set participant attributes if roomIO.room.localParticipant is available
@@ -719,7 +720,7 @@ export class AgentSession<
   }
   /** @internal */
-  _updateUserState(state: UserState, _lastSpeakingTime?: number) {
+  _updateUserState(state: UserState, lastSpeakingTime?: number) {
     if (this.userState === state) {
       return;
     }
@@ -728,13 +729,13 @@ export class AgentSession<
       this.userSpeakingSpan = tracer.startSpan({
         name: 'user_speaking',
         context: this.rootSpanContext,
+        startTime: lastSpeakingTime,
       });
       // TODO(brian): PR4 - Set participant attributes if roomIO.linkedParticipant is available
       // (Ref: Python agent_session.py line 1192-1195)
     } else if (this.userSpeakingSpan !== undefined) {
-      // TODO(brian): PR4 - Set ATTR_END_TIME attribute with lastSpeakingTime if available
-      this.userSpeakingSpan.end();
+      this.userSpeakingSpan.end(lastSpeakingTime);
       this.userSpeakingSpan = undefined;
     }

package/src/voice/audio_recognition.ts CHANGED Viewed

@@ -566,9 +566,11 @@ export class AudioRecognition {
             this.speaking = true;
             if (!this.userTurnSpan) {
+              const startTime = Date.now() - ev.speechDuration;
               this.userTurnSpan = tracer.startSpan({
                 name: 'user_turn',
                 context: this.rootSpanContext,
+                startTime,
               });
             }

package/src/voice/avatar/datastream_io.ts CHANGED Viewed

@@ -47,6 +47,7 @@ export class DataStreamAudioOutput extends AudioOutput {
   private started: boolean = false;
   private lock = new Mutex();
   private startTask?: Task<void>;
+  private firstFrameEmitted: boolean = false;
   #logger = log();
@@ -146,6 +147,11 @@ export class DataStreamAudioOutput extends AudioOutput {
     await this.startTask.result;
     await super.captureFrame(frame);
+    if (!this.firstFrameEmitted) {
+      this.firstFrameEmitted = true;
+      this.onPlaybackStarted(Date.now());
+    }
     if (!this.streamWriter) {
       this.streamWriter = await this.room.localParticipant!.streamBytes({
         name: shortuuid('AUDIO_'),
@@ -174,6 +180,8 @@ export class DataStreamAudioOutput extends AudioOutput {
     this.streamWriter.close().finally(() => {
       this.streamWriter = undefined;
     });
+    this.firstFrameEmitted = false;
   }
   clearBuffer(): void {

package/src/voice/generation.ts CHANGED Viewed

@@ -27,7 +27,7 @@ import { traceTypes, tracer } from '../telemetry/index.js';
 import { Future, Task, shortuuid, toError, waitForAbort } from '../utils.js';
 import { type Agent, type ModelSettings, asyncLocalStorage, isStopResponse } from './agent.js';
 import type { AgentSession } from './agent_session.js';
-import type { AudioOutput, LLMNode, TTSNode, TextOutput } from './io.js';
+import { AudioOutput, type LLMNode, type TTSNode, type TextOutput } from './io.js';
 import { RunContext } from './run_context.js';
 import type { SpeechHandle } from './speech_handle.js';
@@ -608,7 +608,8 @@ export function performTextForwarding(
 export interface _AudioOut {
   audio: Array<AudioFrame>;
-  firstFrameFut: Future;
+  /** Future that will be set with the timestamp of the first frame's capture */
+  firstFrameFut: Future<number>;
 }
 async function forwardAudio(
@@ -620,7 +621,16 @@ async function forwardAudio(
   const reader = ttsStream.getReader();
   let resampler: AudioResampler | null = null;
+  const onPlaybackStarted = (ev: { createdAt: number }) => {
+    if (!out.firstFrameFut.done) {
+      out.firstFrameFut.resolve(ev.createdAt);
+    }
+  };
   try {
+    audioOuput.on(AudioOutput.EVENT_PLAYBACK_STARTED, onPlaybackStarted);
+    audioOuput.resume();
     while (true) {
       if (signal?.aborted) {
         break;
@@ -647,20 +657,21 @@ async function forwardAudio(
       } else {
         await audioOuput.captureFrame(frame);
       }
-      // set the first frame future if not already set
-      // (after completing the first frame)
-      if (!out.firstFrameFut.done) {
-        out.firstFrameFut.resolve();
-      }
     }
-  } finally {
-    reader?.releaseLock();
     if (resampler) {
       for (const f of resampler.flush()) {
         await audioOuput.captureFrame(f);
       }
     }
+  } finally {
+    audioOuput.off(AudioOutput.EVENT_PLAYBACK_STARTED, onPlaybackStarted);
+    if (!out.firstFrameFut.done) {
+      out.firstFrameFut.reject(new Error('audio forwarding cancelled before playback started'));
+    }
+    reader?.releaseLock();
     audioOuput.flush();
   }
 }
@@ -670,10 +681,11 @@ export function performAudioForwarding(
   audioOutput: AudioOutput,
   controller: AbortController,
 ): [Task<void>, _AudioOut] {
-  const out = {
+  const out: _AudioOut = {
     audio: [],
-    firstFrameFut: new Future(),
+    firstFrameFut: new Future<number>(),
   };
   return [
     Task.from(
       (controller) => forwardAudio(ttsStream, audioOutput, out, controller.signal),

package/src/voice/io.ts CHANGED Viewed

@@ -30,12 +30,14 @@ export type TTSNode = (
 ) => Promise<ReadableStream<AudioFrame> | null>;
 /**
- * A string with timing information for word-level alignment.
+ *A string with optional start and end timestamps for word-level alignment.
  */
 export interface TimedString {
   text: string;
   startTime?: number; // seconds
   endTime?: number; // seconds
+  confidence?: number;
+  startTimeOffset?: number;
 }
 export interface AudioOutputCapabilities {
@@ -57,6 +59,7 @@ export abstract class AudioInput {
 }
 export abstract class AudioOutput extends EventEmitter {
+  static readonly EVENT_PLAYBACK_STARTED = 'playbackStarted';
   static readonly EVENT_PLAYBACK_FINISHED = 'playbackFinished';
   private playbackFinishedFuture: Future<void> = new Future();
@@ -77,7 +80,11 @@ export abstract class AudioOutput extends EventEmitter {
   ) {
     super();
     this.capabilities = capabilities;
     if (this.nextInChain) {
+      this.nextInChain.on(AudioOutput.EVENT_PLAYBACK_STARTED, (ev: PlaybackStartedEvent) =>
+        this.onPlaybackStarted(ev.createdAt),
+      );
       this.nextInChain.on(AudioOutput.EVENT_PLAYBACK_FINISHED, (ev: PlaybackFinishedEvent) =>
         this.onPlaybackFinished(ev),
       );
@@ -117,6 +124,14 @@ export abstract class AudioOutput extends EventEmitter {
     return this.lastPlaybackEvent;
   }
+  /**
+   * Called when playback actually starts (first frame is sent to output).
+   * Developers building audio sinks should call this when the first frame is captured.
+   */
+  onPlaybackStarted(createdAt: number): void {
+    this.emit(AudioOutput.EVENT_PLAYBACK_STARTED, { createdAt } as PlaybackStartedEvent);
+  }
   /**
    * Developers building audio sinks must call this method when a playback/segment is finished.
    * Segments are segmented by calls to flush() or clearBuffer()
@@ -174,15 +189,22 @@ export abstract class AudioOutput extends EventEmitter {
 }
 export interface PlaybackFinishedEvent {
-  // How much of the audio was played back
+  /** How much of the audio was played back, in seconds */
   playbackPosition: number;
-  // Interrupted is True if playback was interrupted (clearBuffer() was called)
+  /** True if playback was interrupted (clearBuffer() was called) */
   interrupted: boolean;
-  // Transcript synced with playback; may be partial if the audio was interrupted
-  // When null, the transcript is not synchronized with the playback
+  /**
+   * Transcript synced with playback; may be partial if the audio was interrupted.
+   * When undefined, the transcript is not synchronized with the playback.
+   */
   synchronizedTranscript?: string;
 }
+export interface PlaybackStartedEvent {
+  /** The timestamp (Date.now()) when the playback started */
+  createdAt: number;
+}
 export abstract class TextOutput {
   constructor(protected readonly nextInChain?: TextOutput) {}