npm - @livekit/agents - Versions diffs - 1.0.40 → 1.0.42 - Mend

@livekit/agents 1.0.40 → 1.0.42

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (128) hide show

package/dist/cli.cjs +20 -18
package/dist/cli.cjs.map +1 -1
package/dist/cli.d.ts.map +1 -1
package/dist/cli.js +20 -18
package/dist/cli.js.map +1 -1
package/dist/index.cjs +5 -0
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +1 -0
package/dist/index.d.ts +1 -0
package/dist/index.d.ts.map +1 -1
package/dist/index.js +3 -0
package/dist/index.js.map +1 -1
package/dist/inference/stt.cjs +2 -1
package/dist/inference/stt.cjs.map +1 -1
package/dist/inference/stt.d.ts.map +1 -1
package/dist/inference/stt.js +2 -1
package/dist/inference/stt.js.map +1 -1
package/dist/llm/realtime.cjs.map +1 -1
package/dist/llm/realtime.d.cts +5 -1
package/dist/llm/realtime.d.ts +5 -1
package/dist/llm/realtime.d.ts.map +1 -1
package/dist/llm/realtime.js.map +1 -1
package/dist/tts/stream_adapter.cjs +15 -1
package/dist/tts/stream_adapter.cjs.map +1 -1
package/dist/tts/stream_adapter.d.ts.map +1 -1
package/dist/tts/stream_adapter.js +15 -1
package/dist/tts/stream_adapter.js.map +1 -1
package/dist/tts/tts.cjs.map +1 -1
package/dist/tts/tts.d.cts +9 -1
package/dist/tts/tts.d.ts +9 -1
package/dist/tts/tts.d.ts.map +1 -1
package/dist/tts/tts.js.map +1 -1
package/dist/types.cjs +3 -0
package/dist/types.cjs.map +1 -1
package/dist/types.d.cts +4 -0
package/dist/types.d.ts +4 -0
package/dist/types.d.ts.map +1 -1
package/dist/types.js +2 -0
package/dist/types.js.map +1 -1
package/dist/voice/agent.cjs +11 -1
package/dist/voice/agent.cjs.map +1 -1
package/dist/voice/agent.d.cts +7 -3
package/dist/voice/agent.d.ts +7 -3
package/dist/voice/agent.d.ts.map +1 -1
package/dist/voice/agent.js +11 -1
package/dist/voice/agent.js.map +1 -1
package/dist/voice/agent_activity.cjs +30 -14
package/dist/voice/agent_activity.cjs.map +1 -1
package/dist/voice/agent_activity.d.cts +1 -0
package/dist/voice/agent_activity.d.ts +1 -0
package/dist/voice/agent_activity.d.ts.map +1 -1
package/dist/voice/agent_activity.js +30 -14
package/dist/voice/agent_activity.js.map +1 -1
package/dist/voice/agent_session.cjs +5 -1
package/dist/voice/agent_session.cjs.map +1 -1
package/dist/voice/agent_session.d.cts +2 -0
package/dist/voice/agent_session.d.ts +2 -0
package/dist/voice/agent_session.d.ts.map +1 -1
package/dist/voice/agent_session.js +5 -1
package/dist/voice/agent_session.js.map +1 -1
package/dist/voice/audio_recognition.cjs +1 -1
package/dist/voice/audio_recognition.cjs.map +1 -1
package/dist/voice/audio_recognition.d.ts.map +1 -1
package/dist/voice/audio_recognition.js +1 -1
package/dist/voice/audio_recognition.js.map +1 -1
package/dist/voice/background_audio.cjs +2 -1
package/dist/voice/background_audio.cjs.map +1 -1
package/dist/voice/background_audio.d.cts +4 -2
package/dist/voice/background_audio.d.ts +4 -2
package/dist/voice/background_audio.d.ts.map +1 -1
package/dist/voice/background_audio.js +2 -1
package/dist/voice/background_audio.js.map +1 -1
package/dist/voice/generation.cjs +58 -5
package/dist/voice/generation.cjs.map +1 -1
package/dist/voice/generation.d.cts +17 -3
package/dist/voice/generation.d.ts +17 -3
package/dist/voice/generation.d.ts.map +1 -1
package/dist/voice/generation.js +63 -6
package/dist/voice/generation.js.map +1 -1
package/dist/voice/index.cjs.map +1 -1
package/dist/voice/index.d.cts +1 -1
package/dist/voice/index.d.ts +1 -1
package/dist/voice/index.d.ts.map +1 -1
package/dist/voice/index.js.map +1 -1
package/dist/voice/io.cjs +22 -2
package/dist/voice/io.cjs.map +1 -1
package/dist/voice/io.d.cts +21 -5
package/dist/voice/io.d.ts +21 -5
package/dist/voice/io.d.ts.map +1 -1
package/dist/voice/io.js +18 -1
package/dist/voice/io.js.map +1 -1
package/dist/voice/room_io/_output.cjs +3 -2
package/dist/voice/room_io/_output.cjs.map +1 -1
package/dist/voice/room_io/_output.d.cts +3 -3
package/dist/voice/room_io/_output.d.ts +3 -3
package/dist/voice/room_io/_output.d.ts.map +1 -1
package/dist/voice/room_io/_output.js +4 -3
package/dist/voice/room_io/_output.js.map +1 -1
package/dist/voice/transcription/synchronizer.cjs +137 -13
package/dist/voice/transcription/synchronizer.cjs.map +1 -1
package/dist/voice/transcription/synchronizer.d.cts +34 -4
package/dist/voice/transcription/synchronizer.d.ts +34 -4
package/dist/voice/transcription/synchronizer.d.ts.map +1 -1
package/dist/voice/transcription/synchronizer.js +141 -14
package/dist/voice/transcription/synchronizer.js.map +1 -1
package/dist/voice/transcription/synchronizer.test.cjs +151 -0
package/dist/voice/transcription/synchronizer.test.cjs.map +1 -0
package/dist/voice/transcription/synchronizer.test.js +150 -0
package/dist/voice/transcription/synchronizer.test.js.map +1 -0
package/package.json +1 -1
package/src/cli.ts +20 -18
package/src/index.ts +1 -0
package/src/inference/stt.ts +9 -8
package/src/llm/realtime.ts +5 -1
package/src/tts/stream_adapter.ts +23 -1
package/src/tts/tts.ts +10 -1
package/src/types.ts +5 -0
package/src/voice/agent.ts +19 -4
package/src/voice/agent_activity.ts +38 -13
package/src/voice/agent_session.ts +6 -0
package/src/voice/audio_recognition.ts +2 -1
package/src/voice/background_audio.ts +6 -3
package/src/voice/generation.ts +115 -10
package/src/voice/index.ts +1 -1
package/src/voice/io.ts +40 -5
package/src/voice/room_io/_output.ts +6 -5
package/src/voice/transcription/synchronizer.test.ts +206 -0
package/src/voice/transcription/synchronizer.ts +202 -17

package/src/voice/agent.ts CHANGED Viewed

@@ -26,9 +26,11 @@ import { StreamAdapter as STTStreamAdapter } from '../stt/index.js';
 import { SentenceTokenizer as BasicSentenceTokenizer } from '../tokenize/basic/index.js';
 import type { TTS } from '../tts/index.js';
 import { SynthesizeStream, StreamAdapter as TTSStreamAdapter } from '../tts/index.js';
+import { USERDATA_TIMED_TRANSCRIPT } from '../types.js';
 import type { VAD } from '../vad.js';
 import type { AgentActivity } from './agent_activity.js';
 import type { AgentSession, TurnDetectionMode } from './agent_session.js';
+import type { TimedString } from './io.js';
 export const asyncLocalStorage = new AsyncLocalStorage<{ functionCall?: FunctionCall }>();
 export const STOP_RESPONSE_SYMBOL = Symbol('StopResponse');
@@ -70,6 +72,7 @@ export interface AgentOptions<UserData> {
   tts?: TTS | TTSModelString;
   allowInterruptions?: boolean;
   minConsecutiveSpeechDelay?: number;
+  useTtsAlignedTranscript?: boolean;
 }
 export class Agent<UserData = any> {
@@ -79,6 +82,7 @@ export class Agent<UserData = any> {
   private _vad?: VAD;
   private _llm?: LLM | RealtimeModel;
   private _tts?: TTS;
+  private _useTtsAlignedTranscript?: boolean;
   /** @internal */
   _agentActivity?: AgentActivity;
@@ -102,6 +106,7 @@ export class Agent<UserData = any> {
     vad,
     llm,
     tts,
+    useTtsAlignedTranscript,
   }: AgentOptions<UserData>) {
     if (id) {
       this._id = id;
@@ -147,6 +152,8 @@ export class Agent<UserData = any> {
       this._tts = tts;
     }
+    this._useTtsAlignedTranscript = useTtsAlignedTranscript;
     this._agentActivity = undefined;
   }
@@ -166,6 +173,10 @@ export class Agent<UserData = any> {
     return this._tts;
   }
+  get useTtsAlignedTranscript(): boolean | undefined {
+    return this._useTtsAlignedTranscript;
+  }
   get chatCtx(): ReadonlyChatContext {
     return new ReadonlyChatContext(this._chatCtx.items);
   }
@@ -191,9 +202,9 @@ export class Agent<UserData = any> {
   async onExit(): Promise<void> {}
   async transcriptionNode(
-    text: ReadableStream<string>,
+    text: ReadableStream<string | TimedString>,
     modelSettings: ModelSettings,
-  ): Promise<ReadableStream<string> | null> {
+  ): Promise<ReadableStream<string | TimedString> | null> {
     return Agent.default.transcriptionNode(this, text, modelSettings);
   }
@@ -395,6 +406,10 @@ export class Agent<UserData = any> {
               if (chunk === SynthesizeStream.END_OF_STREAM) {
                 break;
               }
+              // Attach timed transcripts to frame.userdata
+              if (chunk.timedTranscripts && chunk.timedTranscripts.length > 0) {
+                chunk.frame.userdata[USERDATA_TIMED_TRANSCRIPT] = chunk.timedTranscripts;
+              }
               controller.enqueue(chunk.frame);
             }
             controller.close();
@@ -410,9 +425,9 @@ export class Agent<UserData = any> {
     async transcriptionNode(
       agent: Agent,
-      text: ReadableStream<string>,
+      text: ReadableStream<string | TimedString>,
       _modelSettings: ModelSettings,
-    ): Promise<ReadableStream<string> | null> {
+    ): Promise<ReadableStream<string | TimedString> | null> {
       return text;
     },

package/src/voice/agent_activity.ts CHANGED Viewed

@@ -60,7 +60,7 @@ import {
   createSpeechCreatedEvent,
   createUserInputTranscribedEvent,
 } from './events.js';
-import type { ToolExecutionOutput } from './generation.js';
+import type { ToolExecutionOutput, _TTSGenerationData } from './generation.js';
 import {
   type _AudioOut,
   type _TextOut,
@@ -72,6 +72,7 @@ import {
   removeInstructions,
   updateInstructions,
 } from './generation.js';
+import type { TimedString } from './io.js';
 import { SpeechHandle } from './speech_handle.js';
 const speechHandleStorage = new AsyncLocalStorage<SpeechHandle>();
@@ -359,6 +360,11 @@ export class AgentActivity implements RecognitionHooks {
     return this.agentSession.options.allowInterruptions;
   }
+  get useTtsAlignedTranscript(): boolean {
+    // Agent setting takes precedence over session setting
+    return this.agent.useTtsAlignedTranscript ?? this.agentSession.useTtsAlignedTranscript;
+  }
   get turnDetection(): TurnDetectionMode | undefined {
     // TODO(brian): prioritize using agent.turn_detection
     return this.agentSession.turnDetection;
@@ -1258,7 +1264,7 @@ export class AgentActivity implements RecognitionHooks {
       let audioOut: _AudioOut | null = null;
       if (!audio) {
         // generate audio using TTS
-        const [ttsTask, ttsStream] = performTTSInference(
+        const [ttsTask, ttsGenData] = performTTSInference(
           (...args) => this.agent.ttsNode(...args),
           audioSource,
           modelSettings,
@@ -1267,7 +1273,7 @@ export class AgentActivity implements RecognitionHooks {
         tasks.push(ttsTask);
         const [forwardTask, _audioOut] = performAudioForwarding(
-          ttsStream,
+          ttsGenData.audioStream,
           audioOutput,
           replyAbortController,
         );
@@ -1389,14 +1395,14 @@ export class AgentActivity implements RecognitionHooks {
     tasks.push(llmTask);
     let ttsTask: Task<void> | null = null;
-    let ttsStream: ReadableStream<AudioFrame> | null = null;
+    let ttsGenData: _TTSGenerationData | null = null;
     let llmOutput: ReadableStream<string>;
     if (audioOutput) {
       // Only tee the stream when we need TTS
       const [ttsTextInput, textOutput] = llmGenData.textStream.tee();
       llmOutput = textOutput;
-      [ttsTask, ttsStream] = performTTSInference(
+      [ttsTask, ttsGenData] = performTTSInference(
         (...args) => this.agent.ttsNode(...args),
         ttsTextInput,
         modelSettings,
@@ -1428,7 +1434,26 @@ export class AgentActivity implements RecognitionHooks {
     speechHandle._clearAuthorization();
     const replyStartedAt = Date.now();
-    const trNodeResult = await this.agent.transcriptionNode(llmOutput, modelSettings);
+    // Determine the transcription input source
+    let transcriptionInput: ReadableStream<string | TimedString> = llmOutput;
+    // Check if we should use TTS aligned transcripts
+    if (this.useTtsAlignedTranscript && this.tts?.capabilities.alignedTranscript && ttsGenData) {
+      // Race timedTextsFut with ttsTask to avoid hanging if TTS fails before resolving the future
+      const timedTextsStream = await Promise.race([
+        ttsGenData.timedTextsFut.await,
+        ttsTask?.result.catch(() =>
+          this.logger.warn('TTS task failed before resolving timedTextsFut'),
+        ) ?? Promise.resolve(),
+      ]);
+      if (timedTextsStream) {
+        this.logger.debug('Using TTS aligned transcripts for transcription node input');
+        transcriptionInput = timedTextsStream;
+      }
+    }
+    const trNodeResult = await this.agent.transcriptionNode(transcriptionInput, modelSettings);
     let textOut: _TextOut | null = null;
     if (trNodeResult) {
       const [textForwardTask, _textOut] = performTextForwarding(
@@ -1449,9 +1474,9 @@ export class AgentActivity implements RecognitionHooks {
     let audioOut: _AudioOut | null = null;
     if (audioOutput) {
-      if (ttsStream) {
+      if (ttsGenData) {
         const [forwardTask, _audioOut] = performAudioForwarding(
-          ttsStream,
+          ttsGenData.audioStream,
           audioOutput,
           replyAbortController,
         );
@@ -1461,7 +1486,7 @@ export class AgentActivity implements RecognitionHooks {
           .then((ts) => onFirstFrame(ts))
           .catch(() => this.logger.debug('firstFrameFut cancelled before first frame'));
       } else {
-        throw Error('ttsStream is null when audioOutput is enabled');
+        throw Error('ttsGenData is null when audioOutput is enabled');
       }
     } else {
       textOut?.firstTextFut.await
@@ -1851,8 +1876,8 @@ export class AgentActivity implements RecognitionHooks {
           }
           const msgModalities = msg.modalities ? await msg.modalities : undefined;
-          let ttsTextInput: ReadableStream<string> | null = null;
-          let trTextInput: ReadableStream<string>;
+          let ttsTextInput: ReadableStream<string | TimedString> | null = null;
+          let trTextInput: ReadableStream<string | TimedString>;
           if (msgModalities && !msgModalities.includes('audio') && this.tts) {
             if (this.llm instanceof RealtimeModel && this.llm.capabilities.audioOutput) {
@@ -1884,14 +1909,14 @@ export class AgentActivity implements RecognitionHooks {
             let realtimeAudioResult: ReadableStream<AudioFrame> | null = null;
             if (ttsTextInput) {
-              const [ttsTask, ttsStream] = performTTSInference(
+              const [ttsTask, ttsGenData] = performTTSInference(
                 (...args) => this.agent.ttsNode(...args),
                 ttsTextInput,
                 modelSettings,
                 abortController,
               );
               tasks.push(ttsTask);
-              realtimeAudioResult = ttsStream;
+              realtimeAudioResult = ttsGenData.audioStream;
             } else if (msgModalities && msgModalities.includes('audio')) {
               realtimeAudioResult = await this.agent.realtimeAudioOutputNode(
                 msg.audioStream,

package/src/voice/agent_session.ts CHANGED Viewed

@@ -73,6 +73,7 @@ export interface VoiceOptions {
   maxToolSteps: number;
   preemptiveGeneration: boolean;
   userAwayTimeout?: number | null;
+  useTtsAlignedTranscript: boolean;
 }
 const defaultVoiceOptions: VoiceOptions = {
@@ -85,6 +86,7 @@ const defaultVoiceOptions: VoiceOptions = {
   maxToolSteps: 3,
   preemptiveGeneration: false,
   userAwayTimeout: 15.0,
+  useTtsAlignedTranscript: true,
 } as const;
 export type TurnDetectionMode = 'stt' | 'vad' | 'realtime_llm' | 'manual' | _TurnDetector;
@@ -264,6 +266,10 @@ export class AgentSession<
     return this._connOptions;
   }
+  get useTtsAlignedTranscript(): boolean {
+    return this.options.useTtsAlignedTranscript;
+  }
   set userData(value: UserData) {
     this._userData = value;
   }

package/src/voice/audio_recognition.ts CHANGED Viewed

@@ -161,7 +161,6 @@ export class AudioRecognition {
     switch (ev.type) {
       case SpeechEventType.FINAL_TRANSCRIPT:
-        this.hooks.onFinalTranscript(ev);
         const transcript = ev.alternatives?.[0]?.text;
         const confidence = ev.alternatives?.[0]?.confidence ?? 0;
         this.lastLanguage = ev.alternatives?.[0]?.language;
@@ -171,6 +170,8 @@ export class AudioRecognition {
           return;
         }
+        this.hooks.onFinalTranscript(ev);
         this.logger.debug(
           {
             user_transcript: transcript,

package/src/voice/background_audio.ts CHANGED Viewed

@@ -63,8 +63,10 @@ export interface BackgroundAudioPlayerOptions {
   thinkingSound?: AudioSourceType | AudioConfig | AudioConfig[];
   /**
-   * Stream timeout in milliseconds
-   * @defaultValue 200
+   * Stream timeout in milliseconds for the audio mixer.
+   * Controls how long the mixer waits for a stream to produce data before timing out.
+   * Higher values are more tolerant of network latency and processing delays.
+   * @defaultValue 2000
    */
   streamTimeoutMs?: number;
 }
@@ -78,6 +80,7 @@ export interface BackgroundAudioStartOptions {
 // Queue size for AudioSource buffer (400ms)
 // Kept small to avoid abrupt cutoffs when removing sounds
 const AUDIO_SOURCE_BUFFER_MS = 400;
+const STREAM_TIMEOUT_MS = 2000;
 export class PlayHandle {
   private doneFuture = new Future<void>();
@@ -155,7 +158,7 @@ export class BackgroundAudioPlayer {
   #logger = log();
   constructor(options?: BackgroundAudioPlayerOptions) {
-    const { ambientSound, thinkingSound, streamTimeoutMs = 200 } = options || {};
+    const { ambientSound, thinkingSound, streamTimeoutMs = STREAM_TIMEOUT_MS } = options || {};
     this.ambientSound = ambientSound;
     this.thinkingSound = thinkingSound;

package/src/voice/generation.ts CHANGED Viewed

@@ -24,10 +24,19 @@ import { isZodSchema, parseZodSchema } from '../llm/zod-utils.js';
 import { log } from '../log.js';
 import { IdentityTransform } from '../stream/identity_transform.js';
 import { traceTypes, tracer } from '../telemetry/index.js';
+import { USERDATA_TIMED_TRANSCRIPT } from '../types.js';
 import { Future, Task, shortuuid, toError, waitForAbort } from '../utils.js';
 import { type Agent, type ModelSettings, asyncLocalStorage, isStopResponse } from './agent.js';
 import type { AgentSession } from './agent_session.js';
-import { AudioOutput, type LLMNode, type TTSNode, type TextOutput } from './io.js';
+import {
+  AudioOutput,
+  type LLMNode,
+  type TTSNode,
+  type TextOutput,
+  type TimedString,
+  createTimedString,
+  isTimedString,
+} from './io.js';
 import { RunContext } from './run_context.js';
 import type { SpeechHandle } from './speech_handle.js';
@@ -46,6 +55,21 @@ export class _LLMGenerationData {
   }
 }
+/**
+ * TTS generation data containing audio stream and optional timed transcripts.
+ * @internal
+ */
+export interface _TTSGenerationData {
+  /** Audio frame stream from TTS */
+  audioStream: ReadableStream<AudioFrame>;
+  /**
+   * Future that resolves to a stream of timed transcripts, or null if TTS doesn't support it.
+   */
+  timedTextsFut: Future<ReadableStream<TimedString> | null>;
+  /** Time to first byte (set when first audio frame is received) */
+  ttfb?: number;
+}
 // TODO(brian): remove this class in favor of ToolOutput
 export class _ToolOutput {
   output: _JsOutput[];
@@ -494,35 +518,105 @@ export function performLLMInference(
 export function performTTSInference(
   node: TTSNode,
-  text: ReadableStream<string>,
+  text: ReadableStream<string | TimedString>,
   modelSettings: ModelSettings,
   controller: AbortController,
-): [Task<void>, ReadableStream<AudioFrame>] {
+): [Task<void>, _TTSGenerationData] {
   const audioStream = new IdentityTransform<AudioFrame>();
   const outputWriter = audioStream.writable.getWriter();
   const audioOutputStream = audioStream.readable;
+  const timedTextsFut = new Future<ReadableStream<TimedString> | null>();
+  const timedTextsStream = new IdentityTransform<TimedString>();
+  const timedTextsWriter = timedTextsStream.writable.getWriter();
+  // Transform stream to extract text from TimedString objects
+  const textOnlyStream = new IdentityTransform<string>();
+  const textOnlyWriter = textOnlyStream.writable.getWriter();
+  (async () => {
+    const reader = text.getReader();
+    try {
+      while (true) {
+        const { done, value } = await reader.read();
+        if (done) {
+          break;
+        }
+        const textValue = typeof value === 'string' ? value : value.text;
+        await textOnlyWriter.write(textValue);
+      }
+      await textOnlyWriter.close();
+    } catch (e) {
+      await textOnlyWriter.abort(e as Error);
+    } finally {
+      reader.releaseLock();
+    }
+  })();
   const _performTTSInferenceImpl = async (signal: AbortSignal) => {
     let ttsStreamReader: ReadableStreamDefaultReader<AudioFrame> | null = null;
     let ttsStream: ReadableStream<AudioFrame> | null = null;
+    let pushedDuration = 0;
     try {
-      ttsStream = await node(text, modelSettings);
+      ttsStream = await node(textOnlyStream.readable, modelSettings);
       if (ttsStream === null) {
+        timedTextsFut.resolve(null);
         await outputWriter.close();
+        await timedTextsWriter.close();
         return;
       }
+      // This is critical: the future must be resolved with the channel/stream before the loop
+      // so that agent_activity can start reading while we write
+      if (!timedTextsFut.done) {
+        timedTextsFut.resolve(timedTextsStream.readable);
+      }
       ttsStreamReader = ttsStream.getReader();
+      // In Python, perform_tts_inference has a while loop processing multiple input segments
+      // (separated by FlushSentinel), with pushed_duration accumulating across segments.
+      // JS currently only does single inference, so initialPushedDuration is always 0.
+      // TODO: Add FlushSentinel + multi-segment loop
+      const initialPushedDuration = pushedDuration;
       while (true) {
         if (signal.aborted) {
           break;
         }
-        const { done, value: chunk } = await ttsStreamReader.read();
+        const { done, value: frame } = await ttsStreamReader.read();
         if (done) {
           break;
         }
-        await outputWriter.write(chunk);
+        // Write the audio frame to the output stream
+        await outputWriter.write(frame);
+        const timedTranscripts = frame.userdata[USERDATA_TIMED_TRANSCRIPT] as
+          | TimedString[]
+          | undefined;
+        if (timedTranscripts && timedTranscripts.length > 0) {
+          for (const timedText of timedTranscripts) {
+            // Uses the INITIAL value (from previous inferences), not the accumulated value
+            const adjustedTimedText = createTimedString({
+              text: timedText.text,
+              startTime:
+                timedText.startTime !== undefined
+                  ? timedText.startTime + initialPushedDuration
+                  : undefined,
+              endTime:
+                timedText.endTime !== undefined
+                  ? timedText.endTime + initialPushedDuration
+                  : undefined,
+              confidence: timedText.confidence,
+              startTimeOffset: timedText.startTimeOffset,
+            });
+            await timedTextsWriter.write(adjustedTimedText);
+          }
+        }
+        const frameDuration = frame.samplesPerChannel / frame.sampleRate;
+        pushedDuration += frameDuration;
       }
     } catch (error) {
       if (error instanceof DOMException && error.name === 'AbortError') {
@@ -534,6 +628,7 @@ export function performTTSInference(
       ttsStreamReader?.releaseLock();
       await ttsStream?.cancel();
       await outputWriter.close();
+      await timedTextsWriter.close();
     }
   };
@@ -546,9 +641,14 @@ export function performTTSInference(
       context: currentContext,
     });
+  const genData: _TTSGenerationData = {
+    audioStream: audioOutputStream,
+    timedTextsFut,
+  };
   return [
     Task.from((controller) => inferenceTask(controller.signal), controller, 'performTTSInference'),
-    audioOutputStream,
+    genData,
   ];
 }
@@ -558,7 +658,7 @@ export interface _TextOut {
 }
 async function forwardText(
-  source: ReadableStream<string>,
+  source: ReadableStream<string | TimedString>,
   out: _TextOut,
   signal: AbortSignal,
   textOutput: TextOutput | null,
@@ -571,8 +671,13 @@ async function forwardText(
       }
       const { done, value: delta } = await reader.read();
       if (done) break;
-      out.text += delta;
+      const deltaIsTimedString = isTimedString(delta);
+      const textDelta = deltaIsTimedString ? delta.text : delta;
+      out.text += textDelta;
       if (textOutput !== null) {
+        // Pass TimedString to textOutput for synchronized transcription
         await textOutput.captureText(delta);
       }
       if (!out.firstTextFut.done) {
@@ -588,7 +693,7 @@ async function forwardText(
 }
 export function performTextForwarding(
-  source: ReadableStream<string>,
+  source: ReadableStream<string | TimedString>,
   controller: AbortController,
   textOutput: TextOutput | null,
 ): [Task<void>, _TextOut] {

package/src/voice/index.ts CHANGED Viewed

@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 export { Agent, StopResponse, type AgentOptions, type ModelSettings } from './agent.js';
-export { AgentSession, type AgentSessionOptions } from './agent_session.js';
+export { AgentSession, type AgentSessionOptions, type VoiceOptions } from './agent_session.js';
 export * from './avatar/index.js';
 export * from './background_audio.js';
 export * from './events.js';

package/src/voice/io.ts CHANGED Viewed

@@ -30,9 +30,15 @@ export type TTSNode = (
 ) => Promise<ReadableStream<AudioFrame> | null>;
 /**
- *A string with optional start and end timestamps for word-level alignment.
+ * Symbol used to identify TimedString objects.
+ */
+export const TIMED_STRING_SYMBOL = Symbol.for('lk.TimedString');
+/**
+ * A string with optional start and end timestamps for word-level alignment.
  */
 export interface TimedString {
+  readonly [TIMED_STRING_SYMBOL]: true;
   text: string;
   startTime?: number; // seconds
   endTime?: number; // seconds
@@ -40,6 +46,38 @@ export interface TimedString {
   startTimeOffset?: number;
 }
+/**
+ * Factory function to create a TimedString object.
+ */
+export function createTimedString(opts: {
+  text: string;
+  startTime?: number;
+  endTime?: number;
+  confidence?: number;
+  startTimeOffset?: number;
+}): TimedString {
+  return {
+    [TIMED_STRING_SYMBOL]: true,
+    text: opts.text,
+    startTime: opts.startTime,
+    endTime: opts.endTime,
+    confidence: opts.confidence,
+    startTimeOffset: opts.startTimeOffset,
+  };
+}
+/**
+ * Type guard to check if a value is a TimedString.
+ */
+export function isTimedString(value: unknown): value is TimedString {
+  return (
+    typeof value === 'object' &&
+    value !== null &&
+    TIMED_STRING_SYMBOL in value &&
+    (value as TimedString)[TIMED_STRING_SYMBOL] === true
+  );
+}
 export interface AudioOutputCapabilities {
   /** Whether this output supports pause/resume functionality */
   pause: boolean;
@@ -208,10 +246,7 @@ export interface PlaybackStartedEvent {
 export abstract class TextOutput {
   constructor(protected readonly nextInChain?: TextOutput) {}
-  /**
-   * Capture a text segment (Used by the output of LLM nodes)
-   */
-  abstract captureText(text: string): Promise<void>;
+  abstract captureText(text: string | TimedString): Promise<void>;
   /**
    * Mark the current text segment as complete (e.g LLM generation is complete)

package/src/voice/room_io/_output.ts CHANGED Viewed

@@ -23,7 +23,7 @@ import {
 } from '../../constants.js';
 import { log } from '../../log.js';
 import { Future, Task, shortuuid } from '../../utils.js';
-import { AudioOutput, TextOutput } from '../io.js';
+import { AudioOutput, TextOutput, type TimedString, isTimedString } from '../io.js';
 import { findMicrophoneTrackId } from '../transcription/index.js';
 abstract class BaseParticipantTranscriptionOutput extends TextOutput {
@@ -102,13 +102,14 @@ abstract class BaseParticipantTranscriptionOutput extends TextOutput {
     this.latestText = '';
   }
-  async captureText(text: string) {
+  async captureText(text: string | TimedString) {
     if (!this.participantIdentity) {
       return;
     }
-    this.latestText = text;
-    await this.handleCaptureText(text);
+    const textStr = isTimedString(text) ? text.text : text;
+    this.latestText = textStr;
+    await this.handleCaptureText(textStr);
   }
   flush() {
@@ -298,7 +299,7 @@ export class ParalellTextOutput extends TextOutput {
     this._sinks = sinks;
   }
-  async captureText(text: string) {
+  async captureText(text: string | TimedString) {
     await Promise.all(this._sinks.map((sink) => sink.captureText(text)));
   }