npm - @livekit/agents - Versions diffs - 1.0.40 → 1.0.41 - Mend

@livekit/agents 1.0.40 → 1.0.41

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (122) hide show

package/dist/cli.cjs +20 -18
package/dist/cli.cjs.map +1 -1
package/dist/cli.d.ts.map +1 -1
package/dist/cli.js +20 -18
package/dist/cli.js.map +1 -1
package/dist/index.cjs +5 -0
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +1 -0
package/dist/index.d.ts +1 -0
package/dist/index.d.ts.map +1 -1
package/dist/index.js +3 -0
package/dist/index.js.map +1 -1
package/dist/inference/stt.cjs +2 -1
package/dist/inference/stt.cjs.map +1 -1
package/dist/inference/stt.d.ts.map +1 -1
package/dist/inference/stt.js +2 -1
package/dist/inference/stt.js.map +1 -1
package/dist/llm/realtime.cjs.map +1 -1
package/dist/llm/realtime.d.cts +5 -1
package/dist/llm/realtime.d.ts +5 -1
package/dist/llm/realtime.d.ts.map +1 -1
package/dist/llm/realtime.js.map +1 -1
package/dist/tts/stream_adapter.cjs +15 -1
package/dist/tts/stream_adapter.cjs.map +1 -1
package/dist/tts/stream_adapter.d.ts.map +1 -1
package/dist/tts/stream_adapter.js +15 -1
package/dist/tts/stream_adapter.js.map +1 -1
package/dist/tts/tts.cjs.map +1 -1
package/dist/tts/tts.d.cts +9 -1
package/dist/tts/tts.d.ts +9 -1
package/dist/tts/tts.d.ts.map +1 -1
package/dist/tts/tts.js.map +1 -1
package/dist/types.cjs +3 -0
package/dist/types.cjs.map +1 -1
package/dist/types.d.cts +4 -0
package/dist/types.d.ts +4 -0
package/dist/types.d.ts.map +1 -1
package/dist/types.js +2 -0
package/dist/types.js.map +1 -1
package/dist/voice/agent.cjs +11 -1
package/dist/voice/agent.cjs.map +1 -1
package/dist/voice/agent.d.cts +7 -3
package/dist/voice/agent.d.ts +7 -3
package/dist/voice/agent.d.ts.map +1 -1
package/dist/voice/agent.js +11 -1
package/dist/voice/agent.js.map +1 -1
package/dist/voice/agent_activity.cjs +30 -14
package/dist/voice/agent_activity.cjs.map +1 -1
package/dist/voice/agent_activity.d.cts +1 -0
package/dist/voice/agent_activity.d.ts +1 -0
package/dist/voice/agent_activity.d.ts.map +1 -1
package/dist/voice/agent_activity.js +30 -14
package/dist/voice/agent_activity.js.map +1 -1
package/dist/voice/agent_session.cjs +5 -1
package/dist/voice/agent_session.cjs.map +1 -1
package/dist/voice/agent_session.d.cts +2 -0
package/dist/voice/agent_session.d.ts +2 -0
package/dist/voice/agent_session.d.ts.map +1 -1
package/dist/voice/agent_session.js +5 -1
package/dist/voice/agent_session.js.map +1 -1
package/dist/voice/background_audio.cjs +2 -1
package/dist/voice/background_audio.cjs.map +1 -1
package/dist/voice/background_audio.d.cts +4 -2
package/dist/voice/background_audio.d.ts +4 -2
package/dist/voice/background_audio.d.ts.map +1 -1
package/dist/voice/background_audio.js +2 -1
package/dist/voice/background_audio.js.map +1 -1
package/dist/voice/generation.cjs +58 -5
package/dist/voice/generation.cjs.map +1 -1
package/dist/voice/generation.d.cts +17 -3
package/dist/voice/generation.d.ts +17 -3
package/dist/voice/generation.d.ts.map +1 -1
package/dist/voice/generation.js +63 -6
package/dist/voice/generation.js.map +1 -1
package/dist/voice/index.cjs.map +1 -1
package/dist/voice/index.d.cts +1 -1
package/dist/voice/index.d.ts +1 -1
package/dist/voice/index.d.ts.map +1 -1
package/dist/voice/index.js.map +1 -1
package/dist/voice/io.cjs +22 -2
package/dist/voice/io.cjs.map +1 -1
package/dist/voice/io.d.cts +21 -5
package/dist/voice/io.d.ts +21 -5
package/dist/voice/io.d.ts.map +1 -1
package/dist/voice/io.js +18 -1
package/dist/voice/io.js.map +1 -1
package/dist/voice/room_io/_output.cjs +3 -2
package/dist/voice/room_io/_output.cjs.map +1 -1
package/dist/voice/room_io/_output.d.cts +3 -3
package/dist/voice/room_io/_output.d.ts +3 -3
package/dist/voice/room_io/_output.d.ts.map +1 -1
package/dist/voice/room_io/_output.js +4 -3
package/dist/voice/room_io/_output.js.map +1 -1
package/dist/voice/transcription/synchronizer.cjs +137 -13
package/dist/voice/transcription/synchronizer.cjs.map +1 -1
package/dist/voice/transcription/synchronizer.d.cts +34 -4
package/dist/voice/transcription/synchronizer.d.ts +34 -4
package/dist/voice/transcription/synchronizer.d.ts.map +1 -1
package/dist/voice/transcription/synchronizer.js +141 -14
package/dist/voice/transcription/synchronizer.js.map +1 -1
package/dist/voice/transcription/synchronizer.test.cjs +151 -0
package/dist/voice/transcription/synchronizer.test.cjs.map +1 -0
package/dist/voice/transcription/synchronizer.test.js +150 -0
package/dist/voice/transcription/synchronizer.test.js.map +1 -0
package/package.json +1 -1
package/src/cli.ts +20 -18
package/src/index.ts +1 -0
package/src/inference/stt.ts +9 -8
package/src/llm/realtime.ts +5 -1
package/src/tts/stream_adapter.ts +23 -1
package/src/tts/tts.ts +10 -1
package/src/types.ts +5 -0
package/src/voice/agent.ts +19 -4
package/src/voice/agent_activity.ts +38 -13
package/src/voice/agent_session.ts +6 -0
package/src/voice/background_audio.ts +6 -3
package/src/voice/generation.ts +115 -10
package/src/voice/index.ts +1 -1
package/src/voice/io.ts +40 -5
package/src/voice/room_io/_output.ts +6 -5
package/src/voice/transcription/synchronizer.test.ts +206 -0
package/src/voice/transcription/synchronizer.ts +202 -17

package/src/voice/transcription/synchronizer.test.ts ADDED Viewed

@@ -0,0 +1,206 @@
+// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+import { describe, expect, it } from 'vitest';
+import { SpeakingRateData } from './synchronizer.js';
+describe('SpeakingRateData', () => {
+  describe('constructor', () => {
+    it('should initialize with empty arrays', () => {
+      const data = new SpeakingRateData();
+      expect(data.timestamps).toEqual([]);
+      expect(data.speakingRate).toEqual([]);
+      expect(data.speakIntegrals).toEqual([]);
+      expect(data.pushedDuration).toBe(0);
+    });
+  });
+  describe('addByRate', () => {
+    it('should add a single rate entry', () => {
+      const data = new SpeakingRateData();
+      data.addByRate(1.0, 5.0);
+      expect(data.timestamps).toEqual([1.0]);
+      expect(data.speakingRate).toEqual([5.0]);
+      // integral = 0 + 5.0 * (1.0 - 0) = 5.0
+      expect(data.speakIntegrals).toEqual([5.0]);
+      expect(data.pushedDuration).toBe(1.0);
+    });
+    it('should accumulate integrals across multiple entries', () => {
+      const data = new SpeakingRateData();
+      data.addByRate(1.0, 4.0); // integral = 0 + 4.0 * 1.0 = 4.0
+      data.addByRate(2.0, 6.0); // integral = 4.0 + 6.0 * 1.0 = 10.0
+      data.addByRate(3.5, 2.0); // integral = 10.0 + 2.0 * 1.5 = 13.0
+      expect(data.timestamps).toEqual([1.0, 2.0, 3.5]);
+      expect(data.speakingRate).toEqual([4.0, 6.0, 2.0]);
+      expect(data.speakIntegrals).toEqual([4.0, 10.0, 13.0]);
+      expect(data.pushedDuration).toBe(3.5);
+    });
+    it('should handle zero rate', () => {
+      const data = new SpeakingRateData();
+      data.addByRate(1.0, 0.0);
+      expect(data.timestamps).toEqual([1.0]);
+      expect(data.speakingRate).toEqual([0.0]);
+      expect(data.speakIntegrals).toEqual([0.0]);
+    });
+  });
+  describe('addByAnnotation', () => {
+    it('should buffer text without startTime', () => {
+      const data = new SpeakingRateData();
+      data.addByAnnotation('hello', undefined, undefined);
+      // Text is buffered, no timestamp entry yet
+      expect(data.timestamps).toEqual([]);
+      expect(data.pushedDuration).toBe(0);
+    });
+    it('should add entry when startTime is provided', () => {
+      const data = new SpeakingRateData();
+      data.addByAnnotation('hello', undefined, undefined); // buffer "hello"
+      data.addByAnnotation('world', 1.0, undefined); // flush with startTime
+      expect(data.timestamps).toEqual([1.0]);
+      // textLen = 5 (hello), dt = 1.0, rate = 5/1 = 5.0
+      expect(data.speakingRate).toEqual([5.0]);
+      expect(data.speakIntegrals).toEqual([5.0]);
+    });
+    it('should handle startTime and endTime together', () => {
+      const data = new SpeakingRateData();
+      data.addByAnnotation('hello ', 0.0, 0.5);
+      data.addByAnnotation('world', 0.5, 1.0);
+      // First annotation: startTime=0.0, text="hello ", then recursively calls with endTime=0.5
+      // Second annotation: startTime=0.5, text="world", then recursively calls with endTime=1.0
+      expect(data.timestamps.length).toBeGreaterThanOrEqual(2);
+      expect(data.pushedDuration).toBe(1.0);
+    });
+    it('should calculate rate based on buffered text length', () => {
+      const data = new SpeakingRateData();
+      data.addByAnnotation('ab', undefined, undefined); // buffer 2 chars
+      data.addByAnnotation('cde', undefined, undefined); // buffer 3 more chars
+      data.addByAnnotation('', 2.0, undefined); // flush: textLen=5, dt=2.0, rate=2.5
+      expect(data.timestamps).toEqual([2.0]);
+      expect(data.speakingRate).toEqual([2.5]);
+      expect(data.speakIntegrals).toEqual([5.0]);
+    });
+    it('should handle zero time delta gracefully', () => {
+      const data = new SpeakingRateData();
+      data.addByAnnotation('hello', 0.0, undefined); // dt=0, rate should be 0
+      expect(data.timestamps).toEqual([0.0]);
+      expect(data.speakingRate).toEqual([0.0]);
+      expect(data.speakIntegrals).toEqual([0.0]);
+    });
+  });
+  describe('accumulateTo', () => {
+    it('should return 0 for empty data', () => {
+      const data = new SpeakingRateData();
+      expect(data.accumulateTo(1.0)).toBe(0);
+    });
+    it('should return 0 for timestamp before first entry', () => {
+      const data = new SpeakingRateData();
+      data.addByRate(1.0, 5.0);
+      expect(data.accumulateTo(0.5)).toBe(0);
+    });
+    it('should return exact integral at timestamp', () => {
+      const data = new SpeakingRateData();
+      data.addByRate(1.0, 4.0); // integral = 4.0
+      data.addByRate(2.0, 6.0); // integral = 10.0
+      expect(data.accumulateTo(1.0)).toBe(4.0);
+      expect(data.accumulateTo(2.0)).toBe(10.0);
+    });
+    it('should interpolate between timestamps', () => {
+      const data = new SpeakingRateData();
+      data.addByRate(1.0, 4.0); // integral = 4.0
+      data.addByRate(2.0, 6.0); // integral = 10.0
+      // At 1.5: integral = 4.0 + 6.0 * 0.5 = 7.0
+      expect(data.accumulateTo(1.5)).toBe(7.0);
+    });
+    it('should extrapolate beyond last timestamp', () => {
+      const data = new SpeakingRateData();
+      data.addByRate(1.0, 4.0); // integral = 4.0
+      data.addByRate(2.0, 6.0); // integral = 10.0
+      // At 3.0: integral = 10.0 + 6.0 * 1.0 = 16.0
+      expect(data.accumulateTo(3.0)).toBe(16.0);
+    });
+    it('should not exceed next integral when interpolating', () => {
+      const data = new SpeakingRateData();
+      data.addByRate(1.0, 100.0); // integral = 100.0 (very high rate)
+      data.addByRate(2.0, 1.0); // integral = 101.0
+      // At 1.5 with rate 1.0: would be 100.0 + 1.0 * 0.5 = 100.5
+      // But capped at next integral 101.0, so result is min(100.5, 101.0) = 100.5
+      expect(data.accumulateTo(1.5)).toBe(100.5);
+    });
+  });
+  describe('pushedDuration', () => {
+    it('should return 0 when empty', () => {
+      const data = new SpeakingRateData();
+      expect(data.pushedDuration).toBe(0);
+    });
+    it('should return last timestamp', () => {
+      const data = new SpeakingRateData();
+      data.addByRate(1.0, 5.0);
+      data.addByRate(2.5, 3.0);
+      data.addByRate(4.0, 7.0);
+      expect(data.pushedDuration).toBe(4.0);
+    });
+  });
+  describe('integration scenarios', () => {
+    it('should handle typical TTS word timing scenario', () => {
+      const data = new SpeakingRateData();
+      // Simulating words with timing: "Hello " at 0-0.3s, "world" at 0.3-0.6s
+      data.addByAnnotation('Hello ', 0.0, 0.3);
+      data.addByAnnotation('world', 0.3, 0.6);
+      // Should have accumulated text lengths at each timestamp
+      expect(data.pushedDuration).toBe(0.6);
+      // At 0.15s (middle of first word), should be partway through
+      const mid1 = data.accumulateTo(0.15);
+      expect(mid1).toBeGreaterThan(0);
+      expect(mid1).toBeLessThan(6); // "Hello " is 6 chars
+      // At 0.45s (middle of second word), should be past first word
+      const mid2 = data.accumulateTo(0.45);
+      expect(mid2).toBeGreaterThan(6);
+    });
+    it('should handle mixed rate and annotation data', () => {
+      const data = new SpeakingRateData();
+      // Start with rate-based data
+      data.addByRate(0.5, 4.0); // integral = 2.0
+      // Then add annotation
+      data.addByAnnotation('test', undefined, undefined);
+      data.addByAnnotation('', 1.0, undefined); // textLen=4, dt=0.5, rate=8.0, integral = 2.0 + 4.0 = 6.0
+      expect(data.timestamps).toEqual([0.5, 1.0]);
+      expect(data.speakIntegrals).toEqual([2.0, 6.0]);
+    });
+  });
+});

package/src/voice/transcription/synchronizer.ts CHANGED Viewed

@@ -8,7 +8,13 @@ import { IdentityTransform } from '../../stream/identity_transform.js';
 import type { SentenceStream, SentenceTokenizer } from '../../tokenize/index.js';
 import { basic } from '../../tokenize/index.js';
 import { Future, Task, delay } from '../../utils.js';
-import { AudioOutput, type PlaybackFinishedEvent, TextOutput } from '../io.js';
+import {
+  AudioOutput,
+  type PlaybackFinishedEvent,
+  TextOutput,
+  type TimedString,
+  isTimedString,
+} from '../io.js';
 const STANDARD_SPEECH_RATE = 3.83; // hyphens (syllables) per second
@@ -27,9 +33,110 @@ interface TextData {
   forwardedText: string;
 }
+/**
+ * Tracks speaking rate data from TTS timing annotations.
+ * @internal Exported for testing purposes.
+ */
+export class SpeakingRateData {
+  /** Timestamps of the speaking rate. */
+  timestamps: number[] = [];
+  /** Speed at the timestamp. */
+  speakingRate: number[] = [];
+  /** Accumulated speaking units up to the timestamp. */
+  speakIntegrals: number[] = [];
+  /** Buffer for text without timing annotations yet. */
+  private textBuffer: string[] = [];
+  /**
+   * Add by speaking rate estimation.
+   */
+  addByRate(timestamp: number, speakingRate: number): void {
+    const integral =
+      this.speakIntegrals.length > 0 ? this.speakIntegrals[this.speakIntegrals.length - 1]! : 0;
+    const dt = timestamp - this.pushedDuration;
+    const newIntegral = integral + speakingRate * dt;
+    this.timestamps.push(timestamp);
+    this.speakingRate.push(speakingRate);
+    this.speakIntegrals.push(newIntegral);
+  }
+  /**
+   * Add annotation from TimedString with start_time/end_time.
+   */
+  addByAnnotation(text: string, startTime: number | undefined, endTime: number | undefined): void {
+    if (startTime !== undefined) {
+      // Calculate the integral of the speaking rate up to the start time
+      const integral =
+        this.speakIntegrals.length > 0 ? this.speakIntegrals[this.speakIntegrals.length - 1]! : 0;
+      const dt = startTime - this.pushedDuration;
+      // Use the length of the text directly instead of hyphens
+      const textLen = this.textBuffer.reduce((sum, t) => sum + t.length, 0);
+      const newIntegral = integral + textLen;
+      const rate = dt > 0 ? textLen / dt : 0;
+      this.timestamps.push(startTime);
+      this.speakingRate.push(rate);
+      this.speakIntegrals.push(newIntegral);
+      this.textBuffer = [];
+    }
+    this.textBuffer.push(text);
+    if (endTime !== undefined) {
+      this.addByAnnotation('', endTime, undefined);
+    }
+  }
+  /**
+   * Get accumulated speaking units up to the given timestamp.
+   */
+  accumulateTo(timestamp: number): number {
+    if (this.timestamps.length === 0) {
+      return 0;
+    }
+    // Binary search for the right position (equivalent to np.searchsorted with side="right")
+    let idx = 0;
+    for (let i = 0; i < this.timestamps.length; i++) {
+      if (this.timestamps[i]! <= timestamp) {
+        idx = i + 1;
+      } else {
+        break;
+      }
+    }
+    if (idx === 0) {
+      return 0;
+    }
+    let integralT = this.speakIntegrals[idx - 1]!;
+    // Fill the tail assuming the speaking rate is constant
+    const dt = timestamp - this.timestamps[idx - 1]!;
+    const rate =
+      idx < this.speakingRate.length ? this.speakingRate[idx]! : this.speakingRate[idx - 1]!;
+    integralT += rate * dt;
+    // If there is a next timestamp, make sure the integral does not exceed the next
+    if (idx < this.timestamps.length) {
+      integralT = Math.min(integralT, this.speakIntegrals[idx]!);
+    }
+    return integralT;
+  }
+  /** Get the last pushed timestamp. */
+  get pushedDuration(): number {
+    return this.timestamps.length > 0 ? this.timestamps[this.timestamps.length - 1]! : 0;
+  }
+}
 interface AudioData {
   pushedDuration: number;
   done: boolean;
+  annotatedRate: SpeakingRateData | null;
 }
 class SegmentSynchronizerImpl {
@@ -62,6 +169,7 @@ class SegmentSynchronizerImpl {
     this.audioData = {
       pushedDuration: 0,
       done: false,
+      annotatedRate: null,
     };
     this.outputStream = new IdentityTransform();
     this.outputStreamWriter = this.outputStream.writable.getWriter();
@@ -88,6 +196,10 @@ class SegmentSynchronizerImpl {
     return this.textData.done;
   }
+  get hasPendingText(): boolean {
+    return this.textData.pushedText.length > this.textData.forwardedText.length;
+  }
   get readable(): ReadableStream<string> {
     return this.outputStream.readable;
   }
@@ -117,14 +229,36 @@ class SegmentSynchronizerImpl {
     this.audioData.done = true;
   }
-  pushText(text: string) {
+  pushText(text: string | TimedString) {
     if (this.closed) {
       this.logger.warn('SegmentSynchronizerImpl.pushText called after close');
       return;
     }
-    this.textData.sentenceStream.pushText(text);
-    this.textData.pushedText += text;
+    // Check if text is a TimedString (has timing information)
+    let textStr: string;
+    let startTime: number | undefined;
+    let endTime: number | undefined;
+    if (isTimedString(text)) {
+      // This is a TimedString
+      textStr = text.text;
+      startTime = text.startTime;
+      endTime = text.endTime;
+      // Create annotatedRate if it doesn't exist
+      if (!this.audioData.annotatedRate) {
+        this.audioData.annotatedRate = new SpeakingRateData();
+      }
+      // Add the timing annotation
+      this.audioData.annotatedRate.addByAnnotation(textStr, startTime, endTime);
+    } else {
+      textStr = text;
+    }
+    this.textData.sentenceStream.pushText(textStr);
+    this.textData.pushedText += textStr;
   }
   endTextInput() {
@@ -148,6 +282,10 @@ class SegmentSynchronizerImpl {
         { textDone: this.textData.done, audioDone: this.audioData.done },
         'SegmentSynchronizerImpl.markPlaybackFinished called before text/audio input is done',
       );
+      // This allows mainTask to flush remaining text even if audio wasn't formally ended
+      if (!interrupted) {
+        this.playbackCompleted = true;
+      }
       return;
     }
@@ -166,13 +304,13 @@ class SegmentSynchronizerImpl {
   private async captureTaskImpl() {
     // Don't use a for-await loop here, because exiting the loop will close the writer in the
     // outputStream, which will cause an error in the mainTask.then method.
+    // NOTE: forwardedText is updated in mainTask, NOT here
     const reader = this.outputStream.readable.getReader();
     while (true) {
       const { done, value: text } = await reader.read();
       if (done) {
         break;
       }
-      this.textData.forwardedText += text;
       await this.nextInChain.captureText(text);
     }
     reader.releaseLock();
@@ -211,19 +349,42 @@ class SegmentSynchronizerImpl {
         const wordHphens = this.options.hyphenateWord(word).length;
         const elapsedSeconds = (Date.now() - this.startWallTime) / 1000;
-        const targetHyphens = elapsedSeconds * this.options.speed;
-        const hyphensBehind = Math.max(0, targetHyphens - this.textData.forwardedHyphens);
-        let delay = Math.max(0, wordHphens - hyphensBehind) / this.speed;
+        let dHyphens = 0;
+        const annotated = this.audioData.annotatedRate;
+        if (annotated && annotated.pushedDuration >= elapsedSeconds) {
+          // Use actual TTS timing annotations for accurate sync
+          const targetLen = Math.floor(annotated.accumulateTo(elapsedSeconds));
+          const forwardedLen = this.textData.forwardedText.length;
+          if (targetLen >= forwardedLen) {
+            const dText = this.textData.pushedText.slice(forwardedLen, targetLen);
+            dHyphens = this.calcHyphens(dText).length;
+          } else {
+            const dText = this.textData.pushedText.slice(targetLen, forwardedLen);
+            dHyphens = -this.calcHyphens(dText).length;
+          }
+        } else {
+          // Fall back to estimated hyphens-per-second calculation
+          const targetHyphens = elapsedSeconds * this.options.speed;
+          dHyphens = Math.max(0, targetHyphens - this.textData.forwardedHyphens);
+        }
+        let delayTime = Math.max(0, wordHphens - dHyphens) / this.speed;
         if (this.playbackCompleted) {
-          delay = 0;
+          delayTime = 0;
         }
-        await this.sleepIfNotClosed(delay / 2);
-        this.outputStreamWriter.write(sentence.slice(textCursor, endPos));
-        await this.sleepIfNotClosed(delay / 2);
+        await this.sleepIfNotClosed(delayTime / 2);
+        const forwardedWord = sentence.slice(textCursor, endPos);
+        this.outputStreamWriter.write(forwardedWord);
+        await this.sleepIfNotClosed(delayTime / 2);
         this.textData.forwardedHyphens += wordHphens;
+        this.textData.forwardedText += forwardedWord;
         textCursor = endPos;
       }
@@ -234,6 +395,15 @@ class SegmentSynchronizerImpl {
     }
   }
+  private calcHyphens(text: string): string[] {
+    const words = this.options.splitWords(text);
+    const hyphens: string[] = [];
+    for (const [word] of words) {
+      hyphens.push(...this.options.hyphenateWord(word));
+    }
+    return hyphens;
+  }
   private async sleepIfNotClosed(sleepTimeSeconds: number) {
     if (this.closed) {
       return;
@@ -350,6 +520,7 @@ export class TranscriptionSynchronizer {
     if (abort.aborted) {
       return;
     }
     await this._impl.close();
     this._impl = new SegmentSynchronizerImpl(this.options, this.textOutput.nextInChain);
   }
@@ -399,7 +570,15 @@ class SyncedAudioOutput extends AudioOutput {
     }
     if (!this.pushedDuration) {
-      // in case there is no audio after the text was pushed, rotate the segment
+      // For timed texts, audio goes directly to room without going through synchronizer.
+      // If text was pushed but no audio, still end audio input so text can be processed.
+      // Only rotate if there's also no text (truly empty segment).
+      if (this.synchronizer._impl.hasPendingText) {
+        // Text is pending - end audio input to allow text processing
+        this.synchronizer._impl.endAudioInput();
+        return;
+      }
+      // No text and no audio - rotate the segment
       this.synchronizer.rotateSegment();
       return;
     }
@@ -441,12 +620,14 @@ class SyncedTextOutput extends TextOutput {
     super(nextInChain);
   }
-  async captureText(text: string): Promise<void> {
+  async captureText(text: string | TimedString): Promise<void> {
     await this.synchronizer.barrier();
+    const textStr = isTimedString(text) ? text.text : text;
     if (!this.synchronizer.enabled) {
-      // pass through to the next in chain
-      await this.nextInChain.captureText(text);
+      // pass through to the next in chain (extract string from TimedString if needed)
+      await this.nextInChain.captureText(textStr);
       return;
     }
@@ -458,10 +639,14 @@ class SyncedTextOutput extends TextOutput {
       this.synchronizer.rotateSegment();
       await this.synchronizer.barrier();
     }
+    // Pass the TimedString to pushText for timing extraction
     this.synchronizer._impl.pushText(text);
   }
-  flush() {
+  async flush() {
+    // Wait for any pending rotation to complete before accessing _impl
+    await this.synchronizer.barrier();
     if (!this.synchronizer.enabled) {
       this.nextInChain.flush(); // passthrough text if the synchronizer is disabled
       return;