@livekit/agents 1.0.39 → 1.0.41

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (154) hide show
  1. package/dist/cli.cjs +20 -18
  2. package/dist/cli.cjs.map +1 -1
  3. package/dist/cli.d.ts.map +1 -1
  4. package/dist/cli.js +20 -18
  5. package/dist/cli.js.map +1 -1
  6. package/dist/http_server.cjs +9 -6
  7. package/dist/http_server.cjs.map +1 -1
  8. package/dist/http_server.d.cts +5 -1
  9. package/dist/http_server.d.ts +5 -1
  10. package/dist/http_server.d.ts.map +1 -1
  11. package/dist/http_server.js +9 -6
  12. package/dist/http_server.js.map +1 -1
  13. package/dist/index.cjs +5 -0
  14. package/dist/index.cjs.map +1 -1
  15. package/dist/index.d.cts +1 -0
  16. package/dist/index.d.ts +1 -0
  17. package/dist/index.d.ts.map +1 -1
  18. package/dist/index.js +3 -0
  19. package/dist/index.js.map +1 -1
  20. package/dist/inference/stt.cjs +2 -1
  21. package/dist/inference/stt.cjs.map +1 -1
  22. package/dist/inference/stt.d.ts.map +1 -1
  23. package/dist/inference/stt.js +2 -1
  24. package/dist/inference/stt.js.map +1 -1
  25. package/dist/ipc/supervised_proc.cjs +4 -0
  26. package/dist/ipc/supervised_proc.cjs.map +1 -1
  27. package/dist/ipc/supervised_proc.d.cts +1 -0
  28. package/dist/ipc/supervised_proc.d.ts +1 -0
  29. package/dist/ipc/supervised_proc.d.ts.map +1 -1
  30. package/dist/ipc/supervised_proc.js +4 -0
  31. package/dist/ipc/supervised_proc.js.map +1 -1
  32. package/dist/llm/realtime.cjs.map +1 -1
  33. package/dist/llm/realtime.d.cts +5 -1
  34. package/dist/llm/realtime.d.ts +5 -1
  35. package/dist/llm/realtime.d.ts.map +1 -1
  36. package/dist/llm/realtime.js.map +1 -1
  37. package/dist/tokenize/basic/sentence.cjs +3 -3
  38. package/dist/tokenize/basic/sentence.cjs.map +1 -1
  39. package/dist/tokenize/basic/sentence.js +3 -3
  40. package/dist/tokenize/basic/sentence.js.map +1 -1
  41. package/dist/tokenize/tokenizer.test.cjs +3 -1
  42. package/dist/tokenize/tokenizer.test.cjs.map +1 -1
  43. package/dist/tokenize/tokenizer.test.js +3 -1
  44. package/dist/tokenize/tokenizer.test.js.map +1 -1
  45. package/dist/tts/stream_adapter.cjs +15 -1
  46. package/dist/tts/stream_adapter.cjs.map +1 -1
  47. package/dist/tts/stream_adapter.d.ts.map +1 -1
  48. package/dist/tts/stream_adapter.js +15 -1
  49. package/dist/tts/stream_adapter.js.map +1 -1
  50. package/dist/tts/tts.cjs.map +1 -1
  51. package/dist/tts/tts.d.cts +9 -1
  52. package/dist/tts/tts.d.ts +9 -1
  53. package/dist/tts/tts.d.ts.map +1 -1
  54. package/dist/tts/tts.js.map +1 -1
  55. package/dist/types.cjs +3 -0
  56. package/dist/types.cjs.map +1 -1
  57. package/dist/types.d.cts +4 -0
  58. package/dist/types.d.ts +4 -0
  59. package/dist/types.d.ts.map +1 -1
  60. package/dist/types.js +2 -0
  61. package/dist/types.js.map +1 -1
  62. package/dist/voice/agent.cjs +11 -1
  63. package/dist/voice/agent.cjs.map +1 -1
  64. package/dist/voice/agent.d.cts +7 -3
  65. package/dist/voice/agent.d.ts +7 -3
  66. package/dist/voice/agent.d.ts.map +1 -1
  67. package/dist/voice/agent.js +11 -1
  68. package/dist/voice/agent.js.map +1 -1
  69. package/dist/voice/agent_activity.cjs +30 -14
  70. package/dist/voice/agent_activity.cjs.map +1 -1
  71. package/dist/voice/agent_activity.d.cts +1 -0
  72. package/dist/voice/agent_activity.d.ts +1 -0
  73. package/dist/voice/agent_activity.d.ts.map +1 -1
  74. package/dist/voice/agent_activity.js +30 -14
  75. package/dist/voice/agent_activity.js.map +1 -1
  76. package/dist/voice/agent_session.cjs +5 -1
  77. package/dist/voice/agent_session.cjs.map +1 -1
  78. package/dist/voice/agent_session.d.cts +2 -0
  79. package/dist/voice/agent_session.d.ts +2 -0
  80. package/dist/voice/agent_session.d.ts.map +1 -1
  81. package/dist/voice/agent_session.js +5 -1
  82. package/dist/voice/agent_session.js.map +1 -1
  83. package/dist/voice/background_audio.cjs +2 -1
  84. package/dist/voice/background_audio.cjs.map +1 -1
  85. package/dist/voice/background_audio.d.cts +4 -2
  86. package/dist/voice/background_audio.d.ts +4 -2
  87. package/dist/voice/background_audio.d.ts.map +1 -1
  88. package/dist/voice/background_audio.js +2 -1
  89. package/dist/voice/background_audio.js.map +1 -1
  90. package/dist/voice/generation.cjs +58 -5
  91. package/dist/voice/generation.cjs.map +1 -1
  92. package/dist/voice/generation.d.cts +17 -3
  93. package/dist/voice/generation.d.ts +17 -3
  94. package/dist/voice/generation.d.ts.map +1 -1
  95. package/dist/voice/generation.js +63 -6
  96. package/dist/voice/generation.js.map +1 -1
  97. package/dist/voice/index.cjs.map +1 -1
  98. package/dist/voice/index.d.cts +1 -1
  99. package/dist/voice/index.d.ts +1 -1
  100. package/dist/voice/index.d.ts.map +1 -1
  101. package/dist/voice/index.js.map +1 -1
  102. package/dist/voice/io.cjs +22 -2
  103. package/dist/voice/io.cjs.map +1 -1
  104. package/dist/voice/io.d.cts +21 -5
  105. package/dist/voice/io.d.ts +21 -5
  106. package/dist/voice/io.d.ts.map +1 -1
  107. package/dist/voice/io.js +18 -1
  108. package/dist/voice/io.js.map +1 -1
  109. package/dist/voice/room_io/_output.cjs +3 -2
  110. package/dist/voice/room_io/_output.cjs.map +1 -1
  111. package/dist/voice/room_io/_output.d.cts +3 -3
  112. package/dist/voice/room_io/_output.d.ts +3 -3
  113. package/dist/voice/room_io/_output.d.ts.map +1 -1
  114. package/dist/voice/room_io/_output.js +4 -3
  115. package/dist/voice/room_io/_output.js.map +1 -1
  116. package/dist/voice/transcription/synchronizer.cjs +137 -13
  117. package/dist/voice/transcription/synchronizer.cjs.map +1 -1
  118. package/dist/voice/transcription/synchronizer.d.cts +34 -4
  119. package/dist/voice/transcription/synchronizer.d.ts +34 -4
  120. package/dist/voice/transcription/synchronizer.d.ts.map +1 -1
  121. package/dist/voice/transcription/synchronizer.js +141 -14
  122. package/dist/voice/transcription/synchronizer.js.map +1 -1
  123. package/dist/voice/transcription/synchronizer.test.cjs +151 -0
  124. package/dist/voice/transcription/synchronizer.test.cjs.map +1 -0
  125. package/dist/voice/transcription/synchronizer.test.js +150 -0
  126. package/dist/voice/transcription/synchronizer.test.js.map +1 -0
  127. package/dist/worker.cjs +12 -2
  128. package/dist/worker.cjs.map +1 -1
  129. package/dist/worker.d.ts.map +1 -1
  130. package/dist/worker.js +12 -2
  131. package/dist/worker.js.map +1 -1
  132. package/package.json +1 -1
  133. package/src/cli.ts +20 -18
  134. package/src/http_server.ts +18 -6
  135. package/src/index.ts +1 -0
  136. package/src/inference/stt.ts +9 -8
  137. package/src/ipc/supervised_proc.ts +4 -0
  138. package/src/llm/realtime.ts +5 -1
  139. package/src/tokenize/basic/sentence.ts +3 -3
  140. package/src/tokenize/tokenizer.test.ts +4 -0
  141. package/src/tts/stream_adapter.ts +23 -1
  142. package/src/tts/tts.ts +10 -1
  143. package/src/types.ts +5 -0
  144. package/src/voice/agent.ts +19 -4
  145. package/src/voice/agent_activity.ts +38 -13
  146. package/src/voice/agent_session.ts +6 -0
  147. package/src/voice/background_audio.ts +6 -3
  148. package/src/voice/generation.ts +115 -10
  149. package/src/voice/index.ts +1 -1
  150. package/src/voice/io.ts +40 -5
  151. package/src/voice/room_io/_output.ts +6 -5
  152. package/src/voice/transcription/synchronizer.test.ts +206 -0
  153. package/src/voice/transcription/synchronizer.ts +202 -17
  154. package/src/worker.ts +24 -2
@@ -60,7 +60,7 @@ import {
60
60
  createSpeechCreatedEvent,
61
61
  createUserInputTranscribedEvent,
62
62
  } from './events.js';
63
- import type { ToolExecutionOutput } from './generation.js';
63
+ import type { ToolExecutionOutput, _TTSGenerationData } from './generation.js';
64
64
  import {
65
65
  type _AudioOut,
66
66
  type _TextOut,
@@ -72,6 +72,7 @@ import {
72
72
  removeInstructions,
73
73
  updateInstructions,
74
74
  } from './generation.js';
75
+ import type { TimedString } from './io.js';
75
76
  import { SpeechHandle } from './speech_handle.js';
76
77
 
77
78
  const speechHandleStorage = new AsyncLocalStorage<SpeechHandle>();
@@ -359,6 +360,11 @@ export class AgentActivity implements RecognitionHooks {
359
360
  return this.agentSession.options.allowInterruptions;
360
361
  }
361
362
 
363
+ get useTtsAlignedTranscript(): boolean {
364
+ // Agent setting takes precedence over session setting
365
+ return this.agent.useTtsAlignedTranscript ?? this.agentSession.useTtsAlignedTranscript;
366
+ }
367
+
362
368
  get turnDetection(): TurnDetectionMode | undefined {
363
369
  // TODO(brian): prioritize using agent.turn_detection
364
370
  return this.agentSession.turnDetection;
@@ -1258,7 +1264,7 @@ export class AgentActivity implements RecognitionHooks {
1258
1264
  let audioOut: _AudioOut | null = null;
1259
1265
  if (!audio) {
1260
1266
  // generate audio using TTS
1261
- const [ttsTask, ttsStream] = performTTSInference(
1267
+ const [ttsTask, ttsGenData] = performTTSInference(
1262
1268
  (...args) => this.agent.ttsNode(...args),
1263
1269
  audioSource,
1264
1270
  modelSettings,
@@ -1267,7 +1273,7 @@ export class AgentActivity implements RecognitionHooks {
1267
1273
  tasks.push(ttsTask);
1268
1274
 
1269
1275
  const [forwardTask, _audioOut] = performAudioForwarding(
1270
- ttsStream,
1276
+ ttsGenData.audioStream,
1271
1277
  audioOutput,
1272
1278
  replyAbortController,
1273
1279
  );
@@ -1389,14 +1395,14 @@ export class AgentActivity implements RecognitionHooks {
1389
1395
  tasks.push(llmTask);
1390
1396
 
1391
1397
  let ttsTask: Task<void> | null = null;
1392
- let ttsStream: ReadableStream<AudioFrame> | null = null;
1398
+ let ttsGenData: _TTSGenerationData | null = null;
1393
1399
  let llmOutput: ReadableStream<string>;
1394
1400
 
1395
1401
  if (audioOutput) {
1396
1402
  // Only tee the stream when we need TTS
1397
1403
  const [ttsTextInput, textOutput] = llmGenData.textStream.tee();
1398
1404
  llmOutput = textOutput;
1399
- [ttsTask, ttsStream] = performTTSInference(
1405
+ [ttsTask, ttsGenData] = performTTSInference(
1400
1406
  (...args) => this.agent.ttsNode(...args),
1401
1407
  ttsTextInput,
1402
1408
  modelSettings,
@@ -1428,7 +1434,26 @@ export class AgentActivity implements RecognitionHooks {
1428
1434
  speechHandle._clearAuthorization();
1429
1435
 
1430
1436
  const replyStartedAt = Date.now();
1431
- const trNodeResult = await this.agent.transcriptionNode(llmOutput, modelSettings);
1437
+
1438
+ // Determine the transcription input source
1439
+ let transcriptionInput: ReadableStream<string | TimedString> = llmOutput;
1440
+
1441
+ // Check if we should use TTS aligned transcripts
1442
+ if (this.useTtsAlignedTranscript && this.tts?.capabilities.alignedTranscript && ttsGenData) {
1443
+ // Race timedTextsFut with ttsTask to avoid hanging if TTS fails before resolving the future
1444
+ const timedTextsStream = await Promise.race([
1445
+ ttsGenData.timedTextsFut.await,
1446
+ ttsTask?.result.catch(() =>
1447
+ this.logger.warn('TTS task failed before resolving timedTextsFut'),
1448
+ ) ?? Promise.resolve(),
1449
+ ]);
1450
+ if (timedTextsStream) {
1451
+ this.logger.debug('Using TTS aligned transcripts for transcription node input');
1452
+ transcriptionInput = timedTextsStream;
1453
+ }
1454
+ }
1455
+
1456
+ const trNodeResult = await this.agent.transcriptionNode(transcriptionInput, modelSettings);
1432
1457
  let textOut: _TextOut | null = null;
1433
1458
  if (trNodeResult) {
1434
1459
  const [textForwardTask, _textOut] = performTextForwarding(
@@ -1449,9 +1474,9 @@ export class AgentActivity implements RecognitionHooks {
1449
1474
 
1450
1475
  let audioOut: _AudioOut | null = null;
1451
1476
  if (audioOutput) {
1452
- if (ttsStream) {
1477
+ if (ttsGenData) {
1453
1478
  const [forwardTask, _audioOut] = performAudioForwarding(
1454
- ttsStream,
1479
+ ttsGenData.audioStream,
1455
1480
  audioOutput,
1456
1481
  replyAbortController,
1457
1482
  );
@@ -1461,7 +1486,7 @@ export class AgentActivity implements RecognitionHooks {
1461
1486
  .then((ts) => onFirstFrame(ts))
1462
1487
  .catch(() => this.logger.debug('firstFrameFut cancelled before first frame'));
1463
1488
  } else {
1464
- throw Error('ttsStream is null when audioOutput is enabled');
1489
+ throw Error('ttsGenData is null when audioOutput is enabled');
1465
1490
  }
1466
1491
  } else {
1467
1492
  textOut?.firstTextFut.await
@@ -1851,8 +1876,8 @@ export class AgentActivity implements RecognitionHooks {
1851
1876
  }
1852
1877
 
1853
1878
  const msgModalities = msg.modalities ? await msg.modalities : undefined;
1854
- let ttsTextInput: ReadableStream<string> | null = null;
1855
- let trTextInput: ReadableStream<string>;
1879
+ let ttsTextInput: ReadableStream<string | TimedString> | null = null;
1880
+ let trTextInput: ReadableStream<string | TimedString>;
1856
1881
 
1857
1882
  if (msgModalities && !msgModalities.includes('audio') && this.tts) {
1858
1883
  if (this.llm instanceof RealtimeModel && this.llm.capabilities.audioOutput) {
@@ -1884,14 +1909,14 @@ export class AgentActivity implements RecognitionHooks {
1884
1909
  let realtimeAudioResult: ReadableStream<AudioFrame> | null = null;
1885
1910
 
1886
1911
  if (ttsTextInput) {
1887
- const [ttsTask, ttsStream] = performTTSInference(
1912
+ const [ttsTask, ttsGenData] = performTTSInference(
1888
1913
  (...args) => this.agent.ttsNode(...args),
1889
1914
  ttsTextInput,
1890
1915
  modelSettings,
1891
1916
  abortController,
1892
1917
  );
1893
1918
  tasks.push(ttsTask);
1894
- realtimeAudioResult = ttsStream;
1919
+ realtimeAudioResult = ttsGenData.audioStream;
1895
1920
  } else if (msgModalities && msgModalities.includes('audio')) {
1896
1921
  realtimeAudioResult = await this.agent.realtimeAudioOutputNode(
1897
1922
  msg.audioStream,
@@ -73,6 +73,7 @@ export interface VoiceOptions {
73
73
  maxToolSteps: number;
74
74
  preemptiveGeneration: boolean;
75
75
  userAwayTimeout?: number | null;
76
+ useTtsAlignedTranscript: boolean;
76
77
  }
77
78
 
78
79
  const defaultVoiceOptions: VoiceOptions = {
@@ -85,6 +86,7 @@ const defaultVoiceOptions: VoiceOptions = {
85
86
  maxToolSteps: 3,
86
87
  preemptiveGeneration: false,
87
88
  userAwayTimeout: 15.0,
89
+ useTtsAlignedTranscript: true,
88
90
  } as const;
89
91
 
90
92
  export type TurnDetectionMode = 'stt' | 'vad' | 'realtime_llm' | 'manual' | _TurnDetector;
@@ -264,6 +266,10 @@ export class AgentSession<
264
266
  return this._connOptions;
265
267
  }
266
268
 
269
+ get useTtsAlignedTranscript(): boolean {
270
+ return this.options.useTtsAlignedTranscript;
271
+ }
272
+
267
273
  set userData(value: UserData) {
268
274
  this._userData = value;
269
275
  }
@@ -63,8 +63,10 @@ export interface BackgroundAudioPlayerOptions {
63
63
  thinkingSound?: AudioSourceType | AudioConfig | AudioConfig[];
64
64
 
65
65
  /**
66
- * Stream timeout in milliseconds
67
- * @defaultValue 200
66
+ * Stream timeout in milliseconds for the audio mixer.
67
+ * Controls how long the mixer waits for a stream to produce data before timing out.
68
+ * Higher values are more tolerant of network latency and processing delays.
69
+ * @defaultValue 2000
68
70
  */
69
71
  streamTimeoutMs?: number;
70
72
  }
@@ -78,6 +80,7 @@ export interface BackgroundAudioStartOptions {
78
80
  // Queue size for AudioSource buffer (400ms)
79
81
  // Kept small to avoid abrupt cutoffs when removing sounds
80
82
  const AUDIO_SOURCE_BUFFER_MS = 400;
83
+ const STREAM_TIMEOUT_MS = 2000;
81
84
 
82
85
  export class PlayHandle {
83
86
  private doneFuture = new Future<void>();
@@ -155,7 +158,7 @@ export class BackgroundAudioPlayer {
155
158
  #logger = log();
156
159
 
157
160
  constructor(options?: BackgroundAudioPlayerOptions) {
158
- const { ambientSound, thinkingSound, streamTimeoutMs = 200 } = options || {};
161
+ const { ambientSound, thinkingSound, streamTimeoutMs = STREAM_TIMEOUT_MS } = options || {};
159
162
 
160
163
  this.ambientSound = ambientSound;
161
164
  this.thinkingSound = thinkingSound;
@@ -24,10 +24,19 @@ import { isZodSchema, parseZodSchema } from '../llm/zod-utils.js';
24
24
  import { log } from '../log.js';
25
25
  import { IdentityTransform } from '../stream/identity_transform.js';
26
26
  import { traceTypes, tracer } from '../telemetry/index.js';
27
+ import { USERDATA_TIMED_TRANSCRIPT } from '../types.js';
27
28
  import { Future, Task, shortuuid, toError, waitForAbort } from '../utils.js';
28
29
  import { type Agent, type ModelSettings, asyncLocalStorage, isStopResponse } from './agent.js';
29
30
  import type { AgentSession } from './agent_session.js';
30
- import { AudioOutput, type LLMNode, type TTSNode, type TextOutput } from './io.js';
31
+ import {
32
+ AudioOutput,
33
+ type LLMNode,
34
+ type TTSNode,
35
+ type TextOutput,
36
+ type TimedString,
37
+ createTimedString,
38
+ isTimedString,
39
+ } from './io.js';
31
40
  import { RunContext } from './run_context.js';
32
41
  import type { SpeechHandle } from './speech_handle.js';
33
42
 
@@ -46,6 +55,21 @@ export class _LLMGenerationData {
46
55
  }
47
56
  }
48
57
 
58
+ /**
59
+ * TTS generation data containing audio stream and optional timed transcripts.
60
+ * @internal
61
+ */
62
+ export interface _TTSGenerationData {
63
+ /** Audio frame stream from TTS */
64
+ audioStream: ReadableStream<AudioFrame>;
65
+ /**
66
+ * Future that resolves to a stream of timed transcripts, or null if TTS doesn't support it.
67
+ */
68
+ timedTextsFut: Future<ReadableStream<TimedString> | null>;
69
+ /** Time to first byte (set when first audio frame is received) */
70
+ ttfb?: number;
71
+ }
72
+
49
73
  // TODO(brian): remove this class in favor of ToolOutput
50
74
  export class _ToolOutput {
51
75
  output: _JsOutput[];
@@ -494,35 +518,105 @@ export function performLLMInference(
494
518
 
495
519
  export function performTTSInference(
496
520
  node: TTSNode,
497
- text: ReadableStream<string>,
521
+ text: ReadableStream<string | TimedString>,
498
522
  modelSettings: ModelSettings,
499
523
  controller: AbortController,
500
- ): [Task<void>, ReadableStream<AudioFrame>] {
524
+ ): [Task<void>, _TTSGenerationData] {
501
525
  const audioStream = new IdentityTransform<AudioFrame>();
502
526
  const outputWriter = audioStream.writable.getWriter();
503
527
  const audioOutputStream = audioStream.readable;
504
528
 
529
+ const timedTextsFut = new Future<ReadableStream<TimedString> | null>();
530
+ const timedTextsStream = new IdentityTransform<TimedString>();
531
+ const timedTextsWriter = timedTextsStream.writable.getWriter();
532
+
533
+ // Transform stream to extract text from TimedString objects
534
+ const textOnlyStream = new IdentityTransform<string>();
535
+ const textOnlyWriter = textOnlyStream.writable.getWriter();
536
+ (async () => {
537
+ const reader = text.getReader();
538
+ try {
539
+ while (true) {
540
+ const { done, value } = await reader.read();
541
+ if (done) {
542
+ break;
543
+ }
544
+ const textValue = typeof value === 'string' ? value : value.text;
545
+ await textOnlyWriter.write(textValue);
546
+ }
547
+ await textOnlyWriter.close();
548
+ } catch (e) {
549
+ await textOnlyWriter.abort(e as Error);
550
+ } finally {
551
+ reader.releaseLock();
552
+ }
553
+ })();
554
+
505
555
  const _performTTSInferenceImpl = async (signal: AbortSignal) => {
506
556
  let ttsStreamReader: ReadableStreamDefaultReader<AudioFrame> | null = null;
507
557
  let ttsStream: ReadableStream<AudioFrame> | null = null;
558
+ let pushedDuration = 0;
508
559
 
509
560
  try {
510
- ttsStream = await node(text, modelSettings);
561
+ ttsStream = await node(textOnlyStream.readable, modelSettings);
511
562
  if (ttsStream === null) {
563
+ timedTextsFut.resolve(null);
512
564
  await outputWriter.close();
565
+ await timedTextsWriter.close();
513
566
  return;
514
567
  }
515
568
 
569
+ // This is critical: the future must be resolved with the channel/stream before the loop
570
+ // so that agent_activity can start reading while we write
571
+ if (!timedTextsFut.done) {
572
+ timedTextsFut.resolve(timedTextsStream.readable);
573
+ }
574
+
516
575
  ttsStreamReader = ttsStream.getReader();
576
+
577
+ // In Python, perform_tts_inference has a while loop processing multiple input segments
578
+ // (separated by FlushSentinel), with pushed_duration accumulating across segments.
579
+ // JS currently only does single inference, so initialPushedDuration is always 0.
580
+ // TODO: Add FlushSentinel + multi-segment loop
581
+ const initialPushedDuration = pushedDuration;
582
+
517
583
  while (true) {
518
584
  if (signal.aborted) {
519
585
  break;
520
586
  }
521
- const { done, value: chunk } = await ttsStreamReader.read();
587
+ const { done, value: frame } = await ttsStreamReader.read();
522
588
  if (done) {
523
589
  break;
524
590
  }
525
- await outputWriter.write(chunk);
591
+
592
+ // Write the audio frame to the output stream
593
+ await outputWriter.write(frame);
594
+
595
+ const timedTranscripts = frame.userdata[USERDATA_TIMED_TRANSCRIPT] as
596
+ | TimedString[]
597
+ | undefined;
598
+ if (timedTranscripts && timedTranscripts.length > 0) {
599
+ for (const timedText of timedTranscripts) {
600
+ // Uses the INITIAL value (from previous inferences), not the accumulated value
601
+ const adjustedTimedText = createTimedString({
602
+ text: timedText.text,
603
+ startTime:
604
+ timedText.startTime !== undefined
605
+ ? timedText.startTime + initialPushedDuration
606
+ : undefined,
607
+ endTime:
608
+ timedText.endTime !== undefined
609
+ ? timedText.endTime + initialPushedDuration
610
+ : undefined,
611
+ confidence: timedText.confidence,
612
+ startTimeOffset: timedText.startTimeOffset,
613
+ });
614
+ await timedTextsWriter.write(adjustedTimedText);
615
+ }
616
+ }
617
+
618
+ const frameDuration = frame.samplesPerChannel / frame.sampleRate;
619
+ pushedDuration += frameDuration;
526
620
  }
527
621
  } catch (error) {
528
622
  if (error instanceof DOMException && error.name === 'AbortError') {
@@ -534,6 +628,7 @@ export function performTTSInference(
534
628
  ttsStreamReader?.releaseLock();
535
629
  await ttsStream?.cancel();
536
630
  await outputWriter.close();
631
+ await timedTextsWriter.close();
537
632
  }
538
633
  };
539
634
 
@@ -546,9 +641,14 @@ export function performTTSInference(
546
641
  context: currentContext,
547
642
  });
548
643
 
644
+ const genData: _TTSGenerationData = {
645
+ audioStream: audioOutputStream,
646
+ timedTextsFut,
647
+ };
648
+
549
649
  return [
550
650
  Task.from((controller) => inferenceTask(controller.signal), controller, 'performTTSInference'),
551
- audioOutputStream,
651
+ genData,
552
652
  ];
553
653
  }
554
654
 
@@ -558,7 +658,7 @@ export interface _TextOut {
558
658
  }
559
659
 
560
660
  async function forwardText(
561
- source: ReadableStream<string>,
661
+ source: ReadableStream<string | TimedString>,
562
662
  out: _TextOut,
563
663
  signal: AbortSignal,
564
664
  textOutput: TextOutput | null,
@@ -571,8 +671,13 @@ async function forwardText(
571
671
  }
572
672
  const { done, value: delta } = await reader.read();
573
673
  if (done) break;
574
- out.text += delta;
674
+
675
+ const deltaIsTimedString = isTimedString(delta);
676
+ const textDelta = deltaIsTimedString ? delta.text : delta;
677
+
678
+ out.text += textDelta;
575
679
  if (textOutput !== null) {
680
+ // Pass TimedString to textOutput for synchronized transcription
576
681
  await textOutput.captureText(delta);
577
682
  }
578
683
  if (!out.firstTextFut.done) {
@@ -588,7 +693,7 @@ async function forwardText(
588
693
  }
589
694
 
590
695
  export function performTextForwarding(
591
- source: ReadableStream<string>,
696
+ source: ReadableStream<string | TimedString>,
592
697
  controller: AbortController,
593
698
  textOutput: TextOutput | null,
594
699
  ): [Task<void>, _TextOut] {
@@ -2,7 +2,7 @@
2
2
  //
3
3
  // SPDX-License-Identifier: Apache-2.0
4
4
  export { Agent, StopResponse, type AgentOptions, type ModelSettings } from './agent.js';
5
- export { AgentSession, type AgentSessionOptions } from './agent_session.js';
5
+ export { AgentSession, type AgentSessionOptions, type VoiceOptions } from './agent_session.js';
6
6
  export * from './avatar/index.js';
7
7
  export * from './background_audio.js';
8
8
  export * from './events.js';
package/src/voice/io.ts CHANGED
@@ -30,9 +30,15 @@ export type TTSNode = (
30
30
  ) => Promise<ReadableStream<AudioFrame> | null>;
31
31
 
32
32
  /**
33
- *A string with optional start and end timestamps for word-level alignment.
33
+ * Symbol used to identify TimedString objects.
34
+ */
35
+ export const TIMED_STRING_SYMBOL = Symbol.for('lk.TimedString');
36
+
37
+ /**
38
+ * A string with optional start and end timestamps for word-level alignment.
34
39
  */
35
40
  export interface TimedString {
41
+ readonly [TIMED_STRING_SYMBOL]: true;
36
42
  text: string;
37
43
  startTime?: number; // seconds
38
44
  endTime?: number; // seconds
@@ -40,6 +46,38 @@ export interface TimedString {
40
46
  startTimeOffset?: number;
41
47
  }
42
48
 
49
+ /**
50
+ * Factory function to create a TimedString object.
51
+ */
52
+ export function createTimedString(opts: {
53
+ text: string;
54
+ startTime?: number;
55
+ endTime?: number;
56
+ confidence?: number;
57
+ startTimeOffset?: number;
58
+ }): TimedString {
59
+ return {
60
+ [TIMED_STRING_SYMBOL]: true,
61
+ text: opts.text,
62
+ startTime: opts.startTime,
63
+ endTime: opts.endTime,
64
+ confidence: opts.confidence,
65
+ startTimeOffset: opts.startTimeOffset,
66
+ };
67
+ }
68
+
69
+ /**
70
+ * Type guard to check if a value is a TimedString.
71
+ */
72
+ export function isTimedString(value: unknown): value is TimedString {
73
+ return (
74
+ typeof value === 'object' &&
75
+ value !== null &&
76
+ TIMED_STRING_SYMBOL in value &&
77
+ (value as TimedString)[TIMED_STRING_SYMBOL] === true
78
+ );
79
+ }
80
+
43
81
  export interface AudioOutputCapabilities {
44
82
  /** Whether this output supports pause/resume functionality */
45
83
  pause: boolean;
@@ -208,10 +246,7 @@ export interface PlaybackStartedEvent {
208
246
  export abstract class TextOutput {
209
247
  constructor(protected readonly nextInChain?: TextOutput) {}
210
248
 
211
- /**
212
- * Capture a text segment (Used by the output of LLM nodes)
213
- */
214
- abstract captureText(text: string): Promise<void>;
249
+ abstract captureText(text: string | TimedString): Promise<void>;
215
250
 
216
251
  /**
217
252
  * Mark the current text segment as complete (e.g LLM generation is complete)
@@ -23,7 +23,7 @@ import {
23
23
  } from '../../constants.js';
24
24
  import { log } from '../../log.js';
25
25
  import { Future, Task, shortuuid } from '../../utils.js';
26
- import { AudioOutput, TextOutput } from '../io.js';
26
+ import { AudioOutput, TextOutput, type TimedString, isTimedString } from '../io.js';
27
27
  import { findMicrophoneTrackId } from '../transcription/index.js';
28
28
 
29
29
  abstract class BaseParticipantTranscriptionOutput extends TextOutput {
@@ -102,13 +102,14 @@ abstract class BaseParticipantTranscriptionOutput extends TextOutput {
102
102
  this.latestText = '';
103
103
  }
104
104
 
105
- async captureText(text: string) {
105
+ async captureText(text: string | TimedString) {
106
106
  if (!this.participantIdentity) {
107
107
  return;
108
108
  }
109
109
 
110
- this.latestText = text;
111
- await this.handleCaptureText(text);
110
+ const textStr = isTimedString(text) ? text.text : text;
111
+ this.latestText = textStr;
112
+ await this.handleCaptureText(textStr);
112
113
  }
113
114
 
114
115
  flush() {
@@ -298,7 +299,7 @@ export class ParalellTextOutput extends TextOutput {
298
299
  this._sinks = sinks;
299
300
  }
300
301
 
301
- async captureText(text: string) {
302
+ async captureText(text: string | TimedString) {
302
303
  await Promise.all(this._sinks.map((sink) => sink.captureText(text)));
303
304
  }
304
305