@livekit/agents 1.0.37 → 1.0.39

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. package/dist/cli.cjs.map +1 -1
  2. package/dist/inference/api_protos.cjs +68 -0
  3. package/dist/inference/api_protos.cjs.map +1 -1
  4. package/dist/inference/api_protos.d.cts +345 -4
  5. package/dist/inference/api_protos.d.ts +345 -4
  6. package/dist/inference/api_protos.d.ts.map +1 -1
  7. package/dist/inference/api_protos.js +60 -0
  8. package/dist/inference/api_protos.js.map +1 -1
  9. package/dist/inference/llm.cjs +7 -3
  10. package/dist/inference/llm.cjs.map +1 -1
  11. package/dist/inference/llm.d.cts +5 -6
  12. package/dist/inference/llm.d.ts +5 -6
  13. package/dist/inference/llm.d.ts.map +1 -1
  14. package/dist/inference/llm.js +7 -3
  15. package/dist/inference/llm.js.map +1 -1
  16. package/dist/inference/stt.cjs +32 -21
  17. package/dist/inference/stt.cjs.map +1 -1
  18. package/dist/inference/stt.d.cts +5 -4
  19. package/dist/inference/stt.d.ts +5 -4
  20. package/dist/inference/stt.d.ts.map +1 -1
  21. package/dist/inference/stt.js +34 -21
  22. package/dist/inference/stt.js.map +1 -1
  23. package/dist/inference/tts.cjs.map +1 -1
  24. package/dist/inference/tts.d.cts +10 -7
  25. package/dist/inference/tts.d.ts +10 -7
  26. package/dist/inference/tts.d.ts.map +1 -1
  27. package/dist/inference/tts.js.map +1 -1
  28. package/dist/ipc/inference_proc_executor.cjs.map +1 -1
  29. package/dist/ipc/job_proc_executor.cjs.map +1 -1
  30. package/dist/stt/stream_adapter.cjs +9 -1
  31. package/dist/stt/stream_adapter.cjs.map +1 -1
  32. package/dist/stt/stream_adapter.d.ts.map +1 -1
  33. package/dist/stt/stream_adapter.js +9 -1
  34. package/dist/stt/stream_adapter.js.map +1 -1
  35. package/dist/stt/stt.cjs +10 -0
  36. package/dist/stt/stt.cjs.map +1 -1
  37. package/dist/stt/stt.d.cts +12 -0
  38. package/dist/stt/stt.d.ts +12 -0
  39. package/dist/stt/stt.d.ts.map +1 -1
  40. package/dist/stt/stt.js +10 -0
  41. package/dist/stt/stt.js.map +1 -1
  42. package/dist/telemetry/traces.cjs +4 -3
  43. package/dist/telemetry/traces.cjs.map +1 -1
  44. package/dist/telemetry/traces.d.cts +2 -0
  45. package/dist/telemetry/traces.d.ts +2 -0
  46. package/dist/telemetry/traces.d.ts.map +1 -1
  47. package/dist/telemetry/traces.js +4 -3
  48. package/dist/telemetry/traces.js.map +1 -1
  49. package/dist/utils.cjs +11 -0
  50. package/dist/utils.cjs.map +1 -1
  51. package/dist/utils.d.cts +10 -0
  52. package/dist/utils.d.ts +10 -0
  53. package/dist/utils.d.ts.map +1 -1
  54. package/dist/utils.js +10 -0
  55. package/dist/utils.js.map +1 -1
  56. package/dist/voice/agent.cjs +6 -2
  57. package/dist/voice/agent.cjs.map +1 -1
  58. package/dist/voice/agent.d.ts.map +1 -1
  59. package/dist/voice/agent.js +6 -2
  60. package/dist/voice/agent.js.map +1 -1
  61. package/dist/voice/agent_activity.cjs +72 -37
  62. package/dist/voice/agent_activity.cjs.map +1 -1
  63. package/dist/voice/agent_activity.d.cts +2 -1
  64. package/dist/voice/agent_activity.d.ts +2 -1
  65. package/dist/voice/agent_activity.d.ts.map +1 -1
  66. package/dist/voice/agent_activity.js +73 -38
  67. package/dist/voice/agent_activity.js.map +1 -1
  68. package/dist/voice/agent_session.cjs +7 -5
  69. package/dist/voice/agent_session.cjs.map +1 -1
  70. package/dist/voice/agent_session.d.cts +5 -2
  71. package/dist/voice/agent_session.d.ts +5 -2
  72. package/dist/voice/agent_session.d.ts.map +1 -1
  73. package/dist/voice/agent_session.js +7 -5
  74. package/dist/voice/agent_session.js.map +1 -1
  75. package/dist/voice/audio_recognition.cjs +3 -1
  76. package/dist/voice/audio_recognition.cjs.map +1 -1
  77. package/dist/voice/audio_recognition.d.ts.map +1 -1
  78. package/dist/voice/audio_recognition.js +3 -1
  79. package/dist/voice/audio_recognition.js.map +1 -1
  80. package/dist/voice/avatar/datastream_io.cjs +6 -0
  81. package/dist/voice/avatar/datastream_io.cjs.map +1 -1
  82. package/dist/voice/avatar/datastream_io.d.cts +1 -0
  83. package/dist/voice/avatar/datastream_io.d.ts +1 -0
  84. package/dist/voice/avatar/datastream_io.d.ts.map +1 -1
  85. package/dist/voice/avatar/datastream_io.js +6 -0
  86. package/dist/voice/avatar/datastream_io.js.map +1 -1
  87. package/dist/voice/background_audio.cjs.map +1 -1
  88. package/dist/voice/generation.cjs +14 -5
  89. package/dist/voice/generation.cjs.map +1 -1
  90. package/dist/voice/generation.d.cts +3 -2
  91. package/dist/voice/generation.d.ts +3 -2
  92. package/dist/voice/generation.d.ts.map +1 -1
  93. package/dist/voice/generation.js +14 -5
  94. package/dist/voice/generation.js.map +1 -1
  95. package/dist/voice/io.cjs +12 -0
  96. package/dist/voice/io.cjs.map +1 -1
  97. package/dist/voice/io.d.cts +19 -1
  98. package/dist/voice/io.d.ts +19 -1
  99. package/dist/voice/io.d.ts.map +1 -1
  100. package/dist/voice/io.js +12 -0
  101. package/dist/voice/io.js.map +1 -1
  102. package/dist/voice/recorder_io/recorder_io.cjs +91 -28
  103. package/dist/voice/recorder_io/recorder_io.cjs.map +1 -1
  104. package/dist/voice/recorder_io/recorder_io.d.cts +7 -1
  105. package/dist/voice/recorder_io/recorder_io.d.ts +7 -1
  106. package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -1
  107. package/dist/voice/recorder_io/recorder_io.js +91 -28
  108. package/dist/voice/recorder_io/recorder_io.js.map +1 -1
  109. package/dist/voice/room_io/_input.cjs +40 -11
  110. package/dist/voice/room_io/_input.cjs.map +1 -1
  111. package/dist/voice/room_io/_input.d.cts +4 -1
  112. package/dist/voice/room_io/_input.d.ts +4 -1
  113. package/dist/voice/room_io/_input.d.ts.map +1 -1
  114. package/dist/voice/room_io/_input.js +31 -2
  115. package/dist/voice/room_io/_input.js.map +1 -1
  116. package/dist/voice/room_io/_output.cjs +6 -0
  117. package/dist/voice/room_io/_output.cjs.map +1 -1
  118. package/dist/voice/room_io/_output.d.cts +1 -0
  119. package/dist/voice/room_io/_output.d.ts +1 -0
  120. package/dist/voice/room_io/_output.d.ts.map +1 -1
  121. package/dist/voice/room_io/_output.js +6 -0
  122. package/dist/voice/room_io/_output.js.map +1 -1
  123. package/dist/voice/room_io/room_io.cjs.map +1 -1
  124. package/dist/voice/room_io/room_io.d.cts +2 -2
  125. package/dist/voice/room_io/room_io.d.ts +2 -2
  126. package/dist/voice/room_io/room_io.d.ts.map +1 -1
  127. package/dist/voice/room_io/room_io.js.map +1 -1
  128. package/dist/voice/speech_handle.cjs +2 -0
  129. package/dist/voice/speech_handle.cjs.map +1 -1
  130. package/dist/voice/speech_handle.d.cts +3 -0
  131. package/dist/voice/speech_handle.d.ts +3 -0
  132. package/dist/voice/speech_handle.d.ts.map +1 -1
  133. package/dist/voice/speech_handle.js +2 -0
  134. package/dist/voice/speech_handle.js.map +1 -1
  135. package/package.json +2 -2
  136. package/src/inference/api_protos.ts +83 -0
  137. package/src/inference/llm.ts +20 -15
  138. package/src/inference/stt.ts +48 -29
  139. package/src/inference/tts.ts +36 -16
  140. package/src/stt/stream_adapter.ts +12 -1
  141. package/src/stt/stt.ts +21 -0
  142. package/src/telemetry/traces.ts +6 -2
  143. package/src/utils.ts +21 -0
  144. package/src/voice/agent.ts +11 -2
  145. package/src/voice/agent_activity.ts +108 -41
  146. package/src/voice/agent_session.ts +6 -5
  147. package/src/voice/audio_recognition.ts +2 -0
  148. package/src/voice/avatar/datastream_io.ts +8 -0
  149. package/src/voice/generation.ts +24 -12
  150. package/src/voice/io.ts +27 -5
  151. package/src/voice/recorder_io/recorder_io.ts +123 -31
  152. package/src/voice/room_io/_input.ts +32 -4
  153. package/src/voice/room_io/_output.ts +8 -0
  154. package/src/voice/room_io/room_io.ts +3 -1
  155. package/src/voice/speech_handle.ts +4 -0
@@ -4,7 +4,7 @@
4
4
  import { Mutex } from '@livekit/mutex';
5
5
  import type { AudioFrame } from '@livekit/rtc-node';
6
6
  import type { Span } from '@opentelemetry/api';
7
- import { ROOT_CONTEXT, trace } from '@opentelemetry/api';
7
+ import { ROOT_CONTEXT, context as otelContext, trace } from '@opentelemetry/api';
8
8
  import { Heap } from 'heap-js';
9
9
  import { AsyncLocalStorage } from 'node:async_hooks';
10
10
  import { ReadableStream } from 'node:stream/web';
@@ -194,12 +194,13 @@ export class AgentActivity implements RecognitionHooks {
194
194
  if (
195
195
  !this.vad &&
196
196
  this.stt &&
197
+ !this.stt.capabilities.streaming &&
197
198
  this.llm instanceof LLM &&
198
199
  this.allowInterruptions &&
199
200
  this.turnDetectionMode === undefined
200
201
  ) {
201
202
  this.logger.warn(
202
- 'VAD is not set. Enabling VAD is recommended when using LLM and STT ' +
203
+ 'VAD is not set. Enabling VAD is recommended when using LLM and non-streaming STT ' +
203
204
  'for more responsive interruption handling.',
204
205
  );
205
206
  }
@@ -637,9 +638,12 @@ export class AgentActivity implements RecognitionHooks {
637
638
  }
638
639
 
639
640
  // recognition hooks
640
-
641
- onStartOfSpeech(_ev: VADEvent): void {
642
- this.agentSession._updateUserState('speaking');
641
+ onStartOfSpeech(ev: VADEvent): void {
642
+ let speechStartTime = Date.now();
643
+ if (ev) {
644
+ speechStartTime = speechStartTime - ev.speechDuration;
645
+ }
646
+ this.agentSession._updateUserState('speaking', speechStartTime);
643
647
  }
644
648
 
645
649
  onEndOfSpeech(ev: VADEvent): void {
@@ -656,12 +660,14 @@ export class AgentActivity implements RecognitionHooks {
656
660
  return;
657
661
  }
658
662
 
659
- if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection) {
660
- // skip speech handle interruption if server side turn detection is enabled
661
- return;
663
+ if (ev.speechDuration >= this.agentSession.options.minInterruptionDuration) {
664
+ this.interruptByAudioActivity();
662
665
  }
666
+ }
663
667
 
664
- if (ev.speechDuration < this.agentSession.options.minInterruptionDuration) {
668
+ private interruptByAudioActivity(): void {
669
+ if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection) {
670
+ // skip speech handle interruption if server side turn detection is enabled
665
671
  return;
666
672
  }
667
673
 
@@ -691,7 +697,10 @@ export class AgentActivity implements RecognitionHooks {
691
697
  !this._currentSpeech.interrupted &&
692
698
  this._currentSpeech.allowInterruptions
693
699
  ) {
694
- this.logger.info({ 'speech id': this._currentSpeech.id }, 'speech interrupted by VAD');
700
+ this.logger.info(
701
+ { 'speech id': this._currentSpeech.id },
702
+ 'speech interrupted by audio activity',
703
+ );
695
704
  this.realtimeSession?.interrupt();
696
705
  this._currentSpeech.interrupt();
697
706
  }
@@ -712,6 +721,10 @@ export class AgentActivity implements RecognitionHooks {
712
721
  // TODO(AJS-106): add multi participant support
713
722
  }),
714
723
  );
724
+
725
+ if (ev.alternatives![0].text) {
726
+ this.interruptByAudioActivity();
727
+ }
715
728
  }
716
729
 
717
730
  onFinalTranscript(ev: SpeechEvent): void {
@@ -729,6 +742,20 @@ export class AgentActivity implements RecognitionHooks {
729
742
  // TODO(AJS-106): add multi participant support
730
743
  }),
731
744
  );
745
+
746
+ // agent speech might not be interrupted if VAD failed and a final transcript is received
747
+ // we call interruptByAudioActivity (idempotent) to pause the speech, if possible
748
+ if (
749
+ this.audioRecognition &&
750
+ this.turnDetection !== 'manual' &&
751
+ this.turnDetection !== 'realtime_llm'
752
+ ) {
753
+ this.interruptByAudioActivity();
754
+
755
+ // TODO: resume false interruption - schedule a resume timer if interrupted after end_of_speech
756
+ }
757
+
758
+ // TODO: resume false interruption - start interrupt paused speech task
732
759
  }
733
760
 
734
761
  onPreemptiveGeneration(info: PreemptiveGenerationInfo): void {
@@ -1168,6 +1195,8 @@ export class AgentActivity implements RecognitionHooks {
1168
1195
  replyAbortController: AbortController,
1169
1196
  audio?: ReadableStream<AudioFrame> | null,
1170
1197
  ): Promise<void> {
1198
+ speechHandle._agentTurnContext = otelContext.active();
1199
+
1171
1200
  speechHandleStorage.enterWith(speechHandle);
1172
1201
 
1173
1202
  const transcriptionOutput = this.agentSession.output.transcriptionEnabled
@@ -1212,13 +1241,18 @@ export class AgentActivity implements RecognitionHooks {
1212
1241
  tasks.push(textForwardTask);
1213
1242
  }
1214
1243
 
1215
- const onFirstFrame = () => {
1216
- this.agentSession._updateAgentState('speaking');
1244
+ const onFirstFrame = (startedSpeakingAt?: number) => {
1245
+ this.agentSession._updateAgentState('speaking', {
1246
+ startTime: startedSpeakingAt,
1247
+ otelContext: speechHandle._agentTurnContext,
1248
+ });
1217
1249
  };
1218
1250
 
1219
1251
  if (!audioOutput) {
1220
1252
  if (textOut) {
1221
- textOut.firstTextFut.await.finally(onFirstFrame);
1253
+ textOut.firstTextFut.await
1254
+ .then(() => onFirstFrame())
1255
+ .catch(() => this.logger.debug('firstTextFut cancelled before first frame'));
1222
1256
  }
1223
1257
  } else {
1224
1258
  let audioOut: _AudioOut | null = null;
@@ -1249,7 +1283,9 @@ export class AgentActivity implements RecognitionHooks {
1249
1283
  tasks.push(forwardTask);
1250
1284
  audioOut = _audioOut;
1251
1285
  }
1252
- audioOut.firstFrameFut.await.finally(onFirstFrame);
1286
+ audioOut.firstFrameFut.await
1287
+ .then((ts) => onFirstFrame(ts))
1288
+ .catch(() => this.logger.debug('firstFrameFut cancelled before first frame'));
1253
1289
  }
1254
1290
 
1255
1291
  await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
@@ -1303,6 +1339,8 @@ export class AgentActivity implements RecognitionHooks {
1303
1339
  toolsMessages?: ChatItem[];
1304
1340
  span: Span;
1305
1341
  }): Promise<void> => {
1342
+ speechHandle._agentTurnContext = otelContext.active();
1343
+
1306
1344
  span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
1307
1345
  if (instructions) {
1308
1346
  span.setAttribute(traceTypes.ATTR_INSTRUCTIONS, instructions);
@@ -1402,8 +1440,11 @@ export class AgentActivity implements RecognitionHooks {
1402
1440
  textOut = _textOut;
1403
1441
  }
1404
1442
 
1405
- const onFirstFrame = () => {
1406
- this.agentSession._updateAgentState('speaking');
1443
+ const onFirstFrame = (startedSpeakingAt?: number) => {
1444
+ this.agentSession._updateAgentState('speaking', {
1445
+ startTime: startedSpeakingAt,
1446
+ otelContext: speechHandle._agentTurnContext,
1447
+ });
1407
1448
  };
1408
1449
 
1409
1450
  let audioOut: _AudioOut | null = null;
@@ -1416,12 +1457,16 @@ export class AgentActivity implements RecognitionHooks {
1416
1457
  );
1417
1458
  audioOut = _audioOut;
1418
1459
  tasks.push(forwardTask);
1419
- audioOut.firstFrameFut.await.finally(onFirstFrame);
1460
+ audioOut.firstFrameFut.await
1461
+ .then((ts) => onFirstFrame(ts))
1462
+ .catch(() => this.logger.debug('firstFrameFut cancelled before first frame'));
1420
1463
  } else {
1421
1464
  throw Error('ttsStream is null when audioOutput is enabled');
1422
1465
  }
1423
1466
  } else {
1424
- textOut?.firstTextFut.await.finally(onFirstFrame);
1467
+ textOut?.firstTextFut.await
1468
+ .then(() => onFirstFrame())
1469
+ .catch(() => this.logger.debug('firstTextFut cancelled before first frame'));
1425
1470
  }
1426
1471
 
1427
1472
  //TODO(AJS-272): before executing tools, make sure we generated all the text
@@ -1462,8 +1507,14 @@ export class AgentActivity implements RecognitionHooks {
1462
1507
  msg.createdAt = replyStartedAt;
1463
1508
  }
1464
1509
  this.agent._chatCtx.insert(toolsMessages);
1465
- // Also add to session history (matches Python agent_session.py _tool_items_added)
1466
- this.agentSession._toolItemsAdded(toolsMessages as (FunctionCall | FunctionCallOutput)[]);
1510
+ // Only add FunctionCallOutput items to session history since FunctionCall items
1511
+ // were already added by onToolExecutionStarted when the tool execution began
1512
+ const toolCallOutputs = toolsMessages.filter(
1513
+ (m): m is FunctionCallOutput => m.type === 'function_call_output',
1514
+ );
1515
+ if (toolCallOutputs.length > 0) {
1516
+ this.agentSession._toolItemsAdded(toolCallOutputs);
1517
+ }
1467
1518
  }
1468
1519
 
1469
1520
  if (speechHandle.interrupted) {
@@ -1487,10 +1538,10 @@ export class AgentActivity implements RecognitionHooks {
1487
1538
 
1488
1539
  if (audioOutput) {
1489
1540
  const playbackEv = await audioOutput.waitForPlayout();
1490
- if (audioOut?.firstFrameFut.done) {
1541
+ if (audioOut?.firstFrameFut.done && !audioOut.firstFrameFut.rejected) {
1491
1542
  // playback EV is valid only if the first frame was already played
1492
1543
  this.logger.info(
1493
- { speech_id: speechHandle.id, playbackPosition: playbackEv.playbackPosition },
1544
+ { speech_id: speechHandle.id, playbackPositionInS: playbackEv.playbackPosition },
1494
1545
  'playout interrupted',
1495
1546
  );
1496
1547
  if (playbackEv.synchronizedTranscript) {
@@ -1656,8 +1707,18 @@ export class AgentActivity implements RecognitionHooks {
1656
1707
  for (const msg of toolMessages) {
1657
1708
  msg.createdAt = replyStartedAt;
1658
1709
  }
1710
+
1659
1711
  this.agent._chatCtx.insert(toolMessages);
1660
- this.agentSession._toolItemsAdded(toolMessages as (FunctionCall | FunctionCallOutput)[]);
1712
+
1713
+ // Only add FunctionCallOutput items to session history since FunctionCall items
1714
+ // were already added by onToolExecutionStarted when the tool execution began
1715
+ const toolCallOutputs = toolMessages.filter(
1716
+ (m): m is FunctionCallOutput => m.type === 'function_call_output',
1717
+ );
1718
+
1719
+ if (toolCallOutputs.length > 0) {
1720
+ this.agentSession._toolItemsAdded(toolCallOutputs);
1721
+ }
1661
1722
  }
1662
1723
  };
1663
1724
 
@@ -1725,6 +1786,8 @@ export class AgentActivity implements RecognitionHooks {
1725
1786
  replyAbortController: AbortController;
1726
1787
  span: Span;
1727
1788
  }): Promise<void> {
1789
+ speechHandle._agentTurnContext = otelContext.active();
1790
+
1728
1791
  span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
1729
1792
 
1730
1793
  speechHandleStorage.enterWith(speechHandle);
@@ -1762,8 +1825,11 @@ export class AgentActivity implements RecognitionHooks {
1762
1825
  return;
1763
1826
  }
1764
1827
 
1765
- const onFirstFrame = () => {
1766
- this.agentSession._updateAgentState('speaking');
1828
+ const onFirstFrame = (startedSpeakingAt?: number) => {
1829
+ this.agentSession._updateAgentState('speaking', {
1830
+ startTime: startedSpeakingAt,
1831
+ otelContext: speechHandle._agentTurnContext,
1832
+ });
1767
1833
  };
1768
1834
 
1769
1835
  const readMessages = async (
@@ -1851,10 +1917,14 @@ export class AgentActivity implements RecognitionHooks {
1851
1917
  );
1852
1918
  forwardTasks.push(forwardTask);
1853
1919
  audioOut = _audioOut;
1854
- audioOut.firstFrameFut.await.finally(onFirstFrame);
1920
+ audioOut.firstFrameFut.await
1921
+ .then((ts) => onFirstFrame(ts))
1922
+ .catch(() => this.logger.debug('firstFrameFut cancelled before first frame'));
1855
1923
  }
1856
1924
  } else if (textOut) {
1857
- textOut.firstTextFut.await.finally(onFirstFrame);
1925
+ textOut.firstTextFut.await
1926
+ .then(() => onFirstFrame())
1927
+ .catch(() => this.logger.debug('firstTextFut cancelled before first frame'));
1858
1928
  }
1859
1929
  outputs.push([msg.messageId, textOut, audioOut, msgModalities]);
1860
1930
  }
@@ -1936,7 +2006,6 @@ export class AgentActivity implements RecognitionHooks {
1936
2006
 
1937
2007
  if (audioOutput) {
1938
2008
  await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
1939
- this.agentSession._updateAgentState('listening');
1940
2009
  }
1941
2010
 
1942
2011
  if (speechHandle.interrupted) {
@@ -1955,11 +2024,11 @@ export class AgentActivity implements RecognitionHooks {
1955
2024
  if (audioOutput) {
1956
2025
  audioOutput.clearBuffer();
1957
2026
  const playbackEv = await audioOutput.waitForPlayout();
1958
- let playbackPosition = playbackEv.playbackPosition;
1959
- if (audioOut?.firstFrameFut.done) {
2027
+ let playbackPositionInS = playbackEv.playbackPosition;
2028
+ if (audioOut?.firstFrameFut.done && !audioOut.firstFrameFut.rejected) {
1960
2029
  // playback EV is valid only if the first frame was already played
1961
2030
  this.logger.info(
1962
- { speech_id: speechHandle.id, playbackPosition: playbackEv.playbackPosition },
2031
+ { speech_id: speechHandle.id, playbackPositionInS },
1963
2032
  'playout interrupted',
1964
2033
  );
1965
2034
  if (playbackEv.synchronizedTranscript) {
@@ -1967,13 +2036,13 @@ export class AgentActivity implements RecognitionHooks {
1967
2036
  }
1968
2037
  } else {
1969
2038
  forwardedText = '';
1970
- playbackPosition = 0;
2039
+ playbackPositionInS = 0;
1971
2040
  }
1972
2041
 
1973
2042
  // truncate server-side message
1974
2043
  this.realtimeSession.truncate({
1975
2044
  messageId: msgId,
1976
- audioEndMs: Math.floor(playbackPosition),
2045
+ audioEndMs: Math.floor(playbackPositionInS * 1000),
1977
2046
  modalities: msgModalities,
1978
2047
  audioTranscript: forwardedText,
1979
2048
  });
@@ -2023,17 +2092,15 @@ export class AgentActivity implements RecognitionHooks {
2023
2092
  speechHandle._markGenerationDone();
2024
2093
  // TODO(brian): close tees
2025
2094
 
2026
- toolOutput.firstToolStartedFuture.await.finally(() => {
2027
- this.agentSession._updateAgentState('thinking');
2028
- });
2029
-
2030
2095
  await executeToolsTask.result;
2031
2096
 
2097
+ if (toolOutput.output.length > 0) {
2098
+ this.agentSession._updateAgentState('thinking');
2099
+ } else if (this.agentSession.agentState === 'speaking') {
2100
+ this.agentSession._updateAgentState('listening');
2101
+ }
2102
+
2032
2103
  if (toolOutput.output.length === 0) {
2033
- // return to listening state for thinking-only turns (no audio output, no tools)
2034
- if (!speechHandle.interrupted) {
2035
- this.agentSession._updateAgentState('listening');
2036
- }
2037
2104
  return;
2038
2105
  }
2039
2106
 
@@ -677,7 +677,7 @@ export class AgentSession<
677
677
  }
678
678
 
679
679
  /** @internal */
680
- _updateAgentState(state: AgentState) {
680
+ _updateAgentState(state: AgentState, options?: { startTime?: number; otelContext?: Context }) {
681
681
  if (this._agentState === state) {
682
682
  return;
683
683
  }
@@ -690,7 +690,8 @@ export class AgentSession<
690
690
  if (this.agentSpeakingSpan === undefined) {
691
691
  this.agentSpeakingSpan = tracer.startSpan({
692
692
  name: 'agent_speaking',
693
- context: this.rootSpanContext,
693
+ context: options?.otelContext ?? this.rootSpanContext,
694
+ startTime: options?.startTime,
694
695
  });
695
696
 
696
697
  // TODO(brian): PR4 - Set participant attributes if roomIO.room.localParticipant is available
@@ -719,7 +720,7 @@ export class AgentSession<
719
720
  }
720
721
 
721
722
  /** @internal */
722
- _updateUserState(state: UserState, _lastSpeakingTime?: number) {
723
+ _updateUserState(state: UserState, lastSpeakingTime?: number) {
723
724
  if (this.userState === state) {
724
725
  return;
725
726
  }
@@ -728,13 +729,13 @@ export class AgentSession<
728
729
  this.userSpeakingSpan = tracer.startSpan({
729
730
  name: 'user_speaking',
730
731
  context: this.rootSpanContext,
732
+ startTime: lastSpeakingTime,
731
733
  });
732
734
 
733
735
  // TODO(brian): PR4 - Set participant attributes if roomIO.linkedParticipant is available
734
736
  // (Ref: Python agent_session.py line 1192-1195)
735
737
  } else if (this.userSpeakingSpan !== undefined) {
736
- // TODO(brian): PR4 - Set ATTR_END_TIME attribute with lastSpeakingTime if available
737
- this.userSpeakingSpan.end();
738
+ this.userSpeakingSpan.end(lastSpeakingTime);
738
739
  this.userSpeakingSpan = undefined;
739
740
  }
740
741
 
@@ -566,9 +566,11 @@ export class AudioRecognition {
566
566
  this.speaking = true;
567
567
 
568
568
  if (!this.userTurnSpan) {
569
+ const startTime = Date.now() - ev.speechDuration;
569
570
  this.userTurnSpan = tracer.startSpan({
570
571
  name: 'user_turn',
571
572
  context: this.rootSpanContext,
573
+ startTime,
572
574
  });
573
575
  }
574
576
 
@@ -47,6 +47,7 @@ export class DataStreamAudioOutput extends AudioOutput {
47
47
  private started: boolean = false;
48
48
  private lock = new Mutex();
49
49
  private startTask?: Task<void>;
50
+ private firstFrameEmitted: boolean = false;
50
51
 
51
52
  #logger = log();
52
53
 
@@ -146,6 +147,11 @@ export class DataStreamAudioOutput extends AudioOutput {
146
147
  await this.startTask.result;
147
148
  await super.captureFrame(frame);
148
149
 
150
+ if (!this.firstFrameEmitted) {
151
+ this.firstFrameEmitted = true;
152
+ this.onPlaybackStarted(Date.now());
153
+ }
154
+
149
155
  if (!this.streamWriter) {
150
156
  this.streamWriter = await this.room.localParticipant!.streamBytes({
151
157
  name: shortuuid('AUDIO_'),
@@ -174,6 +180,8 @@ export class DataStreamAudioOutput extends AudioOutput {
174
180
  this.streamWriter.close().finally(() => {
175
181
  this.streamWriter = undefined;
176
182
  });
183
+
184
+ this.firstFrameEmitted = false;
177
185
  }
178
186
 
179
187
  clearBuffer(): void {
@@ -27,7 +27,7 @@ import { traceTypes, tracer } from '../telemetry/index.js';
27
27
  import { Future, Task, shortuuid, toError, waitForAbort } from '../utils.js';
28
28
  import { type Agent, type ModelSettings, asyncLocalStorage, isStopResponse } from './agent.js';
29
29
  import type { AgentSession } from './agent_session.js';
30
- import type { AudioOutput, LLMNode, TTSNode, TextOutput } from './io.js';
30
+ import { AudioOutput, type LLMNode, type TTSNode, type TextOutput } from './io.js';
31
31
  import { RunContext } from './run_context.js';
32
32
  import type { SpeechHandle } from './speech_handle.js';
33
33
 
@@ -608,7 +608,8 @@ export function performTextForwarding(
608
608
 
609
609
  export interface _AudioOut {
610
610
  audio: Array<AudioFrame>;
611
- firstFrameFut: Future;
611
+ /** Future that will be set with the timestamp of the first frame's capture */
612
+ firstFrameFut: Future<number>;
612
613
  }
613
614
 
614
615
  async function forwardAudio(
@@ -620,7 +621,16 @@ async function forwardAudio(
620
621
  const reader = ttsStream.getReader();
621
622
  let resampler: AudioResampler | null = null;
622
623
 
624
+ const onPlaybackStarted = (ev: { createdAt: number }) => {
625
+ if (!out.firstFrameFut.done) {
626
+ out.firstFrameFut.resolve(ev.createdAt);
627
+ }
628
+ };
629
+
623
630
  try {
631
+ audioOuput.on(AudioOutput.EVENT_PLAYBACK_STARTED, onPlaybackStarted);
632
+ audioOuput.resume();
633
+
624
634
  while (true) {
625
635
  if (signal?.aborted) {
626
636
  break;
@@ -647,20 +657,21 @@ async function forwardAudio(
647
657
  } else {
648
658
  await audioOuput.captureFrame(frame);
649
659
  }
650
-
651
- // set the first frame future if not already set
652
- // (after completing the first frame)
653
- if (!out.firstFrameFut.done) {
654
- out.firstFrameFut.resolve();
655
- }
656
660
  }
657
- } finally {
658
- reader?.releaseLock();
661
+
659
662
  if (resampler) {
660
663
  for (const f of resampler.flush()) {
661
664
  await audioOuput.captureFrame(f);
662
665
  }
663
666
  }
667
+ } finally {
668
+ audioOuput.off(AudioOutput.EVENT_PLAYBACK_STARTED, onPlaybackStarted);
669
+
670
+ if (!out.firstFrameFut.done) {
671
+ out.firstFrameFut.reject(new Error('audio forwarding cancelled before playback started'));
672
+ }
673
+
674
+ reader?.releaseLock();
664
675
  audioOuput.flush();
665
676
  }
666
677
  }
@@ -670,10 +681,11 @@ export function performAudioForwarding(
670
681
  audioOutput: AudioOutput,
671
682
  controller: AbortController,
672
683
  ): [Task<void>, _AudioOut] {
673
- const out = {
684
+ const out: _AudioOut = {
674
685
  audio: [],
675
- firstFrameFut: new Future(),
686
+ firstFrameFut: new Future<number>(),
676
687
  };
688
+
677
689
  return [
678
690
  Task.from(
679
691
  (controller) => forwardAudio(ttsStream, audioOutput, out, controller.signal),
package/src/voice/io.ts CHANGED
@@ -30,12 +30,14 @@ export type TTSNode = (
30
30
  ) => Promise<ReadableStream<AudioFrame> | null>;
31
31
 
32
32
  /**
33
- * A string with timing information for word-level alignment.
33
+ *A string with optional start and end timestamps for word-level alignment.
34
34
  */
35
35
  export interface TimedString {
36
36
  text: string;
37
37
  startTime?: number; // seconds
38
38
  endTime?: number; // seconds
39
+ confidence?: number;
40
+ startTimeOffset?: number;
39
41
  }
40
42
 
41
43
  export interface AudioOutputCapabilities {
@@ -57,6 +59,7 @@ export abstract class AudioInput {
57
59
  }
58
60
 
59
61
  export abstract class AudioOutput extends EventEmitter {
62
+ static readonly EVENT_PLAYBACK_STARTED = 'playbackStarted';
60
63
  static readonly EVENT_PLAYBACK_FINISHED = 'playbackFinished';
61
64
 
62
65
  private playbackFinishedFuture: Future<void> = new Future();
@@ -77,7 +80,11 @@ export abstract class AudioOutput extends EventEmitter {
77
80
  ) {
78
81
  super();
79
82
  this.capabilities = capabilities;
83
+
80
84
  if (this.nextInChain) {
85
+ this.nextInChain.on(AudioOutput.EVENT_PLAYBACK_STARTED, (ev: PlaybackStartedEvent) =>
86
+ this.onPlaybackStarted(ev.createdAt),
87
+ );
81
88
  this.nextInChain.on(AudioOutput.EVENT_PLAYBACK_FINISHED, (ev: PlaybackFinishedEvent) =>
82
89
  this.onPlaybackFinished(ev),
83
90
  );
@@ -117,6 +124,14 @@ export abstract class AudioOutput extends EventEmitter {
117
124
  return this.lastPlaybackEvent;
118
125
  }
119
126
 
127
+ /**
128
+ * Called when playback actually starts (first frame is sent to output).
129
+ * Developers building audio sinks should call this when the first frame is captured.
130
+ */
131
+ onPlaybackStarted(createdAt: number): void {
132
+ this.emit(AudioOutput.EVENT_PLAYBACK_STARTED, { createdAt } as PlaybackStartedEvent);
133
+ }
134
+
120
135
  /**
121
136
  * Developers building audio sinks must call this method when a playback/segment is finished.
122
137
  * Segments are segmented by calls to flush() or clearBuffer()
@@ -174,15 +189,22 @@ export abstract class AudioOutput extends EventEmitter {
174
189
  }
175
190
 
176
191
  export interface PlaybackFinishedEvent {
177
- // How much of the audio was played back
192
+ /** How much of the audio was played back, in seconds */
178
193
  playbackPosition: number;
179
- // Interrupted is True if playback was interrupted (clearBuffer() was called)
194
+ /** True if playback was interrupted (clearBuffer() was called) */
180
195
  interrupted: boolean;
181
- // Transcript synced with playback; may be partial if the audio was interrupted
182
- // When null, the transcript is not synchronized with the playback
196
+ /**
197
+ * Transcript synced with playback; may be partial if the audio was interrupted.
198
+ * When undefined, the transcript is not synchronized with the playback.
199
+ */
183
200
  synchronizedTranscript?: string;
184
201
  }
185
202
 
203
+ export interface PlaybackStartedEvent {
204
+ /** The timestamp (Date.now()) when the playback started */
205
+ createdAt: number;
206
+ }
207
+
186
208
  export abstract class TextOutput {
187
209
  constructor(protected readonly nextInChain?: TextOutput) {}
188
210