@livekit/agents 1.0.37 → 1.0.39

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. package/dist/cli.cjs.map +1 -1
  2. package/dist/inference/api_protos.cjs +68 -0
  3. package/dist/inference/api_protos.cjs.map +1 -1
  4. package/dist/inference/api_protos.d.cts +345 -4
  5. package/dist/inference/api_protos.d.ts +345 -4
  6. package/dist/inference/api_protos.d.ts.map +1 -1
  7. package/dist/inference/api_protos.js +60 -0
  8. package/dist/inference/api_protos.js.map +1 -1
  9. package/dist/inference/llm.cjs +7 -3
  10. package/dist/inference/llm.cjs.map +1 -1
  11. package/dist/inference/llm.d.cts +5 -6
  12. package/dist/inference/llm.d.ts +5 -6
  13. package/dist/inference/llm.d.ts.map +1 -1
  14. package/dist/inference/llm.js +7 -3
  15. package/dist/inference/llm.js.map +1 -1
  16. package/dist/inference/stt.cjs +32 -21
  17. package/dist/inference/stt.cjs.map +1 -1
  18. package/dist/inference/stt.d.cts +5 -4
  19. package/dist/inference/stt.d.ts +5 -4
  20. package/dist/inference/stt.d.ts.map +1 -1
  21. package/dist/inference/stt.js +34 -21
  22. package/dist/inference/stt.js.map +1 -1
  23. package/dist/inference/tts.cjs.map +1 -1
  24. package/dist/inference/tts.d.cts +10 -7
  25. package/dist/inference/tts.d.ts +10 -7
  26. package/dist/inference/tts.d.ts.map +1 -1
  27. package/dist/inference/tts.js.map +1 -1
  28. package/dist/ipc/inference_proc_executor.cjs.map +1 -1
  29. package/dist/ipc/job_proc_executor.cjs.map +1 -1
  30. package/dist/stt/stream_adapter.cjs +9 -1
  31. package/dist/stt/stream_adapter.cjs.map +1 -1
  32. package/dist/stt/stream_adapter.d.ts.map +1 -1
  33. package/dist/stt/stream_adapter.js +9 -1
  34. package/dist/stt/stream_adapter.js.map +1 -1
  35. package/dist/stt/stt.cjs +10 -0
  36. package/dist/stt/stt.cjs.map +1 -1
  37. package/dist/stt/stt.d.cts +12 -0
  38. package/dist/stt/stt.d.ts +12 -0
  39. package/dist/stt/stt.d.ts.map +1 -1
  40. package/dist/stt/stt.js +10 -0
  41. package/dist/stt/stt.js.map +1 -1
  42. package/dist/telemetry/traces.cjs +4 -3
  43. package/dist/telemetry/traces.cjs.map +1 -1
  44. package/dist/telemetry/traces.d.cts +2 -0
  45. package/dist/telemetry/traces.d.ts +2 -0
  46. package/dist/telemetry/traces.d.ts.map +1 -1
  47. package/dist/telemetry/traces.js +4 -3
  48. package/dist/telemetry/traces.js.map +1 -1
  49. package/dist/utils.cjs +11 -0
  50. package/dist/utils.cjs.map +1 -1
  51. package/dist/utils.d.cts +10 -0
  52. package/dist/utils.d.ts +10 -0
  53. package/dist/utils.d.ts.map +1 -1
  54. package/dist/utils.js +10 -0
  55. package/dist/utils.js.map +1 -1
  56. package/dist/voice/agent.cjs +6 -2
  57. package/dist/voice/agent.cjs.map +1 -1
  58. package/dist/voice/agent.d.ts.map +1 -1
  59. package/dist/voice/agent.js +6 -2
  60. package/dist/voice/agent.js.map +1 -1
  61. package/dist/voice/agent_activity.cjs +72 -37
  62. package/dist/voice/agent_activity.cjs.map +1 -1
  63. package/dist/voice/agent_activity.d.cts +2 -1
  64. package/dist/voice/agent_activity.d.ts +2 -1
  65. package/dist/voice/agent_activity.d.ts.map +1 -1
  66. package/dist/voice/agent_activity.js +73 -38
  67. package/dist/voice/agent_activity.js.map +1 -1
  68. package/dist/voice/agent_session.cjs +7 -5
  69. package/dist/voice/agent_session.cjs.map +1 -1
  70. package/dist/voice/agent_session.d.cts +5 -2
  71. package/dist/voice/agent_session.d.ts +5 -2
  72. package/dist/voice/agent_session.d.ts.map +1 -1
  73. package/dist/voice/agent_session.js +7 -5
  74. package/dist/voice/agent_session.js.map +1 -1
  75. package/dist/voice/audio_recognition.cjs +3 -1
  76. package/dist/voice/audio_recognition.cjs.map +1 -1
  77. package/dist/voice/audio_recognition.d.ts.map +1 -1
  78. package/dist/voice/audio_recognition.js +3 -1
  79. package/dist/voice/audio_recognition.js.map +1 -1
  80. package/dist/voice/avatar/datastream_io.cjs +6 -0
  81. package/dist/voice/avatar/datastream_io.cjs.map +1 -1
  82. package/dist/voice/avatar/datastream_io.d.cts +1 -0
  83. package/dist/voice/avatar/datastream_io.d.ts +1 -0
  84. package/dist/voice/avatar/datastream_io.d.ts.map +1 -1
  85. package/dist/voice/avatar/datastream_io.js +6 -0
  86. package/dist/voice/avatar/datastream_io.js.map +1 -1
  87. package/dist/voice/background_audio.cjs.map +1 -1
  88. package/dist/voice/generation.cjs +14 -5
  89. package/dist/voice/generation.cjs.map +1 -1
  90. package/dist/voice/generation.d.cts +3 -2
  91. package/dist/voice/generation.d.ts +3 -2
  92. package/dist/voice/generation.d.ts.map +1 -1
  93. package/dist/voice/generation.js +14 -5
  94. package/dist/voice/generation.js.map +1 -1
  95. package/dist/voice/io.cjs +12 -0
  96. package/dist/voice/io.cjs.map +1 -1
  97. package/dist/voice/io.d.cts +19 -1
  98. package/dist/voice/io.d.ts +19 -1
  99. package/dist/voice/io.d.ts.map +1 -1
  100. package/dist/voice/io.js +12 -0
  101. package/dist/voice/io.js.map +1 -1
  102. package/dist/voice/recorder_io/recorder_io.cjs +91 -28
  103. package/dist/voice/recorder_io/recorder_io.cjs.map +1 -1
  104. package/dist/voice/recorder_io/recorder_io.d.cts +7 -1
  105. package/dist/voice/recorder_io/recorder_io.d.ts +7 -1
  106. package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -1
  107. package/dist/voice/recorder_io/recorder_io.js +91 -28
  108. package/dist/voice/recorder_io/recorder_io.js.map +1 -1
  109. package/dist/voice/room_io/_input.cjs +40 -11
  110. package/dist/voice/room_io/_input.cjs.map +1 -1
  111. package/dist/voice/room_io/_input.d.cts +4 -1
  112. package/dist/voice/room_io/_input.d.ts +4 -1
  113. package/dist/voice/room_io/_input.d.ts.map +1 -1
  114. package/dist/voice/room_io/_input.js +31 -2
  115. package/dist/voice/room_io/_input.js.map +1 -1
  116. package/dist/voice/room_io/_output.cjs +6 -0
  117. package/dist/voice/room_io/_output.cjs.map +1 -1
  118. package/dist/voice/room_io/_output.d.cts +1 -0
  119. package/dist/voice/room_io/_output.d.ts +1 -0
  120. package/dist/voice/room_io/_output.d.ts.map +1 -1
  121. package/dist/voice/room_io/_output.js +6 -0
  122. package/dist/voice/room_io/_output.js.map +1 -1
  123. package/dist/voice/room_io/room_io.cjs.map +1 -1
  124. package/dist/voice/room_io/room_io.d.cts +2 -2
  125. package/dist/voice/room_io/room_io.d.ts +2 -2
  126. package/dist/voice/room_io/room_io.d.ts.map +1 -1
  127. package/dist/voice/room_io/room_io.js.map +1 -1
  128. package/dist/voice/speech_handle.cjs +2 -0
  129. package/dist/voice/speech_handle.cjs.map +1 -1
  130. package/dist/voice/speech_handle.d.cts +3 -0
  131. package/dist/voice/speech_handle.d.ts +3 -0
  132. package/dist/voice/speech_handle.d.ts.map +1 -1
  133. package/dist/voice/speech_handle.js +2 -0
  134. package/dist/voice/speech_handle.js.map +1 -1
  135. package/package.json +2 -2
  136. package/src/inference/api_protos.ts +83 -0
  137. package/src/inference/llm.ts +20 -15
  138. package/src/inference/stt.ts +48 -29
  139. package/src/inference/tts.ts +36 -16
  140. package/src/stt/stream_adapter.ts +12 -1
  141. package/src/stt/stt.ts +21 -0
  142. package/src/telemetry/traces.ts +6 -2
  143. package/src/utils.ts +21 -0
  144. package/src/voice/agent.ts +11 -2
  145. package/src/voice/agent_activity.ts +108 -41
  146. package/src/voice/agent_session.ts +6 -5
  147. package/src/voice/audio_recognition.ts +2 -0
  148. package/src/voice/avatar/datastream_io.ts +8 -0
  149. package/src/voice/generation.ts +24 -12
  150. package/src/voice/io.ts +27 -5
  151. package/src/voice/recorder_io/recorder_io.ts +123 -31
  152. package/src/voice/room_io/_input.ts +32 -4
  153. package/src/voice/room_io/_output.ts +8 -0
  154. package/src/voice/room_io/room_io.ts +3 -1
  155. package/src/voice/speech_handle.ts +4 -0
@@ -122,9 +122,9 @@ class AgentActivity {
122
122
  );
123
123
  this.turnDetectionMode = void 0;
124
124
  }
125
- if (!this.vad && this.stt && this.llm instanceof import_llm.LLM && this.allowInterruptions && this.turnDetectionMode === void 0) {
125
+ if (!this.vad && this.stt && !this.stt.capabilities.streaming && this.llm instanceof import_llm.LLM && this.allowInterruptions && this.turnDetectionMode === void 0) {
126
126
  this.logger.warn(
127
- "VAD is not set. Enabling VAD is recommended when using LLM and STT for more responsive interruption handling."
127
+ "VAD is not set. Enabling VAD is recommended when using LLM and non-streaming STT for more responsive interruption handling."
128
128
  );
129
129
  }
130
130
  }
@@ -458,8 +458,12 @@ class AgentActivity {
458
458
  this.scheduleSpeech(handle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL);
459
459
  }
460
460
  // recognition hooks
461
- onStartOfSpeech(_ev) {
462
- this.agentSession._updateUserState("speaking");
461
+ onStartOfSpeech(ev) {
462
+ let speechStartTime = Date.now();
463
+ if (ev) {
464
+ speechStartTime = speechStartTime - ev.speechDuration;
465
+ }
466
+ this.agentSession._updateUserState("speaking", speechStartTime);
463
467
  }
464
468
  onEndOfSpeech(ev) {
465
469
  let speechEndTime = Date.now();
@@ -469,14 +473,16 @@ class AgentActivity {
469
473
  this.agentSession._updateUserState("listening", speechEndTime);
470
474
  }
471
475
  onVADInferenceDone(ev) {
472
- var _a, _b;
473
476
  if (this.turnDetection === "manual" || this.turnDetection === "realtime_llm") {
474
477
  return;
475
478
  }
476
- if (this.llm instanceof import_llm.RealtimeModel && this.llm.capabilities.turnDetection) {
477
- return;
479
+ if (ev.speechDuration >= this.agentSession.options.minInterruptionDuration) {
480
+ this.interruptByAudioActivity();
478
481
  }
479
- if (ev.speechDuration < this.agentSession.options.minInterruptionDuration) {
482
+ }
483
+ interruptByAudioActivity() {
484
+ var _a, _b;
485
+ if (this.llm instanceof import_llm.RealtimeModel && this.llm.capabilities.turnDetection) {
480
486
  return;
481
487
  }
482
488
  if (this.stt && this.agentSession.options.minInterruptionWords > 0 && this.audioRecognition) {
@@ -489,7 +495,10 @@ class AgentActivity {
489
495
  }
490
496
  (_a = this.realtimeSession) == null ? void 0 : _a.startUserActivity();
491
497
  if (this._currentSpeech && !this._currentSpeech.interrupted && this._currentSpeech.allowInterruptions) {
492
- this.logger.info({ "speech id": this._currentSpeech.id }, "speech interrupted by VAD");
498
+ this.logger.info(
499
+ { "speech id": this._currentSpeech.id },
500
+ "speech interrupted by audio activity"
501
+ );
493
502
  (_b = this.realtimeSession) == null ? void 0 : _b.interrupt();
494
503
  this._currentSpeech.interrupt();
495
504
  }
@@ -507,6 +516,9 @@ class AgentActivity {
507
516
  // TODO(AJS-106): add multi participant support
508
517
  })
509
518
  );
519
+ if (ev.alternatives[0].text) {
520
+ this.interruptByAudioActivity();
521
+ }
510
522
  }
511
523
  onFinalTranscript(ev) {
512
524
  if (this.llm instanceof import_llm.RealtimeModel && this.llm.capabilities.userTranscription) {
@@ -521,6 +533,9 @@ class AgentActivity {
521
533
  // TODO(AJS-106): add multi participant support
522
534
  })
523
535
  );
536
+ if (this.audioRecognition && this.turnDetection !== "manual" && this.turnDetection !== "realtime_llm") {
537
+ this.interruptByAudioActivity();
538
+ }
524
539
  }
525
540
  onPreemptiveGeneration(info) {
526
541
  if (!this.agentSession.options.preemptiveGeneration || this.draining || this._currentSpeech !== void 0 && !this._currentSpeech.interrupted || !(this.llm instanceof import_llm.LLM)) {
@@ -836,6 +851,7 @@ ${instructions}` : instructions,
836
851
  );
837
852
  }
838
853
  async ttsTask(speechHandle, text, addToChatCtx, modelSettings, replyAbortController, audio) {
854
+ speechHandle._agentTurnContext = import_api.context.active();
839
855
  speechHandleStorage.enterWith(speechHandle);
840
856
  const transcriptionOutput = this.agentSession.output.transcriptionEnabled ? this.agentSession.output.transcription : null;
841
857
  const audioOutput = this.agentSession.output.audioEnabled ? this.agentSession.output.audio : null;
@@ -867,12 +883,15 @@ ${instructions}` : instructions,
867
883
  textOut = _textOut;
868
884
  tasks.push(textForwardTask);
869
885
  }
870
- const onFirstFrame = () => {
871
- this.agentSession._updateAgentState("speaking");
886
+ const onFirstFrame = (startedSpeakingAt) => {
887
+ this.agentSession._updateAgentState("speaking", {
888
+ startTime: startedSpeakingAt,
889
+ otelContext: speechHandle._agentTurnContext
890
+ });
872
891
  };
873
892
  if (!audioOutput) {
874
893
  if (textOut) {
875
- textOut.firstTextFut.await.finally(onFirstFrame);
894
+ textOut.firstTextFut.await.then(() => onFirstFrame()).catch(() => this.logger.debug("firstTextFut cancelled before first frame"));
876
895
  }
877
896
  } else {
878
897
  let audioOut = null;
@@ -900,7 +919,7 @@ ${instructions}` : instructions,
900
919
  tasks.push(forwardTask);
901
920
  audioOut = _audioOut;
902
921
  }
903
- audioOut.firstFrameFut.await.finally(onFirstFrame);
922
+ audioOut.firstFrameFut.await.then((ts) => onFirstFrame(ts)).catch(() => this.logger.debug("firstFrameFut cancelled before first frame"));
904
923
  }
905
924
  await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
906
925
  if (audioOutput) {
@@ -939,6 +958,7 @@ ${instructions}` : instructions,
939
958
  span
940
959
  }) => {
941
960
  var _a, _b, _c;
961
+ speechHandle._agentTurnContext = import_api.context.active();
942
962
  span.setAttribute(import_telemetry.traceTypes.ATTR_SPEECH_ID, speechHandle.id);
943
963
  if (instructions) {
944
964
  span.setAttribute(import_telemetry.traceTypes.ATTR_INSTRUCTIONS, instructions);
@@ -1015,8 +1035,11 @@ ${instructions}` : instructions,
1015
1035
  tasks.push(textForwardTask);
1016
1036
  textOut = _textOut;
1017
1037
  }
1018
- const onFirstFrame = () => {
1019
- this.agentSession._updateAgentState("speaking");
1038
+ const onFirstFrame = (startedSpeakingAt) => {
1039
+ this.agentSession._updateAgentState("speaking", {
1040
+ startTime: startedSpeakingAt,
1041
+ otelContext: speechHandle._agentTurnContext
1042
+ });
1020
1043
  };
1021
1044
  let audioOut = null;
1022
1045
  if (audioOutput) {
@@ -1028,12 +1051,12 @@ ${instructions}` : instructions,
1028
1051
  );
1029
1052
  audioOut = _audioOut;
1030
1053
  tasks.push(forwardTask);
1031
- audioOut.firstFrameFut.await.finally(onFirstFrame);
1054
+ audioOut.firstFrameFut.await.then((ts) => onFirstFrame(ts)).catch(() => this.logger.debug("firstFrameFut cancelled before first frame"));
1032
1055
  } else {
1033
1056
  throw Error("ttsStream is null when audioOutput is enabled");
1034
1057
  }
1035
1058
  } else {
1036
- textOut == null ? void 0 : textOut.firstTextFut.await.finally(onFirstFrame);
1059
+ textOut == null ? void 0 : textOut.firstTextFut.await.then(() => onFirstFrame()).catch(() => this.logger.debug("firstTextFut cancelled before first frame"));
1037
1060
  }
1038
1061
  const onToolExecutionStarted = (f) => {
1039
1062
  speechHandle._itemAdded([f]);
@@ -1064,7 +1087,12 @@ ${instructions}` : instructions,
1064
1087
  msg.createdAt = replyStartedAt;
1065
1088
  }
1066
1089
  this.agent._chatCtx.insert(toolsMessages);
1067
- this.agentSession._toolItemsAdded(toolsMessages);
1090
+ const toolCallOutputs = toolsMessages.filter(
1091
+ (m) => m.type === "function_call_output"
1092
+ );
1093
+ if (toolCallOutputs.length > 0) {
1094
+ this.agentSession._toolItemsAdded(toolCallOutputs);
1095
+ }
1068
1096
  }
1069
1097
  if (speechHandle.interrupted) {
1070
1098
  this.logger.debug(
@@ -1081,9 +1109,9 @@ ${instructions}` : instructions,
1081
1109
  let forwardedText = (textOut == null ? void 0 : textOut.text) || "";
1082
1110
  if (audioOutput) {
1083
1111
  const playbackEv = await audioOutput.waitForPlayout();
1084
- if (audioOut == null ? void 0 : audioOut.firstFrameFut.done) {
1112
+ if ((audioOut == null ? void 0 : audioOut.firstFrameFut.done) && !audioOut.firstFrameFut.rejected) {
1085
1113
  this.logger.info(
1086
- { speech_id: speechHandle.id, playbackPosition: playbackEv.playbackPosition },
1114
+ { speech_id: speechHandle.id, playbackPositionInS: playbackEv.playbackPosition },
1087
1115
  "playout interrupted"
1088
1116
  );
1089
1117
  if (playbackEv.synchronizedTranscript) {
@@ -1221,7 +1249,12 @@ ${instructions}` : instructions,
1221
1249
  msg.createdAt = replyStartedAt;
1222
1250
  }
1223
1251
  this.agent._chatCtx.insert(toolMessages);
1224
- this.agentSession._toolItemsAdded(toolMessages);
1252
+ const toolCallOutputs = toolMessages.filter(
1253
+ (m) => m.type === "function_call_output"
1254
+ );
1255
+ if (toolCallOutputs.length > 0) {
1256
+ this.agentSession._toolItemsAdded(toolCallOutputs);
1257
+ }
1225
1258
  }
1226
1259
  };
1227
1260
  pipelineReplyTask = async (speechHandle, chatCtx, toolCtx, modelSettings, replyAbortController, instructions, newMessage, toolsMessages) => import_telemetry.tracer.startActiveSpan(
@@ -1264,6 +1297,7 @@ ${instructions}` : instructions,
1264
1297
  span
1265
1298
  }) {
1266
1299
  var _a, _b, _c;
1300
+ speechHandle._agentTurnContext = import_api.context.active();
1267
1301
  span.setAttribute(import_telemetry.traceTypes.ATTR_SPEECH_ID, speechHandle.id);
1268
1302
  speechHandleStorage.enterWith(speechHandle);
1269
1303
  if (!this.realtimeSession) {
@@ -1288,8 +1322,11 @@ ${instructions}` : instructions,
1288
1322
  if (speechHandle.interrupted) {
1289
1323
  return;
1290
1324
  }
1291
- const onFirstFrame = () => {
1292
- this.agentSession._updateAgentState("speaking");
1325
+ const onFirstFrame = (startedSpeakingAt) => {
1326
+ this.agentSession._updateAgentState("speaking", {
1327
+ startTime: startedSpeakingAt,
1328
+ otelContext: speechHandle._agentTurnContext
1329
+ });
1293
1330
  };
1294
1331
  const readMessages = async (abortController, outputs) => {
1295
1332
  replyAbortController.signal.addEventListener("abort", () => abortController.abort(), {
@@ -1364,10 +1401,10 @@ ${instructions}` : instructions,
1364
1401
  );
1365
1402
  forwardTasks.push(forwardTask);
1366
1403
  audioOut = _audioOut;
1367
- audioOut.firstFrameFut.await.finally(onFirstFrame);
1404
+ audioOut.firstFrameFut.await.then((ts) => onFirstFrame(ts)).catch(() => this.logger.debug("firstFrameFut cancelled before first frame"));
1368
1405
  }
1369
1406
  } else if (textOut) {
1370
- textOut.firstTextFut.await.finally(onFirstFrame);
1407
+ textOut.firstTextFut.await.then(() => onFirstFrame()).catch(() => this.logger.debug("firstTextFut cancelled before first frame"));
1371
1408
  }
1372
1409
  outputs.push([msg.messageId, textOut, audioOut, msgModalities]);
1373
1410
  }
@@ -1431,7 +1468,6 @@ ${instructions}` : instructions,
1431
1468
  await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
1432
1469
  if (audioOutput) {
1433
1470
  await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
1434
- this.agentSession._updateAgentState("listening");
1435
1471
  }
1436
1472
  if (speechHandle.interrupted) {
1437
1473
  this.logger.debug(
@@ -1446,10 +1482,10 @@ ${instructions}` : instructions,
1446
1482
  if (audioOutput) {
1447
1483
  audioOutput.clearBuffer();
1448
1484
  const playbackEv = await audioOutput.waitForPlayout();
1449
- let playbackPosition = playbackEv.playbackPosition;
1450
- if (audioOut == null ? void 0 : audioOut.firstFrameFut.done) {
1485
+ let playbackPositionInS = playbackEv.playbackPosition;
1486
+ if ((audioOut == null ? void 0 : audioOut.firstFrameFut.done) && !audioOut.firstFrameFut.rejected) {
1451
1487
  this.logger.info(
1452
- { speech_id: speechHandle.id, playbackPosition: playbackEv.playbackPosition },
1488
+ { speech_id: speechHandle.id, playbackPositionInS },
1453
1489
  "playout interrupted"
1454
1490
  );
1455
1491
  if (playbackEv.synchronizedTranscript) {
@@ -1457,11 +1493,11 @@ ${instructions}` : instructions,
1457
1493
  }
1458
1494
  } else {
1459
1495
  forwardedText = "";
1460
- playbackPosition = 0;
1496
+ playbackPositionInS = 0;
1461
1497
  }
1462
1498
  this.realtimeSession.truncate({
1463
1499
  messageId: msgId,
1464
- audioEndMs: Math.floor(playbackPosition),
1500
+ audioEndMs: Math.floor(playbackPositionInS * 1e3),
1465
1501
  modalities: msgModalities,
1466
1502
  audioTranscript: forwardedText
1467
1503
  });
@@ -1499,14 +1535,13 @@ ${instructions}` : instructions,
1499
1535
  this.agentSession._conversationItemAdded(message);
1500
1536
  }
1501
1537
  speechHandle._markGenerationDone();
1502
- toolOutput.firstToolStartedFuture.await.finally(() => {
1503
- this.agentSession._updateAgentState("thinking");
1504
- });
1505
1538
  await executeToolsTask.result;
1539
+ if (toolOutput.output.length > 0) {
1540
+ this.agentSession._updateAgentState("thinking");
1541
+ } else if (this.agentSession.agentState === "speaking") {
1542
+ this.agentSession._updateAgentState("listening");
1543
+ }
1506
1544
  if (toolOutput.output.length === 0) {
1507
- if (!speechHandle.interrupted) {
1508
- this.agentSession._updateAgentState("listening");
1509
- }
1510
1545
  return;
1511
1546
  }
1512
1547
  const { maxToolSteps } = this.agentSession.options;