@livekit/agents 1.0.2 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. package/dist/index.cjs +2 -5
  2. package/dist/index.cjs.map +1 -1
  3. package/dist/index.d.cts +2 -3
  4. package/dist/index.d.ts +2 -3
  5. package/dist/index.d.ts.map +1 -1
  6. package/dist/index.js +1 -3
  7. package/dist/index.js.map +1 -1
  8. package/dist/ipc/job_proc_lazy_main.cjs +3 -2
  9. package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
  10. package/dist/ipc/job_proc_lazy_main.js +4 -3
  11. package/dist/ipc/job_proc_lazy_main.js.map +1 -1
  12. package/dist/job.cjs +20 -14
  13. package/dist/job.cjs.map +1 -1
  14. package/dist/job.d.cts +11 -5
  15. package/dist/job.d.ts +11 -5
  16. package/dist/job.d.ts.map +1 -1
  17. package/dist/job.js +17 -12
  18. package/dist/job.js.map +1 -1
  19. package/dist/tokenize/basic/hyphenator.cjs.map +1 -1
  20. package/dist/tokenize/basic/hyphenator.js.map +1 -1
  21. package/dist/utils.cjs +77 -0
  22. package/dist/utils.cjs.map +1 -1
  23. package/dist/utils.d.cts +21 -0
  24. package/dist/utils.d.ts +21 -0
  25. package/dist/utils.d.ts.map +1 -1
  26. package/dist/utils.js +76 -1
  27. package/dist/utils.js.map +1 -1
  28. package/dist/voice/agent_activity.cjs +112 -71
  29. package/dist/voice/agent_activity.cjs.map +1 -1
  30. package/dist/voice/agent_activity.d.ts.map +1 -1
  31. package/dist/voice/agent_activity.js +112 -71
  32. package/dist/voice/agent_activity.js.map +1 -1
  33. package/dist/voice/agent_session.cjs +9 -2
  34. package/dist/voice/agent_session.cjs.map +1 -1
  35. package/dist/voice/agent_session.d.ts.map +1 -1
  36. package/dist/voice/agent_session.js +9 -2
  37. package/dist/voice/agent_session.js.map +1 -1
  38. package/dist/voice/avatar/datastream_io.cjs +204 -0
  39. package/dist/voice/avatar/datastream_io.cjs.map +1 -0
  40. package/dist/voice/avatar/datastream_io.d.cts +37 -0
  41. package/dist/voice/avatar/datastream_io.d.ts +37 -0
  42. package/dist/voice/avatar/datastream_io.d.ts.map +1 -0
  43. package/dist/voice/avatar/datastream_io.js +188 -0
  44. package/dist/voice/avatar/datastream_io.js.map +1 -0
  45. package/dist/{multimodal → voice/avatar}/index.cjs +4 -4
  46. package/dist/voice/avatar/index.cjs.map +1 -0
  47. package/dist/voice/avatar/index.d.cts +2 -0
  48. package/dist/voice/avatar/index.d.ts +2 -0
  49. package/dist/voice/avatar/index.d.ts.map +1 -0
  50. package/dist/voice/avatar/index.js +2 -0
  51. package/dist/voice/avatar/index.js.map +1 -0
  52. package/dist/voice/index.cjs +2 -0
  53. package/dist/voice/index.cjs.map +1 -1
  54. package/dist/voice/index.d.cts +1 -0
  55. package/dist/voice/index.d.ts +1 -0
  56. package/dist/voice/index.d.ts.map +1 -1
  57. package/dist/voice/index.js +1 -0
  58. package/dist/voice/index.js.map +1 -1
  59. package/dist/voice/io.cjs.map +1 -1
  60. package/dist/voice/io.d.cts +1 -1
  61. package/dist/voice/io.d.ts +1 -1
  62. package/dist/voice/io.d.ts.map +1 -1
  63. package/dist/voice/io.js.map +1 -1
  64. package/dist/voice/room_io/_input.cjs +2 -1
  65. package/dist/voice/room_io/_input.cjs.map +1 -1
  66. package/dist/voice/room_io/_input.d.ts.map +1 -1
  67. package/dist/voice/room_io/_input.js +2 -1
  68. package/dist/voice/room_io/_input.js.map +1 -1
  69. package/dist/voice/run_context.cjs +13 -0
  70. package/dist/voice/run_context.cjs.map +1 -1
  71. package/dist/voice/run_context.d.cts +10 -0
  72. package/dist/voice/run_context.d.ts +10 -0
  73. package/dist/voice/run_context.d.ts.map +1 -1
  74. package/dist/voice/run_context.js +13 -0
  75. package/dist/voice/run_context.js.map +1 -1
  76. package/dist/voice/speech_handle.cjs +152 -30
  77. package/dist/voice/speech_handle.cjs.map +1 -1
  78. package/dist/voice/speech_handle.d.cts +67 -16
  79. package/dist/voice/speech_handle.d.ts +67 -16
  80. package/dist/voice/speech_handle.d.ts.map +1 -1
  81. package/dist/voice/speech_handle.js +153 -31
  82. package/dist/voice/speech_handle.js.map +1 -1
  83. package/dist/worker.cjs +4 -1
  84. package/dist/worker.cjs.map +1 -1
  85. package/dist/worker.d.ts.map +1 -1
  86. package/dist/worker.js +4 -1
  87. package/dist/worker.js.map +1 -1
  88. package/package.json +2 -2
  89. package/src/index.ts +2 -3
  90. package/src/ipc/job_proc_lazy_main.ts +6 -3
  91. package/src/job.ts +27 -12
  92. package/src/tokenize/basic/hyphenator.ts +1 -1
  93. package/src/utils.ts +121 -1
  94. package/src/voice/agent_activity.ts +128 -78
  95. package/src/voice/agent_session.ts +11 -2
  96. package/src/voice/avatar/datastream_io.ts +247 -0
  97. package/src/voice/avatar/index.ts +4 -0
  98. package/src/voice/index.ts +2 -0
  99. package/src/voice/io.ts +1 -1
  100. package/src/voice/room_io/_input.ts +8 -3
  101. package/src/voice/run_context.ts +16 -2
  102. package/src/voice/speech_handle.ts +183 -38
  103. package/src/worker.ts +5 -1
  104. package/dist/multimodal/agent_playout.cjs +0 -233
  105. package/dist/multimodal/agent_playout.cjs.map +0 -1
  106. package/dist/multimodal/agent_playout.d.cts +0 -34
  107. package/dist/multimodal/agent_playout.d.ts +0 -34
  108. package/dist/multimodal/agent_playout.d.ts.map +0 -1
  109. package/dist/multimodal/agent_playout.js +0 -207
  110. package/dist/multimodal/agent_playout.js.map +0 -1
  111. package/dist/multimodal/index.cjs.map +0 -1
  112. package/dist/multimodal/index.d.cts +0 -2
  113. package/dist/multimodal/index.d.ts +0 -2
  114. package/dist/multimodal/index.d.ts.map +0 -1
  115. package/dist/multimodal/index.js +0 -2
  116. package/dist/multimodal/index.js.map +0 -1
  117. package/src/multimodal/agent_playout.ts +0 -266
  118. package/src/multimodal/index.ts +0 -4
@@ -190,7 +190,7 @@ class AgentActivity {
190
190
  this.started = true;
191
191
  this._mainTask = Task.from(({ signal }) => this.mainTask(signal));
192
192
  this.createSpeechTask({
193
- promise: this.agent.onEnter(),
193
+ task: Task.from(() => this.agent.onEnter()),
194
194
  name: "AgentActivity_onEnter"
195
195
  });
196
196
  } finally {
@@ -309,7 +309,9 @@ class AgentActivity {
309
309
  })
310
310
  );
311
311
  const task = this.createSpeechTask({
312
- promise: this.ttsTask(handle, text, addToChatCtx, {}, audio),
312
+ task: Task.from(
313
+ (abortController) => this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio)
314
+ ),
313
315
  ownedSpeechHandle: handle,
314
316
  name: "AgentActivity.say_tts"
315
317
  });
@@ -413,7 +415,9 @@ class AgentActivity {
413
415
  );
414
416
  this.logger.info({ speech_id: handle.id }, "Creating speech handle");
415
417
  this.createSpeechTask({
416
- promise: this.realtimeGenerationTask(handle, ev, {}),
418
+ task: Task.from(
419
+ (abortController) => this.realtimeGenerationTask(handle, ev, {}, abortController)
420
+ ),
417
421
  ownedSpeechHandle: handle,
418
422
  name: "AgentActivity.realtimeGeneration"
419
423
  });
@@ -477,16 +481,23 @@ class AgentActivity {
477
481
  );
478
482
  }
479
483
  createSpeechTask(options) {
480
- const { promise, ownedSpeechHandle } = options;
481
- this.speechTasks.add(promise);
482
- promise.finally(() => {
483
- this.speechTasks.delete(promise);
484
- if (ownedSpeechHandle) {
485
- ownedSpeechHandle._markPlayoutDone();
486
- }
484
+ const { task, ownedSpeechHandle } = options;
485
+ this.speechTasks.add(task);
486
+ task.addDoneCallback(() => {
487
+ this.speechTasks.delete(task);
488
+ });
489
+ if (ownedSpeechHandle) {
490
+ ownedSpeechHandle._tasks.push(task);
491
+ task.addDoneCallback(() => {
492
+ if (ownedSpeechHandle._tasks.every((t) => t.done)) {
493
+ ownedSpeechHandle._markDone();
494
+ }
495
+ });
496
+ }
497
+ task.addDoneCallback(() => {
487
498
  this.wakeupMainTask();
488
499
  });
489
- return promise;
500
+ return task.result;
490
501
  }
491
502
  async onEndOfTurn(info) {
492
503
  if (this.draining) {
@@ -499,7 +510,7 @@ class AgentActivity {
499
510
  }
500
511
  const oldTask = this._userTurnCompletedTask;
501
512
  this._userTurnCompletedTask = this.createSpeechTask({
502
- promise: this.userTurnCompleted(info, oldTask),
513
+ task: Task.from(() => this.userTurnCompleted(info, oldTask)),
503
514
  name: "AgentActivity.userTurnCompleted"
504
515
  });
505
516
  return true;
@@ -525,8 +536,8 @@ class AgentActivity {
525
536
  }
526
537
  const speechHandle = heapItem[2];
527
538
  this._currentSpeech = speechHandle;
528
- speechHandle._authorizePlayout();
529
- await speechHandle.waitForPlayout();
539
+ speechHandle._authorizeGeneration();
540
+ await speechHandle._waitForGeneration();
530
541
  this._currentSpeech = void 0;
531
542
  }
532
543
  if (this.draining && this.speechTasks.size === 0) {
@@ -579,16 +590,19 @@ class AgentActivity {
579
590
  this.logger.info({ speech_id: handle.id }, "Creating speech handle");
580
591
  if (this.llm instanceof RealtimeModel) {
581
592
  this.createSpeechTask({
582
- promise: this.realtimeReplyTask({
583
- speechHandle: handle,
584
- // TODO(brian): support llm.ChatMessage for the realtime model
585
- userInput: userMessage == null ? void 0 : userMessage.textContent,
586
- instructions,
587
- modelSettings: {
588
- // isGiven(toolChoice) = toolChoice !== undefined
589
- toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
590
- }
591
- }),
593
+ task: Task.from(
594
+ (abortController) => this.realtimeReplyTask({
595
+ speechHandle: handle,
596
+ // TODO(brian): support llm.ChatMessage for the realtime model
597
+ userInput: userMessage == null ? void 0 : userMessage.textContent,
598
+ instructions,
599
+ modelSettings: {
600
+ // isGiven(toolChoice) = toolChoice !== undefined
601
+ toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
602
+ },
603
+ abortController
604
+ })
605
+ ),
592
606
  ownedSpeechHandle: handle,
593
607
  name: "AgentActivity.realtimeReply"
594
608
  });
@@ -598,14 +612,19 @@ class AgentActivity {
598
612
  ${instructions}`;
599
613
  }
600
614
  const task = this.createSpeechTask({
601
- promise: this.pipelineReplyTask(
602
- handle,
603
- chatCtx ?? this.agent.chatCtx,
604
- this.agent.toolCtx,
605
- { toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice) },
606
- instructions ? `${this.agent.instructions}
615
+ task: Task.from(
616
+ (abortController) => this.pipelineReplyTask(
617
+ handle,
618
+ chatCtx ?? this.agent.chatCtx,
619
+ this.agent.toolCtx,
620
+ {
621
+ toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
622
+ },
623
+ abortController,
624
+ instructions ? `${this.agent.instructions}
607
625
  ${instructions}` : instructions,
608
- userMessage
626
+ userMessage
627
+ )
609
628
  ),
610
629
  ownedSpeechHandle: handle,
611
630
  name: "AgentActivity.pipelineReply"
@@ -627,7 +646,7 @@ ${instructions}` : instructions,
627
646
  if (currentSpeech === void 0) {
628
647
  future.resolve();
629
648
  } else {
630
- currentSpeech.then(() => {
649
+ currentSpeech.addDoneCallback(() => {
631
650
  if (future.done) return;
632
651
  future.resolve();
633
652
  });
@@ -635,7 +654,7 @@ ${instructions}` : instructions,
635
654
  return future;
636
655
  }
637
656
  onPipelineReplyDone() {
638
- if (!this.speechQueue.peek() && (!this._currentSpeech || this._currentSpeech.done)) {
657
+ if (!this.speechQueue.peek() && (!this._currentSpeech || this._currentSpeech.done())) {
639
658
  this.agentSession._updateAgentState("listening");
640
659
  }
641
660
  }
@@ -699,11 +718,10 @@ ${instructions}` : instructions,
699
718
  createMetricsCollectedEvent({ metrics: eouMetrics })
700
719
  );
701
720
  }
702
- async ttsTask(speechHandle, text, addToChatCtx, modelSettings, audio) {
721
+ async ttsTask(speechHandle, text, addToChatCtx, modelSettings, replyAbortController, audio) {
703
722
  speechHandleStorage.enterWith(speechHandle);
704
723
  const transcriptionOutput = this.agentSession.output.transcriptionEnabled ? this.agentSession.output.transcription : null;
705
724
  const audioOutput = this.agentSession.output.audioEnabled ? this.agentSession.output.audio : null;
706
- const replyAbortController = new AbortController();
707
725
  await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
708
726
  if (speechHandle.interrupted) {
709
727
  return;
@@ -792,10 +810,9 @@ ${instructions}` : instructions,
792
810
  this.agentSession._updateAgentState("listening");
793
811
  }
794
812
  }
795
- async pipelineReplyTask(speechHandle, chatCtx, toolCtx, modelSettings, instructions, newMessage, toolsMessages) {
813
+ async pipelineReplyTask(speechHandle, chatCtx, toolCtx, modelSettings, replyAbortController, instructions, newMessage, toolsMessages) {
796
814
  var _a, _b, _c;
797
815
  speechHandleStorage.enterWith(speechHandle);
798
- const replyAbortController = new AbortController();
799
816
  const audioOutput = this.agentSession.output.audioEnabled ? this.agentSession.output.audio : null;
800
817
  const transcriptionOutput = this.agentSession.output.transcriptionEnabled ? this.agentSession.output.transcription : null;
801
818
  chatCtx = chatCtx.copy();
@@ -838,12 +855,20 @@ ${instructions}` : instructions,
838
855
  );
839
856
  tasks.push(ttsTask);
840
857
  }
841
- await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
858
+ await speechHandle.waitIfNotInterrupted([speechHandle._waitForScheduled()]);
859
+ if (newMessage && speechHandle.scheduled) {
860
+ chatCtx.insert(newMessage);
861
+ this.agent._chatCtx.insert(newMessage);
862
+ this.agentSession._conversationItemAdded(newMessage);
863
+ }
842
864
  if (speechHandle.interrupted) {
843
865
  replyAbortController.abort();
844
866
  await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
845
867
  return;
846
868
  }
869
+ this.agentSession._updateAgentState("thinking");
870
+ await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
871
+ speechHandle._clearAuthorization();
847
872
  const replyStartedAt = Date.now();
848
873
  const trNodeResult = await this.agent.transcriptionNode(llmOutput, modelSettings);
849
874
  let textOut = null;
@@ -890,7 +915,6 @@ ${instructions}` : instructions,
890
915
  onToolExecutionStarted,
891
916
  onToolExecutionCompleted
892
917
  });
893
- tasks.push(executeToolsTask);
894
918
  await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
895
919
  if (audioOutput) {
896
920
  await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
@@ -945,7 +969,7 @@ ${instructions}` : instructions,
945
969
  { speech_id: speechHandle.id, message: forwardedText },
946
970
  "playout completed with interrupt"
947
971
  );
948
- speechHandle._markPlayoutDone();
972
+ speechHandle._markGenerationDone();
949
973
  await executeToolsTask.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
950
974
  return;
951
975
  }
@@ -970,11 +994,11 @@ ${instructions}` : instructions,
970
994
  } else if (this.agentSession.agentState === "speaking") {
971
995
  this.agentSession._updateAgentState("listening");
972
996
  }
973
- speechHandle._markPlayoutDone();
997
+ speechHandle._markGenerationDone();
974
998
  await executeToolsTask.result;
975
999
  if (toolOutput.output.length === 0) return;
976
1000
  const { maxToolSteps } = this.agentSession.options;
977
- if (speechHandle.stepIndex >= maxToolSteps) {
1001
+ if (speechHandle.numSteps >= maxToolSteps) {
978
1002
  this.logger.warn(
979
1003
  { speech_id: speechHandle.id, max_tool_steps: maxToolSteps },
980
1004
  "maximum number of function calls steps reached"
@@ -1029,7 +1053,7 @@ ${instructions}` : instructions,
1029
1053
  chatCtx.insert(toolMessages);
1030
1054
  const handle = SpeechHandle.create({
1031
1055
  allowInterruptions: speechHandle.allowInterruptions,
1032
- stepIndex: speechHandle.stepIndex + 1,
1056
+ stepIndex: speechHandle._stepIndex + 1,
1033
1057
  parent: speechHandle
1034
1058
  });
1035
1059
  this.agentSession.emit(
@@ -1042,14 +1066,17 @@ ${instructions}` : instructions,
1042
1066
  );
1043
1067
  const respondToolChoice = draining || modelSettings.toolChoice === "none" ? "none" : "auto";
1044
1068
  const toolResponseTask = this.createSpeechTask({
1045
- promise: this.pipelineReplyTask(
1046
- handle,
1047
- chatCtx,
1048
- toolCtx,
1049
- { toolChoice: respondToolChoice },
1050
- instructions,
1051
- void 0,
1052
- toolMessages
1069
+ task: Task.from(
1070
+ () => this.pipelineReplyTask(
1071
+ handle,
1072
+ chatCtx,
1073
+ toolCtx,
1074
+ { toolChoice: respondToolChoice },
1075
+ replyAbortController,
1076
+ instructions,
1077
+ void 0,
1078
+ toolMessages
1079
+ )
1053
1080
  ),
1054
1081
  ownedSpeechHandle: handle,
1055
1082
  name: "AgentActivity.pipelineReply"
@@ -1063,7 +1090,7 @@ ${instructions}` : instructions,
1063
1090
  this.agent._chatCtx.insert(toolMessages);
1064
1091
  }
1065
1092
  }
1066
- async realtimeGenerationTask(speechHandle, ev, modelSettings) {
1093
+ async realtimeGenerationTask(speechHandle, ev, modelSettings, replyAbortController) {
1067
1094
  var _a, _b, _c;
1068
1095
  speechHandleStorage.enterWith(speechHandle);
1069
1096
  if (!this.realtimeSession) {
@@ -1073,20 +1100,20 @@ ${instructions}` : instructions,
1073
1100
  throw new Error("llm is not a realtime model");
1074
1101
  }
1075
1102
  this.logger.debug(
1076
- { speech_id: speechHandle.id, stepIndex: speechHandle.stepIndex },
1103
+ { speech_id: speechHandle.id, stepIndex: speechHandle.numSteps },
1077
1104
  "realtime generation started"
1078
1105
  );
1079
1106
  const audioOutput = this.agentSession.output.audioEnabled ? this.agentSession.output.audio : null;
1080
1107
  const textOutput = this.agentSession.output.transcriptionEnabled ? this.agentSession.output.transcription : null;
1081
1108
  const toolCtx = this.realtimeSession.tools;
1082
1109
  await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
1110
+ speechHandle._clearAuthorization();
1083
1111
  if (speechHandle.interrupted) {
1084
1112
  return;
1085
1113
  }
1086
1114
  const onFirstFrame = () => {
1087
1115
  this.agentSession._updateAgentState("speaking");
1088
1116
  };
1089
- const replyAbortController = new AbortController();
1090
1117
  const readMessages = async (abortController, outputs) => {
1091
1118
  const forwardTasks = [];
1092
1119
  try {
@@ -1170,9 +1197,13 @@ ${instructions}` : instructions,
1170
1197
  "AgentActivity.realtime_generation.read_tool_stream"
1171
1198
  )
1172
1199
  );
1173
- const onToolExecutionStarted = (_) => {
1200
+ const onToolExecutionStarted = (f) => {
1201
+ speechHandle._itemAdded([f]);
1174
1202
  };
1175
- const onToolExecutionCompleted = (_) => {
1203
+ const onToolExecutionCompleted = (out) => {
1204
+ if (out.toolCallOutput) {
1205
+ speechHandle._itemAdded([out.toolCallOutput]);
1206
+ }
1176
1207
  };
1177
1208
  const [executeToolsTask, toolOutput] = performToolExecutions({
1178
1209
  session: this.agentSession,
@@ -1228,7 +1259,7 @@ ${instructions}` : instructions,
1228
1259
  interrupted: true
1229
1260
  });
1230
1261
  this.agent._chatCtx.insert(message);
1231
- speechHandle._setChatMessage(message);
1262
+ speechHandle._itemAdded([message]);
1232
1263
  this.agentSession._conversationItemAdded(message);
1233
1264
  }
1234
1265
  this.logger.info(
@@ -1236,7 +1267,7 @@ ${instructions}` : instructions,
1236
1267
  "playout completed with interrupt"
1237
1268
  );
1238
1269
  }
1239
- speechHandle._markPlayoutDone();
1270
+ speechHandle._markGenerationDone();
1240
1271
  await executeToolsTask.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
1241
1272
  return;
1242
1273
  }
@@ -1249,17 +1280,17 @@ ${instructions}` : instructions,
1249
1280
  interrupted: false
1250
1281
  });
1251
1282
  this.agent._chatCtx.insert(message);
1252
- speechHandle._setChatMessage(message);
1283
+ speechHandle._itemAdded([message]);
1253
1284
  this.agentSession._conversationItemAdded(message);
1254
1285
  }
1255
- speechHandle._markPlayoutDone();
1286
+ speechHandle._markGenerationDone();
1256
1287
  toolOutput.firstToolStartedFuture.await.finally(() => {
1257
1288
  this.agentSession._updateAgentState("thinking");
1258
1289
  });
1259
1290
  await executeToolsTask.result;
1260
1291
  if (toolOutput.output.length === 0) return;
1261
1292
  const { maxToolSteps } = this.agentSession.options;
1262
- if (speechHandle.stepIndex >= maxToolSteps) {
1293
+ if (speechHandle.numSteps >= maxToolSteps) {
1263
1294
  this.logger.warn(
1264
1295
  { speech_id: speechHandle.id, max_tool_steps: maxToolSteps },
1265
1296
  "maximum number of function calls steps reached"
@@ -1323,7 +1354,7 @@ ${instructions}` : instructions,
1323
1354
  this.realtimeSession.interrupt();
1324
1355
  const replySpeechHandle = SpeechHandle.create({
1325
1356
  allowInterruptions: speechHandle.allowInterruptions,
1326
- stepIndex: speechHandle.stepIndex + 1,
1357
+ stepIndex: speechHandle.numSteps + 1,
1327
1358
  parent: speechHandle
1328
1359
  });
1329
1360
  this.agentSession.emit(
@@ -1336,10 +1367,13 @@ ${instructions}` : instructions,
1336
1367
  );
1337
1368
  const toolChoice = draining || modelSettings.toolChoice === "none" ? "none" : "auto";
1338
1369
  this.createSpeechTask({
1339
- promise: this.realtimeReplyTask({
1340
- speechHandle: replySpeechHandle,
1341
- modelSettings: { toolChoice }
1342
- }),
1370
+ task: Task.from(
1371
+ (abortController) => this.realtimeReplyTask({
1372
+ speechHandle: replySpeechHandle,
1373
+ modelSettings: { toolChoice },
1374
+ abortController
1375
+ })
1376
+ ),
1343
1377
  ownedSpeechHandle: replySpeechHandle,
1344
1378
  name: "AgentActivity.realtime_reply"
1345
1379
  });
@@ -1349,7 +1383,8 @@ ${instructions}` : instructions,
1349
1383
  speechHandle,
1350
1384
  modelSettings: { toolChoice },
1351
1385
  userInput,
1352
- instructions
1386
+ instructions,
1387
+ abortController
1353
1388
  }) {
1354
1389
  speechHandleStorage.enterWith(speechHandle);
1355
1390
  if (!this.realtimeSession) {
@@ -1372,18 +1407,24 @@ ${instructions}` : instructions,
1372
1407
  }
1373
1408
  try {
1374
1409
  const generationEvent = await this.realtimeSession.generateReply(instructions);
1375
- await this.realtimeGenerationTask(speechHandle, generationEvent, { toolChoice });
1410
+ await this.realtimeGenerationTask(
1411
+ speechHandle,
1412
+ generationEvent,
1413
+ { toolChoice },
1414
+ abortController
1415
+ );
1376
1416
  } finally {
1377
1417
  if (toolChoice !== void 0 && toolChoice !== originalToolChoice) {
1378
1418
  this.realtimeSession.updateOptions({ toolChoice: originalToolChoice });
1379
1419
  }
1380
1420
  }
1381
1421
  }
1382
- scheduleSpeech(speechHandle, priority, bypassDraining = false) {
1383
- if (this.draining && !bypassDraining) {
1422
+ scheduleSpeech(speechHandle, priority, force = false) {
1423
+ if (this.draining && !force) {
1384
1424
  throw new Error("cannot schedule new speech, the agent is draining");
1385
1425
  }
1386
1426
  this.speechQueue.push([priority, Number(process.hrtime.bigint()), speechHandle]);
1427
+ speechHandle._markScheduled();
1387
1428
  this.wakeupMainTask();
1388
1429
  }
1389
1430
  async drain() {
@@ -1392,7 +1433,7 @@ ${instructions}` : instructions,
1392
1433
  try {
1393
1434
  if (this._draining) return;
1394
1435
  this.createSpeechTask({
1395
- promise: this.agent.onExit(),
1436
+ task: Task.from(() => this.agent.onExit()),
1396
1437
  name: "AgentActivity_onExit"
1397
1438
  });
1398
1439
  this.wakeupMainTask();