@livekit/agents 1.0.3 → 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. package/dist/index.cjs +2 -5
  2. package/dist/index.cjs.map +1 -1
  3. package/dist/index.d.cts +2 -3
  4. package/dist/index.d.ts +2 -3
  5. package/dist/index.d.ts.map +1 -1
  6. package/dist/index.js +1 -3
  7. package/dist/index.js.map +1 -1
  8. package/dist/tokenize/basic/hyphenator.cjs.map +1 -1
  9. package/dist/tokenize/basic/hyphenator.js.map +1 -1
  10. package/dist/utils.cjs +77 -0
  11. package/dist/utils.cjs.map +1 -1
  12. package/dist/utils.d.cts +21 -0
  13. package/dist/utils.d.ts +21 -0
  14. package/dist/utils.d.ts.map +1 -1
  15. package/dist/utils.js +76 -1
  16. package/dist/utils.js.map +1 -1
  17. package/dist/voice/agent_activity.cjs +107 -71
  18. package/dist/voice/agent_activity.cjs.map +1 -1
  19. package/dist/voice/agent_activity.d.ts.map +1 -1
  20. package/dist/voice/agent_activity.js +107 -71
  21. package/dist/voice/agent_activity.js.map +1 -1
  22. package/dist/voice/avatar/datastream_io.cjs +204 -0
  23. package/dist/voice/avatar/datastream_io.cjs.map +1 -0
  24. package/dist/voice/avatar/datastream_io.d.cts +37 -0
  25. package/dist/voice/avatar/datastream_io.d.ts +37 -0
  26. package/dist/voice/avatar/datastream_io.d.ts.map +1 -0
  27. package/dist/voice/avatar/datastream_io.js +188 -0
  28. package/dist/voice/avatar/datastream_io.js.map +1 -0
  29. package/dist/{multimodal → voice/avatar}/index.cjs +4 -4
  30. package/dist/voice/avatar/index.cjs.map +1 -0
  31. package/dist/voice/avatar/index.d.cts +2 -0
  32. package/dist/voice/avatar/index.d.ts +2 -0
  33. package/dist/voice/avatar/index.d.ts.map +1 -0
  34. package/dist/voice/avatar/index.js +2 -0
  35. package/dist/voice/avatar/index.js.map +1 -0
  36. package/dist/voice/index.cjs +2 -0
  37. package/dist/voice/index.cjs.map +1 -1
  38. package/dist/voice/index.d.cts +1 -0
  39. package/dist/voice/index.d.ts +1 -0
  40. package/dist/voice/index.d.ts.map +1 -1
  41. package/dist/voice/index.js +1 -0
  42. package/dist/voice/index.js.map +1 -1
  43. package/dist/voice/io.cjs.map +1 -1
  44. package/dist/voice/io.d.cts +1 -1
  45. package/dist/voice/io.d.ts +1 -1
  46. package/dist/voice/io.d.ts.map +1 -1
  47. package/dist/voice/io.js.map +1 -1
  48. package/dist/voice/room_io/_input.cjs +3 -1
  49. package/dist/voice/room_io/_input.cjs.map +1 -1
  50. package/dist/voice/room_io/_input.d.ts.map +1 -1
  51. package/dist/voice/room_io/_input.js +3 -1
  52. package/dist/voice/room_io/_input.js.map +1 -1
  53. package/dist/voice/run_context.cjs +13 -0
  54. package/dist/voice/run_context.cjs.map +1 -1
  55. package/dist/voice/run_context.d.cts +10 -0
  56. package/dist/voice/run_context.d.ts +10 -0
  57. package/dist/voice/run_context.d.ts.map +1 -1
  58. package/dist/voice/run_context.js +13 -0
  59. package/dist/voice/run_context.js.map +1 -1
  60. package/dist/voice/speech_handle.cjs +152 -30
  61. package/dist/voice/speech_handle.cjs.map +1 -1
  62. package/dist/voice/speech_handle.d.cts +67 -16
  63. package/dist/voice/speech_handle.d.ts +67 -16
  64. package/dist/voice/speech_handle.d.ts.map +1 -1
  65. package/dist/voice/speech_handle.js +153 -31
  66. package/dist/voice/speech_handle.js.map +1 -1
  67. package/dist/worker.cjs +4 -1
  68. package/dist/worker.cjs.map +1 -1
  69. package/dist/worker.d.ts.map +1 -1
  70. package/dist/worker.js +4 -1
  71. package/dist/worker.js.map +1 -1
  72. package/package.json +2 -2
  73. package/src/index.ts +2 -3
  74. package/src/tokenize/basic/hyphenator.ts +1 -1
  75. package/src/utils.ts +121 -1
  76. package/src/voice/agent_activity.ts +122 -78
  77. package/src/voice/avatar/datastream_io.ts +247 -0
  78. package/src/voice/avatar/index.ts +4 -0
  79. package/src/voice/index.ts +2 -0
  80. package/src/voice/io.ts +1 -1
  81. package/src/voice/room_io/_input.ts +9 -3
  82. package/src/voice/run_context.ts +16 -2
  83. package/src/voice/speech_handle.ts +183 -38
  84. package/src/worker.ts +5 -1
  85. package/dist/multimodal/agent_playout.cjs +0 -233
  86. package/dist/multimodal/agent_playout.cjs.map +0 -1
  87. package/dist/multimodal/agent_playout.d.cts +0 -34
  88. package/dist/multimodal/agent_playout.d.ts +0 -34
  89. package/dist/multimodal/agent_playout.d.ts.map +0 -1
  90. package/dist/multimodal/agent_playout.js +0 -207
  91. package/dist/multimodal/agent_playout.js.map +0 -1
  92. package/dist/multimodal/index.cjs.map +0 -1
  93. package/dist/multimodal/index.d.cts +0 -2
  94. package/dist/multimodal/index.d.ts +0 -2
  95. package/dist/multimodal/index.d.ts.map +0 -1
  96. package/dist/multimodal/index.js +0 -2
  97. package/dist/multimodal/index.js.map +0 -1
  98. package/src/multimodal/agent_playout.ts +0 -266
  99. package/src/multimodal/index.ts +0 -4
@@ -82,7 +82,7 @@ export class AgentActivity implements RecognitionHooks {
82
82
  private _currentSpeech?: SpeechHandle;
83
83
  private speechQueue: Heap<[number, number, SpeechHandle]>; // [priority, timestamp, speechHandle]
84
84
  private q_updated: Future;
85
- private speechTasks: Set<Promise<unknown>> = new Set();
85
+ private speechTasks: Set<Task<void>> = new Set();
86
86
  private lock = new Mutex();
87
87
  private audioStream = new DeferredReadableStream<AudioFrame>();
88
88
  // default to null as None, which maps to the default provider tool choice value
@@ -269,7 +269,7 @@ export class AgentActivity implements RecognitionHooks {
269
269
 
270
270
  this._mainTask = Task.from(({ signal }) => this.mainTask(signal));
271
271
  this.createSpeechTask({
272
- promise: this.agent.onEnter(),
272
+ task: Task.from(() => this.agent.onEnter()),
273
273
  name: 'AgentActivity_onEnter',
274
274
  });
275
275
  } finally {
@@ -441,9 +441,10 @@ export class AgentActivity implements RecognitionHooks {
441
441
  speechHandle: handle,
442
442
  }),
443
443
  );
444
-
445
444
  const task = this.createSpeechTask({
446
- promise: this.ttsTask(handle, text, addToChatCtx, {}, audio),
445
+ task: Task.from((abortController: AbortController) =>
446
+ this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio),
447
+ ),
447
448
  ownedSpeechHandle: handle,
448
449
  name: 'AgentActivity.say_tts',
449
450
  });
@@ -572,7 +573,9 @@ export class AgentActivity implements RecognitionHooks {
572
573
  this.logger.info({ speech_id: handle.id }, 'Creating speech handle');
573
574
 
574
575
  this.createSpeechTask({
575
- promise: this.realtimeGenerationTask(handle, ev, {}),
576
+ task: Task.from((abortController: AbortController) =>
577
+ this.realtimeGenerationTask(handle, ev, {}, abortController),
578
+ ),
576
579
  ownedSpeechHandle: handle,
577
580
  name: 'AgentActivity.realtimeGeneration',
578
581
  });
@@ -659,26 +662,32 @@ export class AgentActivity implements RecognitionHooks {
659
662
  );
660
663
  }
661
664
 
662
- private createSpeechTask<T>(options: {
663
- promise: Promise<T>;
665
+ private createSpeechTask(options: {
666
+ task: Task<void>;
664
667
  ownedSpeechHandle?: SpeechHandle;
665
668
  name?: string;
666
- }): Promise<T> {
667
- const { promise, ownedSpeechHandle } = options;
668
-
669
- this.speechTasks.add(promise);
669
+ }): Promise<void> {
670
+ const { task, ownedSpeechHandle } = options;
670
671
 
671
- promise.finally(() => {
672
- this.speechTasks.delete(promise);
672
+ this.speechTasks.add(task);
673
+ task.addDoneCallback(() => {
674
+ this.speechTasks.delete(task);
675
+ });
673
676
 
674
- if (ownedSpeechHandle) {
675
- ownedSpeechHandle._markPlayoutDone();
676
- }
677
+ if (ownedSpeechHandle) {
678
+ ownedSpeechHandle._tasks.push(task);
679
+ task.addDoneCallback(() => {
680
+ if (ownedSpeechHandle._tasks.every((t) => t.done)) {
681
+ ownedSpeechHandle._markDone();
682
+ }
683
+ });
684
+ }
677
685
 
686
+ task.addDoneCallback(() => {
678
687
  this.wakeupMainTask();
679
688
  });
680
689
 
681
- return promise;
690
+ return task.result;
682
691
  }
683
692
 
684
693
  async onEndOfTurn(info: EndOfTurnInfo): Promise<boolean> {
@@ -705,7 +714,7 @@ export class AgentActivity implements RecognitionHooks {
705
714
 
706
715
  const oldTask = this._userTurnCompletedTask;
707
716
  this._userTurnCompletedTask = this.createSpeechTask({
708
- promise: this.userTurnCompleted(info, oldTask),
717
+ task: Task.from(() => this.userTurnCompleted(info, oldTask)),
709
718
  name: 'AgentActivity.userTurnCompleted',
710
719
  });
711
720
  return true;
@@ -736,8 +745,8 @@ export class AgentActivity implements RecognitionHooks {
736
745
  }
737
746
  const speechHandle = heapItem[2];
738
747
  this._currentSpeech = speechHandle;
739
- speechHandle._authorizePlayout();
740
- await speechHandle.waitForPlayout();
748
+ speechHandle._authorizeGeneration();
749
+ await speechHandle._waitForGeneration();
741
750
  this._currentSpeech = undefined;
742
751
  }
743
752
 
@@ -815,16 +824,19 @@ export class AgentActivity implements RecognitionHooks {
815
824
 
816
825
  if (this.llm instanceof RealtimeModel) {
817
826
  this.createSpeechTask({
818
- promise: this.realtimeReplyTask({
819
- speechHandle: handle,
820
- // TODO(brian): support llm.ChatMessage for the realtime model
821
- userInput: userMessage?.textContent,
822
- instructions,
823
- modelSettings: {
824
- // isGiven(toolChoice) = toolChoice !== undefined
825
- toolChoice: toOaiToolChoice(toolChoice !== undefined ? toolChoice : this.toolChoice),
826
- },
827
- }),
827
+ task: Task.from((abortController: AbortController) =>
828
+ this.realtimeReplyTask({
829
+ speechHandle: handle,
830
+ // TODO(brian): support llm.ChatMessage for the realtime model
831
+ userInput: userMessage?.textContent,
832
+ instructions,
833
+ modelSettings: {
834
+ // isGiven(toolChoice) = toolChoice !== undefined
835
+ toolChoice: toOaiToolChoice(toolChoice !== undefined ? toolChoice : this.toolChoice),
836
+ },
837
+ abortController,
838
+ }),
839
+ ),
828
840
  ownedSpeechHandle: handle,
829
841
  name: 'AgentActivity.realtimeReply',
830
842
  });
@@ -837,13 +849,18 @@ export class AgentActivity implements RecognitionHooks {
837
849
  }
838
850
 
839
851
  const task = this.createSpeechTask({
840
- promise: this.pipelineReplyTask(
841
- handle,
842
- chatCtx ?? this.agent.chatCtx,
843
- this.agent.toolCtx,
844
- { toolChoice: toOaiToolChoice(toolChoice !== undefined ? toolChoice : this.toolChoice) },
845
- instructions ? `${this.agent.instructions}\n${instructions}` : instructions,
846
- userMessage,
852
+ task: Task.from((abortController: AbortController) =>
853
+ this.pipelineReplyTask(
854
+ handle,
855
+ chatCtx ?? this.agent.chatCtx,
856
+ this.agent.toolCtx,
857
+ {
858
+ toolChoice: toOaiToolChoice(toolChoice !== undefined ? toolChoice : this.toolChoice),
859
+ },
860
+ abortController,
861
+ instructions ? `${this.agent.instructions}\n${instructions}` : instructions,
862
+ userMessage,
863
+ ),
847
864
  ),
848
865
  ownedSpeechHandle: handle,
849
866
  name: 'AgentActivity.pipelineReply',
@@ -860,6 +877,8 @@ export class AgentActivity implements RecognitionHooks {
860
877
  const future = new Future<void>();
861
878
  const currentSpeech = this._currentSpeech;
862
879
 
880
+ //TODO(AJS-273): add interrupt for background speeches
881
+
863
882
  currentSpeech?.interrupt();
864
883
 
865
884
  for (const [_, __, speech] of this.speechQueue) {
@@ -871,7 +890,7 @@ export class AgentActivity implements RecognitionHooks {
871
890
  if (currentSpeech === undefined) {
872
891
  future.resolve();
873
892
  } else {
874
- currentSpeech.then(() => {
893
+ currentSpeech.addDoneCallback(() => {
875
894
  if (future.done) return;
876
895
  future.resolve();
877
896
  });
@@ -881,7 +900,7 @@ export class AgentActivity implements RecognitionHooks {
881
900
  }
882
901
 
883
902
  private onPipelineReplyDone(): void {
884
- if (!this.speechQueue.peek() && (!this._currentSpeech || this._currentSpeech.done)) {
903
+ if (!this.speechQueue.peek() && (!this._currentSpeech || this._currentSpeech.done())) {
885
904
  this.agentSession._updateAgentState('listening');
886
905
  }
887
906
  }
@@ -980,6 +999,7 @@ export class AgentActivity implements RecognitionHooks {
980
999
  text: string | ReadableStream<string>,
981
1000
  addToChatCtx: boolean,
982
1001
  modelSettings: ModelSettings,
1002
+ replyAbortController: AbortController,
983
1003
  audio?: ReadableStream<AudioFrame> | null,
984
1004
  ): Promise<void> {
985
1005
  speechHandleStorage.enterWith(speechHandle);
@@ -992,7 +1012,6 @@ export class AgentActivity implements RecognitionHooks {
992
1012
  ? this.agentSession.output.audio
993
1013
  : null;
994
1014
 
995
- const replyAbortController = new AbortController();
996
1015
  await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
997
1016
 
998
1017
  if (speechHandle.interrupted) {
@@ -1102,14 +1121,13 @@ export class AgentActivity implements RecognitionHooks {
1102
1121
  chatCtx: ChatContext,
1103
1122
  toolCtx: ToolContext,
1104
1123
  modelSettings: ModelSettings,
1124
+ replyAbortController: AbortController,
1105
1125
  instructions?: string,
1106
1126
  newMessage?: ChatMessage,
1107
1127
  toolsMessages?: ChatItem[],
1108
1128
  ): Promise<void> {
1109
1129
  speechHandleStorage.enterWith(speechHandle);
1110
1130
 
1111
- const replyAbortController = new AbortController();
1112
-
1113
1131
  const audioOutput = this.agentSession.output.audioEnabled
1114
1132
  ? this.agentSession.output.audio
1115
1133
  : null;
@@ -1163,13 +1181,19 @@ export class AgentActivity implements RecognitionHooks {
1163
1181
  tasks.push(ttsTask);
1164
1182
  }
1165
1183
 
1166
- await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
1184
+ await speechHandle.waitIfNotInterrupted([speechHandle._waitForScheduled()]);
1185
+
1167
1186
  if (speechHandle.interrupted) {
1168
1187
  replyAbortController.abort();
1169
1188
  await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
1170
1189
  return;
1171
1190
  }
1172
1191
 
1192
+ this.agentSession._updateAgentState('thinking');
1193
+
1194
+ await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
1195
+ speechHandle._clearAuthorization();
1196
+
1173
1197
  const replyStartedAt = Date.now();
1174
1198
  const trNodeResult = await this.agent.transcriptionNode(llmOutput, modelSettings);
1175
1199
  let textOut: _TextOut | null = null;
@@ -1205,6 +1229,9 @@ export class AgentActivity implements RecognitionHooks {
1205
1229
  textOut?.firstTextFut.await.finally(onFirstFrame);
1206
1230
  }
1207
1231
 
1232
+ //TODO(AJS-272): before executing tools, make sure we generated all the text
1233
+ // (this ensure everything is kept ordered)
1234
+
1208
1235
  const onToolExecutionStarted = (_: FunctionCall) => {
1209
1236
  // TODO(brian): handle speech_handle item_added
1210
1237
  };
@@ -1223,7 +1250,6 @@ export class AgentActivity implements RecognitionHooks {
1223
1250
  onToolExecutionStarted,
1224
1251
  onToolExecutionCompleted,
1225
1252
  });
1226
- tasks.push(executeToolsTask);
1227
1253
 
1228
1254
  await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
1229
1255
 
@@ -1290,7 +1316,7 @@ export class AgentActivity implements RecognitionHooks {
1290
1316
  'playout completed with interrupt',
1291
1317
  );
1292
1318
  // TODO(shubhra) add chat message to speech handle
1293
- speechHandle._markPlayoutDone();
1319
+ speechHandle._markGenerationDone();
1294
1320
  await executeToolsTask.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
1295
1321
  return;
1296
1322
  }
@@ -1318,14 +1344,15 @@ export class AgentActivity implements RecognitionHooks {
1318
1344
  this.agentSession._updateAgentState('listening');
1319
1345
  }
1320
1346
 
1321
- speechHandle._markPlayoutDone();
1347
+ // mark the playout done before waiting for the tool execution
1348
+ speechHandle._markGenerationDone();
1322
1349
  await executeToolsTask.result;
1323
1350
 
1324
1351
  if (toolOutput.output.length === 0) return;
1325
1352
 
1326
1353
  // important: no agent output should be used after this point
1327
1354
  const { maxToolSteps } = this.agentSession.options;
1328
- if (speechHandle.stepIndex >= maxToolSteps) {
1355
+ if (speechHandle.numSteps >= maxToolSteps) {
1329
1356
  this.logger.warn(
1330
1357
  { speech_id: speechHandle.id, max_tool_steps: maxToolSteps },
1331
1358
  'maximum number of function calls steps reached',
@@ -1390,7 +1417,7 @@ export class AgentActivity implements RecognitionHooks {
1390
1417
 
1391
1418
  const handle = SpeechHandle.create({
1392
1419
  allowInterruptions: speechHandle.allowInterruptions,
1393
- stepIndex: speechHandle.stepIndex + 1,
1420
+ stepIndex: speechHandle._stepIndex + 1,
1394
1421
  parent: speechHandle,
1395
1422
  });
1396
1423
  this.agentSession.emit(
@@ -1407,14 +1434,17 @@ export class AgentActivity implements RecognitionHooks {
1407
1434
  const respondToolChoice = draining || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
1408
1435
 
1409
1436
  const toolResponseTask = this.createSpeechTask({
1410
- promise: this.pipelineReplyTask(
1411
- handle,
1412
- chatCtx,
1413
- toolCtx,
1414
- { toolChoice: respondToolChoice },
1415
- instructions,
1416
- undefined,
1417
- toolMessages,
1437
+ task: Task.from(() =>
1438
+ this.pipelineReplyTask(
1439
+ handle,
1440
+ chatCtx,
1441
+ toolCtx,
1442
+ { toolChoice: respondToolChoice },
1443
+ replyAbortController,
1444
+ instructions,
1445
+ undefined,
1446
+ toolMessages,
1447
+ ),
1418
1448
  ),
1419
1449
  ownedSpeechHandle: handle,
1420
1450
  name: 'AgentActivity.pipelineReply',
@@ -1435,6 +1465,7 @@ export class AgentActivity implements RecognitionHooks {
1435
1465
  speechHandle: SpeechHandle,
1436
1466
  ev: GenerationCreatedEvent,
1437
1467
  modelSettings: ModelSettings,
1468
+ replyAbortController: AbortController,
1438
1469
  ): Promise<void> {
1439
1470
  speechHandleStorage.enterWith(speechHandle);
1440
1471
 
@@ -1446,7 +1477,7 @@ export class AgentActivity implements RecognitionHooks {
1446
1477
  }
1447
1478
 
1448
1479
  this.logger.debug(
1449
- { speech_id: speechHandle.id, stepIndex: speechHandle.stepIndex },
1480
+ { speech_id: speechHandle.id, stepIndex: speechHandle.numSteps },
1450
1481
  'realtime generation started',
1451
1482
  );
1452
1483
 
@@ -1459,6 +1490,7 @@ export class AgentActivity implements RecognitionHooks {
1459
1490
  const toolCtx = this.realtimeSession.tools;
1460
1491
 
1461
1492
  await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
1493
+ speechHandle._clearAuthorization();
1462
1494
 
1463
1495
  if (speechHandle.interrupted) {
1464
1496
  return;
@@ -1468,8 +1500,6 @@ export class AgentActivity implements RecognitionHooks {
1468
1500
  this.agentSession._updateAgentState('speaking');
1469
1501
  };
1470
1502
 
1471
- const replyAbortController = new AbortController();
1472
-
1473
1503
  const readMessages = async (
1474
1504
  abortController: AbortController,
1475
1505
  outputs: Array<[string, _TextOut | null, _AudioOut | null]>,
@@ -1566,12 +1596,14 @@ export class AgentActivity implements RecognitionHooks {
1566
1596
  ),
1567
1597
  );
1568
1598
 
1569
- const onToolExecutionStarted = (_: FunctionCall) => {
1570
- // TODO(brian): handle speech_handle item_added
1599
+ const onToolExecutionStarted = (f: FunctionCall) => {
1600
+ speechHandle._itemAdded([f]);
1571
1601
  };
1572
1602
 
1573
- const onToolExecutionCompleted = (_: ToolExecutionOutput) => {
1574
- // TODO(brian): handle speech_handle item_added
1603
+ const onToolExecutionCompleted = (out: ToolExecutionOutput) => {
1604
+ if (out.toolCallOutput) {
1605
+ speechHandle._itemAdded([out.toolCallOutput]);
1606
+ }
1575
1607
  };
1576
1608
 
1577
1609
  const [executeToolsTask, toolOutput] = performToolExecutions({
@@ -1640,7 +1672,7 @@ export class AgentActivity implements RecognitionHooks {
1640
1672
  interrupted: true,
1641
1673
  });
1642
1674
  this.agent._chatCtx.insert(message);
1643
- speechHandle._setChatMessage(message);
1675
+ speechHandle._itemAdded([message]);
1644
1676
  this.agentSession._conversationItemAdded(message);
1645
1677
 
1646
1678
  // TODO(brian): add tracing span
@@ -1650,8 +1682,7 @@ export class AgentActivity implements RecognitionHooks {
1650
1682
  'playout completed with interrupt',
1651
1683
  );
1652
1684
  }
1653
- // TODO(shubhra) add chat message to speech handle
1654
- speechHandle._markPlayoutDone();
1685
+ speechHandle._markGenerationDone();
1655
1686
  await executeToolsTask.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
1656
1687
 
1657
1688
  // TODO(brian): close tees
@@ -1668,13 +1699,13 @@ export class AgentActivity implements RecognitionHooks {
1668
1699
  interrupted: false,
1669
1700
  });
1670
1701
  this.agent._chatCtx.insert(message);
1671
- speechHandle._setChatMessage(message);
1702
+ speechHandle._itemAdded([message]);
1672
1703
  this.agentSession._conversationItemAdded(message); // mark the playout done before waiting for the tool execution\
1673
1704
  // TODO(brian): add tracing span
1674
1705
  }
1675
1706
 
1676
1707
  // mark the playout done before waiting for the tool execution
1677
- speechHandle._markPlayoutDone();
1708
+ speechHandle._markGenerationDone();
1678
1709
  // TODO(brian): close tees
1679
1710
 
1680
1711
  toolOutput.firstToolStartedFuture.await.finally(() => {
@@ -1687,7 +1718,7 @@ export class AgentActivity implements RecognitionHooks {
1687
1718
 
1688
1719
  // important: no agent ouput should be used after this point
1689
1720
  const { maxToolSteps } = this.agentSession.options;
1690
- if (speechHandle.stepIndex >= maxToolSteps) {
1721
+ if (speechHandle.numSteps >= maxToolSteps) {
1691
1722
  this.logger.warn(
1692
1723
  { speech_id: speechHandle.id, max_tool_steps: maxToolSteps },
1693
1724
  'maximum number of function calls steps reached',
@@ -1763,7 +1794,7 @@ export class AgentActivity implements RecognitionHooks {
1763
1794
 
1764
1795
  const replySpeechHandle = SpeechHandle.create({
1765
1796
  allowInterruptions: speechHandle.allowInterruptions,
1766
- stepIndex: speechHandle.stepIndex + 1,
1797
+ stepIndex: speechHandle.numSteps + 1,
1767
1798
  parent: speechHandle,
1768
1799
  });
1769
1800
  this.agentSession.emit(
@@ -1777,10 +1808,13 @@ export class AgentActivity implements RecognitionHooks {
1777
1808
 
1778
1809
  const toolChoice = draining || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
1779
1810
  this.createSpeechTask({
1780
- promise: this.realtimeReplyTask({
1781
- speechHandle: replySpeechHandle,
1782
- modelSettings: { toolChoice },
1783
- }),
1811
+ task: Task.from((abortController: AbortController) =>
1812
+ this.realtimeReplyTask({
1813
+ speechHandle: replySpeechHandle,
1814
+ modelSettings: { toolChoice },
1815
+ abortController,
1816
+ }),
1817
+ ),
1784
1818
  ownedSpeechHandle: replySpeechHandle,
1785
1819
  name: 'AgentActivity.realtime_reply',
1786
1820
  });
@@ -1793,9 +1827,11 @@ export class AgentActivity implements RecognitionHooks {
1793
1827
  modelSettings: { toolChoice },
1794
1828
  userInput,
1795
1829
  instructions,
1830
+ abortController,
1796
1831
  }: {
1797
1832
  speechHandle: SpeechHandle;
1798
1833
  modelSettings: ModelSettings;
1834
+ abortController: AbortController;
1799
1835
  userInput?: string;
1800
1836
  instructions?: string;
1801
1837
  }): Promise<void> {
@@ -1825,7 +1861,12 @@ export class AgentActivity implements RecognitionHooks {
1825
1861
 
1826
1862
  try {
1827
1863
  const generationEvent = await this.realtimeSession.generateReply(instructions);
1828
- await this.realtimeGenerationTask(speechHandle, generationEvent, { toolChoice });
1864
+ await this.realtimeGenerationTask(
1865
+ speechHandle,
1866
+ generationEvent,
1867
+ { toolChoice },
1868
+ abortController,
1869
+ );
1829
1870
  } finally {
1830
1871
  // reset toolChoice value
1831
1872
  if (toolChoice !== undefined && toolChoice !== originalToolChoice) {
@@ -1837,14 +1878,17 @@ export class AgentActivity implements RecognitionHooks {
1837
1878
  private scheduleSpeech(
1838
1879
  speechHandle: SpeechHandle,
1839
1880
  priority: number,
1840
- bypassDraining: boolean = false,
1881
+ force: boolean = false,
1841
1882
  ): void {
1842
- if (this.draining && !bypassDraining) {
1883
+ // when force=true, we allow tool responses to bypass draining
1884
+ // This allows for tool responses to be generated before the AgentActivity is finalized
1885
+ if (this.draining && !force) {
1843
1886
  throw new Error('cannot schedule new speech, the agent is draining');
1844
1887
  }
1845
1888
 
1846
1889
  // Monotonic time to avoid near 0 collisions
1847
1890
  this.speechQueue.push([priority, Number(process.hrtime.bigint()), speechHandle]);
1891
+ speechHandle._markScheduled();
1848
1892
  this.wakeupMainTask();
1849
1893
  }
1850
1894
 
@@ -1854,7 +1898,7 @@ export class AgentActivity implements RecognitionHooks {
1854
1898
  if (this._draining) return;
1855
1899
 
1856
1900
  this.createSpeechTask({
1857
- promise: this.agent.onExit(),
1901
+ task: Task.from(() => this.agent.onExit()),
1858
1902
  name: 'AgentActivity_onExit',
1859
1903
  });
1860
1904