@livekit/agents 1.0.2 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. package/dist/index.cjs +2 -5
  2. package/dist/index.cjs.map +1 -1
  3. package/dist/index.d.cts +2 -3
  4. package/dist/index.d.ts +2 -3
  5. package/dist/index.d.ts.map +1 -1
  6. package/dist/index.js +1 -3
  7. package/dist/index.js.map +1 -1
  8. package/dist/ipc/job_proc_lazy_main.cjs +3 -2
  9. package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
  10. package/dist/ipc/job_proc_lazy_main.js +4 -3
  11. package/dist/ipc/job_proc_lazy_main.js.map +1 -1
  12. package/dist/job.cjs +20 -14
  13. package/dist/job.cjs.map +1 -1
  14. package/dist/job.d.cts +11 -5
  15. package/dist/job.d.ts +11 -5
  16. package/dist/job.d.ts.map +1 -1
  17. package/dist/job.js +17 -12
  18. package/dist/job.js.map +1 -1
  19. package/dist/tokenize/basic/hyphenator.cjs.map +1 -1
  20. package/dist/tokenize/basic/hyphenator.js.map +1 -1
  21. package/dist/utils.cjs +77 -0
  22. package/dist/utils.cjs.map +1 -1
  23. package/dist/utils.d.cts +21 -0
  24. package/dist/utils.d.ts +21 -0
  25. package/dist/utils.d.ts.map +1 -1
  26. package/dist/utils.js +76 -1
  27. package/dist/utils.js.map +1 -1
  28. package/dist/voice/agent_activity.cjs +112 -71
  29. package/dist/voice/agent_activity.cjs.map +1 -1
  30. package/dist/voice/agent_activity.d.ts.map +1 -1
  31. package/dist/voice/agent_activity.js +112 -71
  32. package/dist/voice/agent_activity.js.map +1 -1
  33. package/dist/voice/agent_session.cjs +9 -2
  34. package/dist/voice/agent_session.cjs.map +1 -1
  35. package/dist/voice/agent_session.d.ts.map +1 -1
  36. package/dist/voice/agent_session.js +9 -2
  37. package/dist/voice/agent_session.js.map +1 -1
  38. package/dist/voice/avatar/datastream_io.cjs +204 -0
  39. package/dist/voice/avatar/datastream_io.cjs.map +1 -0
  40. package/dist/voice/avatar/datastream_io.d.cts +37 -0
  41. package/dist/voice/avatar/datastream_io.d.ts +37 -0
  42. package/dist/voice/avatar/datastream_io.d.ts.map +1 -0
  43. package/dist/voice/avatar/datastream_io.js +188 -0
  44. package/dist/voice/avatar/datastream_io.js.map +1 -0
  45. package/dist/{multimodal → voice/avatar}/index.cjs +4 -4
  46. package/dist/voice/avatar/index.cjs.map +1 -0
  47. package/dist/voice/avatar/index.d.cts +2 -0
  48. package/dist/voice/avatar/index.d.ts +2 -0
  49. package/dist/voice/avatar/index.d.ts.map +1 -0
  50. package/dist/voice/avatar/index.js +2 -0
  51. package/dist/voice/avatar/index.js.map +1 -0
  52. package/dist/voice/index.cjs +2 -0
  53. package/dist/voice/index.cjs.map +1 -1
  54. package/dist/voice/index.d.cts +1 -0
  55. package/dist/voice/index.d.ts +1 -0
  56. package/dist/voice/index.d.ts.map +1 -1
  57. package/dist/voice/index.js +1 -0
  58. package/dist/voice/index.js.map +1 -1
  59. package/dist/voice/io.cjs.map +1 -1
  60. package/dist/voice/io.d.cts +1 -1
  61. package/dist/voice/io.d.ts +1 -1
  62. package/dist/voice/io.d.ts.map +1 -1
  63. package/dist/voice/io.js.map +1 -1
  64. package/dist/voice/room_io/_input.cjs +2 -1
  65. package/dist/voice/room_io/_input.cjs.map +1 -1
  66. package/dist/voice/room_io/_input.d.ts.map +1 -1
  67. package/dist/voice/room_io/_input.js +2 -1
  68. package/dist/voice/room_io/_input.js.map +1 -1
  69. package/dist/voice/run_context.cjs +13 -0
  70. package/dist/voice/run_context.cjs.map +1 -1
  71. package/dist/voice/run_context.d.cts +10 -0
  72. package/dist/voice/run_context.d.ts +10 -0
  73. package/dist/voice/run_context.d.ts.map +1 -1
  74. package/dist/voice/run_context.js +13 -0
  75. package/dist/voice/run_context.js.map +1 -1
  76. package/dist/voice/speech_handle.cjs +152 -30
  77. package/dist/voice/speech_handle.cjs.map +1 -1
  78. package/dist/voice/speech_handle.d.cts +67 -16
  79. package/dist/voice/speech_handle.d.ts +67 -16
  80. package/dist/voice/speech_handle.d.ts.map +1 -1
  81. package/dist/voice/speech_handle.js +153 -31
  82. package/dist/voice/speech_handle.js.map +1 -1
  83. package/dist/worker.cjs +4 -1
  84. package/dist/worker.cjs.map +1 -1
  85. package/dist/worker.d.ts.map +1 -1
  86. package/dist/worker.js +4 -1
  87. package/dist/worker.js.map +1 -1
  88. package/package.json +2 -2
  89. package/src/index.ts +2 -3
  90. package/src/ipc/job_proc_lazy_main.ts +6 -3
  91. package/src/job.ts +27 -12
  92. package/src/tokenize/basic/hyphenator.ts +1 -1
  93. package/src/utils.ts +121 -1
  94. package/src/voice/agent_activity.ts +128 -78
  95. package/src/voice/agent_session.ts +11 -2
  96. package/src/voice/avatar/datastream_io.ts +247 -0
  97. package/src/voice/avatar/index.ts +4 -0
  98. package/src/voice/index.ts +2 -0
  99. package/src/voice/io.ts +1 -1
  100. package/src/voice/room_io/_input.ts +8 -3
  101. package/src/voice/run_context.ts +16 -2
  102. package/src/voice/speech_handle.ts +183 -38
  103. package/src/worker.ts +5 -1
  104. package/dist/multimodal/agent_playout.cjs +0 -233
  105. package/dist/multimodal/agent_playout.cjs.map +0 -1
  106. package/dist/multimodal/agent_playout.d.cts +0 -34
  107. package/dist/multimodal/agent_playout.d.ts +0 -34
  108. package/dist/multimodal/agent_playout.d.ts.map +0 -1
  109. package/dist/multimodal/agent_playout.js +0 -207
  110. package/dist/multimodal/agent_playout.js.map +0 -1
  111. package/dist/multimodal/index.cjs.map +0 -1
  112. package/dist/multimodal/index.d.cts +0 -2
  113. package/dist/multimodal/index.d.ts +0 -2
  114. package/dist/multimodal/index.d.ts.map +0 -1
  115. package/dist/multimodal/index.js +0 -2
  116. package/dist/multimodal/index.js.map +0 -1
  117. package/src/multimodal/agent_playout.ts +0 -266
  118. package/src/multimodal/index.ts +0 -4
@@ -82,7 +82,7 @@ export class AgentActivity implements RecognitionHooks {
82
82
  private _currentSpeech?: SpeechHandle;
83
83
  private speechQueue: Heap<[number, number, SpeechHandle]>; // [priority, timestamp, speechHandle]
84
84
  private q_updated: Future;
85
- private speechTasks: Set<Promise<unknown>> = new Set();
85
+ private speechTasks: Set<Task<void>> = new Set();
86
86
  private lock = new Mutex();
87
87
  private audioStream = new DeferredReadableStream<AudioFrame>();
88
88
  // default to null as None, which maps to the default provider tool choice value
@@ -269,7 +269,7 @@ export class AgentActivity implements RecognitionHooks {
269
269
 
270
270
  this._mainTask = Task.from(({ signal }) => this.mainTask(signal));
271
271
  this.createSpeechTask({
272
- promise: this.agent.onEnter(),
272
+ task: Task.from(() => this.agent.onEnter()),
273
273
  name: 'AgentActivity_onEnter',
274
274
  });
275
275
  } finally {
@@ -441,9 +441,10 @@ export class AgentActivity implements RecognitionHooks {
441
441
  speechHandle: handle,
442
442
  }),
443
443
  );
444
-
445
444
  const task = this.createSpeechTask({
446
- promise: this.ttsTask(handle, text, addToChatCtx, {}, audio),
445
+ task: Task.from((abortController: AbortController) =>
446
+ this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio),
447
+ ),
447
448
  ownedSpeechHandle: handle,
448
449
  name: 'AgentActivity.say_tts',
449
450
  });
@@ -572,7 +573,9 @@ export class AgentActivity implements RecognitionHooks {
572
573
  this.logger.info({ speech_id: handle.id }, 'Creating speech handle');
573
574
 
574
575
  this.createSpeechTask({
575
- promise: this.realtimeGenerationTask(handle, ev, {}),
576
+ task: Task.from((abortController: AbortController) =>
577
+ this.realtimeGenerationTask(handle, ev, {}, abortController),
578
+ ),
576
579
  ownedSpeechHandle: handle,
577
580
  name: 'AgentActivity.realtimeGeneration',
578
581
  });
@@ -659,26 +662,32 @@ export class AgentActivity implements RecognitionHooks {
659
662
  );
660
663
  }
661
664
 
662
- private createSpeechTask<T>(options: {
663
- promise: Promise<T>;
665
+ private createSpeechTask(options: {
666
+ task: Task<void>;
664
667
  ownedSpeechHandle?: SpeechHandle;
665
668
  name?: string;
666
- }): Promise<T> {
667
- const { promise, ownedSpeechHandle } = options;
668
-
669
- this.speechTasks.add(promise);
669
+ }): Promise<void> {
670
+ const { task, ownedSpeechHandle } = options;
670
671
 
671
- promise.finally(() => {
672
- this.speechTasks.delete(promise);
672
+ this.speechTasks.add(task);
673
+ task.addDoneCallback(() => {
674
+ this.speechTasks.delete(task);
675
+ });
673
676
 
674
- if (ownedSpeechHandle) {
675
- ownedSpeechHandle._markPlayoutDone();
676
- }
677
+ if (ownedSpeechHandle) {
678
+ ownedSpeechHandle._tasks.push(task);
679
+ task.addDoneCallback(() => {
680
+ if (ownedSpeechHandle._tasks.every((t) => t.done)) {
681
+ ownedSpeechHandle._markDone();
682
+ }
683
+ });
684
+ }
677
685
 
686
+ task.addDoneCallback(() => {
678
687
  this.wakeupMainTask();
679
688
  });
680
689
 
681
- return promise;
690
+ return task.result;
682
691
  }
683
692
 
684
693
  async onEndOfTurn(info: EndOfTurnInfo): Promise<boolean> {
@@ -705,7 +714,7 @@ export class AgentActivity implements RecognitionHooks {
705
714
 
706
715
  const oldTask = this._userTurnCompletedTask;
707
716
  this._userTurnCompletedTask = this.createSpeechTask({
708
- promise: this.userTurnCompleted(info, oldTask),
717
+ task: Task.from(() => this.userTurnCompleted(info, oldTask)),
709
718
  name: 'AgentActivity.userTurnCompleted',
710
719
  });
711
720
  return true;
@@ -736,8 +745,8 @@ export class AgentActivity implements RecognitionHooks {
736
745
  }
737
746
  const speechHandle = heapItem[2];
738
747
  this._currentSpeech = speechHandle;
739
- speechHandle._authorizePlayout();
740
- await speechHandle.waitForPlayout();
748
+ speechHandle._authorizeGeneration();
749
+ await speechHandle._waitForGeneration();
741
750
  this._currentSpeech = undefined;
742
751
  }
743
752
 
@@ -815,16 +824,19 @@ export class AgentActivity implements RecognitionHooks {
815
824
 
816
825
  if (this.llm instanceof RealtimeModel) {
817
826
  this.createSpeechTask({
818
- promise: this.realtimeReplyTask({
819
- speechHandle: handle,
820
- // TODO(brian): support llm.ChatMessage for the realtime model
821
- userInput: userMessage?.textContent,
822
- instructions,
823
- modelSettings: {
824
- // isGiven(toolChoice) = toolChoice !== undefined
825
- toolChoice: toOaiToolChoice(toolChoice !== undefined ? toolChoice : this.toolChoice),
826
- },
827
- }),
827
+ task: Task.from((abortController: AbortController) =>
828
+ this.realtimeReplyTask({
829
+ speechHandle: handle,
830
+ // TODO(brian): support llm.ChatMessage for the realtime model
831
+ userInput: userMessage?.textContent,
832
+ instructions,
833
+ modelSettings: {
834
+ // isGiven(toolChoice) = toolChoice !== undefined
835
+ toolChoice: toOaiToolChoice(toolChoice !== undefined ? toolChoice : this.toolChoice),
836
+ },
837
+ abortController,
838
+ }),
839
+ ),
828
840
  ownedSpeechHandle: handle,
829
841
  name: 'AgentActivity.realtimeReply',
830
842
  });
@@ -837,13 +849,18 @@ export class AgentActivity implements RecognitionHooks {
837
849
  }
838
850
 
839
851
  const task = this.createSpeechTask({
840
- promise: this.pipelineReplyTask(
841
- handle,
842
- chatCtx ?? this.agent.chatCtx,
843
- this.agent.toolCtx,
844
- { toolChoice: toOaiToolChoice(toolChoice !== undefined ? toolChoice : this.toolChoice) },
845
- instructions ? `${this.agent.instructions}\n${instructions}` : instructions,
846
- userMessage,
852
+ task: Task.from((abortController: AbortController) =>
853
+ this.pipelineReplyTask(
854
+ handle,
855
+ chatCtx ?? this.agent.chatCtx,
856
+ this.agent.toolCtx,
857
+ {
858
+ toolChoice: toOaiToolChoice(toolChoice !== undefined ? toolChoice : this.toolChoice),
859
+ },
860
+ abortController,
861
+ instructions ? `${this.agent.instructions}\n${instructions}` : instructions,
862
+ userMessage,
863
+ ),
847
864
  ),
848
865
  ownedSpeechHandle: handle,
849
866
  name: 'AgentActivity.pipelineReply',
@@ -860,6 +877,8 @@ export class AgentActivity implements RecognitionHooks {
860
877
  const future = new Future<void>();
861
878
  const currentSpeech = this._currentSpeech;
862
879
 
880
+ //TODO(AJS-273): add interrupt for background speeches
881
+
863
882
  currentSpeech?.interrupt();
864
883
 
865
884
  for (const [_, __, speech] of this.speechQueue) {
@@ -871,7 +890,7 @@ export class AgentActivity implements RecognitionHooks {
871
890
  if (currentSpeech === undefined) {
872
891
  future.resolve();
873
892
  } else {
874
- currentSpeech.then(() => {
893
+ currentSpeech.addDoneCallback(() => {
875
894
  if (future.done) return;
876
895
  future.resolve();
877
896
  });
@@ -881,7 +900,7 @@ export class AgentActivity implements RecognitionHooks {
881
900
  }
882
901
 
883
902
  private onPipelineReplyDone(): void {
884
- if (!this.speechQueue.peek() && (!this._currentSpeech || this._currentSpeech.done)) {
903
+ if (!this.speechQueue.peek() && (!this._currentSpeech || this._currentSpeech.done())) {
885
904
  this.agentSession._updateAgentState('listening');
886
905
  }
887
906
  }
@@ -980,6 +999,7 @@ export class AgentActivity implements RecognitionHooks {
980
999
  text: string | ReadableStream<string>,
981
1000
  addToChatCtx: boolean,
982
1001
  modelSettings: ModelSettings,
1002
+ replyAbortController: AbortController,
983
1003
  audio?: ReadableStream<AudioFrame> | null,
984
1004
  ): Promise<void> {
985
1005
  speechHandleStorage.enterWith(speechHandle);
@@ -992,7 +1012,6 @@ export class AgentActivity implements RecognitionHooks {
992
1012
  ? this.agentSession.output.audio
993
1013
  : null;
994
1014
 
995
- const replyAbortController = new AbortController();
996
1015
  await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
997
1016
 
998
1017
  if (speechHandle.interrupted) {
@@ -1102,14 +1121,13 @@ export class AgentActivity implements RecognitionHooks {
1102
1121
  chatCtx: ChatContext,
1103
1122
  toolCtx: ToolContext,
1104
1123
  modelSettings: ModelSettings,
1124
+ replyAbortController: AbortController,
1105
1125
  instructions?: string,
1106
1126
  newMessage?: ChatMessage,
1107
1127
  toolsMessages?: ChatItem[],
1108
1128
  ): Promise<void> {
1109
1129
  speechHandleStorage.enterWith(speechHandle);
1110
1130
 
1111
- const replyAbortController = new AbortController();
1112
-
1113
1131
  const audioOutput = this.agentSession.output.audioEnabled
1114
1132
  ? this.agentSession.output.audio
1115
1133
  : null;
@@ -1163,13 +1181,25 @@ export class AgentActivity implements RecognitionHooks {
1163
1181
  tasks.push(ttsTask);
1164
1182
  }
1165
1183
 
1166
- await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
1184
+ await speechHandle.waitIfNotInterrupted([speechHandle._waitForScheduled()]);
1185
+
1186
+ if (newMessage && speechHandle.scheduled) {
1187
+ chatCtx.insert(newMessage);
1188
+ this.agent._chatCtx.insert(newMessage);
1189
+ this.agentSession._conversationItemAdded(newMessage);
1190
+ }
1191
+
1167
1192
  if (speechHandle.interrupted) {
1168
1193
  replyAbortController.abort();
1169
1194
  await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
1170
1195
  return;
1171
1196
  }
1172
1197
 
1198
+ this.agentSession._updateAgentState('thinking');
1199
+
1200
+ await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
1201
+ speechHandle._clearAuthorization();
1202
+
1173
1203
  const replyStartedAt = Date.now();
1174
1204
  const trNodeResult = await this.agent.transcriptionNode(llmOutput, modelSettings);
1175
1205
  let textOut: _TextOut | null = null;
@@ -1205,6 +1235,9 @@ export class AgentActivity implements RecognitionHooks {
1205
1235
  textOut?.firstTextFut.await.finally(onFirstFrame);
1206
1236
  }
1207
1237
 
1238
+ //TODO(AJS-272): before executing tools, make sure we generated all the text
1239
+ // (this ensure everything is kept ordered)
1240
+
1208
1241
  const onToolExecutionStarted = (_: FunctionCall) => {
1209
1242
  // TODO(brian): handle speech_handle item_added
1210
1243
  };
@@ -1223,7 +1256,6 @@ export class AgentActivity implements RecognitionHooks {
1223
1256
  onToolExecutionStarted,
1224
1257
  onToolExecutionCompleted,
1225
1258
  });
1226
- tasks.push(executeToolsTask);
1227
1259
 
1228
1260
  await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
1229
1261
 
@@ -1290,7 +1322,7 @@ export class AgentActivity implements RecognitionHooks {
1290
1322
  'playout completed with interrupt',
1291
1323
  );
1292
1324
  // TODO(shubhra) add chat message to speech handle
1293
- speechHandle._markPlayoutDone();
1325
+ speechHandle._markGenerationDone();
1294
1326
  await executeToolsTask.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
1295
1327
  return;
1296
1328
  }
@@ -1318,14 +1350,15 @@ export class AgentActivity implements RecognitionHooks {
1318
1350
  this.agentSession._updateAgentState('listening');
1319
1351
  }
1320
1352
 
1321
- speechHandle._markPlayoutDone();
1353
+ // mark the playout done before waiting for the tool execution
1354
+ speechHandle._markGenerationDone();
1322
1355
  await executeToolsTask.result;
1323
1356
 
1324
1357
  if (toolOutput.output.length === 0) return;
1325
1358
 
1326
1359
  // important: no agent output should be used after this point
1327
1360
  const { maxToolSteps } = this.agentSession.options;
1328
- if (speechHandle.stepIndex >= maxToolSteps) {
1361
+ if (speechHandle.numSteps >= maxToolSteps) {
1329
1362
  this.logger.warn(
1330
1363
  { speech_id: speechHandle.id, max_tool_steps: maxToolSteps },
1331
1364
  'maximum number of function calls steps reached',
@@ -1390,7 +1423,7 @@ export class AgentActivity implements RecognitionHooks {
1390
1423
 
1391
1424
  const handle = SpeechHandle.create({
1392
1425
  allowInterruptions: speechHandle.allowInterruptions,
1393
- stepIndex: speechHandle.stepIndex + 1,
1426
+ stepIndex: speechHandle._stepIndex + 1,
1394
1427
  parent: speechHandle,
1395
1428
  });
1396
1429
  this.agentSession.emit(
@@ -1407,14 +1440,17 @@ export class AgentActivity implements RecognitionHooks {
1407
1440
  const respondToolChoice = draining || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
1408
1441
 
1409
1442
  const toolResponseTask = this.createSpeechTask({
1410
- promise: this.pipelineReplyTask(
1411
- handle,
1412
- chatCtx,
1413
- toolCtx,
1414
- { toolChoice: respondToolChoice },
1415
- instructions,
1416
- undefined,
1417
- toolMessages,
1443
+ task: Task.from(() =>
1444
+ this.pipelineReplyTask(
1445
+ handle,
1446
+ chatCtx,
1447
+ toolCtx,
1448
+ { toolChoice: respondToolChoice },
1449
+ replyAbortController,
1450
+ instructions,
1451
+ undefined,
1452
+ toolMessages,
1453
+ ),
1418
1454
  ),
1419
1455
  ownedSpeechHandle: handle,
1420
1456
  name: 'AgentActivity.pipelineReply',
@@ -1435,6 +1471,7 @@ export class AgentActivity implements RecognitionHooks {
1435
1471
  speechHandle: SpeechHandle,
1436
1472
  ev: GenerationCreatedEvent,
1437
1473
  modelSettings: ModelSettings,
1474
+ replyAbortController: AbortController,
1438
1475
  ): Promise<void> {
1439
1476
  speechHandleStorage.enterWith(speechHandle);
1440
1477
 
@@ -1446,7 +1483,7 @@ export class AgentActivity implements RecognitionHooks {
1446
1483
  }
1447
1484
 
1448
1485
  this.logger.debug(
1449
- { speech_id: speechHandle.id, stepIndex: speechHandle.stepIndex },
1486
+ { speech_id: speechHandle.id, stepIndex: speechHandle.numSteps },
1450
1487
  'realtime generation started',
1451
1488
  );
1452
1489
 
@@ -1459,6 +1496,7 @@ export class AgentActivity implements RecognitionHooks {
1459
1496
  const toolCtx = this.realtimeSession.tools;
1460
1497
 
1461
1498
  await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
1499
+ speechHandle._clearAuthorization();
1462
1500
 
1463
1501
  if (speechHandle.interrupted) {
1464
1502
  return;
@@ -1468,8 +1506,6 @@ export class AgentActivity implements RecognitionHooks {
1468
1506
  this.agentSession._updateAgentState('speaking');
1469
1507
  };
1470
1508
 
1471
- const replyAbortController = new AbortController();
1472
-
1473
1509
  const readMessages = async (
1474
1510
  abortController: AbortController,
1475
1511
  outputs: Array<[string, _TextOut | null, _AudioOut | null]>,
@@ -1566,12 +1602,14 @@ export class AgentActivity implements RecognitionHooks {
1566
1602
  ),
1567
1603
  );
1568
1604
 
1569
- const onToolExecutionStarted = (_: FunctionCall) => {
1570
- // TODO(brian): handle speech_handle item_added
1605
+ const onToolExecutionStarted = (f: FunctionCall) => {
1606
+ speechHandle._itemAdded([f]);
1571
1607
  };
1572
1608
 
1573
- const onToolExecutionCompleted = (_: ToolExecutionOutput) => {
1574
- // TODO(brian): handle speech_handle item_added
1609
+ const onToolExecutionCompleted = (out: ToolExecutionOutput) => {
1610
+ if (out.toolCallOutput) {
1611
+ speechHandle._itemAdded([out.toolCallOutput]);
1612
+ }
1575
1613
  };
1576
1614
 
1577
1615
  const [executeToolsTask, toolOutput] = performToolExecutions({
@@ -1640,7 +1678,7 @@ export class AgentActivity implements RecognitionHooks {
1640
1678
  interrupted: true,
1641
1679
  });
1642
1680
  this.agent._chatCtx.insert(message);
1643
- speechHandle._setChatMessage(message);
1681
+ speechHandle._itemAdded([message]);
1644
1682
  this.agentSession._conversationItemAdded(message);
1645
1683
 
1646
1684
  // TODO(brian): add tracing span
@@ -1650,8 +1688,7 @@ export class AgentActivity implements RecognitionHooks {
1650
1688
  'playout completed with interrupt',
1651
1689
  );
1652
1690
  }
1653
- // TODO(shubhra) add chat message to speech handle
1654
- speechHandle._markPlayoutDone();
1691
+ speechHandle._markGenerationDone();
1655
1692
  await executeToolsTask.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
1656
1693
 
1657
1694
  // TODO(brian): close tees
@@ -1668,13 +1705,13 @@ export class AgentActivity implements RecognitionHooks {
1668
1705
  interrupted: false,
1669
1706
  });
1670
1707
  this.agent._chatCtx.insert(message);
1671
- speechHandle._setChatMessage(message);
1708
+ speechHandle._itemAdded([message]);
1672
1709
  this.agentSession._conversationItemAdded(message); // mark the playout done before waiting for the tool execution\
1673
1710
  // TODO(brian): add tracing span
1674
1711
  }
1675
1712
 
1676
1713
  // mark the playout done before waiting for the tool execution
1677
- speechHandle._markPlayoutDone();
1714
+ speechHandle._markGenerationDone();
1678
1715
  // TODO(brian): close tees
1679
1716
 
1680
1717
  toolOutput.firstToolStartedFuture.await.finally(() => {
@@ -1687,7 +1724,7 @@ export class AgentActivity implements RecognitionHooks {
1687
1724
 
1688
1725
  // important: no agent ouput should be used after this point
1689
1726
  const { maxToolSteps } = this.agentSession.options;
1690
- if (speechHandle.stepIndex >= maxToolSteps) {
1727
+ if (speechHandle.numSteps >= maxToolSteps) {
1691
1728
  this.logger.warn(
1692
1729
  { speech_id: speechHandle.id, max_tool_steps: maxToolSteps },
1693
1730
  'maximum number of function calls steps reached',
@@ -1763,7 +1800,7 @@ export class AgentActivity implements RecognitionHooks {
1763
1800
 
1764
1801
  const replySpeechHandle = SpeechHandle.create({
1765
1802
  allowInterruptions: speechHandle.allowInterruptions,
1766
- stepIndex: speechHandle.stepIndex + 1,
1803
+ stepIndex: speechHandle.numSteps + 1,
1767
1804
  parent: speechHandle,
1768
1805
  });
1769
1806
  this.agentSession.emit(
@@ -1777,10 +1814,13 @@ export class AgentActivity implements RecognitionHooks {
1777
1814
 
1778
1815
  const toolChoice = draining || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
1779
1816
  this.createSpeechTask({
1780
- promise: this.realtimeReplyTask({
1781
- speechHandle: replySpeechHandle,
1782
- modelSettings: { toolChoice },
1783
- }),
1817
+ task: Task.from((abortController: AbortController) =>
1818
+ this.realtimeReplyTask({
1819
+ speechHandle: replySpeechHandle,
1820
+ modelSettings: { toolChoice },
1821
+ abortController,
1822
+ }),
1823
+ ),
1784
1824
  ownedSpeechHandle: replySpeechHandle,
1785
1825
  name: 'AgentActivity.realtime_reply',
1786
1826
  });
@@ -1793,9 +1833,11 @@ export class AgentActivity implements RecognitionHooks {
1793
1833
  modelSettings: { toolChoice },
1794
1834
  userInput,
1795
1835
  instructions,
1836
+ abortController,
1796
1837
  }: {
1797
1838
  speechHandle: SpeechHandle;
1798
1839
  modelSettings: ModelSettings;
1840
+ abortController: AbortController;
1799
1841
  userInput?: string;
1800
1842
  instructions?: string;
1801
1843
  }): Promise<void> {
@@ -1825,7 +1867,12 @@ export class AgentActivity implements RecognitionHooks {
1825
1867
 
1826
1868
  try {
1827
1869
  const generationEvent = await this.realtimeSession.generateReply(instructions);
1828
- await this.realtimeGenerationTask(speechHandle, generationEvent, { toolChoice });
1870
+ await this.realtimeGenerationTask(
1871
+ speechHandle,
1872
+ generationEvent,
1873
+ { toolChoice },
1874
+ abortController,
1875
+ );
1829
1876
  } finally {
1830
1877
  // reset toolChoice value
1831
1878
  if (toolChoice !== undefined && toolChoice !== originalToolChoice) {
@@ -1837,14 +1884,17 @@ export class AgentActivity implements RecognitionHooks {
1837
1884
  private scheduleSpeech(
1838
1885
  speechHandle: SpeechHandle,
1839
1886
  priority: number,
1840
- bypassDraining: boolean = false,
1887
+ force: boolean = false,
1841
1888
  ): void {
1842
- if (this.draining && !bypassDraining) {
1889
+ // when force=true, we allow tool responses to bypass draining
1890
+ // This allows for tool responses to be generated before the AgentActivity is finalized
1891
+ if (this.draining && !force) {
1843
1892
  throw new Error('cannot schedule new speech, the agent is draining');
1844
1893
  }
1845
1894
 
1846
1895
  // Monotonic time to avoid near 0 collisions
1847
1896
  this.speechQueue.push([priority, Number(process.hrtime.bigint()), speechHandle]);
1897
+ speechHandle._markScheduled();
1848
1898
  this.wakeupMainTask();
1849
1899
  }
1850
1900
 
@@ -1854,7 +1904,7 @@ export class AgentActivity implements RecognitionHooks {
1854
1904
  if (this._draining) return;
1855
1905
 
1856
1906
  this.createSpeechTask({
1857
- promise: this.agent.onExit(),
1907
+ task: Task.from(() => this.agent.onExit()),
1858
1908
  name: 'AgentActivity_onExit',
1859
1909
  });
1860
1910
 
@@ -5,6 +5,7 @@ import type { AudioFrame, Room } from '@livekit/rtc-node';
5
5
  import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
6
6
  import { EventEmitter } from 'node:events';
7
7
  import type { ReadableStream } from 'node:stream/web';
8
+ import { getJobContext } from '../job.js';
8
9
  import { ChatContext, ChatMessage } from '../llm/chat_context.js';
9
10
  import type { LLM, RealtimeModel, RealtimeModelError, ToolChoice } from '../llm/index.js';
10
11
  import type { LLMError } from '../llm/llm.js';
@@ -184,6 +185,7 @@ export class AgentSession<
184
185
  this.agent = agent;
185
186
  this._updateAgentState('initializing');
186
187
 
188
+ const tasks: Promise<void>[] = [];
187
189
  // Check for existing input/output configuration and warn if needed
188
190
  if (this.input.audio && inputOptions?.audioEnabled !== false) {
189
191
  this.logger.warn('RoomIO audio input is enabled but input.audio is already set, ignoring..');
@@ -209,7 +211,15 @@ export class AgentSession<
209
211
  });
210
212
  this.roomIO.start();
211
213
 
212
- this.updateActivity(this.agent);
214
+ const ctx = getJobContext();
215
+ if (ctx && ctx.room === room && !room.isConnected) {
216
+ this.logger.debug('Auto-connecting to room via job context');
217
+ tasks.push(ctx.connect());
218
+ }
219
+ // TODO(AJS-265): add shutdown callback to job context
220
+ tasks.push(this.updateActivity(this.agent));
221
+
222
+ await Promise.allSettled(tasks);
213
223
 
214
224
  // Log used IO configuration
215
225
  this.logger.debug(
@@ -220,7 +230,6 @@ export class AgentSession<
220
230
  `using transcript io: \`AgentSession\` -> ${this.output.transcription ? '`' + this.output.transcription.constructor.name + '`' : '(none)'}`,
221
231
  );
222
232
 
223
- this.logger.debug('AgentSession started');
224
233
  this.started = true;
225
234
  this._updateAgentState('listening');
226
235
  }