@livekit/agents 1.0.2 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +2 -5
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +2 -3
- package/dist/index.d.ts +2 -3
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +1 -3
- package/dist/index.js.map +1 -1
- package/dist/ipc/job_proc_lazy_main.cjs +3 -2
- package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
- package/dist/ipc/job_proc_lazy_main.js +4 -3
- package/dist/ipc/job_proc_lazy_main.js.map +1 -1
- package/dist/job.cjs +20 -14
- package/dist/job.cjs.map +1 -1
- package/dist/job.d.cts +11 -5
- package/dist/job.d.ts +11 -5
- package/dist/job.d.ts.map +1 -1
- package/dist/job.js +17 -12
- package/dist/job.js.map +1 -1
- package/dist/tokenize/basic/hyphenator.cjs.map +1 -1
- package/dist/tokenize/basic/hyphenator.js.map +1 -1
- package/dist/utils.cjs +77 -0
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.cts +21 -0
- package/dist/utils.d.ts +21 -0
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +76 -1
- package/dist/utils.js.map +1 -1
- package/dist/voice/agent_activity.cjs +112 -71
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +112 -71
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +9 -2
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +9 -2
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/avatar/datastream_io.cjs +204 -0
- package/dist/voice/avatar/datastream_io.cjs.map +1 -0
- package/dist/voice/avatar/datastream_io.d.cts +37 -0
- package/dist/voice/avatar/datastream_io.d.ts +37 -0
- package/dist/voice/avatar/datastream_io.d.ts.map +1 -0
- package/dist/voice/avatar/datastream_io.js +188 -0
- package/dist/voice/avatar/datastream_io.js.map +1 -0
- package/dist/{multimodal → voice/avatar}/index.cjs +4 -4
- package/dist/voice/avatar/index.cjs.map +1 -0
- package/dist/voice/avatar/index.d.cts +2 -0
- package/dist/voice/avatar/index.d.ts +2 -0
- package/dist/voice/avatar/index.d.ts.map +1 -0
- package/dist/voice/avatar/index.js +2 -0
- package/dist/voice/avatar/index.js.map +1 -0
- package/dist/voice/index.cjs +2 -0
- package/dist/voice/index.cjs.map +1 -1
- package/dist/voice/index.d.cts +1 -0
- package/dist/voice/index.d.ts +1 -0
- package/dist/voice/index.d.ts.map +1 -1
- package/dist/voice/index.js +1 -0
- package/dist/voice/index.js.map +1 -1
- package/dist/voice/io.cjs.map +1 -1
- package/dist/voice/io.d.cts +1 -1
- package/dist/voice/io.d.ts +1 -1
- package/dist/voice/io.d.ts.map +1 -1
- package/dist/voice/io.js.map +1 -1
- package/dist/voice/room_io/_input.cjs +2 -1
- package/dist/voice/room_io/_input.cjs.map +1 -1
- package/dist/voice/room_io/_input.d.ts.map +1 -1
- package/dist/voice/room_io/_input.js +2 -1
- package/dist/voice/room_io/_input.js.map +1 -1
- package/dist/voice/run_context.cjs +13 -0
- package/dist/voice/run_context.cjs.map +1 -1
- package/dist/voice/run_context.d.cts +10 -0
- package/dist/voice/run_context.d.ts +10 -0
- package/dist/voice/run_context.d.ts.map +1 -1
- package/dist/voice/run_context.js +13 -0
- package/dist/voice/run_context.js.map +1 -1
- package/dist/voice/speech_handle.cjs +152 -30
- package/dist/voice/speech_handle.cjs.map +1 -1
- package/dist/voice/speech_handle.d.cts +67 -16
- package/dist/voice/speech_handle.d.ts +67 -16
- package/dist/voice/speech_handle.d.ts.map +1 -1
- package/dist/voice/speech_handle.js +153 -31
- package/dist/voice/speech_handle.js.map +1 -1
- package/dist/worker.cjs +4 -1
- package/dist/worker.cjs.map +1 -1
- package/dist/worker.d.ts.map +1 -1
- package/dist/worker.js +4 -1
- package/dist/worker.js.map +1 -1
- package/package.json +2 -2
- package/src/index.ts +2 -3
- package/src/ipc/job_proc_lazy_main.ts +6 -3
- package/src/job.ts +27 -12
- package/src/tokenize/basic/hyphenator.ts +1 -1
- package/src/utils.ts +121 -1
- package/src/voice/agent_activity.ts +128 -78
- package/src/voice/agent_session.ts +11 -2
- package/src/voice/avatar/datastream_io.ts +247 -0
- package/src/voice/avatar/index.ts +4 -0
- package/src/voice/index.ts +2 -0
- package/src/voice/io.ts +1 -1
- package/src/voice/room_io/_input.ts +8 -3
- package/src/voice/run_context.ts +16 -2
- package/src/voice/speech_handle.ts +183 -38
- package/src/worker.ts +5 -1
- package/dist/multimodal/agent_playout.cjs +0 -233
- package/dist/multimodal/agent_playout.cjs.map +0 -1
- package/dist/multimodal/agent_playout.d.cts +0 -34
- package/dist/multimodal/agent_playout.d.ts +0 -34
- package/dist/multimodal/agent_playout.d.ts.map +0 -1
- package/dist/multimodal/agent_playout.js +0 -207
- package/dist/multimodal/agent_playout.js.map +0 -1
- package/dist/multimodal/index.cjs.map +0 -1
- package/dist/multimodal/index.d.cts +0 -2
- package/dist/multimodal/index.d.ts +0 -2
- package/dist/multimodal/index.d.ts.map +0 -1
- package/dist/multimodal/index.js +0 -2
- package/dist/multimodal/index.js.map +0 -1
- package/src/multimodal/agent_playout.ts +0 -266
- package/src/multimodal/index.ts +0 -4
|
@@ -82,7 +82,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
82
82
|
private _currentSpeech?: SpeechHandle;
|
|
83
83
|
private speechQueue: Heap<[number, number, SpeechHandle]>; // [priority, timestamp, speechHandle]
|
|
84
84
|
private q_updated: Future;
|
|
85
|
-
private speechTasks: Set<
|
|
85
|
+
private speechTasks: Set<Task<void>> = new Set();
|
|
86
86
|
private lock = new Mutex();
|
|
87
87
|
private audioStream = new DeferredReadableStream<AudioFrame>();
|
|
88
88
|
// default to null as None, which maps to the default provider tool choice value
|
|
@@ -269,7 +269,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
269
269
|
|
|
270
270
|
this._mainTask = Task.from(({ signal }) => this.mainTask(signal));
|
|
271
271
|
this.createSpeechTask({
|
|
272
|
-
|
|
272
|
+
task: Task.from(() => this.agent.onEnter()),
|
|
273
273
|
name: 'AgentActivity_onEnter',
|
|
274
274
|
});
|
|
275
275
|
} finally {
|
|
@@ -441,9 +441,10 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
441
441
|
speechHandle: handle,
|
|
442
442
|
}),
|
|
443
443
|
);
|
|
444
|
-
|
|
445
444
|
const task = this.createSpeechTask({
|
|
446
|
-
|
|
445
|
+
task: Task.from((abortController: AbortController) =>
|
|
446
|
+
this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio),
|
|
447
|
+
),
|
|
447
448
|
ownedSpeechHandle: handle,
|
|
448
449
|
name: 'AgentActivity.say_tts',
|
|
449
450
|
});
|
|
@@ -572,7 +573,9 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
572
573
|
this.logger.info({ speech_id: handle.id }, 'Creating speech handle');
|
|
573
574
|
|
|
574
575
|
this.createSpeechTask({
|
|
575
|
-
|
|
576
|
+
task: Task.from((abortController: AbortController) =>
|
|
577
|
+
this.realtimeGenerationTask(handle, ev, {}, abortController),
|
|
578
|
+
),
|
|
576
579
|
ownedSpeechHandle: handle,
|
|
577
580
|
name: 'AgentActivity.realtimeGeneration',
|
|
578
581
|
});
|
|
@@ -659,26 +662,32 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
659
662
|
);
|
|
660
663
|
}
|
|
661
664
|
|
|
662
|
-
private createSpeechTask
|
|
663
|
-
|
|
665
|
+
private createSpeechTask(options: {
|
|
666
|
+
task: Task<void>;
|
|
664
667
|
ownedSpeechHandle?: SpeechHandle;
|
|
665
668
|
name?: string;
|
|
666
|
-
}): Promise<
|
|
667
|
-
const {
|
|
668
|
-
|
|
669
|
-
this.speechTasks.add(promise);
|
|
669
|
+
}): Promise<void> {
|
|
670
|
+
const { task, ownedSpeechHandle } = options;
|
|
670
671
|
|
|
671
|
-
|
|
672
|
-
|
|
672
|
+
this.speechTasks.add(task);
|
|
673
|
+
task.addDoneCallback(() => {
|
|
674
|
+
this.speechTasks.delete(task);
|
|
675
|
+
});
|
|
673
676
|
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
+
if (ownedSpeechHandle) {
|
|
678
|
+
ownedSpeechHandle._tasks.push(task);
|
|
679
|
+
task.addDoneCallback(() => {
|
|
680
|
+
if (ownedSpeechHandle._tasks.every((t) => t.done)) {
|
|
681
|
+
ownedSpeechHandle._markDone();
|
|
682
|
+
}
|
|
683
|
+
});
|
|
684
|
+
}
|
|
677
685
|
|
|
686
|
+
task.addDoneCallback(() => {
|
|
678
687
|
this.wakeupMainTask();
|
|
679
688
|
});
|
|
680
689
|
|
|
681
|
-
return
|
|
690
|
+
return task.result;
|
|
682
691
|
}
|
|
683
692
|
|
|
684
693
|
async onEndOfTurn(info: EndOfTurnInfo): Promise<boolean> {
|
|
@@ -705,7 +714,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
705
714
|
|
|
706
715
|
const oldTask = this._userTurnCompletedTask;
|
|
707
716
|
this._userTurnCompletedTask = this.createSpeechTask({
|
|
708
|
-
|
|
717
|
+
task: Task.from(() => this.userTurnCompleted(info, oldTask)),
|
|
709
718
|
name: 'AgentActivity.userTurnCompleted',
|
|
710
719
|
});
|
|
711
720
|
return true;
|
|
@@ -736,8 +745,8 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
736
745
|
}
|
|
737
746
|
const speechHandle = heapItem[2];
|
|
738
747
|
this._currentSpeech = speechHandle;
|
|
739
|
-
speechHandle.
|
|
740
|
-
await speechHandle.
|
|
748
|
+
speechHandle._authorizeGeneration();
|
|
749
|
+
await speechHandle._waitForGeneration();
|
|
741
750
|
this._currentSpeech = undefined;
|
|
742
751
|
}
|
|
743
752
|
|
|
@@ -815,16 +824,19 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
815
824
|
|
|
816
825
|
if (this.llm instanceof RealtimeModel) {
|
|
817
826
|
this.createSpeechTask({
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
827
|
+
task: Task.from((abortController: AbortController) =>
|
|
828
|
+
this.realtimeReplyTask({
|
|
829
|
+
speechHandle: handle,
|
|
830
|
+
// TODO(brian): support llm.ChatMessage for the realtime model
|
|
831
|
+
userInput: userMessage?.textContent,
|
|
832
|
+
instructions,
|
|
833
|
+
modelSettings: {
|
|
834
|
+
// isGiven(toolChoice) = toolChoice !== undefined
|
|
835
|
+
toolChoice: toOaiToolChoice(toolChoice !== undefined ? toolChoice : this.toolChoice),
|
|
836
|
+
},
|
|
837
|
+
abortController,
|
|
838
|
+
}),
|
|
839
|
+
),
|
|
828
840
|
ownedSpeechHandle: handle,
|
|
829
841
|
name: 'AgentActivity.realtimeReply',
|
|
830
842
|
});
|
|
@@ -837,13 +849,18 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
837
849
|
}
|
|
838
850
|
|
|
839
851
|
const task = this.createSpeechTask({
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
852
|
+
task: Task.from((abortController: AbortController) =>
|
|
853
|
+
this.pipelineReplyTask(
|
|
854
|
+
handle,
|
|
855
|
+
chatCtx ?? this.agent.chatCtx,
|
|
856
|
+
this.agent.toolCtx,
|
|
857
|
+
{
|
|
858
|
+
toolChoice: toOaiToolChoice(toolChoice !== undefined ? toolChoice : this.toolChoice),
|
|
859
|
+
},
|
|
860
|
+
abortController,
|
|
861
|
+
instructions ? `${this.agent.instructions}\n${instructions}` : instructions,
|
|
862
|
+
userMessage,
|
|
863
|
+
),
|
|
847
864
|
),
|
|
848
865
|
ownedSpeechHandle: handle,
|
|
849
866
|
name: 'AgentActivity.pipelineReply',
|
|
@@ -860,6 +877,8 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
860
877
|
const future = new Future<void>();
|
|
861
878
|
const currentSpeech = this._currentSpeech;
|
|
862
879
|
|
|
880
|
+
//TODO(AJS-273): add interrupt for background speeches
|
|
881
|
+
|
|
863
882
|
currentSpeech?.interrupt();
|
|
864
883
|
|
|
865
884
|
for (const [_, __, speech] of this.speechQueue) {
|
|
@@ -871,7 +890,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
871
890
|
if (currentSpeech === undefined) {
|
|
872
891
|
future.resolve();
|
|
873
892
|
} else {
|
|
874
|
-
currentSpeech.
|
|
893
|
+
currentSpeech.addDoneCallback(() => {
|
|
875
894
|
if (future.done) return;
|
|
876
895
|
future.resolve();
|
|
877
896
|
});
|
|
@@ -881,7 +900,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
881
900
|
}
|
|
882
901
|
|
|
883
902
|
private onPipelineReplyDone(): void {
|
|
884
|
-
if (!this.speechQueue.peek() && (!this._currentSpeech || this._currentSpeech.done)) {
|
|
903
|
+
if (!this.speechQueue.peek() && (!this._currentSpeech || this._currentSpeech.done())) {
|
|
885
904
|
this.agentSession._updateAgentState('listening');
|
|
886
905
|
}
|
|
887
906
|
}
|
|
@@ -980,6 +999,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
980
999
|
text: string | ReadableStream<string>,
|
|
981
1000
|
addToChatCtx: boolean,
|
|
982
1001
|
modelSettings: ModelSettings,
|
|
1002
|
+
replyAbortController: AbortController,
|
|
983
1003
|
audio?: ReadableStream<AudioFrame> | null,
|
|
984
1004
|
): Promise<void> {
|
|
985
1005
|
speechHandleStorage.enterWith(speechHandle);
|
|
@@ -992,7 +1012,6 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
992
1012
|
? this.agentSession.output.audio
|
|
993
1013
|
: null;
|
|
994
1014
|
|
|
995
|
-
const replyAbortController = new AbortController();
|
|
996
1015
|
await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
|
|
997
1016
|
|
|
998
1017
|
if (speechHandle.interrupted) {
|
|
@@ -1102,14 +1121,13 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1102
1121
|
chatCtx: ChatContext,
|
|
1103
1122
|
toolCtx: ToolContext,
|
|
1104
1123
|
modelSettings: ModelSettings,
|
|
1124
|
+
replyAbortController: AbortController,
|
|
1105
1125
|
instructions?: string,
|
|
1106
1126
|
newMessage?: ChatMessage,
|
|
1107
1127
|
toolsMessages?: ChatItem[],
|
|
1108
1128
|
): Promise<void> {
|
|
1109
1129
|
speechHandleStorage.enterWith(speechHandle);
|
|
1110
1130
|
|
|
1111
|
-
const replyAbortController = new AbortController();
|
|
1112
|
-
|
|
1113
1131
|
const audioOutput = this.agentSession.output.audioEnabled
|
|
1114
1132
|
? this.agentSession.output.audio
|
|
1115
1133
|
: null;
|
|
@@ -1163,13 +1181,25 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1163
1181
|
tasks.push(ttsTask);
|
|
1164
1182
|
}
|
|
1165
1183
|
|
|
1166
|
-
await speechHandle.waitIfNotInterrupted([speechHandle.
|
|
1184
|
+
await speechHandle.waitIfNotInterrupted([speechHandle._waitForScheduled()]);
|
|
1185
|
+
|
|
1186
|
+
if (newMessage && speechHandle.scheduled) {
|
|
1187
|
+
chatCtx.insert(newMessage);
|
|
1188
|
+
this.agent._chatCtx.insert(newMessage);
|
|
1189
|
+
this.agentSession._conversationItemAdded(newMessage);
|
|
1190
|
+
}
|
|
1191
|
+
|
|
1167
1192
|
if (speechHandle.interrupted) {
|
|
1168
1193
|
replyAbortController.abort();
|
|
1169
1194
|
await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
|
|
1170
1195
|
return;
|
|
1171
1196
|
}
|
|
1172
1197
|
|
|
1198
|
+
this.agentSession._updateAgentState('thinking');
|
|
1199
|
+
|
|
1200
|
+
await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
|
|
1201
|
+
speechHandle._clearAuthorization();
|
|
1202
|
+
|
|
1173
1203
|
const replyStartedAt = Date.now();
|
|
1174
1204
|
const trNodeResult = await this.agent.transcriptionNode(llmOutput, modelSettings);
|
|
1175
1205
|
let textOut: _TextOut | null = null;
|
|
@@ -1205,6 +1235,9 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1205
1235
|
textOut?.firstTextFut.await.finally(onFirstFrame);
|
|
1206
1236
|
}
|
|
1207
1237
|
|
|
1238
|
+
//TODO(AJS-272): before executing tools, make sure we generated all the text
|
|
1239
|
+
// (this ensure everything is kept ordered)
|
|
1240
|
+
|
|
1208
1241
|
const onToolExecutionStarted = (_: FunctionCall) => {
|
|
1209
1242
|
// TODO(brian): handle speech_handle item_added
|
|
1210
1243
|
};
|
|
@@ -1223,7 +1256,6 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1223
1256
|
onToolExecutionStarted,
|
|
1224
1257
|
onToolExecutionCompleted,
|
|
1225
1258
|
});
|
|
1226
|
-
tasks.push(executeToolsTask);
|
|
1227
1259
|
|
|
1228
1260
|
await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
|
|
1229
1261
|
|
|
@@ -1290,7 +1322,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1290
1322
|
'playout completed with interrupt',
|
|
1291
1323
|
);
|
|
1292
1324
|
// TODO(shubhra) add chat message to speech handle
|
|
1293
|
-
speechHandle.
|
|
1325
|
+
speechHandle._markGenerationDone();
|
|
1294
1326
|
await executeToolsTask.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
|
|
1295
1327
|
return;
|
|
1296
1328
|
}
|
|
@@ -1318,14 +1350,15 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1318
1350
|
this.agentSession._updateAgentState('listening');
|
|
1319
1351
|
}
|
|
1320
1352
|
|
|
1321
|
-
|
|
1353
|
+
// mark the playout done before waiting for the tool execution
|
|
1354
|
+
speechHandle._markGenerationDone();
|
|
1322
1355
|
await executeToolsTask.result;
|
|
1323
1356
|
|
|
1324
1357
|
if (toolOutput.output.length === 0) return;
|
|
1325
1358
|
|
|
1326
1359
|
// important: no agent output should be used after this point
|
|
1327
1360
|
const { maxToolSteps } = this.agentSession.options;
|
|
1328
|
-
if (speechHandle.
|
|
1361
|
+
if (speechHandle.numSteps >= maxToolSteps) {
|
|
1329
1362
|
this.logger.warn(
|
|
1330
1363
|
{ speech_id: speechHandle.id, max_tool_steps: maxToolSteps },
|
|
1331
1364
|
'maximum number of function calls steps reached',
|
|
@@ -1390,7 +1423,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1390
1423
|
|
|
1391
1424
|
const handle = SpeechHandle.create({
|
|
1392
1425
|
allowInterruptions: speechHandle.allowInterruptions,
|
|
1393
|
-
stepIndex: speechHandle.
|
|
1426
|
+
stepIndex: speechHandle._stepIndex + 1,
|
|
1394
1427
|
parent: speechHandle,
|
|
1395
1428
|
});
|
|
1396
1429
|
this.agentSession.emit(
|
|
@@ -1407,14 +1440,17 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1407
1440
|
const respondToolChoice = draining || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
|
|
1408
1441
|
|
|
1409
1442
|
const toolResponseTask = this.createSpeechTask({
|
|
1410
|
-
|
|
1411
|
-
|
|
1412
|
-
|
|
1413
|
-
|
|
1414
|
-
|
|
1415
|
-
|
|
1416
|
-
|
|
1417
|
-
|
|
1443
|
+
task: Task.from(() =>
|
|
1444
|
+
this.pipelineReplyTask(
|
|
1445
|
+
handle,
|
|
1446
|
+
chatCtx,
|
|
1447
|
+
toolCtx,
|
|
1448
|
+
{ toolChoice: respondToolChoice },
|
|
1449
|
+
replyAbortController,
|
|
1450
|
+
instructions,
|
|
1451
|
+
undefined,
|
|
1452
|
+
toolMessages,
|
|
1453
|
+
),
|
|
1418
1454
|
),
|
|
1419
1455
|
ownedSpeechHandle: handle,
|
|
1420
1456
|
name: 'AgentActivity.pipelineReply',
|
|
@@ -1435,6 +1471,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1435
1471
|
speechHandle: SpeechHandle,
|
|
1436
1472
|
ev: GenerationCreatedEvent,
|
|
1437
1473
|
modelSettings: ModelSettings,
|
|
1474
|
+
replyAbortController: AbortController,
|
|
1438
1475
|
): Promise<void> {
|
|
1439
1476
|
speechHandleStorage.enterWith(speechHandle);
|
|
1440
1477
|
|
|
@@ -1446,7 +1483,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1446
1483
|
}
|
|
1447
1484
|
|
|
1448
1485
|
this.logger.debug(
|
|
1449
|
-
{ speech_id: speechHandle.id, stepIndex: speechHandle.
|
|
1486
|
+
{ speech_id: speechHandle.id, stepIndex: speechHandle.numSteps },
|
|
1450
1487
|
'realtime generation started',
|
|
1451
1488
|
);
|
|
1452
1489
|
|
|
@@ -1459,6 +1496,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1459
1496
|
const toolCtx = this.realtimeSession.tools;
|
|
1460
1497
|
|
|
1461
1498
|
await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
|
|
1499
|
+
speechHandle._clearAuthorization();
|
|
1462
1500
|
|
|
1463
1501
|
if (speechHandle.interrupted) {
|
|
1464
1502
|
return;
|
|
@@ -1468,8 +1506,6 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1468
1506
|
this.agentSession._updateAgentState('speaking');
|
|
1469
1507
|
};
|
|
1470
1508
|
|
|
1471
|
-
const replyAbortController = new AbortController();
|
|
1472
|
-
|
|
1473
1509
|
const readMessages = async (
|
|
1474
1510
|
abortController: AbortController,
|
|
1475
1511
|
outputs: Array<[string, _TextOut | null, _AudioOut | null]>,
|
|
@@ -1566,12 +1602,14 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1566
1602
|
),
|
|
1567
1603
|
);
|
|
1568
1604
|
|
|
1569
|
-
const onToolExecutionStarted = (
|
|
1570
|
-
|
|
1605
|
+
const onToolExecutionStarted = (f: FunctionCall) => {
|
|
1606
|
+
speechHandle._itemAdded([f]);
|
|
1571
1607
|
};
|
|
1572
1608
|
|
|
1573
|
-
const onToolExecutionCompleted = (
|
|
1574
|
-
|
|
1609
|
+
const onToolExecutionCompleted = (out: ToolExecutionOutput) => {
|
|
1610
|
+
if (out.toolCallOutput) {
|
|
1611
|
+
speechHandle._itemAdded([out.toolCallOutput]);
|
|
1612
|
+
}
|
|
1575
1613
|
};
|
|
1576
1614
|
|
|
1577
1615
|
const [executeToolsTask, toolOutput] = performToolExecutions({
|
|
@@ -1640,7 +1678,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1640
1678
|
interrupted: true,
|
|
1641
1679
|
});
|
|
1642
1680
|
this.agent._chatCtx.insert(message);
|
|
1643
|
-
speechHandle.
|
|
1681
|
+
speechHandle._itemAdded([message]);
|
|
1644
1682
|
this.agentSession._conversationItemAdded(message);
|
|
1645
1683
|
|
|
1646
1684
|
// TODO(brian): add tracing span
|
|
@@ -1650,8 +1688,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1650
1688
|
'playout completed with interrupt',
|
|
1651
1689
|
);
|
|
1652
1690
|
}
|
|
1653
|
-
|
|
1654
|
-
speechHandle._markPlayoutDone();
|
|
1691
|
+
speechHandle._markGenerationDone();
|
|
1655
1692
|
await executeToolsTask.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
|
|
1656
1693
|
|
|
1657
1694
|
// TODO(brian): close tees
|
|
@@ -1668,13 +1705,13 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1668
1705
|
interrupted: false,
|
|
1669
1706
|
});
|
|
1670
1707
|
this.agent._chatCtx.insert(message);
|
|
1671
|
-
speechHandle.
|
|
1708
|
+
speechHandle._itemAdded([message]);
|
|
1672
1709
|
this.agentSession._conversationItemAdded(message); // mark the playout done before waiting for the tool execution\
|
|
1673
1710
|
// TODO(brian): add tracing span
|
|
1674
1711
|
}
|
|
1675
1712
|
|
|
1676
1713
|
// mark the playout done before waiting for the tool execution
|
|
1677
|
-
speechHandle.
|
|
1714
|
+
speechHandle._markGenerationDone();
|
|
1678
1715
|
// TODO(brian): close tees
|
|
1679
1716
|
|
|
1680
1717
|
toolOutput.firstToolStartedFuture.await.finally(() => {
|
|
@@ -1687,7 +1724,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1687
1724
|
|
|
1688
1725
|
// important: no agent ouput should be used after this point
|
|
1689
1726
|
const { maxToolSteps } = this.agentSession.options;
|
|
1690
|
-
if (speechHandle.
|
|
1727
|
+
if (speechHandle.numSteps >= maxToolSteps) {
|
|
1691
1728
|
this.logger.warn(
|
|
1692
1729
|
{ speech_id: speechHandle.id, max_tool_steps: maxToolSteps },
|
|
1693
1730
|
'maximum number of function calls steps reached',
|
|
@@ -1763,7 +1800,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1763
1800
|
|
|
1764
1801
|
const replySpeechHandle = SpeechHandle.create({
|
|
1765
1802
|
allowInterruptions: speechHandle.allowInterruptions,
|
|
1766
|
-
stepIndex: speechHandle.
|
|
1803
|
+
stepIndex: speechHandle.numSteps + 1,
|
|
1767
1804
|
parent: speechHandle,
|
|
1768
1805
|
});
|
|
1769
1806
|
this.agentSession.emit(
|
|
@@ -1777,10 +1814,13 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1777
1814
|
|
|
1778
1815
|
const toolChoice = draining || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
|
|
1779
1816
|
this.createSpeechTask({
|
|
1780
|
-
|
|
1781
|
-
|
|
1782
|
-
|
|
1783
|
-
|
|
1817
|
+
task: Task.from((abortController: AbortController) =>
|
|
1818
|
+
this.realtimeReplyTask({
|
|
1819
|
+
speechHandle: replySpeechHandle,
|
|
1820
|
+
modelSettings: { toolChoice },
|
|
1821
|
+
abortController,
|
|
1822
|
+
}),
|
|
1823
|
+
),
|
|
1784
1824
|
ownedSpeechHandle: replySpeechHandle,
|
|
1785
1825
|
name: 'AgentActivity.realtime_reply',
|
|
1786
1826
|
});
|
|
@@ -1793,9 +1833,11 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1793
1833
|
modelSettings: { toolChoice },
|
|
1794
1834
|
userInput,
|
|
1795
1835
|
instructions,
|
|
1836
|
+
abortController,
|
|
1796
1837
|
}: {
|
|
1797
1838
|
speechHandle: SpeechHandle;
|
|
1798
1839
|
modelSettings: ModelSettings;
|
|
1840
|
+
abortController: AbortController;
|
|
1799
1841
|
userInput?: string;
|
|
1800
1842
|
instructions?: string;
|
|
1801
1843
|
}): Promise<void> {
|
|
@@ -1825,7 +1867,12 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1825
1867
|
|
|
1826
1868
|
try {
|
|
1827
1869
|
const generationEvent = await this.realtimeSession.generateReply(instructions);
|
|
1828
|
-
await this.realtimeGenerationTask(
|
|
1870
|
+
await this.realtimeGenerationTask(
|
|
1871
|
+
speechHandle,
|
|
1872
|
+
generationEvent,
|
|
1873
|
+
{ toolChoice },
|
|
1874
|
+
abortController,
|
|
1875
|
+
);
|
|
1829
1876
|
} finally {
|
|
1830
1877
|
// reset toolChoice value
|
|
1831
1878
|
if (toolChoice !== undefined && toolChoice !== originalToolChoice) {
|
|
@@ -1837,14 +1884,17 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1837
1884
|
private scheduleSpeech(
|
|
1838
1885
|
speechHandle: SpeechHandle,
|
|
1839
1886
|
priority: number,
|
|
1840
|
-
|
|
1887
|
+
force: boolean = false,
|
|
1841
1888
|
): void {
|
|
1842
|
-
|
|
1889
|
+
// when force=true, we allow tool responses to bypass draining
|
|
1890
|
+
// This allows for tool responses to be generated before the AgentActivity is finalized
|
|
1891
|
+
if (this.draining && !force) {
|
|
1843
1892
|
throw new Error('cannot schedule new speech, the agent is draining');
|
|
1844
1893
|
}
|
|
1845
1894
|
|
|
1846
1895
|
// Monotonic time to avoid near 0 collisions
|
|
1847
1896
|
this.speechQueue.push([priority, Number(process.hrtime.bigint()), speechHandle]);
|
|
1897
|
+
speechHandle._markScheduled();
|
|
1848
1898
|
this.wakeupMainTask();
|
|
1849
1899
|
}
|
|
1850
1900
|
|
|
@@ -1854,7 +1904,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1854
1904
|
if (this._draining) return;
|
|
1855
1905
|
|
|
1856
1906
|
this.createSpeechTask({
|
|
1857
|
-
|
|
1907
|
+
task: Task.from(() => this.agent.onExit()),
|
|
1858
1908
|
name: 'AgentActivity_onExit',
|
|
1859
1909
|
});
|
|
1860
1910
|
|
|
@@ -5,6 +5,7 @@ import type { AudioFrame, Room } from '@livekit/rtc-node';
|
|
|
5
5
|
import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
|
|
6
6
|
import { EventEmitter } from 'node:events';
|
|
7
7
|
import type { ReadableStream } from 'node:stream/web';
|
|
8
|
+
import { getJobContext } from '../job.js';
|
|
8
9
|
import { ChatContext, ChatMessage } from '../llm/chat_context.js';
|
|
9
10
|
import type { LLM, RealtimeModel, RealtimeModelError, ToolChoice } from '../llm/index.js';
|
|
10
11
|
import type { LLMError } from '../llm/llm.js';
|
|
@@ -184,6 +185,7 @@ export class AgentSession<
|
|
|
184
185
|
this.agent = agent;
|
|
185
186
|
this._updateAgentState('initializing');
|
|
186
187
|
|
|
188
|
+
const tasks: Promise<void>[] = [];
|
|
187
189
|
// Check for existing input/output configuration and warn if needed
|
|
188
190
|
if (this.input.audio && inputOptions?.audioEnabled !== false) {
|
|
189
191
|
this.logger.warn('RoomIO audio input is enabled but input.audio is already set, ignoring..');
|
|
@@ -209,7 +211,15 @@ export class AgentSession<
|
|
|
209
211
|
});
|
|
210
212
|
this.roomIO.start();
|
|
211
213
|
|
|
212
|
-
|
|
214
|
+
const ctx = getJobContext();
|
|
215
|
+
if (ctx && ctx.room === room && !room.isConnected) {
|
|
216
|
+
this.logger.debug('Auto-connecting to room via job context');
|
|
217
|
+
tasks.push(ctx.connect());
|
|
218
|
+
}
|
|
219
|
+
// TODO(AJS-265): add shutdown callback to job context
|
|
220
|
+
tasks.push(this.updateActivity(this.agent));
|
|
221
|
+
|
|
222
|
+
await Promise.allSettled(tasks);
|
|
213
223
|
|
|
214
224
|
// Log used IO configuration
|
|
215
225
|
this.logger.debug(
|
|
@@ -220,7 +230,6 @@ export class AgentSession<
|
|
|
220
230
|
`using transcript io: \`AgentSession\` -> ${this.output.transcription ? '`' + this.output.transcription.constructor.name + '`' : '(none)'}`,
|
|
221
231
|
);
|
|
222
232
|
|
|
223
|
-
this.logger.debug('AgentSession started');
|
|
224
233
|
this.started = true;
|
|
225
234
|
this._updateAgentState('listening');
|
|
226
235
|
}
|