@livekit/agents 1.0.3 → 1.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +2 -5
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +2 -3
- package/dist/index.d.ts +2 -3
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +1 -3
- package/dist/index.js.map +1 -1
- package/dist/tokenize/basic/hyphenator.cjs.map +1 -1
- package/dist/tokenize/basic/hyphenator.js.map +1 -1
- package/dist/utils.cjs +77 -0
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.cts +21 -0
- package/dist/utils.d.ts +21 -0
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +76 -1
- package/dist/utils.js.map +1 -1
- package/dist/voice/agent_activity.cjs +107 -71
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +107 -71
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/avatar/datastream_io.cjs +204 -0
- package/dist/voice/avatar/datastream_io.cjs.map +1 -0
- package/dist/voice/avatar/datastream_io.d.cts +37 -0
- package/dist/voice/avatar/datastream_io.d.ts +37 -0
- package/dist/voice/avatar/datastream_io.d.ts.map +1 -0
- package/dist/voice/avatar/datastream_io.js +188 -0
- package/dist/voice/avatar/datastream_io.js.map +1 -0
- package/dist/{multimodal → voice/avatar}/index.cjs +4 -4
- package/dist/voice/avatar/index.cjs.map +1 -0
- package/dist/voice/avatar/index.d.cts +2 -0
- package/dist/voice/avatar/index.d.ts +2 -0
- package/dist/voice/avatar/index.d.ts.map +1 -0
- package/dist/voice/avatar/index.js +2 -0
- package/dist/voice/avatar/index.js.map +1 -0
- package/dist/voice/index.cjs +2 -0
- package/dist/voice/index.cjs.map +1 -1
- package/dist/voice/index.d.cts +1 -0
- package/dist/voice/index.d.ts +1 -0
- package/dist/voice/index.d.ts.map +1 -1
- package/dist/voice/index.js +1 -0
- package/dist/voice/index.js.map +1 -1
- package/dist/voice/io.cjs.map +1 -1
- package/dist/voice/io.d.cts +1 -1
- package/dist/voice/io.d.ts +1 -1
- package/dist/voice/io.d.ts.map +1 -1
- package/dist/voice/io.js.map +1 -1
- package/dist/voice/room_io/_input.cjs +3 -1
- package/dist/voice/room_io/_input.cjs.map +1 -1
- package/dist/voice/room_io/_input.d.ts.map +1 -1
- package/dist/voice/room_io/_input.js +3 -1
- package/dist/voice/room_io/_input.js.map +1 -1
- package/dist/voice/run_context.cjs +13 -0
- package/dist/voice/run_context.cjs.map +1 -1
- package/dist/voice/run_context.d.cts +10 -0
- package/dist/voice/run_context.d.ts +10 -0
- package/dist/voice/run_context.d.ts.map +1 -1
- package/dist/voice/run_context.js +13 -0
- package/dist/voice/run_context.js.map +1 -1
- package/dist/voice/speech_handle.cjs +152 -30
- package/dist/voice/speech_handle.cjs.map +1 -1
- package/dist/voice/speech_handle.d.cts +67 -16
- package/dist/voice/speech_handle.d.ts +67 -16
- package/dist/voice/speech_handle.d.ts.map +1 -1
- package/dist/voice/speech_handle.js +153 -31
- package/dist/voice/speech_handle.js.map +1 -1
- package/dist/worker.cjs +4 -1
- package/dist/worker.cjs.map +1 -1
- package/dist/worker.d.ts.map +1 -1
- package/dist/worker.js +4 -1
- package/dist/worker.js.map +1 -1
- package/package.json +2 -2
- package/src/index.ts +2 -3
- package/src/tokenize/basic/hyphenator.ts +1 -1
- package/src/utils.ts +121 -1
- package/src/voice/agent_activity.ts +122 -78
- package/src/voice/avatar/datastream_io.ts +247 -0
- package/src/voice/avatar/index.ts +4 -0
- package/src/voice/index.ts +2 -0
- package/src/voice/io.ts +1 -1
- package/src/voice/room_io/_input.ts +9 -3
- package/src/voice/run_context.ts +16 -2
- package/src/voice/speech_handle.ts +183 -38
- package/src/worker.ts +5 -1
- package/dist/multimodal/agent_playout.cjs +0 -233
- package/dist/multimodal/agent_playout.cjs.map +0 -1
- package/dist/multimodal/agent_playout.d.cts +0 -34
- package/dist/multimodal/agent_playout.d.ts +0 -34
- package/dist/multimodal/agent_playout.d.ts.map +0 -1
- package/dist/multimodal/agent_playout.js +0 -207
- package/dist/multimodal/agent_playout.js.map +0 -1
- package/dist/multimodal/index.cjs.map +0 -1
- package/dist/multimodal/index.d.cts +0 -2
- package/dist/multimodal/index.d.ts +0 -2
- package/dist/multimodal/index.d.ts.map +0 -1
- package/dist/multimodal/index.js +0 -2
- package/dist/multimodal/index.js.map +0 -1
- package/src/multimodal/agent_playout.ts +0 -266
- package/src/multimodal/index.ts +0 -4
|
@@ -82,7 +82,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
82
82
|
private _currentSpeech?: SpeechHandle;
|
|
83
83
|
private speechQueue: Heap<[number, number, SpeechHandle]>; // [priority, timestamp, speechHandle]
|
|
84
84
|
private q_updated: Future;
|
|
85
|
-
private speechTasks: Set<
|
|
85
|
+
private speechTasks: Set<Task<void>> = new Set();
|
|
86
86
|
private lock = new Mutex();
|
|
87
87
|
private audioStream = new DeferredReadableStream<AudioFrame>();
|
|
88
88
|
// default to null as None, which maps to the default provider tool choice value
|
|
@@ -269,7 +269,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
269
269
|
|
|
270
270
|
this._mainTask = Task.from(({ signal }) => this.mainTask(signal));
|
|
271
271
|
this.createSpeechTask({
|
|
272
|
-
|
|
272
|
+
task: Task.from(() => this.agent.onEnter()),
|
|
273
273
|
name: 'AgentActivity_onEnter',
|
|
274
274
|
});
|
|
275
275
|
} finally {
|
|
@@ -441,9 +441,10 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
441
441
|
speechHandle: handle,
|
|
442
442
|
}),
|
|
443
443
|
);
|
|
444
|
-
|
|
445
444
|
const task = this.createSpeechTask({
|
|
446
|
-
|
|
445
|
+
task: Task.from((abortController: AbortController) =>
|
|
446
|
+
this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio),
|
|
447
|
+
),
|
|
447
448
|
ownedSpeechHandle: handle,
|
|
448
449
|
name: 'AgentActivity.say_tts',
|
|
449
450
|
});
|
|
@@ -572,7 +573,9 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
572
573
|
this.logger.info({ speech_id: handle.id }, 'Creating speech handle');
|
|
573
574
|
|
|
574
575
|
this.createSpeechTask({
|
|
575
|
-
|
|
576
|
+
task: Task.from((abortController: AbortController) =>
|
|
577
|
+
this.realtimeGenerationTask(handle, ev, {}, abortController),
|
|
578
|
+
),
|
|
576
579
|
ownedSpeechHandle: handle,
|
|
577
580
|
name: 'AgentActivity.realtimeGeneration',
|
|
578
581
|
});
|
|
@@ -659,26 +662,32 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
659
662
|
);
|
|
660
663
|
}
|
|
661
664
|
|
|
662
|
-
private createSpeechTask
|
|
663
|
-
|
|
665
|
+
private createSpeechTask(options: {
|
|
666
|
+
task: Task<void>;
|
|
664
667
|
ownedSpeechHandle?: SpeechHandle;
|
|
665
668
|
name?: string;
|
|
666
|
-
}): Promise<
|
|
667
|
-
const {
|
|
668
|
-
|
|
669
|
-
this.speechTasks.add(promise);
|
|
669
|
+
}): Promise<void> {
|
|
670
|
+
const { task, ownedSpeechHandle } = options;
|
|
670
671
|
|
|
671
|
-
|
|
672
|
-
|
|
672
|
+
this.speechTasks.add(task);
|
|
673
|
+
task.addDoneCallback(() => {
|
|
674
|
+
this.speechTasks.delete(task);
|
|
675
|
+
});
|
|
673
676
|
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
+
if (ownedSpeechHandle) {
|
|
678
|
+
ownedSpeechHandle._tasks.push(task);
|
|
679
|
+
task.addDoneCallback(() => {
|
|
680
|
+
if (ownedSpeechHandle._tasks.every((t) => t.done)) {
|
|
681
|
+
ownedSpeechHandle._markDone();
|
|
682
|
+
}
|
|
683
|
+
});
|
|
684
|
+
}
|
|
677
685
|
|
|
686
|
+
task.addDoneCallback(() => {
|
|
678
687
|
this.wakeupMainTask();
|
|
679
688
|
});
|
|
680
689
|
|
|
681
|
-
return
|
|
690
|
+
return task.result;
|
|
682
691
|
}
|
|
683
692
|
|
|
684
693
|
async onEndOfTurn(info: EndOfTurnInfo): Promise<boolean> {
|
|
@@ -705,7 +714,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
705
714
|
|
|
706
715
|
const oldTask = this._userTurnCompletedTask;
|
|
707
716
|
this._userTurnCompletedTask = this.createSpeechTask({
|
|
708
|
-
|
|
717
|
+
task: Task.from(() => this.userTurnCompleted(info, oldTask)),
|
|
709
718
|
name: 'AgentActivity.userTurnCompleted',
|
|
710
719
|
});
|
|
711
720
|
return true;
|
|
@@ -736,8 +745,8 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
736
745
|
}
|
|
737
746
|
const speechHandle = heapItem[2];
|
|
738
747
|
this._currentSpeech = speechHandle;
|
|
739
|
-
speechHandle.
|
|
740
|
-
await speechHandle.
|
|
748
|
+
speechHandle._authorizeGeneration();
|
|
749
|
+
await speechHandle._waitForGeneration();
|
|
741
750
|
this._currentSpeech = undefined;
|
|
742
751
|
}
|
|
743
752
|
|
|
@@ -815,16 +824,19 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
815
824
|
|
|
816
825
|
if (this.llm instanceof RealtimeModel) {
|
|
817
826
|
this.createSpeechTask({
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
827
|
+
task: Task.from((abortController: AbortController) =>
|
|
828
|
+
this.realtimeReplyTask({
|
|
829
|
+
speechHandle: handle,
|
|
830
|
+
// TODO(brian): support llm.ChatMessage for the realtime model
|
|
831
|
+
userInput: userMessage?.textContent,
|
|
832
|
+
instructions,
|
|
833
|
+
modelSettings: {
|
|
834
|
+
// isGiven(toolChoice) = toolChoice !== undefined
|
|
835
|
+
toolChoice: toOaiToolChoice(toolChoice !== undefined ? toolChoice : this.toolChoice),
|
|
836
|
+
},
|
|
837
|
+
abortController,
|
|
838
|
+
}),
|
|
839
|
+
),
|
|
828
840
|
ownedSpeechHandle: handle,
|
|
829
841
|
name: 'AgentActivity.realtimeReply',
|
|
830
842
|
});
|
|
@@ -837,13 +849,18 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
837
849
|
}
|
|
838
850
|
|
|
839
851
|
const task = this.createSpeechTask({
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
852
|
+
task: Task.from((abortController: AbortController) =>
|
|
853
|
+
this.pipelineReplyTask(
|
|
854
|
+
handle,
|
|
855
|
+
chatCtx ?? this.agent.chatCtx,
|
|
856
|
+
this.agent.toolCtx,
|
|
857
|
+
{
|
|
858
|
+
toolChoice: toOaiToolChoice(toolChoice !== undefined ? toolChoice : this.toolChoice),
|
|
859
|
+
},
|
|
860
|
+
abortController,
|
|
861
|
+
instructions ? `${this.agent.instructions}\n${instructions}` : instructions,
|
|
862
|
+
userMessage,
|
|
863
|
+
),
|
|
847
864
|
),
|
|
848
865
|
ownedSpeechHandle: handle,
|
|
849
866
|
name: 'AgentActivity.pipelineReply',
|
|
@@ -860,6 +877,8 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
860
877
|
const future = new Future<void>();
|
|
861
878
|
const currentSpeech = this._currentSpeech;
|
|
862
879
|
|
|
880
|
+
//TODO(AJS-273): add interrupt for background speeches
|
|
881
|
+
|
|
863
882
|
currentSpeech?.interrupt();
|
|
864
883
|
|
|
865
884
|
for (const [_, __, speech] of this.speechQueue) {
|
|
@@ -871,7 +890,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
871
890
|
if (currentSpeech === undefined) {
|
|
872
891
|
future.resolve();
|
|
873
892
|
} else {
|
|
874
|
-
currentSpeech.
|
|
893
|
+
currentSpeech.addDoneCallback(() => {
|
|
875
894
|
if (future.done) return;
|
|
876
895
|
future.resolve();
|
|
877
896
|
});
|
|
@@ -881,7 +900,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
881
900
|
}
|
|
882
901
|
|
|
883
902
|
private onPipelineReplyDone(): void {
|
|
884
|
-
if (!this.speechQueue.peek() && (!this._currentSpeech || this._currentSpeech.done)) {
|
|
903
|
+
if (!this.speechQueue.peek() && (!this._currentSpeech || this._currentSpeech.done())) {
|
|
885
904
|
this.agentSession._updateAgentState('listening');
|
|
886
905
|
}
|
|
887
906
|
}
|
|
@@ -980,6 +999,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
980
999
|
text: string | ReadableStream<string>,
|
|
981
1000
|
addToChatCtx: boolean,
|
|
982
1001
|
modelSettings: ModelSettings,
|
|
1002
|
+
replyAbortController: AbortController,
|
|
983
1003
|
audio?: ReadableStream<AudioFrame> | null,
|
|
984
1004
|
): Promise<void> {
|
|
985
1005
|
speechHandleStorage.enterWith(speechHandle);
|
|
@@ -992,7 +1012,6 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
992
1012
|
? this.agentSession.output.audio
|
|
993
1013
|
: null;
|
|
994
1014
|
|
|
995
|
-
const replyAbortController = new AbortController();
|
|
996
1015
|
await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
|
|
997
1016
|
|
|
998
1017
|
if (speechHandle.interrupted) {
|
|
@@ -1102,14 +1121,13 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1102
1121
|
chatCtx: ChatContext,
|
|
1103
1122
|
toolCtx: ToolContext,
|
|
1104
1123
|
modelSettings: ModelSettings,
|
|
1124
|
+
replyAbortController: AbortController,
|
|
1105
1125
|
instructions?: string,
|
|
1106
1126
|
newMessage?: ChatMessage,
|
|
1107
1127
|
toolsMessages?: ChatItem[],
|
|
1108
1128
|
): Promise<void> {
|
|
1109
1129
|
speechHandleStorage.enterWith(speechHandle);
|
|
1110
1130
|
|
|
1111
|
-
const replyAbortController = new AbortController();
|
|
1112
|
-
|
|
1113
1131
|
const audioOutput = this.agentSession.output.audioEnabled
|
|
1114
1132
|
? this.agentSession.output.audio
|
|
1115
1133
|
: null;
|
|
@@ -1163,13 +1181,19 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1163
1181
|
tasks.push(ttsTask);
|
|
1164
1182
|
}
|
|
1165
1183
|
|
|
1166
|
-
await speechHandle.waitIfNotInterrupted([speechHandle.
|
|
1184
|
+
await speechHandle.waitIfNotInterrupted([speechHandle._waitForScheduled()]);
|
|
1185
|
+
|
|
1167
1186
|
if (speechHandle.interrupted) {
|
|
1168
1187
|
replyAbortController.abort();
|
|
1169
1188
|
await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
|
|
1170
1189
|
return;
|
|
1171
1190
|
}
|
|
1172
1191
|
|
|
1192
|
+
this.agentSession._updateAgentState('thinking');
|
|
1193
|
+
|
|
1194
|
+
await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
|
|
1195
|
+
speechHandle._clearAuthorization();
|
|
1196
|
+
|
|
1173
1197
|
const replyStartedAt = Date.now();
|
|
1174
1198
|
const trNodeResult = await this.agent.transcriptionNode(llmOutput, modelSettings);
|
|
1175
1199
|
let textOut: _TextOut | null = null;
|
|
@@ -1205,6 +1229,9 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1205
1229
|
textOut?.firstTextFut.await.finally(onFirstFrame);
|
|
1206
1230
|
}
|
|
1207
1231
|
|
|
1232
|
+
//TODO(AJS-272): before executing tools, make sure we generated all the text
|
|
1233
|
+
// (this ensure everything is kept ordered)
|
|
1234
|
+
|
|
1208
1235
|
const onToolExecutionStarted = (_: FunctionCall) => {
|
|
1209
1236
|
// TODO(brian): handle speech_handle item_added
|
|
1210
1237
|
};
|
|
@@ -1223,7 +1250,6 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1223
1250
|
onToolExecutionStarted,
|
|
1224
1251
|
onToolExecutionCompleted,
|
|
1225
1252
|
});
|
|
1226
|
-
tasks.push(executeToolsTask);
|
|
1227
1253
|
|
|
1228
1254
|
await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
|
|
1229
1255
|
|
|
@@ -1290,7 +1316,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1290
1316
|
'playout completed with interrupt',
|
|
1291
1317
|
);
|
|
1292
1318
|
// TODO(shubhra) add chat message to speech handle
|
|
1293
|
-
speechHandle.
|
|
1319
|
+
speechHandle._markGenerationDone();
|
|
1294
1320
|
await executeToolsTask.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
|
|
1295
1321
|
return;
|
|
1296
1322
|
}
|
|
@@ -1318,14 +1344,15 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1318
1344
|
this.agentSession._updateAgentState('listening');
|
|
1319
1345
|
}
|
|
1320
1346
|
|
|
1321
|
-
|
|
1347
|
+
// mark the playout done before waiting for the tool execution
|
|
1348
|
+
speechHandle._markGenerationDone();
|
|
1322
1349
|
await executeToolsTask.result;
|
|
1323
1350
|
|
|
1324
1351
|
if (toolOutput.output.length === 0) return;
|
|
1325
1352
|
|
|
1326
1353
|
// important: no agent output should be used after this point
|
|
1327
1354
|
const { maxToolSteps } = this.agentSession.options;
|
|
1328
|
-
if (speechHandle.
|
|
1355
|
+
if (speechHandle.numSteps >= maxToolSteps) {
|
|
1329
1356
|
this.logger.warn(
|
|
1330
1357
|
{ speech_id: speechHandle.id, max_tool_steps: maxToolSteps },
|
|
1331
1358
|
'maximum number of function calls steps reached',
|
|
@@ -1390,7 +1417,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1390
1417
|
|
|
1391
1418
|
const handle = SpeechHandle.create({
|
|
1392
1419
|
allowInterruptions: speechHandle.allowInterruptions,
|
|
1393
|
-
stepIndex: speechHandle.
|
|
1420
|
+
stepIndex: speechHandle._stepIndex + 1,
|
|
1394
1421
|
parent: speechHandle,
|
|
1395
1422
|
});
|
|
1396
1423
|
this.agentSession.emit(
|
|
@@ -1407,14 +1434,17 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1407
1434
|
const respondToolChoice = draining || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
|
|
1408
1435
|
|
|
1409
1436
|
const toolResponseTask = this.createSpeechTask({
|
|
1410
|
-
|
|
1411
|
-
|
|
1412
|
-
|
|
1413
|
-
|
|
1414
|
-
|
|
1415
|
-
|
|
1416
|
-
|
|
1417
|
-
|
|
1437
|
+
task: Task.from(() =>
|
|
1438
|
+
this.pipelineReplyTask(
|
|
1439
|
+
handle,
|
|
1440
|
+
chatCtx,
|
|
1441
|
+
toolCtx,
|
|
1442
|
+
{ toolChoice: respondToolChoice },
|
|
1443
|
+
replyAbortController,
|
|
1444
|
+
instructions,
|
|
1445
|
+
undefined,
|
|
1446
|
+
toolMessages,
|
|
1447
|
+
),
|
|
1418
1448
|
),
|
|
1419
1449
|
ownedSpeechHandle: handle,
|
|
1420
1450
|
name: 'AgentActivity.pipelineReply',
|
|
@@ -1435,6 +1465,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1435
1465
|
speechHandle: SpeechHandle,
|
|
1436
1466
|
ev: GenerationCreatedEvent,
|
|
1437
1467
|
modelSettings: ModelSettings,
|
|
1468
|
+
replyAbortController: AbortController,
|
|
1438
1469
|
): Promise<void> {
|
|
1439
1470
|
speechHandleStorage.enterWith(speechHandle);
|
|
1440
1471
|
|
|
@@ -1446,7 +1477,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1446
1477
|
}
|
|
1447
1478
|
|
|
1448
1479
|
this.logger.debug(
|
|
1449
|
-
{ speech_id: speechHandle.id, stepIndex: speechHandle.
|
|
1480
|
+
{ speech_id: speechHandle.id, stepIndex: speechHandle.numSteps },
|
|
1450
1481
|
'realtime generation started',
|
|
1451
1482
|
);
|
|
1452
1483
|
|
|
@@ -1459,6 +1490,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1459
1490
|
const toolCtx = this.realtimeSession.tools;
|
|
1460
1491
|
|
|
1461
1492
|
await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
|
|
1493
|
+
speechHandle._clearAuthorization();
|
|
1462
1494
|
|
|
1463
1495
|
if (speechHandle.interrupted) {
|
|
1464
1496
|
return;
|
|
@@ -1468,8 +1500,6 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1468
1500
|
this.agentSession._updateAgentState('speaking');
|
|
1469
1501
|
};
|
|
1470
1502
|
|
|
1471
|
-
const replyAbortController = new AbortController();
|
|
1472
|
-
|
|
1473
1503
|
const readMessages = async (
|
|
1474
1504
|
abortController: AbortController,
|
|
1475
1505
|
outputs: Array<[string, _TextOut | null, _AudioOut | null]>,
|
|
@@ -1566,12 +1596,14 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1566
1596
|
),
|
|
1567
1597
|
);
|
|
1568
1598
|
|
|
1569
|
-
const onToolExecutionStarted = (
|
|
1570
|
-
|
|
1599
|
+
const onToolExecutionStarted = (f: FunctionCall) => {
|
|
1600
|
+
speechHandle._itemAdded([f]);
|
|
1571
1601
|
};
|
|
1572
1602
|
|
|
1573
|
-
const onToolExecutionCompleted = (
|
|
1574
|
-
|
|
1603
|
+
const onToolExecutionCompleted = (out: ToolExecutionOutput) => {
|
|
1604
|
+
if (out.toolCallOutput) {
|
|
1605
|
+
speechHandle._itemAdded([out.toolCallOutput]);
|
|
1606
|
+
}
|
|
1575
1607
|
};
|
|
1576
1608
|
|
|
1577
1609
|
const [executeToolsTask, toolOutput] = performToolExecutions({
|
|
@@ -1640,7 +1672,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1640
1672
|
interrupted: true,
|
|
1641
1673
|
});
|
|
1642
1674
|
this.agent._chatCtx.insert(message);
|
|
1643
|
-
speechHandle.
|
|
1675
|
+
speechHandle._itemAdded([message]);
|
|
1644
1676
|
this.agentSession._conversationItemAdded(message);
|
|
1645
1677
|
|
|
1646
1678
|
// TODO(brian): add tracing span
|
|
@@ -1650,8 +1682,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1650
1682
|
'playout completed with interrupt',
|
|
1651
1683
|
);
|
|
1652
1684
|
}
|
|
1653
|
-
|
|
1654
|
-
speechHandle._markPlayoutDone();
|
|
1685
|
+
speechHandle._markGenerationDone();
|
|
1655
1686
|
await executeToolsTask.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
|
|
1656
1687
|
|
|
1657
1688
|
// TODO(brian): close tees
|
|
@@ -1668,13 +1699,13 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1668
1699
|
interrupted: false,
|
|
1669
1700
|
});
|
|
1670
1701
|
this.agent._chatCtx.insert(message);
|
|
1671
|
-
speechHandle.
|
|
1702
|
+
speechHandle._itemAdded([message]);
|
|
1672
1703
|
this.agentSession._conversationItemAdded(message); // mark the playout done before waiting for the tool execution\
|
|
1673
1704
|
// TODO(brian): add tracing span
|
|
1674
1705
|
}
|
|
1675
1706
|
|
|
1676
1707
|
// mark the playout done before waiting for the tool execution
|
|
1677
|
-
speechHandle.
|
|
1708
|
+
speechHandle._markGenerationDone();
|
|
1678
1709
|
// TODO(brian): close tees
|
|
1679
1710
|
|
|
1680
1711
|
toolOutput.firstToolStartedFuture.await.finally(() => {
|
|
@@ -1687,7 +1718,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1687
1718
|
|
|
1688
1719
|
// important: no agent ouput should be used after this point
|
|
1689
1720
|
const { maxToolSteps } = this.agentSession.options;
|
|
1690
|
-
if (speechHandle.
|
|
1721
|
+
if (speechHandle.numSteps >= maxToolSteps) {
|
|
1691
1722
|
this.logger.warn(
|
|
1692
1723
|
{ speech_id: speechHandle.id, max_tool_steps: maxToolSteps },
|
|
1693
1724
|
'maximum number of function calls steps reached',
|
|
@@ -1763,7 +1794,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1763
1794
|
|
|
1764
1795
|
const replySpeechHandle = SpeechHandle.create({
|
|
1765
1796
|
allowInterruptions: speechHandle.allowInterruptions,
|
|
1766
|
-
stepIndex: speechHandle.
|
|
1797
|
+
stepIndex: speechHandle.numSteps + 1,
|
|
1767
1798
|
parent: speechHandle,
|
|
1768
1799
|
});
|
|
1769
1800
|
this.agentSession.emit(
|
|
@@ -1777,10 +1808,13 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1777
1808
|
|
|
1778
1809
|
const toolChoice = draining || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
|
|
1779
1810
|
this.createSpeechTask({
|
|
1780
|
-
|
|
1781
|
-
|
|
1782
|
-
|
|
1783
|
-
|
|
1811
|
+
task: Task.from((abortController: AbortController) =>
|
|
1812
|
+
this.realtimeReplyTask({
|
|
1813
|
+
speechHandle: replySpeechHandle,
|
|
1814
|
+
modelSettings: { toolChoice },
|
|
1815
|
+
abortController,
|
|
1816
|
+
}),
|
|
1817
|
+
),
|
|
1784
1818
|
ownedSpeechHandle: replySpeechHandle,
|
|
1785
1819
|
name: 'AgentActivity.realtime_reply',
|
|
1786
1820
|
});
|
|
@@ -1793,9 +1827,11 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1793
1827
|
modelSettings: { toolChoice },
|
|
1794
1828
|
userInput,
|
|
1795
1829
|
instructions,
|
|
1830
|
+
abortController,
|
|
1796
1831
|
}: {
|
|
1797
1832
|
speechHandle: SpeechHandle;
|
|
1798
1833
|
modelSettings: ModelSettings;
|
|
1834
|
+
abortController: AbortController;
|
|
1799
1835
|
userInput?: string;
|
|
1800
1836
|
instructions?: string;
|
|
1801
1837
|
}): Promise<void> {
|
|
@@ -1825,7 +1861,12 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1825
1861
|
|
|
1826
1862
|
try {
|
|
1827
1863
|
const generationEvent = await this.realtimeSession.generateReply(instructions);
|
|
1828
|
-
await this.realtimeGenerationTask(
|
|
1864
|
+
await this.realtimeGenerationTask(
|
|
1865
|
+
speechHandle,
|
|
1866
|
+
generationEvent,
|
|
1867
|
+
{ toolChoice },
|
|
1868
|
+
abortController,
|
|
1869
|
+
);
|
|
1829
1870
|
} finally {
|
|
1830
1871
|
// reset toolChoice value
|
|
1831
1872
|
if (toolChoice !== undefined && toolChoice !== originalToolChoice) {
|
|
@@ -1837,14 +1878,17 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1837
1878
|
private scheduleSpeech(
|
|
1838
1879
|
speechHandle: SpeechHandle,
|
|
1839
1880
|
priority: number,
|
|
1840
|
-
|
|
1881
|
+
force: boolean = false,
|
|
1841
1882
|
): void {
|
|
1842
|
-
|
|
1883
|
+
// when force=true, we allow tool responses to bypass draining
|
|
1884
|
+
// This allows for tool responses to be generated before the AgentActivity is finalized
|
|
1885
|
+
if (this.draining && !force) {
|
|
1843
1886
|
throw new Error('cannot schedule new speech, the agent is draining');
|
|
1844
1887
|
}
|
|
1845
1888
|
|
|
1846
1889
|
// Monotonic time to avoid near 0 collisions
|
|
1847
1890
|
this.speechQueue.push([priority, Number(process.hrtime.bigint()), speechHandle]);
|
|
1891
|
+
speechHandle._markScheduled();
|
|
1848
1892
|
this.wakeupMainTask();
|
|
1849
1893
|
}
|
|
1850
1894
|
|
|
@@ -1854,7 +1898,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1854
1898
|
if (this._draining) return;
|
|
1855
1899
|
|
|
1856
1900
|
this.createSpeechTask({
|
|
1857
|
-
|
|
1901
|
+
task: Task.from(() => this.agent.onExit()),
|
|
1858
1902
|
name: 'AgentActivity_onExit',
|
|
1859
1903
|
});
|
|
1860
1904
|
|