@livekit/agents 1.0.37 → 1.0.39
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.cjs.map +1 -1
- package/dist/inference/api_protos.cjs +68 -0
- package/dist/inference/api_protos.cjs.map +1 -1
- package/dist/inference/api_protos.d.cts +345 -4
- package/dist/inference/api_protos.d.ts +345 -4
- package/dist/inference/api_protos.d.ts.map +1 -1
- package/dist/inference/api_protos.js +60 -0
- package/dist/inference/api_protos.js.map +1 -1
- package/dist/inference/llm.cjs +7 -3
- package/dist/inference/llm.cjs.map +1 -1
- package/dist/inference/llm.d.cts +5 -6
- package/dist/inference/llm.d.ts +5 -6
- package/dist/inference/llm.d.ts.map +1 -1
- package/dist/inference/llm.js +7 -3
- package/dist/inference/llm.js.map +1 -1
- package/dist/inference/stt.cjs +32 -21
- package/dist/inference/stt.cjs.map +1 -1
- package/dist/inference/stt.d.cts +5 -4
- package/dist/inference/stt.d.ts +5 -4
- package/dist/inference/stt.d.ts.map +1 -1
- package/dist/inference/stt.js +34 -21
- package/dist/inference/stt.js.map +1 -1
- package/dist/inference/tts.cjs.map +1 -1
- package/dist/inference/tts.d.cts +10 -7
- package/dist/inference/tts.d.ts +10 -7
- package/dist/inference/tts.d.ts.map +1 -1
- package/dist/inference/tts.js.map +1 -1
- package/dist/ipc/inference_proc_executor.cjs.map +1 -1
- package/dist/ipc/job_proc_executor.cjs.map +1 -1
- package/dist/stt/stream_adapter.cjs +9 -1
- package/dist/stt/stream_adapter.cjs.map +1 -1
- package/dist/stt/stream_adapter.d.ts.map +1 -1
- package/dist/stt/stream_adapter.js +9 -1
- package/dist/stt/stream_adapter.js.map +1 -1
- package/dist/stt/stt.cjs +10 -0
- package/dist/stt/stt.cjs.map +1 -1
- package/dist/stt/stt.d.cts +12 -0
- package/dist/stt/stt.d.ts +12 -0
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js +10 -0
- package/dist/stt/stt.js.map +1 -1
- package/dist/telemetry/traces.cjs +4 -3
- package/dist/telemetry/traces.cjs.map +1 -1
- package/dist/telemetry/traces.d.cts +2 -0
- package/dist/telemetry/traces.d.ts +2 -0
- package/dist/telemetry/traces.d.ts.map +1 -1
- package/dist/telemetry/traces.js +4 -3
- package/dist/telemetry/traces.js.map +1 -1
- package/dist/utils.cjs +11 -0
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.cts +10 -0
- package/dist/utils.d.ts +10 -0
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +10 -0
- package/dist/utils.js.map +1 -1
- package/dist/voice/agent.cjs +6 -2
- package/dist/voice/agent.cjs.map +1 -1
- package/dist/voice/agent.d.ts.map +1 -1
- package/dist/voice/agent.js +6 -2
- package/dist/voice/agent.js.map +1 -1
- package/dist/voice/agent_activity.cjs +72 -37
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +2 -1
- package/dist/voice/agent_activity.d.ts +2 -1
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +73 -38
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +7 -5
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +5 -2
- package/dist/voice/agent_session.d.ts +5 -2
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +7 -5
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +3 -1
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +3 -1
- package/dist/voice/audio_recognition.js.map +1 -1
- package/dist/voice/avatar/datastream_io.cjs +6 -0
- package/dist/voice/avatar/datastream_io.cjs.map +1 -1
- package/dist/voice/avatar/datastream_io.d.cts +1 -0
- package/dist/voice/avatar/datastream_io.d.ts +1 -0
- package/dist/voice/avatar/datastream_io.d.ts.map +1 -1
- package/dist/voice/avatar/datastream_io.js +6 -0
- package/dist/voice/avatar/datastream_io.js.map +1 -1
- package/dist/voice/background_audio.cjs.map +1 -1
- package/dist/voice/generation.cjs +14 -5
- package/dist/voice/generation.cjs.map +1 -1
- package/dist/voice/generation.d.cts +3 -2
- package/dist/voice/generation.d.ts +3 -2
- package/dist/voice/generation.d.ts.map +1 -1
- package/dist/voice/generation.js +14 -5
- package/dist/voice/generation.js.map +1 -1
- package/dist/voice/io.cjs +12 -0
- package/dist/voice/io.cjs.map +1 -1
- package/dist/voice/io.d.cts +19 -1
- package/dist/voice/io.d.ts +19 -1
- package/dist/voice/io.d.ts.map +1 -1
- package/dist/voice/io.js +12 -0
- package/dist/voice/io.js.map +1 -1
- package/dist/voice/recorder_io/recorder_io.cjs +91 -28
- package/dist/voice/recorder_io/recorder_io.cjs.map +1 -1
- package/dist/voice/recorder_io/recorder_io.d.cts +7 -1
- package/dist/voice/recorder_io/recorder_io.d.ts +7 -1
- package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -1
- package/dist/voice/recorder_io/recorder_io.js +91 -28
- package/dist/voice/recorder_io/recorder_io.js.map +1 -1
- package/dist/voice/room_io/_input.cjs +40 -11
- package/dist/voice/room_io/_input.cjs.map +1 -1
- package/dist/voice/room_io/_input.d.cts +4 -1
- package/dist/voice/room_io/_input.d.ts +4 -1
- package/dist/voice/room_io/_input.d.ts.map +1 -1
- package/dist/voice/room_io/_input.js +31 -2
- package/dist/voice/room_io/_input.js.map +1 -1
- package/dist/voice/room_io/_output.cjs +6 -0
- package/dist/voice/room_io/_output.cjs.map +1 -1
- package/dist/voice/room_io/_output.d.cts +1 -0
- package/dist/voice/room_io/_output.d.ts +1 -0
- package/dist/voice/room_io/_output.d.ts.map +1 -1
- package/dist/voice/room_io/_output.js +6 -0
- package/dist/voice/room_io/_output.js.map +1 -1
- package/dist/voice/room_io/room_io.cjs.map +1 -1
- package/dist/voice/room_io/room_io.d.cts +2 -2
- package/dist/voice/room_io/room_io.d.ts +2 -2
- package/dist/voice/room_io/room_io.d.ts.map +1 -1
- package/dist/voice/room_io/room_io.js.map +1 -1
- package/dist/voice/speech_handle.cjs +2 -0
- package/dist/voice/speech_handle.cjs.map +1 -1
- package/dist/voice/speech_handle.d.cts +3 -0
- package/dist/voice/speech_handle.d.ts +3 -0
- package/dist/voice/speech_handle.d.ts.map +1 -1
- package/dist/voice/speech_handle.js +2 -0
- package/dist/voice/speech_handle.js.map +1 -1
- package/package.json +2 -2
- package/src/inference/api_protos.ts +83 -0
- package/src/inference/llm.ts +20 -15
- package/src/inference/stt.ts +48 -29
- package/src/inference/tts.ts +36 -16
- package/src/stt/stream_adapter.ts +12 -1
- package/src/stt/stt.ts +21 -0
- package/src/telemetry/traces.ts +6 -2
- package/src/utils.ts +21 -0
- package/src/voice/agent.ts +11 -2
- package/src/voice/agent_activity.ts +108 -41
- package/src/voice/agent_session.ts +6 -5
- package/src/voice/audio_recognition.ts +2 -0
- package/src/voice/avatar/datastream_io.ts +8 -0
- package/src/voice/generation.ts +24 -12
- package/src/voice/io.ts +27 -5
- package/src/voice/recorder_io/recorder_io.ts +123 -31
- package/src/voice/room_io/_input.ts +32 -4
- package/src/voice/room_io/_output.ts +8 -0
- package/src/voice/room_io/room_io.ts +3 -1
- package/src/voice/speech_handle.ts +4 -0
|
@@ -122,9 +122,9 @@ class AgentActivity {
|
|
|
122
122
|
);
|
|
123
123
|
this.turnDetectionMode = void 0;
|
|
124
124
|
}
|
|
125
|
-
if (!this.vad && this.stt && this.llm instanceof import_llm.LLM && this.allowInterruptions && this.turnDetectionMode === void 0) {
|
|
125
|
+
if (!this.vad && this.stt && !this.stt.capabilities.streaming && this.llm instanceof import_llm.LLM && this.allowInterruptions && this.turnDetectionMode === void 0) {
|
|
126
126
|
this.logger.warn(
|
|
127
|
-
"VAD is not set. Enabling VAD is recommended when using LLM and STT for more responsive interruption handling."
|
|
127
|
+
"VAD is not set. Enabling VAD is recommended when using LLM and non-streaming STT for more responsive interruption handling."
|
|
128
128
|
);
|
|
129
129
|
}
|
|
130
130
|
}
|
|
@@ -458,8 +458,12 @@ class AgentActivity {
|
|
|
458
458
|
this.scheduleSpeech(handle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL);
|
|
459
459
|
}
|
|
460
460
|
// recognition hooks
|
|
461
|
-
onStartOfSpeech(
|
|
462
|
-
|
|
461
|
+
onStartOfSpeech(ev) {
|
|
462
|
+
let speechStartTime = Date.now();
|
|
463
|
+
if (ev) {
|
|
464
|
+
speechStartTime = speechStartTime - ev.speechDuration;
|
|
465
|
+
}
|
|
466
|
+
this.agentSession._updateUserState("speaking", speechStartTime);
|
|
463
467
|
}
|
|
464
468
|
onEndOfSpeech(ev) {
|
|
465
469
|
let speechEndTime = Date.now();
|
|
@@ -469,14 +473,16 @@ class AgentActivity {
|
|
|
469
473
|
this.agentSession._updateUserState("listening", speechEndTime);
|
|
470
474
|
}
|
|
471
475
|
onVADInferenceDone(ev) {
|
|
472
|
-
var _a, _b;
|
|
473
476
|
if (this.turnDetection === "manual" || this.turnDetection === "realtime_llm") {
|
|
474
477
|
return;
|
|
475
478
|
}
|
|
476
|
-
if (
|
|
477
|
-
|
|
479
|
+
if (ev.speechDuration >= this.agentSession.options.minInterruptionDuration) {
|
|
480
|
+
this.interruptByAudioActivity();
|
|
478
481
|
}
|
|
479
|
-
|
|
482
|
+
}
|
|
483
|
+
interruptByAudioActivity() {
|
|
484
|
+
var _a, _b;
|
|
485
|
+
if (this.llm instanceof import_llm.RealtimeModel && this.llm.capabilities.turnDetection) {
|
|
480
486
|
return;
|
|
481
487
|
}
|
|
482
488
|
if (this.stt && this.agentSession.options.minInterruptionWords > 0 && this.audioRecognition) {
|
|
@@ -489,7 +495,10 @@ class AgentActivity {
|
|
|
489
495
|
}
|
|
490
496
|
(_a = this.realtimeSession) == null ? void 0 : _a.startUserActivity();
|
|
491
497
|
if (this._currentSpeech && !this._currentSpeech.interrupted && this._currentSpeech.allowInterruptions) {
|
|
492
|
-
this.logger.info(
|
|
498
|
+
this.logger.info(
|
|
499
|
+
{ "speech id": this._currentSpeech.id },
|
|
500
|
+
"speech interrupted by audio activity"
|
|
501
|
+
);
|
|
493
502
|
(_b = this.realtimeSession) == null ? void 0 : _b.interrupt();
|
|
494
503
|
this._currentSpeech.interrupt();
|
|
495
504
|
}
|
|
@@ -507,6 +516,9 @@ class AgentActivity {
|
|
|
507
516
|
// TODO(AJS-106): add multi participant support
|
|
508
517
|
})
|
|
509
518
|
);
|
|
519
|
+
if (ev.alternatives[0].text) {
|
|
520
|
+
this.interruptByAudioActivity();
|
|
521
|
+
}
|
|
510
522
|
}
|
|
511
523
|
onFinalTranscript(ev) {
|
|
512
524
|
if (this.llm instanceof import_llm.RealtimeModel && this.llm.capabilities.userTranscription) {
|
|
@@ -521,6 +533,9 @@ class AgentActivity {
|
|
|
521
533
|
// TODO(AJS-106): add multi participant support
|
|
522
534
|
})
|
|
523
535
|
);
|
|
536
|
+
if (this.audioRecognition && this.turnDetection !== "manual" && this.turnDetection !== "realtime_llm") {
|
|
537
|
+
this.interruptByAudioActivity();
|
|
538
|
+
}
|
|
524
539
|
}
|
|
525
540
|
onPreemptiveGeneration(info) {
|
|
526
541
|
if (!this.agentSession.options.preemptiveGeneration || this.draining || this._currentSpeech !== void 0 && !this._currentSpeech.interrupted || !(this.llm instanceof import_llm.LLM)) {
|
|
@@ -836,6 +851,7 @@ ${instructions}` : instructions,
|
|
|
836
851
|
);
|
|
837
852
|
}
|
|
838
853
|
async ttsTask(speechHandle, text, addToChatCtx, modelSettings, replyAbortController, audio) {
|
|
854
|
+
speechHandle._agentTurnContext = import_api.context.active();
|
|
839
855
|
speechHandleStorage.enterWith(speechHandle);
|
|
840
856
|
const transcriptionOutput = this.agentSession.output.transcriptionEnabled ? this.agentSession.output.transcription : null;
|
|
841
857
|
const audioOutput = this.agentSession.output.audioEnabled ? this.agentSession.output.audio : null;
|
|
@@ -867,12 +883,15 @@ ${instructions}` : instructions,
|
|
|
867
883
|
textOut = _textOut;
|
|
868
884
|
tasks.push(textForwardTask);
|
|
869
885
|
}
|
|
870
|
-
const onFirstFrame = () => {
|
|
871
|
-
this.agentSession._updateAgentState("speaking"
|
|
886
|
+
const onFirstFrame = (startedSpeakingAt) => {
|
|
887
|
+
this.agentSession._updateAgentState("speaking", {
|
|
888
|
+
startTime: startedSpeakingAt,
|
|
889
|
+
otelContext: speechHandle._agentTurnContext
|
|
890
|
+
});
|
|
872
891
|
};
|
|
873
892
|
if (!audioOutput) {
|
|
874
893
|
if (textOut) {
|
|
875
|
-
textOut.firstTextFut.await.
|
|
894
|
+
textOut.firstTextFut.await.then(() => onFirstFrame()).catch(() => this.logger.debug("firstTextFut cancelled before first frame"));
|
|
876
895
|
}
|
|
877
896
|
} else {
|
|
878
897
|
let audioOut = null;
|
|
@@ -900,7 +919,7 @@ ${instructions}` : instructions,
|
|
|
900
919
|
tasks.push(forwardTask);
|
|
901
920
|
audioOut = _audioOut;
|
|
902
921
|
}
|
|
903
|
-
audioOut.firstFrameFut.await.
|
|
922
|
+
audioOut.firstFrameFut.await.then((ts) => onFirstFrame(ts)).catch(() => this.logger.debug("firstFrameFut cancelled before first frame"));
|
|
904
923
|
}
|
|
905
924
|
await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
|
|
906
925
|
if (audioOutput) {
|
|
@@ -939,6 +958,7 @@ ${instructions}` : instructions,
|
|
|
939
958
|
span
|
|
940
959
|
}) => {
|
|
941
960
|
var _a, _b, _c;
|
|
961
|
+
speechHandle._agentTurnContext = import_api.context.active();
|
|
942
962
|
span.setAttribute(import_telemetry.traceTypes.ATTR_SPEECH_ID, speechHandle.id);
|
|
943
963
|
if (instructions) {
|
|
944
964
|
span.setAttribute(import_telemetry.traceTypes.ATTR_INSTRUCTIONS, instructions);
|
|
@@ -1015,8 +1035,11 @@ ${instructions}` : instructions,
|
|
|
1015
1035
|
tasks.push(textForwardTask);
|
|
1016
1036
|
textOut = _textOut;
|
|
1017
1037
|
}
|
|
1018
|
-
const onFirstFrame = () => {
|
|
1019
|
-
this.agentSession._updateAgentState("speaking"
|
|
1038
|
+
const onFirstFrame = (startedSpeakingAt) => {
|
|
1039
|
+
this.agentSession._updateAgentState("speaking", {
|
|
1040
|
+
startTime: startedSpeakingAt,
|
|
1041
|
+
otelContext: speechHandle._agentTurnContext
|
|
1042
|
+
});
|
|
1020
1043
|
};
|
|
1021
1044
|
let audioOut = null;
|
|
1022
1045
|
if (audioOutput) {
|
|
@@ -1028,12 +1051,12 @@ ${instructions}` : instructions,
|
|
|
1028
1051
|
);
|
|
1029
1052
|
audioOut = _audioOut;
|
|
1030
1053
|
tasks.push(forwardTask);
|
|
1031
|
-
audioOut.firstFrameFut.await.
|
|
1054
|
+
audioOut.firstFrameFut.await.then((ts) => onFirstFrame(ts)).catch(() => this.logger.debug("firstFrameFut cancelled before first frame"));
|
|
1032
1055
|
} else {
|
|
1033
1056
|
throw Error("ttsStream is null when audioOutput is enabled");
|
|
1034
1057
|
}
|
|
1035
1058
|
} else {
|
|
1036
|
-
textOut == null ? void 0 : textOut.firstTextFut.await.
|
|
1059
|
+
textOut == null ? void 0 : textOut.firstTextFut.await.then(() => onFirstFrame()).catch(() => this.logger.debug("firstTextFut cancelled before first frame"));
|
|
1037
1060
|
}
|
|
1038
1061
|
const onToolExecutionStarted = (f) => {
|
|
1039
1062
|
speechHandle._itemAdded([f]);
|
|
@@ -1064,7 +1087,12 @@ ${instructions}` : instructions,
|
|
|
1064
1087
|
msg.createdAt = replyStartedAt;
|
|
1065
1088
|
}
|
|
1066
1089
|
this.agent._chatCtx.insert(toolsMessages);
|
|
1067
|
-
|
|
1090
|
+
const toolCallOutputs = toolsMessages.filter(
|
|
1091
|
+
(m) => m.type === "function_call_output"
|
|
1092
|
+
);
|
|
1093
|
+
if (toolCallOutputs.length > 0) {
|
|
1094
|
+
this.agentSession._toolItemsAdded(toolCallOutputs);
|
|
1095
|
+
}
|
|
1068
1096
|
}
|
|
1069
1097
|
if (speechHandle.interrupted) {
|
|
1070
1098
|
this.logger.debug(
|
|
@@ -1081,9 +1109,9 @@ ${instructions}` : instructions,
|
|
|
1081
1109
|
let forwardedText = (textOut == null ? void 0 : textOut.text) || "";
|
|
1082
1110
|
if (audioOutput) {
|
|
1083
1111
|
const playbackEv = await audioOutput.waitForPlayout();
|
|
1084
|
-
if (audioOut == null ? void 0 : audioOut.firstFrameFut.done) {
|
|
1112
|
+
if ((audioOut == null ? void 0 : audioOut.firstFrameFut.done) && !audioOut.firstFrameFut.rejected) {
|
|
1085
1113
|
this.logger.info(
|
|
1086
|
-
{ speech_id: speechHandle.id,
|
|
1114
|
+
{ speech_id: speechHandle.id, playbackPositionInS: playbackEv.playbackPosition },
|
|
1087
1115
|
"playout interrupted"
|
|
1088
1116
|
);
|
|
1089
1117
|
if (playbackEv.synchronizedTranscript) {
|
|
@@ -1221,7 +1249,12 @@ ${instructions}` : instructions,
|
|
|
1221
1249
|
msg.createdAt = replyStartedAt;
|
|
1222
1250
|
}
|
|
1223
1251
|
this.agent._chatCtx.insert(toolMessages);
|
|
1224
|
-
|
|
1252
|
+
const toolCallOutputs = toolMessages.filter(
|
|
1253
|
+
(m) => m.type === "function_call_output"
|
|
1254
|
+
);
|
|
1255
|
+
if (toolCallOutputs.length > 0) {
|
|
1256
|
+
this.agentSession._toolItemsAdded(toolCallOutputs);
|
|
1257
|
+
}
|
|
1225
1258
|
}
|
|
1226
1259
|
};
|
|
1227
1260
|
pipelineReplyTask = async (speechHandle, chatCtx, toolCtx, modelSettings, replyAbortController, instructions, newMessage, toolsMessages) => import_telemetry.tracer.startActiveSpan(
|
|
@@ -1264,6 +1297,7 @@ ${instructions}` : instructions,
|
|
|
1264
1297
|
span
|
|
1265
1298
|
}) {
|
|
1266
1299
|
var _a, _b, _c;
|
|
1300
|
+
speechHandle._agentTurnContext = import_api.context.active();
|
|
1267
1301
|
span.setAttribute(import_telemetry.traceTypes.ATTR_SPEECH_ID, speechHandle.id);
|
|
1268
1302
|
speechHandleStorage.enterWith(speechHandle);
|
|
1269
1303
|
if (!this.realtimeSession) {
|
|
@@ -1288,8 +1322,11 @@ ${instructions}` : instructions,
|
|
|
1288
1322
|
if (speechHandle.interrupted) {
|
|
1289
1323
|
return;
|
|
1290
1324
|
}
|
|
1291
|
-
const onFirstFrame = () => {
|
|
1292
|
-
this.agentSession._updateAgentState("speaking"
|
|
1325
|
+
const onFirstFrame = (startedSpeakingAt) => {
|
|
1326
|
+
this.agentSession._updateAgentState("speaking", {
|
|
1327
|
+
startTime: startedSpeakingAt,
|
|
1328
|
+
otelContext: speechHandle._agentTurnContext
|
|
1329
|
+
});
|
|
1293
1330
|
};
|
|
1294
1331
|
const readMessages = async (abortController, outputs) => {
|
|
1295
1332
|
replyAbortController.signal.addEventListener("abort", () => abortController.abort(), {
|
|
@@ -1364,10 +1401,10 @@ ${instructions}` : instructions,
|
|
|
1364
1401
|
);
|
|
1365
1402
|
forwardTasks.push(forwardTask);
|
|
1366
1403
|
audioOut = _audioOut;
|
|
1367
|
-
audioOut.firstFrameFut.await.
|
|
1404
|
+
audioOut.firstFrameFut.await.then((ts) => onFirstFrame(ts)).catch(() => this.logger.debug("firstFrameFut cancelled before first frame"));
|
|
1368
1405
|
}
|
|
1369
1406
|
} else if (textOut) {
|
|
1370
|
-
textOut.firstTextFut.await.
|
|
1407
|
+
textOut.firstTextFut.await.then(() => onFirstFrame()).catch(() => this.logger.debug("firstTextFut cancelled before first frame"));
|
|
1371
1408
|
}
|
|
1372
1409
|
outputs.push([msg.messageId, textOut, audioOut, msgModalities]);
|
|
1373
1410
|
}
|
|
@@ -1431,7 +1468,6 @@ ${instructions}` : instructions,
|
|
|
1431
1468
|
await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
|
|
1432
1469
|
if (audioOutput) {
|
|
1433
1470
|
await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
|
|
1434
|
-
this.agentSession._updateAgentState("listening");
|
|
1435
1471
|
}
|
|
1436
1472
|
if (speechHandle.interrupted) {
|
|
1437
1473
|
this.logger.debug(
|
|
@@ -1446,10 +1482,10 @@ ${instructions}` : instructions,
|
|
|
1446
1482
|
if (audioOutput) {
|
|
1447
1483
|
audioOutput.clearBuffer();
|
|
1448
1484
|
const playbackEv = await audioOutput.waitForPlayout();
|
|
1449
|
-
let
|
|
1450
|
-
if (audioOut == null ? void 0 : audioOut.firstFrameFut.done) {
|
|
1485
|
+
let playbackPositionInS = playbackEv.playbackPosition;
|
|
1486
|
+
if ((audioOut == null ? void 0 : audioOut.firstFrameFut.done) && !audioOut.firstFrameFut.rejected) {
|
|
1451
1487
|
this.logger.info(
|
|
1452
|
-
{ speech_id: speechHandle.id,
|
|
1488
|
+
{ speech_id: speechHandle.id, playbackPositionInS },
|
|
1453
1489
|
"playout interrupted"
|
|
1454
1490
|
);
|
|
1455
1491
|
if (playbackEv.synchronizedTranscript) {
|
|
@@ -1457,11 +1493,11 @@ ${instructions}` : instructions,
|
|
|
1457
1493
|
}
|
|
1458
1494
|
} else {
|
|
1459
1495
|
forwardedText = "";
|
|
1460
|
-
|
|
1496
|
+
playbackPositionInS = 0;
|
|
1461
1497
|
}
|
|
1462
1498
|
this.realtimeSession.truncate({
|
|
1463
1499
|
messageId: msgId,
|
|
1464
|
-
audioEndMs: Math.floor(
|
|
1500
|
+
audioEndMs: Math.floor(playbackPositionInS * 1e3),
|
|
1465
1501
|
modalities: msgModalities,
|
|
1466
1502
|
audioTranscript: forwardedText
|
|
1467
1503
|
});
|
|
@@ -1499,14 +1535,13 @@ ${instructions}` : instructions,
|
|
|
1499
1535
|
this.agentSession._conversationItemAdded(message);
|
|
1500
1536
|
}
|
|
1501
1537
|
speechHandle._markGenerationDone();
|
|
1502
|
-
toolOutput.firstToolStartedFuture.await.finally(() => {
|
|
1503
|
-
this.agentSession._updateAgentState("thinking");
|
|
1504
|
-
});
|
|
1505
1538
|
await executeToolsTask.result;
|
|
1539
|
+
if (toolOutput.output.length > 0) {
|
|
1540
|
+
this.agentSession._updateAgentState("thinking");
|
|
1541
|
+
} else if (this.agentSession.agentState === "speaking") {
|
|
1542
|
+
this.agentSession._updateAgentState("listening");
|
|
1543
|
+
}
|
|
1506
1544
|
if (toolOutput.output.length === 0) {
|
|
1507
|
-
if (!speechHandle.interrupted) {
|
|
1508
|
-
this.agentSession._updateAgentState("listening");
|
|
1509
|
-
}
|
|
1510
1545
|
return;
|
|
1511
1546
|
}
|
|
1512
1547
|
const { maxToolSteps } = this.agentSession.options;
|