@livekit/agents 1.0.37 → 1.0.38
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.cjs.map +1 -1
- package/dist/inference/api_protos.cjs +68 -0
- package/dist/inference/api_protos.cjs.map +1 -1
- package/dist/inference/api_protos.d.cts +345 -4
- package/dist/inference/api_protos.d.ts +345 -4
- package/dist/inference/api_protos.d.ts.map +1 -1
- package/dist/inference/api_protos.js +60 -0
- package/dist/inference/api_protos.js.map +1 -1
- package/dist/inference/stt.cjs +32 -21
- package/dist/inference/stt.cjs.map +1 -1
- package/dist/inference/stt.d.ts.map +1 -1
- package/dist/inference/stt.js +34 -21
- package/dist/inference/stt.js.map +1 -1
- package/dist/ipc/inference_proc_executor.cjs.map +1 -1
- package/dist/ipc/job_proc_executor.cjs.map +1 -1
- package/dist/stt/stt.cjs +10 -0
- package/dist/stt/stt.cjs.map +1 -1
- package/dist/stt/stt.d.cts +12 -0
- package/dist/stt/stt.d.ts +12 -0
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js +10 -0
- package/dist/stt/stt.js.map +1 -1
- package/dist/telemetry/traces.cjs +4 -3
- package/dist/telemetry/traces.cjs.map +1 -1
- package/dist/telemetry/traces.d.cts +2 -0
- package/dist/telemetry/traces.d.ts +2 -0
- package/dist/telemetry/traces.d.ts.map +1 -1
- package/dist/telemetry/traces.js +4 -3
- package/dist/telemetry/traces.js.map +1 -1
- package/dist/utils.cjs +6 -0
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.cts +2 -0
- package/dist/utils.d.ts +2 -0
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +6 -0
- package/dist/utils.js.map +1 -1
- package/dist/voice/agent.cjs +5 -0
- package/dist/voice/agent.cjs.map +1 -1
- package/dist/voice/agent.d.ts.map +1 -1
- package/dist/voice/agent.js +5 -0
- package/dist/voice/agent.js.map +1 -1
- package/dist/voice/agent_activity.cjs +49 -23
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +1 -1
- package/dist/voice/agent_activity.d.ts +1 -1
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +50 -24
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +7 -5
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +5 -2
- package/dist/voice/agent_session.d.ts +5 -2
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +7 -5
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +3 -1
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +3 -1
- package/dist/voice/audio_recognition.js.map +1 -1
- package/dist/voice/avatar/datastream_io.cjs +6 -0
- package/dist/voice/avatar/datastream_io.cjs.map +1 -1
- package/dist/voice/avatar/datastream_io.d.cts +1 -0
- package/dist/voice/avatar/datastream_io.d.ts +1 -0
- package/dist/voice/avatar/datastream_io.d.ts.map +1 -1
- package/dist/voice/avatar/datastream_io.js +6 -0
- package/dist/voice/avatar/datastream_io.js.map +1 -1
- package/dist/voice/background_audio.cjs.map +1 -1
- package/dist/voice/generation.cjs +14 -5
- package/dist/voice/generation.cjs.map +1 -1
- package/dist/voice/generation.d.cts +3 -2
- package/dist/voice/generation.d.ts +3 -2
- package/dist/voice/generation.d.ts.map +1 -1
- package/dist/voice/generation.js +14 -5
- package/dist/voice/generation.js.map +1 -1
- package/dist/voice/io.cjs +12 -0
- package/dist/voice/io.cjs.map +1 -1
- package/dist/voice/io.d.cts +19 -1
- package/dist/voice/io.d.ts +19 -1
- package/dist/voice/io.d.ts.map +1 -1
- package/dist/voice/io.js +12 -0
- package/dist/voice/io.js.map +1 -1
- package/dist/voice/recorder_io/recorder_io.cjs +91 -28
- package/dist/voice/recorder_io/recorder_io.cjs.map +1 -1
- package/dist/voice/recorder_io/recorder_io.d.cts +7 -1
- package/dist/voice/recorder_io/recorder_io.d.ts +7 -1
- package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -1
- package/dist/voice/recorder_io/recorder_io.js +91 -28
- package/dist/voice/recorder_io/recorder_io.js.map +1 -1
- package/dist/voice/room_io/_input.cjs +40 -11
- package/dist/voice/room_io/_input.cjs.map +1 -1
- package/dist/voice/room_io/_input.d.cts +4 -1
- package/dist/voice/room_io/_input.d.ts +4 -1
- package/dist/voice/room_io/_input.d.ts.map +1 -1
- package/dist/voice/room_io/_input.js +31 -2
- package/dist/voice/room_io/_input.js.map +1 -1
- package/dist/voice/room_io/_output.cjs +6 -0
- package/dist/voice/room_io/_output.cjs.map +1 -1
- package/dist/voice/room_io/_output.d.cts +1 -0
- package/dist/voice/room_io/_output.d.ts +1 -0
- package/dist/voice/room_io/_output.d.ts.map +1 -1
- package/dist/voice/room_io/_output.js +6 -0
- package/dist/voice/room_io/_output.js.map +1 -1
- package/dist/voice/room_io/room_io.cjs.map +1 -1
- package/dist/voice/room_io/room_io.d.cts +2 -2
- package/dist/voice/room_io/room_io.d.ts +2 -2
- package/dist/voice/room_io/room_io.d.ts.map +1 -1
- package/dist/voice/room_io/room_io.js.map +1 -1
- package/dist/voice/speech_handle.cjs +2 -0
- package/dist/voice/speech_handle.cjs.map +1 -1
- package/dist/voice/speech_handle.d.cts +3 -0
- package/dist/voice/speech_handle.d.ts +3 -0
- package/dist/voice/speech_handle.d.ts.map +1 -1
- package/dist/voice/speech_handle.js +2 -0
- package/dist/voice/speech_handle.js.map +1 -1
- package/package.json +1 -1
- package/src/inference/api_protos.ts +83 -0
- package/src/inference/stt.ts +39 -22
- package/src/stt/stt.ts +21 -0
- package/src/telemetry/traces.ts +6 -2
- package/src/utils.ts +7 -0
- package/src/voice/agent.ts +9 -0
- package/src/voice/agent_activity.ts +72 -26
- package/src/voice/agent_session.ts +6 -5
- package/src/voice/audio_recognition.ts +2 -0
- package/src/voice/avatar/datastream_io.ts +8 -0
- package/src/voice/generation.ts +24 -12
- package/src/voice/io.ts +27 -5
- package/src/voice/recorder_io/recorder_io.ts +123 -31
- package/src/voice/room_io/_input.ts +32 -4
- package/src/voice/room_io/_output.ts +8 -0
- package/src/voice/room_io/room_io.ts +3 -1
- package/src/voice/speech_handle.ts +4 -0
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { Mutex } from "@livekit/mutex";
|
|
2
|
-
import { ROOT_CONTEXT, trace } from "@opentelemetry/api";
|
|
2
|
+
import { ROOT_CONTEXT, context as otelContext, trace } from "@opentelemetry/api";
|
|
3
3
|
import { Heap } from "heap-js";
|
|
4
4
|
import { AsyncLocalStorage } from "node:async_hooks";
|
|
5
5
|
import { ReadableStream } from "node:stream/web";
|
|
@@ -455,8 +455,12 @@ class AgentActivity {
|
|
|
455
455
|
this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
|
|
456
456
|
}
|
|
457
457
|
// recognition hooks
|
|
458
|
-
onStartOfSpeech(
|
|
459
|
-
|
|
458
|
+
onStartOfSpeech(ev) {
|
|
459
|
+
let speechStartTime = Date.now();
|
|
460
|
+
if (ev) {
|
|
461
|
+
speechStartTime = speechStartTime - ev.speechDuration;
|
|
462
|
+
}
|
|
463
|
+
this.agentSession._updateUserState("speaking", speechStartTime);
|
|
460
464
|
}
|
|
461
465
|
onEndOfSpeech(ev) {
|
|
462
466
|
let speechEndTime = Date.now();
|
|
@@ -833,6 +837,7 @@ ${instructions}` : instructions,
|
|
|
833
837
|
);
|
|
834
838
|
}
|
|
835
839
|
async ttsTask(speechHandle, text, addToChatCtx, modelSettings, replyAbortController, audio) {
|
|
840
|
+
speechHandle._agentTurnContext = otelContext.active();
|
|
836
841
|
speechHandleStorage.enterWith(speechHandle);
|
|
837
842
|
const transcriptionOutput = this.agentSession.output.transcriptionEnabled ? this.agentSession.output.transcription : null;
|
|
838
843
|
const audioOutput = this.agentSession.output.audioEnabled ? this.agentSession.output.audio : null;
|
|
@@ -864,12 +869,15 @@ ${instructions}` : instructions,
|
|
|
864
869
|
textOut = _textOut;
|
|
865
870
|
tasks.push(textForwardTask);
|
|
866
871
|
}
|
|
867
|
-
const onFirstFrame = () => {
|
|
868
|
-
this.agentSession._updateAgentState("speaking"
|
|
872
|
+
const onFirstFrame = (startedSpeakingAt) => {
|
|
873
|
+
this.agentSession._updateAgentState("speaking", {
|
|
874
|
+
startTime: startedSpeakingAt,
|
|
875
|
+
otelContext: speechHandle._agentTurnContext
|
|
876
|
+
});
|
|
869
877
|
};
|
|
870
878
|
if (!audioOutput) {
|
|
871
879
|
if (textOut) {
|
|
872
|
-
textOut.firstTextFut.await.
|
|
880
|
+
textOut.firstTextFut.await.then(() => onFirstFrame()).catch(() => this.logger.debug("firstTextFut cancelled before first frame"));
|
|
873
881
|
}
|
|
874
882
|
} else {
|
|
875
883
|
let audioOut = null;
|
|
@@ -897,7 +905,7 @@ ${instructions}` : instructions,
|
|
|
897
905
|
tasks.push(forwardTask);
|
|
898
906
|
audioOut = _audioOut;
|
|
899
907
|
}
|
|
900
|
-
audioOut.firstFrameFut.await.
|
|
908
|
+
audioOut.firstFrameFut.await.then((ts) => onFirstFrame(ts)).catch(() => this.logger.debug("firstFrameFut cancelled before first frame"));
|
|
901
909
|
}
|
|
902
910
|
await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
|
|
903
911
|
if (audioOutput) {
|
|
@@ -936,6 +944,7 @@ ${instructions}` : instructions,
|
|
|
936
944
|
span
|
|
937
945
|
}) => {
|
|
938
946
|
var _a, _b, _c;
|
|
947
|
+
speechHandle._agentTurnContext = otelContext.active();
|
|
939
948
|
span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
|
|
940
949
|
if (instructions) {
|
|
941
950
|
span.setAttribute(traceTypes.ATTR_INSTRUCTIONS, instructions);
|
|
@@ -1012,8 +1021,11 @@ ${instructions}` : instructions,
|
|
|
1012
1021
|
tasks.push(textForwardTask);
|
|
1013
1022
|
textOut = _textOut;
|
|
1014
1023
|
}
|
|
1015
|
-
const onFirstFrame = () => {
|
|
1016
|
-
this.agentSession._updateAgentState("speaking"
|
|
1024
|
+
const onFirstFrame = (startedSpeakingAt) => {
|
|
1025
|
+
this.agentSession._updateAgentState("speaking", {
|
|
1026
|
+
startTime: startedSpeakingAt,
|
|
1027
|
+
otelContext: speechHandle._agentTurnContext
|
|
1028
|
+
});
|
|
1017
1029
|
};
|
|
1018
1030
|
let audioOut = null;
|
|
1019
1031
|
if (audioOutput) {
|
|
@@ -1025,12 +1037,12 @@ ${instructions}` : instructions,
|
|
|
1025
1037
|
);
|
|
1026
1038
|
audioOut = _audioOut;
|
|
1027
1039
|
tasks.push(forwardTask);
|
|
1028
|
-
audioOut.firstFrameFut.await.
|
|
1040
|
+
audioOut.firstFrameFut.await.then((ts) => onFirstFrame(ts)).catch(() => this.logger.debug("firstFrameFut cancelled before first frame"));
|
|
1029
1041
|
} else {
|
|
1030
1042
|
throw Error("ttsStream is null when audioOutput is enabled");
|
|
1031
1043
|
}
|
|
1032
1044
|
} else {
|
|
1033
|
-
textOut == null ? void 0 : textOut.firstTextFut.await.
|
|
1045
|
+
textOut == null ? void 0 : textOut.firstTextFut.await.then(() => onFirstFrame()).catch(() => this.logger.debug("firstTextFut cancelled before first frame"));
|
|
1034
1046
|
}
|
|
1035
1047
|
const onToolExecutionStarted = (f) => {
|
|
1036
1048
|
speechHandle._itemAdded([f]);
|
|
@@ -1061,7 +1073,12 @@ ${instructions}` : instructions,
|
|
|
1061
1073
|
msg.createdAt = replyStartedAt;
|
|
1062
1074
|
}
|
|
1063
1075
|
this.agent._chatCtx.insert(toolsMessages);
|
|
1064
|
-
|
|
1076
|
+
const toolCallOutputs = toolsMessages.filter(
|
|
1077
|
+
(m) => m.type === "function_call_output"
|
|
1078
|
+
);
|
|
1079
|
+
if (toolCallOutputs.length > 0) {
|
|
1080
|
+
this.agentSession._toolItemsAdded(toolCallOutputs);
|
|
1081
|
+
}
|
|
1065
1082
|
}
|
|
1066
1083
|
if (speechHandle.interrupted) {
|
|
1067
1084
|
this.logger.debug(
|
|
@@ -1078,9 +1095,9 @@ ${instructions}` : instructions,
|
|
|
1078
1095
|
let forwardedText = (textOut == null ? void 0 : textOut.text) || "";
|
|
1079
1096
|
if (audioOutput) {
|
|
1080
1097
|
const playbackEv = await audioOutput.waitForPlayout();
|
|
1081
|
-
if (audioOut == null ? void 0 : audioOut.firstFrameFut.done) {
|
|
1098
|
+
if ((audioOut == null ? void 0 : audioOut.firstFrameFut.done) && !audioOut.firstFrameFut.rejected) {
|
|
1082
1099
|
this.logger.info(
|
|
1083
|
-
{ speech_id: speechHandle.id,
|
|
1100
|
+
{ speech_id: speechHandle.id, playbackPositionInS: playbackEv.playbackPosition },
|
|
1084
1101
|
"playout interrupted"
|
|
1085
1102
|
);
|
|
1086
1103
|
if (playbackEv.synchronizedTranscript) {
|
|
@@ -1218,7 +1235,12 @@ ${instructions}` : instructions,
|
|
|
1218
1235
|
msg.createdAt = replyStartedAt;
|
|
1219
1236
|
}
|
|
1220
1237
|
this.agent._chatCtx.insert(toolMessages);
|
|
1221
|
-
|
|
1238
|
+
const toolCallOutputs = toolMessages.filter(
|
|
1239
|
+
(m) => m.type === "function_call_output"
|
|
1240
|
+
);
|
|
1241
|
+
if (toolCallOutputs.length > 0) {
|
|
1242
|
+
this.agentSession._toolItemsAdded(toolCallOutputs);
|
|
1243
|
+
}
|
|
1222
1244
|
}
|
|
1223
1245
|
};
|
|
1224
1246
|
pipelineReplyTask = async (speechHandle, chatCtx, toolCtx, modelSettings, replyAbortController, instructions, newMessage, toolsMessages) => tracer.startActiveSpan(
|
|
@@ -1261,6 +1283,7 @@ ${instructions}` : instructions,
|
|
|
1261
1283
|
span
|
|
1262
1284
|
}) {
|
|
1263
1285
|
var _a, _b, _c;
|
|
1286
|
+
speechHandle._agentTurnContext = otelContext.active();
|
|
1264
1287
|
span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
|
|
1265
1288
|
speechHandleStorage.enterWith(speechHandle);
|
|
1266
1289
|
if (!this.realtimeSession) {
|
|
@@ -1285,8 +1308,11 @@ ${instructions}` : instructions,
|
|
|
1285
1308
|
if (speechHandle.interrupted) {
|
|
1286
1309
|
return;
|
|
1287
1310
|
}
|
|
1288
|
-
const onFirstFrame = () => {
|
|
1289
|
-
this.agentSession._updateAgentState("speaking"
|
|
1311
|
+
const onFirstFrame = (startedSpeakingAt) => {
|
|
1312
|
+
this.agentSession._updateAgentState("speaking", {
|
|
1313
|
+
startTime: startedSpeakingAt,
|
|
1314
|
+
otelContext: speechHandle._agentTurnContext
|
|
1315
|
+
});
|
|
1290
1316
|
};
|
|
1291
1317
|
const readMessages = async (abortController, outputs) => {
|
|
1292
1318
|
replyAbortController.signal.addEventListener("abort", () => abortController.abort(), {
|
|
@@ -1361,10 +1387,10 @@ ${instructions}` : instructions,
|
|
|
1361
1387
|
);
|
|
1362
1388
|
forwardTasks.push(forwardTask);
|
|
1363
1389
|
audioOut = _audioOut;
|
|
1364
|
-
audioOut.firstFrameFut.await.
|
|
1390
|
+
audioOut.firstFrameFut.await.then((ts) => onFirstFrame(ts)).catch(() => this.logger.debug("firstFrameFut cancelled before first frame"));
|
|
1365
1391
|
}
|
|
1366
1392
|
} else if (textOut) {
|
|
1367
|
-
textOut.firstTextFut.await.
|
|
1393
|
+
textOut.firstTextFut.await.then(() => onFirstFrame()).catch(() => this.logger.debug("firstTextFut cancelled before first frame"));
|
|
1368
1394
|
}
|
|
1369
1395
|
outputs.push([msg.messageId, textOut, audioOut, msgModalities]);
|
|
1370
1396
|
}
|
|
@@ -1443,10 +1469,10 @@ ${instructions}` : instructions,
|
|
|
1443
1469
|
if (audioOutput) {
|
|
1444
1470
|
audioOutput.clearBuffer();
|
|
1445
1471
|
const playbackEv = await audioOutput.waitForPlayout();
|
|
1446
|
-
let
|
|
1447
|
-
if (audioOut == null ? void 0 : audioOut.firstFrameFut.done) {
|
|
1472
|
+
let playbackPositionInS = playbackEv.playbackPosition;
|
|
1473
|
+
if ((audioOut == null ? void 0 : audioOut.firstFrameFut.done) && !audioOut.firstFrameFut.rejected) {
|
|
1448
1474
|
this.logger.info(
|
|
1449
|
-
{ speech_id: speechHandle.id,
|
|
1475
|
+
{ speech_id: speechHandle.id, playbackPositionInS },
|
|
1450
1476
|
"playout interrupted"
|
|
1451
1477
|
);
|
|
1452
1478
|
if (playbackEv.synchronizedTranscript) {
|
|
@@ -1454,11 +1480,11 @@ ${instructions}` : instructions,
|
|
|
1454
1480
|
}
|
|
1455
1481
|
} else {
|
|
1456
1482
|
forwardedText = "";
|
|
1457
|
-
|
|
1483
|
+
playbackPositionInS = 0;
|
|
1458
1484
|
}
|
|
1459
1485
|
this.realtimeSession.truncate({
|
|
1460
1486
|
messageId: msgId,
|
|
1461
|
-
audioEndMs: Math.floor(
|
|
1487
|
+
audioEndMs: Math.floor(playbackPositionInS * 1e3),
|
|
1462
1488
|
modalities: msgModalities,
|
|
1463
1489
|
audioTranscript: forwardedText
|
|
1464
1490
|
});
|