@livekit/agents 1.0.37 → 1.0.38

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (133) hide show
  1. package/dist/cli.cjs.map +1 -1
  2. package/dist/inference/api_protos.cjs +68 -0
  3. package/dist/inference/api_protos.cjs.map +1 -1
  4. package/dist/inference/api_protos.d.cts +345 -4
  5. package/dist/inference/api_protos.d.ts +345 -4
  6. package/dist/inference/api_protos.d.ts.map +1 -1
  7. package/dist/inference/api_protos.js +60 -0
  8. package/dist/inference/api_protos.js.map +1 -1
  9. package/dist/inference/stt.cjs +32 -21
  10. package/dist/inference/stt.cjs.map +1 -1
  11. package/dist/inference/stt.d.ts.map +1 -1
  12. package/dist/inference/stt.js +34 -21
  13. package/dist/inference/stt.js.map +1 -1
  14. package/dist/ipc/inference_proc_executor.cjs.map +1 -1
  15. package/dist/ipc/job_proc_executor.cjs.map +1 -1
  16. package/dist/stt/stt.cjs +10 -0
  17. package/dist/stt/stt.cjs.map +1 -1
  18. package/dist/stt/stt.d.cts +12 -0
  19. package/dist/stt/stt.d.ts +12 -0
  20. package/dist/stt/stt.d.ts.map +1 -1
  21. package/dist/stt/stt.js +10 -0
  22. package/dist/stt/stt.js.map +1 -1
  23. package/dist/telemetry/traces.cjs +4 -3
  24. package/dist/telemetry/traces.cjs.map +1 -1
  25. package/dist/telemetry/traces.d.cts +2 -0
  26. package/dist/telemetry/traces.d.ts +2 -0
  27. package/dist/telemetry/traces.d.ts.map +1 -1
  28. package/dist/telemetry/traces.js +4 -3
  29. package/dist/telemetry/traces.js.map +1 -1
  30. package/dist/utils.cjs +6 -0
  31. package/dist/utils.cjs.map +1 -1
  32. package/dist/utils.d.cts +2 -0
  33. package/dist/utils.d.ts +2 -0
  34. package/dist/utils.d.ts.map +1 -1
  35. package/dist/utils.js +6 -0
  36. package/dist/utils.js.map +1 -1
  37. package/dist/voice/agent.cjs +5 -0
  38. package/dist/voice/agent.cjs.map +1 -1
  39. package/dist/voice/agent.d.ts.map +1 -1
  40. package/dist/voice/agent.js +5 -0
  41. package/dist/voice/agent.js.map +1 -1
  42. package/dist/voice/agent_activity.cjs +49 -23
  43. package/dist/voice/agent_activity.cjs.map +1 -1
  44. package/dist/voice/agent_activity.d.cts +1 -1
  45. package/dist/voice/agent_activity.d.ts +1 -1
  46. package/dist/voice/agent_activity.d.ts.map +1 -1
  47. package/dist/voice/agent_activity.js +50 -24
  48. package/dist/voice/agent_activity.js.map +1 -1
  49. package/dist/voice/agent_session.cjs +7 -5
  50. package/dist/voice/agent_session.cjs.map +1 -1
  51. package/dist/voice/agent_session.d.cts +5 -2
  52. package/dist/voice/agent_session.d.ts +5 -2
  53. package/dist/voice/agent_session.d.ts.map +1 -1
  54. package/dist/voice/agent_session.js +7 -5
  55. package/dist/voice/agent_session.js.map +1 -1
  56. package/dist/voice/audio_recognition.cjs +3 -1
  57. package/dist/voice/audio_recognition.cjs.map +1 -1
  58. package/dist/voice/audio_recognition.d.ts.map +1 -1
  59. package/dist/voice/audio_recognition.js +3 -1
  60. package/dist/voice/audio_recognition.js.map +1 -1
  61. package/dist/voice/avatar/datastream_io.cjs +6 -0
  62. package/dist/voice/avatar/datastream_io.cjs.map +1 -1
  63. package/dist/voice/avatar/datastream_io.d.cts +1 -0
  64. package/dist/voice/avatar/datastream_io.d.ts +1 -0
  65. package/dist/voice/avatar/datastream_io.d.ts.map +1 -1
  66. package/dist/voice/avatar/datastream_io.js +6 -0
  67. package/dist/voice/avatar/datastream_io.js.map +1 -1
  68. package/dist/voice/background_audio.cjs.map +1 -1
  69. package/dist/voice/generation.cjs +14 -5
  70. package/dist/voice/generation.cjs.map +1 -1
  71. package/dist/voice/generation.d.cts +3 -2
  72. package/dist/voice/generation.d.ts +3 -2
  73. package/dist/voice/generation.d.ts.map +1 -1
  74. package/dist/voice/generation.js +14 -5
  75. package/dist/voice/generation.js.map +1 -1
  76. package/dist/voice/io.cjs +12 -0
  77. package/dist/voice/io.cjs.map +1 -1
  78. package/dist/voice/io.d.cts +19 -1
  79. package/dist/voice/io.d.ts +19 -1
  80. package/dist/voice/io.d.ts.map +1 -1
  81. package/dist/voice/io.js +12 -0
  82. package/dist/voice/io.js.map +1 -1
  83. package/dist/voice/recorder_io/recorder_io.cjs +91 -28
  84. package/dist/voice/recorder_io/recorder_io.cjs.map +1 -1
  85. package/dist/voice/recorder_io/recorder_io.d.cts +7 -1
  86. package/dist/voice/recorder_io/recorder_io.d.ts +7 -1
  87. package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -1
  88. package/dist/voice/recorder_io/recorder_io.js +91 -28
  89. package/dist/voice/recorder_io/recorder_io.js.map +1 -1
  90. package/dist/voice/room_io/_input.cjs +40 -11
  91. package/dist/voice/room_io/_input.cjs.map +1 -1
  92. package/dist/voice/room_io/_input.d.cts +4 -1
  93. package/dist/voice/room_io/_input.d.ts +4 -1
  94. package/dist/voice/room_io/_input.d.ts.map +1 -1
  95. package/dist/voice/room_io/_input.js +31 -2
  96. package/dist/voice/room_io/_input.js.map +1 -1
  97. package/dist/voice/room_io/_output.cjs +6 -0
  98. package/dist/voice/room_io/_output.cjs.map +1 -1
  99. package/dist/voice/room_io/_output.d.cts +1 -0
  100. package/dist/voice/room_io/_output.d.ts +1 -0
  101. package/dist/voice/room_io/_output.d.ts.map +1 -1
  102. package/dist/voice/room_io/_output.js +6 -0
  103. package/dist/voice/room_io/_output.js.map +1 -1
  104. package/dist/voice/room_io/room_io.cjs.map +1 -1
  105. package/dist/voice/room_io/room_io.d.cts +2 -2
  106. package/dist/voice/room_io/room_io.d.ts +2 -2
  107. package/dist/voice/room_io/room_io.d.ts.map +1 -1
  108. package/dist/voice/room_io/room_io.js.map +1 -1
  109. package/dist/voice/speech_handle.cjs +2 -0
  110. package/dist/voice/speech_handle.cjs.map +1 -1
  111. package/dist/voice/speech_handle.d.cts +3 -0
  112. package/dist/voice/speech_handle.d.ts +3 -0
  113. package/dist/voice/speech_handle.d.ts.map +1 -1
  114. package/dist/voice/speech_handle.js +2 -0
  115. package/dist/voice/speech_handle.js.map +1 -1
  116. package/package.json +1 -1
  117. package/src/inference/api_protos.ts +83 -0
  118. package/src/inference/stt.ts +39 -22
  119. package/src/stt/stt.ts +21 -0
  120. package/src/telemetry/traces.ts +6 -2
  121. package/src/utils.ts +7 -0
  122. package/src/voice/agent.ts +9 -0
  123. package/src/voice/agent_activity.ts +72 -26
  124. package/src/voice/agent_session.ts +6 -5
  125. package/src/voice/audio_recognition.ts +2 -0
  126. package/src/voice/avatar/datastream_io.ts +8 -0
  127. package/src/voice/generation.ts +24 -12
  128. package/src/voice/io.ts +27 -5
  129. package/src/voice/recorder_io/recorder_io.ts +123 -31
  130. package/src/voice/room_io/_input.ts +32 -4
  131. package/src/voice/room_io/_output.ts +8 -0
  132. package/src/voice/room_io/room_io.ts +3 -1
  133. package/src/voice/speech_handle.ts +4 -0
@@ -1,5 +1,5 @@
1
1
  import { Mutex } from "@livekit/mutex";
2
- import { ROOT_CONTEXT, trace } from "@opentelemetry/api";
2
+ import { ROOT_CONTEXT, context as otelContext, trace } from "@opentelemetry/api";
3
3
  import { Heap } from "heap-js";
4
4
  import { AsyncLocalStorage } from "node:async_hooks";
5
5
  import { ReadableStream } from "node:stream/web";
@@ -455,8 +455,12 @@ class AgentActivity {
455
455
  this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
456
456
  }
457
457
  // recognition hooks
458
- onStartOfSpeech(_ev) {
459
- this.agentSession._updateUserState("speaking");
458
+ onStartOfSpeech(ev) {
459
+ let speechStartTime = Date.now();
460
+ if (ev) {
461
+ speechStartTime = speechStartTime - ev.speechDuration;
462
+ }
463
+ this.agentSession._updateUserState("speaking", speechStartTime);
460
464
  }
461
465
  onEndOfSpeech(ev) {
462
466
  let speechEndTime = Date.now();
@@ -833,6 +837,7 @@ ${instructions}` : instructions,
833
837
  );
834
838
  }
835
839
  async ttsTask(speechHandle, text, addToChatCtx, modelSettings, replyAbortController, audio) {
840
+ speechHandle._agentTurnContext = otelContext.active();
836
841
  speechHandleStorage.enterWith(speechHandle);
837
842
  const transcriptionOutput = this.agentSession.output.transcriptionEnabled ? this.agentSession.output.transcription : null;
838
843
  const audioOutput = this.agentSession.output.audioEnabled ? this.agentSession.output.audio : null;
@@ -864,12 +869,15 @@ ${instructions}` : instructions,
864
869
  textOut = _textOut;
865
870
  tasks.push(textForwardTask);
866
871
  }
867
- const onFirstFrame = () => {
868
- this.agentSession._updateAgentState("speaking");
872
+ const onFirstFrame = (startedSpeakingAt) => {
873
+ this.agentSession._updateAgentState("speaking", {
874
+ startTime: startedSpeakingAt,
875
+ otelContext: speechHandle._agentTurnContext
876
+ });
869
877
  };
870
878
  if (!audioOutput) {
871
879
  if (textOut) {
872
- textOut.firstTextFut.await.finally(onFirstFrame);
880
+ textOut.firstTextFut.await.then(() => onFirstFrame()).catch(() => this.logger.debug("firstTextFut cancelled before first frame"));
873
881
  }
874
882
  } else {
875
883
  let audioOut = null;
@@ -897,7 +905,7 @@ ${instructions}` : instructions,
897
905
  tasks.push(forwardTask);
898
906
  audioOut = _audioOut;
899
907
  }
900
- audioOut.firstFrameFut.await.finally(onFirstFrame);
908
+ audioOut.firstFrameFut.await.then((ts) => onFirstFrame(ts)).catch(() => this.logger.debug("firstFrameFut cancelled before first frame"));
901
909
  }
902
910
  await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
903
911
  if (audioOutput) {
@@ -936,6 +944,7 @@ ${instructions}` : instructions,
936
944
  span
937
945
  }) => {
938
946
  var _a, _b, _c;
947
+ speechHandle._agentTurnContext = otelContext.active();
939
948
  span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
940
949
  if (instructions) {
941
950
  span.setAttribute(traceTypes.ATTR_INSTRUCTIONS, instructions);
@@ -1012,8 +1021,11 @@ ${instructions}` : instructions,
1012
1021
  tasks.push(textForwardTask);
1013
1022
  textOut = _textOut;
1014
1023
  }
1015
- const onFirstFrame = () => {
1016
- this.agentSession._updateAgentState("speaking");
1024
+ const onFirstFrame = (startedSpeakingAt) => {
1025
+ this.agentSession._updateAgentState("speaking", {
1026
+ startTime: startedSpeakingAt,
1027
+ otelContext: speechHandle._agentTurnContext
1028
+ });
1017
1029
  };
1018
1030
  let audioOut = null;
1019
1031
  if (audioOutput) {
@@ -1025,12 +1037,12 @@ ${instructions}` : instructions,
1025
1037
  );
1026
1038
  audioOut = _audioOut;
1027
1039
  tasks.push(forwardTask);
1028
- audioOut.firstFrameFut.await.finally(onFirstFrame);
1040
+ audioOut.firstFrameFut.await.then((ts) => onFirstFrame(ts)).catch(() => this.logger.debug("firstFrameFut cancelled before first frame"));
1029
1041
  } else {
1030
1042
  throw Error("ttsStream is null when audioOutput is enabled");
1031
1043
  }
1032
1044
  } else {
1033
- textOut == null ? void 0 : textOut.firstTextFut.await.finally(onFirstFrame);
1045
+ textOut == null ? void 0 : textOut.firstTextFut.await.then(() => onFirstFrame()).catch(() => this.logger.debug("firstTextFut cancelled before first frame"));
1034
1046
  }
1035
1047
  const onToolExecutionStarted = (f) => {
1036
1048
  speechHandle._itemAdded([f]);
@@ -1061,7 +1073,12 @@ ${instructions}` : instructions,
1061
1073
  msg.createdAt = replyStartedAt;
1062
1074
  }
1063
1075
  this.agent._chatCtx.insert(toolsMessages);
1064
- this.agentSession._toolItemsAdded(toolsMessages);
1076
+ const toolCallOutputs = toolsMessages.filter(
1077
+ (m) => m.type === "function_call_output"
1078
+ );
1079
+ if (toolCallOutputs.length > 0) {
1080
+ this.agentSession._toolItemsAdded(toolCallOutputs);
1081
+ }
1065
1082
  }
1066
1083
  if (speechHandle.interrupted) {
1067
1084
  this.logger.debug(
@@ -1078,9 +1095,9 @@ ${instructions}` : instructions,
1078
1095
  let forwardedText = (textOut == null ? void 0 : textOut.text) || "";
1079
1096
  if (audioOutput) {
1080
1097
  const playbackEv = await audioOutput.waitForPlayout();
1081
- if (audioOut == null ? void 0 : audioOut.firstFrameFut.done) {
1098
+ if ((audioOut == null ? void 0 : audioOut.firstFrameFut.done) && !audioOut.firstFrameFut.rejected) {
1082
1099
  this.logger.info(
1083
- { speech_id: speechHandle.id, playbackPosition: playbackEv.playbackPosition },
1100
+ { speech_id: speechHandle.id, playbackPositionInS: playbackEv.playbackPosition },
1084
1101
  "playout interrupted"
1085
1102
  );
1086
1103
  if (playbackEv.synchronizedTranscript) {
@@ -1218,7 +1235,12 @@ ${instructions}` : instructions,
1218
1235
  msg.createdAt = replyStartedAt;
1219
1236
  }
1220
1237
  this.agent._chatCtx.insert(toolMessages);
1221
- this.agentSession._toolItemsAdded(toolMessages);
1238
+ const toolCallOutputs = toolMessages.filter(
1239
+ (m) => m.type === "function_call_output"
1240
+ );
1241
+ if (toolCallOutputs.length > 0) {
1242
+ this.agentSession._toolItemsAdded(toolCallOutputs);
1243
+ }
1222
1244
  }
1223
1245
  };
1224
1246
  pipelineReplyTask = async (speechHandle, chatCtx, toolCtx, modelSettings, replyAbortController, instructions, newMessage, toolsMessages) => tracer.startActiveSpan(
@@ -1261,6 +1283,7 @@ ${instructions}` : instructions,
1261
1283
  span
1262
1284
  }) {
1263
1285
  var _a, _b, _c;
1286
+ speechHandle._agentTurnContext = otelContext.active();
1264
1287
  span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
1265
1288
  speechHandleStorage.enterWith(speechHandle);
1266
1289
  if (!this.realtimeSession) {
@@ -1285,8 +1308,11 @@ ${instructions}` : instructions,
1285
1308
  if (speechHandle.interrupted) {
1286
1309
  return;
1287
1310
  }
1288
- const onFirstFrame = () => {
1289
- this.agentSession._updateAgentState("speaking");
1311
+ const onFirstFrame = (startedSpeakingAt) => {
1312
+ this.agentSession._updateAgentState("speaking", {
1313
+ startTime: startedSpeakingAt,
1314
+ otelContext: speechHandle._agentTurnContext
1315
+ });
1290
1316
  };
1291
1317
  const readMessages = async (abortController, outputs) => {
1292
1318
  replyAbortController.signal.addEventListener("abort", () => abortController.abort(), {
@@ -1361,10 +1387,10 @@ ${instructions}` : instructions,
1361
1387
  );
1362
1388
  forwardTasks.push(forwardTask);
1363
1389
  audioOut = _audioOut;
1364
- audioOut.firstFrameFut.await.finally(onFirstFrame);
1390
+ audioOut.firstFrameFut.await.then((ts) => onFirstFrame(ts)).catch(() => this.logger.debug("firstFrameFut cancelled before first frame"));
1365
1391
  }
1366
1392
  } else if (textOut) {
1367
- textOut.firstTextFut.await.finally(onFirstFrame);
1393
+ textOut.firstTextFut.await.then(() => onFirstFrame()).catch(() => this.logger.debug("firstTextFut cancelled before first frame"));
1368
1394
  }
1369
1395
  outputs.push([msg.messageId, textOut, audioOut, msgModalities]);
1370
1396
  }
@@ -1443,10 +1469,10 @@ ${instructions}` : instructions,
1443
1469
  if (audioOutput) {
1444
1470
  audioOutput.clearBuffer();
1445
1471
  const playbackEv = await audioOutput.waitForPlayout();
1446
- let playbackPosition = playbackEv.playbackPosition;
1447
- if (audioOut == null ? void 0 : audioOut.firstFrameFut.done) {
1472
+ let playbackPositionInS = playbackEv.playbackPosition;
1473
+ if ((audioOut == null ? void 0 : audioOut.firstFrameFut.done) && !audioOut.firstFrameFut.rejected) {
1448
1474
  this.logger.info(
1449
- { speech_id: speechHandle.id, playbackPosition: playbackEv.playbackPosition },
1475
+ { speech_id: speechHandle.id, playbackPositionInS },
1450
1476
  "playout interrupted"
1451
1477
  );
1452
1478
  if (playbackEv.synchronizedTranscript) {
@@ -1454,11 +1480,11 @@ ${instructions}` : instructions,
1454
1480
  }
1455
1481
  } else {
1456
1482
  forwardedText = "";
1457
- playbackPosition = 0;
1483
+ playbackPositionInS = 0;
1458
1484
  }
1459
1485
  this.realtimeSession.truncate({
1460
1486
  messageId: msgId,
1461
- audioEndMs: Math.floor(playbackPosition),
1487
+ audioEndMs: Math.floor(playbackPositionInS * 1e3),
1462
1488
  modalities: msgModalities,
1463
1489
  audioTranscript: forwardedText
1464
1490
  });