@livekit/agents 1.0.16 → 1.0.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. package/dist/inference/llm.cjs +35 -13
  2. package/dist/inference/llm.cjs.map +1 -1
  3. package/dist/inference/llm.d.cts +10 -5
  4. package/dist/inference/llm.d.ts +10 -5
  5. package/dist/inference/llm.d.ts.map +1 -1
  6. package/dist/inference/llm.js +35 -13
  7. package/dist/inference/llm.js.map +1 -1
  8. package/dist/llm/chat_context.d.cts +1 -1
  9. package/dist/llm/chat_context.d.ts +1 -1
  10. package/dist/llm/llm.cjs.map +1 -1
  11. package/dist/llm/llm.d.cts +1 -1
  12. package/dist/llm/llm.d.ts +1 -1
  13. package/dist/llm/llm.d.ts.map +1 -1
  14. package/dist/llm/llm.js.map +1 -1
  15. package/dist/llm/provider_format/google.cjs.map +1 -1
  16. package/dist/llm/provider_format/google.d.cts +1 -1
  17. package/dist/llm/provider_format/google.d.ts +1 -1
  18. package/dist/llm/provider_format/google.d.ts.map +1 -1
  19. package/dist/llm/provider_format/google.js.map +1 -1
  20. package/dist/llm/provider_format/index.d.cts +1 -1
  21. package/dist/llm/provider_format/index.d.ts +1 -1
  22. package/dist/llm/provider_format/index.d.ts.map +1 -1
  23. package/dist/llm/realtime.cjs.map +1 -1
  24. package/dist/llm/realtime.d.cts +4 -0
  25. package/dist/llm/realtime.d.ts +4 -0
  26. package/dist/llm/realtime.d.ts.map +1 -1
  27. package/dist/llm/realtime.js.map +1 -1
  28. package/dist/llm/utils.cjs +2 -2
  29. package/dist/llm/utils.cjs.map +1 -1
  30. package/dist/llm/utils.d.cts +1 -1
  31. package/dist/llm/utils.d.ts +1 -1
  32. package/dist/llm/utils.d.ts.map +1 -1
  33. package/dist/llm/utils.js +2 -2
  34. package/dist/llm/utils.js.map +1 -1
  35. package/dist/llm/zod-utils.cjs +6 -3
  36. package/dist/llm/zod-utils.cjs.map +1 -1
  37. package/dist/llm/zod-utils.d.cts +1 -1
  38. package/dist/llm/zod-utils.d.ts +1 -1
  39. package/dist/llm/zod-utils.d.ts.map +1 -1
  40. package/dist/llm/zod-utils.js +6 -3
  41. package/dist/llm/zod-utils.js.map +1 -1
  42. package/dist/llm/zod-utils.test.cjs +83 -0
  43. package/dist/llm/zod-utils.test.cjs.map +1 -1
  44. package/dist/llm/zod-utils.test.js +83 -0
  45. package/dist/llm/zod-utils.test.js.map +1 -1
  46. package/dist/stt/stt.cjs +0 -1
  47. package/dist/stt/stt.cjs.map +1 -1
  48. package/dist/stt/stt.d.ts.map +1 -1
  49. package/dist/stt/stt.js +0 -1
  50. package/dist/stt/stt.js.map +1 -1
  51. package/dist/tts/tts.cjs +2 -4
  52. package/dist/tts/tts.cjs.map +1 -1
  53. package/dist/tts/tts.d.ts.map +1 -1
  54. package/dist/tts/tts.js +3 -5
  55. package/dist/tts/tts.js.map +1 -1
  56. package/dist/utils.cjs.map +1 -1
  57. package/dist/utils.d.cts +7 -0
  58. package/dist/utils.d.ts +7 -0
  59. package/dist/utils.d.ts.map +1 -1
  60. package/dist/utils.js.map +1 -1
  61. package/dist/voice/agent_activity.cjs +69 -20
  62. package/dist/voice/agent_activity.cjs.map +1 -1
  63. package/dist/voice/agent_activity.d.ts.map +1 -1
  64. package/dist/voice/agent_activity.js +69 -20
  65. package/dist/voice/agent_activity.js.map +1 -1
  66. package/dist/voice/agent_session.cjs +40 -1
  67. package/dist/voice/agent_session.cjs.map +1 -1
  68. package/dist/voice/agent_session.d.cts +5 -0
  69. package/dist/voice/agent_session.d.ts +5 -0
  70. package/dist/voice/agent_session.d.ts.map +1 -1
  71. package/dist/voice/agent_session.js +40 -1
  72. package/dist/voice/agent_session.js.map +1 -1
  73. package/dist/voice/interruption_detection.test.cjs +114 -0
  74. package/dist/voice/interruption_detection.test.cjs.map +1 -0
  75. package/dist/voice/interruption_detection.test.js +113 -0
  76. package/dist/voice/interruption_detection.test.js.map +1 -0
  77. package/dist/voice/room_io/room_io.cjs +3 -0
  78. package/dist/voice/room_io/room_io.cjs.map +1 -1
  79. package/dist/voice/room_io/room_io.d.cts +1 -0
  80. package/dist/voice/room_io/room_io.d.ts +1 -0
  81. package/dist/voice/room_io/room_io.d.ts.map +1 -1
  82. package/dist/voice/room_io/room_io.js +3 -0
  83. package/dist/voice/room_io/room_io.js.map +1 -1
  84. package/package.json +3 -3
  85. package/src/inference/llm.ts +53 -21
  86. package/src/llm/__snapshots__/zod-utils.test.ts.snap +218 -0
  87. package/src/llm/llm.ts +1 -1
  88. package/src/llm/provider_format/google.ts +4 -4
  89. package/src/llm/realtime.ts +8 -1
  90. package/src/llm/utils.ts +7 -2
  91. package/src/llm/zod-utils.test.ts +101 -0
  92. package/src/llm/zod-utils.ts +12 -3
  93. package/src/stt/stt.ts +2 -1
  94. package/src/tts/tts.ts +7 -5
  95. package/src/utils.ts +17 -0
  96. package/src/voice/agent_activity.ts +96 -24
  97. package/src/voice/agent_session.ts +54 -0
  98. package/src/voice/interruption_detection.test.ts +151 -0
  99. package/src/voice/room_io/room_io.ts +4 -0
@@ -152,6 +152,11 @@ class AgentActivity {
152
152
  } catch (error) {
153
153
  this.logger.error(error, "failed to update the tools");
154
154
  }
155
+ if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
156
+ this.logger.error(
157
+ "audio output is enabled but RealtimeModel has no audio modality and no TTS is set. Either enable audio modality in the RealtimeModel or set a TTS model."
158
+ );
159
+ }
155
160
  } else if (this.llm instanceof LLM) {
156
161
  try {
157
162
  updateInstructions({
@@ -449,7 +454,9 @@ class AgentActivity {
449
454
  }
450
455
  if (this.stt && this.agentSession.options.minInterruptionWords > 0 && this.audioRecognition) {
451
456
  const text = this.audioRecognition.currentTranscript;
452
- if (text && splitWords(text, true).length < this.agentSession.options.minInterruptionWords) {
457
+ const normalizedText = text ?? "";
458
+ const wordCount = splitWords(normalizedText, true).length;
459
+ if (wordCount < this.agentSession.options.minInterruptionWords) {
453
460
  return;
454
461
  }
455
462
  }
@@ -551,10 +558,19 @@ class AgentActivity {
551
558
  this.logger.warn({ user_input: info.newTranscript }, "skipping user input, task is draining");
552
559
  return true;
553
560
  }
554
- if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && this.agentSession.options.minInterruptionWords > 0 && info.newTranscript.split(" ").length < this.agentSession.options.minInterruptionWords) {
555
- this.cancelPreemptiveGeneration();
556
- this.logger.info("skipping user input, new_transcript is too short");
557
- return false;
561
+ if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && this.agentSession.options.minInterruptionWords > 0) {
562
+ const wordCount = splitWords(info.newTranscript, true).length;
563
+ if (wordCount < this.agentSession.options.minInterruptionWords) {
564
+ this.cancelPreemptiveGeneration();
565
+ this.logger.info(
566
+ {
567
+ wordCount,
568
+ minInterruptionWords: this.agentSession.options.minInterruptionWords
569
+ },
570
+ "skipping user input, word count below minimum interruption threshold"
571
+ );
572
+ return false;
573
+ }
558
574
  }
559
575
  const oldTask = this._userTurnCompletedTask;
560
576
  this._userTurnCompletedTask = this.createSpeechTask({
@@ -1197,7 +1213,22 @@ ${instructions}` : instructions,
1197
1213
  );
1198
1214
  break;
1199
1215
  }
1200
- const trNodeResult = await this.agent.transcriptionNode(msg.textStream, modelSettings);
1216
+ const msgModalities = msg.modalities ? await msg.modalities : void 0;
1217
+ let ttsTextInput = null;
1218
+ let trTextInput;
1219
+ if (msgModalities && !msgModalities.includes("audio") && this.tts) {
1220
+ if (this.llm instanceof RealtimeModel && this.llm.capabilities.audioOutput) {
1221
+ this.logger.warn(
1222
+ "text response received from realtime API, falling back to use a TTS model."
1223
+ );
1224
+ }
1225
+ const [_ttsTextInput, _trTextInput] = msg.textStream.tee();
1226
+ ttsTextInput = _ttsTextInput;
1227
+ trTextInput = _trTextInput;
1228
+ } else {
1229
+ trTextInput = msg.textStream;
1230
+ }
1231
+ const trNodeResult = await this.agent.transcriptionNode(trTextInput, modelSettings);
1201
1232
  let textOut = null;
1202
1233
  if (trNodeResult) {
1203
1234
  const [textForwardTask, _textOut] = performTextForwarding(
@@ -1210,28 +1241,44 @@ ${instructions}` : instructions,
1210
1241
  }
1211
1242
  let audioOut = null;
1212
1243
  if (audioOutput) {
1213
- const realtimeAudio = await this.agent.realtimeAudioOutputNode(
1214
- msg.audioStream,
1215
- modelSettings
1216
- );
1217
- if (realtimeAudio) {
1244
+ let realtimeAudioResult = null;
1245
+ if (ttsTextInput) {
1246
+ const [ttsTask, ttsStream] = performTTSInference(
1247
+ (...args) => this.agent.ttsNode(...args),
1248
+ ttsTextInput,
1249
+ modelSettings,
1250
+ abortController
1251
+ );
1252
+ tasks.push(ttsTask);
1253
+ realtimeAudioResult = ttsStream;
1254
+ } else if (msgModalities && msgModalities.includes("audio")) {
1255
+ realtimeAudioResult = await this.agent.realtimeAudioOutputNode(
1256
+ msg.audioStream,
1257
+ modelSettings
1258
+ );
1259
+ } else if (this.llm instanceof RealtimeModel && this.llm.capabilities.audioOutput) {
1260
+ this.logger.error(
1261
+ "Text message received from Realtime API with audio modality. This usually happens when text chat context is synced to the API. Try to add a TTS model as fallback or use text modality with TTS instead."
1262
+ );
1263
+ } else {
1264
+ this.logger.warn(
1265
+ "audio output is enabled but neither tts nor realtime audio is available"
1266
+ );
1267
+ }
1268
+ if (realtimeAudioResult) {
1218
1269
  const [forwardTask, _audioOut] = performAudioForwarding(
1219
- realtimeAudio,
1270
+ realtimeAudioResult,
1220
1271
  audioOutput,
1221
1272
  abortController
1222
1273
  );
1223
1274
  forwardTasks.push(forwardTask);
1224
1275
  audioOut = _audioOut;
1225
1276
  audioOut.firstFrameFut.await.finally(onFirstFrame);
1226
- } else {
1227
- this.logger.warn(
1228
- "audio output is enabled but neither tts nor realtime audio is available"
1229
- );
1230
1277
  }
1231
1278
  } else if (textOut) {
1232
1279
  textOut.firstTextFut.await.finally(onFirstFrame);
1233
1280
  }
1234
- outputs.push([msg.messageId, textOut, audioOut]);
1281
+ outputs.push([msg.messageId, textOut, audioOut, msgModalities]);
1235
1282
  }
1236
1283
  await waitFor(forwardTasks);
1237
1284
  } catch (error) {
@@ -1301,7 +1348,7 @@ ${instructions}` : instructions,
1301
1348
  replyAbortController.abort();
1302
1349
  await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
1303
1350
  if (messageOutputs.length > 0) {
1304
- const [msgId, textOut, audioOut] = messageOutputs[0];
1351
+ const [msgId, textOut, audioOut, msgModalities] = messageOutputs[0];
1305
1352
  let forwardedText = (textOut == null ? void 0 : textOut.text) || "";
1306
1353
  if (audioOutput) {
1307
1354
  audioOutput.clearBuffer();
@@ -1321,7 +1368,9 @@ ${instructions}` : instructions,
1321
1368
  }
1322
1369
  this.realtimeSession.truncate({
1323
1370
  messageId: msgId,
1324
- audioEndMs: Math.floor(playbackPosition)
1371
+ audioEndMs: Math.floor(playbackPosition),
1372
+ modalities: msgModalities,
1373
+ audioTranscript: forwardedText
1325
1374
  });
1326
1375
  }
1327
1376
  if (forwardedText) {
@@ -1345,7 +1394,7 @@ ${instructions}` : instructions,
1345
1394
  return;
1346
1395
  }
1347
1396
  if (messageOutputs.length > 0) {
1348
- const [msgId, textOut, _] = messageOutputs[0];
1397
+ const [msgId, textOut, _, __] = messageOutputs[0];
1349
1398
  const message = ChatMessage.create({
1350
1399
  role: "assistant",
1351
1400
  content: (textOut == null ? void 0 : textOut.text) || "",