@livekit/agents 1.0.16 → 1.0.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/inference/llm.cjs +35 -13
- package/dist/inference/llm.cjs.map +1 -1
- package/dist/inference/llm.d.cts +10 -5
- package/dist/inference/llm.d.ts +10 -5
- package/dist/inference/llm.d.ts.map +1 -1
- package/dist/inference/llm.js +35 -13
- package/dist/inference/llm.js.map +1 -1
- package/dist/llm/chat_context.d.cts +1 -1
- package/dist/llm/chat_context.d.ts +1 -1
- package/dist/llm/llm.cjs.map +1 -1
- package/dist/llm/llm.d.cts +1 -1
- package/dist/llm/llm.d.ts +1 -1
- package/dist/llm/llm.d.ts.map +1 -1
- package/dist/llm/llm.js.map +1 -1
- package/dist/llm/provider_format/google.cjs.map +1 -1
- package/dist/llm/provider_format/google.d.cts +1 -1
- package/dist/llm/provider_format/google.d.ts +1 -1
- package/dist/llm/provider_format/google.d.ts.map +1 -1
- package/dist/llm/provider_format/google.js.map +1 -1
- package/dist/llm/provider_format/index.d.cts +1 -1
- package/dist/llm/provider_format/index.d.ts +1 -1
- package/dist/llm/provider_format/index.d.ts.map +1 -1
- package/dist/llm/realtime.cjs.map +1 -1
- package/dist/llm/realtime.d.cts +4 -0
- package/dist/llm/realtime.d.ts +4 -0
- package/dist/llm/realtime.d.ts.map +1 -1
- package/dist/llm/realtime.js.map +1 -1
- package/dist/llm/utils.cjs +2 -2
- package/dist/llm/utils.cjs.map +1 -1
- package/dist/llm/utils.d.cts +1 -1
- package/dist/llm/utils.d.ts +1 -1
- package/dist/llm/utils.d.ts.map +1 -1
- package/dist/llm/utils.js +2 -2
- package/dist/llm/utils.js.map +1 -1
- package/dist/llm/zod-utils.cjs +6 -3
- package/dist/llm/zod-utils.cjs.map +1 -1
- package/dist/llm/zod-utils.d.cts +1 -1
- package/dist/llm/zod-utils.d.ts +1 -1
- package/dist/llm/zod-utils.d.ts.map +1 -1
- package/dist/llm/zod-utils.js +6 -3
- package/dist/llm/zod-utils.js.map +1 -1
- package/dist/llm/zod-utils.test.cjs +83 -0
- package/dist/llm/zod-utils.test.cjs.map +1 -1
- package/dist/llm/zod-utils.test.js +83 -0
- package/dist/llm/zod-utils.test.js.map +1 -1
- package/dist/stt/stt.cjs +0 -1
- package/dist/stt/stt.cjs.map +1 -1
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js +0 -1
- package/dist/stt/stt.js.map +1 -1
- package/dist/tts/tts.cjs +2 -4
- package/dist/tts/tts.cjs.map +1 -1
- package/dist/tts/tts.d.ts.map +1 -1
- package/dist/tts/tts.js +3 -5
- package/dist/tts/tts.js.map +1 -1
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.cts +7 -0
- package/dist/utils.d.ts +7 -0
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js.map +1 -1
- package/dist/voice/agent_activity.cjs +69 -20
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +69 -20
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +40 -1
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +5 -0
- package/dist/voice/agent_session.d.ts +5 -0
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +40 -1
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/interruption_detection.test.cjs +114 -0
- package/dist/voice/interruption_detection.test.cjs.map +1 -0
- package/dist/voice/interruption_detection.test.js +113 -0
- package/dist/voice/interruption_detection.test.js.map +1 -0
- package/dist/voice/room_io/room_io.cjs +3 -0
- package/dist/voice/room_io/room_io.cjs.map +1 -1
- package/dist/voice/room_io/room_io.d.cts +1 -0
- package/dist/voice/room_io/room_io.d.ts +1 -0
- package/dist/voice/room_io/room_io.d.ts.map +1 -1
- package/dist/voice/room_io/room_io.js +3 -0
- package/dist/voice/room_io/room_io.js.map +1 -1
- package/package.json +3 -3
- package/src/inference/llm.ts +53 -21
- package/src/llm/__snapshots__/zod-utils.test.ts.snap +218 -0
- package/src/llm/llm.ts +1 -1
- package/src/llm/provider_format/google.ts +4 -4
- package/src/llm/realtime.ts +8 -1
- package/src/llm/utils.ts +7 -2
- package/src/llm/zod-utils.test.ts +101 -0
- package/src/llm/zod-utils.ts +12 -3
- package/src/stt/stt.ts +2 -1
- package/src/tts/tts.ts +7 -5
- package/src/utils.ts +17 -0
- package/src/voice/agent_activity.ts +96 -24
- package/src/voice/agent_session.ts +54 -0
- package/src/voice/interruption_detection.test.ts +151 -0
- package/src/voice/room_io/room_io.ts +4 -0
|
@@ -152,6 +152,11 @@ class AgentActivity {
|
|
|
152
152
|
} catch (error) {
|
|
153
153
|
this.logger.error(error, "failed to update the tools");
|
|
154
154
|
}
|
|
155
|
+
if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
|
|
156
|
+
this.logger.error(
|
|
157
|
+
"audio output is enabled but RealtimeModel has no audio modality and no TTS is set. Either enable audio modality in the RealtimeModel or set a TTS model."
|
|
158
|
+
);
|
|
159
|
+
}
|
|
155
160
|
} else if (this.llm instanceof LLM) {
|
|
156
161
|
try {
|
|
157
162
|
updateInstructions({
|
|
@@ -449,7 +454,9 @@ class AgentActivity {
|
|
|
449
454
|
}
|
|
450
455
|
if (this.stt && this.agentSession.options.minInterruptionWords > 0 && this.audioRecognition) {
|
|
451
456
|
const text = this.audioRecognition.currentTranscript;
|
|
452
|
-
|
|
457
|
+
const normalizedText = text ?? "";
|
|
458
|
+
const wordCount = splitWords(normalizedText, true).length;
|
|
459
|
+
if (wordCount < this.agentSession.options.minInterruptionWords) {
|
|
453
460
|
return;
|
|
454
461
|
}
|
|
455
462
|
}
|
|
@@ -551,10 +558,19 @@ class AgentActivity {
|
|
|
551
558
|
this.logger.warn({ user_input: info.newTranscript }, "skipping user input, task is draining");
|
|
552
559
|
return true;
|
|
553
560
|
}
|
|
554
|
-
if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && this.agentSession.options.minInterruptionWords > 0
|
|
555
|
-
|
|
556
|
-
this.
|
|
557
|
-
|
|
561
|
+
if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && this.agentSession.options.minInterruptionWords > 0) {
|
|
562
|
+
const wordCount = splitWords(info.newTranscript, true).length;
|
|
563
|
+
if (wordCount < this.agentSession.options.minInterruptionWords) {
|
|
564
|
+
this.cancelPreemptiveGeneration();
|
|
565
|
+
this.logger.info(
|
|
566
|
+
{
|
|
567
|
+
wordCount,
|
|
568
|
+
minInterruptionWords: this.agentSession.options.minInterruptionWords
|
|
569
|
+
},
|
|
570
|
+
"skipping user input, word count below minimum interruption threshold"
|
|
571
|
+
);
|
|
572
|
+
return false;
|
|
573
|
+
}
|
|
558
574
|
}
|
|
559
575
|
const oldTask = this._userTurnCompletedTask;
|
|
560
576
|
this._userTurnCompletedTask = this.createSpeechTask({
|
|
@@ -1197,7 +1213,22 @@ ${instructions}` : instructions,
|
|
|
1197
1213
|
);
|
|
1198
1214
|
break;
|
|
1199
1215
|
}
|
|
1200
|
-
const
|
|
1216
|
+
const msgModalities = msg.modalities ? await msg.modalities : void 0;
|
|
1217
|
+
let ttsTextInput = null;
|
|
1218
|
+
let trTextInput;
|
|
1219
|
+
if (msgModalities && !msgModalities.includes("audio") && this.tts) {
|
|
1220
|
+
if (this.llm instanceof RealtimeModel && this.llm.capabilities.audioOutput) {
|
|
1221
|
+
this.logger.warn(
|
|
1222
|
+
"text response received from realtime API, falling back to use a TTS model."
|
|
1223
|
+
);
|
|
1224
|
+
}
|
|
1225
|
+
const [_ttsTextInput, _trTextInput] = msg.textStream.tee();
|
|
1226
|
+
ttsTextInput = _ttsTextInput;
|
|
1227
|
+
trTextInput = _trTextInput;
|
|
1228
|
+
} else {
|
|
1229
|
+
trTextInput = msg.textStream;
|
|
1230
|
+
}
|
|
1231
|
+
const trNodeResult = await this.agent.transcriptionNode(trTextInput, modelSettings);
|
|
1201
1232
|
let textOut = null;
|
|
1202
1233
|
if (trNodeResult) {
|
|
1203
1234
|
const [textForwardTask, _textOut] = performTextForwarding(
|
|
@@ -1210,28 +1241,44 @@ ${instructions}` : instructions,
|
|
|
1210
1241
|
}
|
|
1211
1242
|
let audioOut = null;
|
|
1212
1243
|
if (audioOutput) {
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
|
|
1217
|
-
|
|
1244
|
+
let realtimeAudioResult = null;
|
|
1245
|
+
if (ttsTextInput) {
|
|
1246
|
+
const [ttsTask, ttsStream] = performTTSInference(
|
|
1247
|
+
(...args) => this.agent.ttsNode(...args),
|
|
1248
|
+
ttsTextInput,
|
|
1249
|
+
modelSettings,
|
|
1250
|
+
abortController
|
|
1251
|
+
);
|
|
1252
|
+
tasks.push(ttsTask);
|
|
1253
|
+
realtimeAudioResult = ttsStream;
|
|
1254
|
+
} else if (msgModalities && msgModalities.includes("audio")) {
|
|
1255
|
+
realtimeAudioResult = await this.agent.realtimeAudioOutputNode(
|
|
1256
|
+
msg.audioStream,
|
|
1257
|
+
modelSettings
|
|
1258
|
+
);
|
|
1259
|
+
} else if (this.llm instanceof RealtimeModel && this.llm.capabilities.audioOutput) {
|
|
1260
|
+
this.logger.error(
|
|
1261
|
+
"Text message received from Realtime API with audio modality. This usually happens when text chat context is synced to the API. Try to add a TTS model as fallback or use text modality with TTS instead."
|
|
1262
|
+
);
|
|
1263
|
+
} else {
|
|
1264
|
+
this.logger.warn(
|
|
1265
|
+
"audio output is enabled but neither tts nor realtime audio is available"
|
|
1266
|
+
);
|
|
1267
|
+
}
|
|
1268
|
+
if (realtimeAudioResult) {
|
|
1218
1269
|
const [forwardTask, _audioOut] = performAudioForwarding(
|
|
1219
|
-
|
|
1270
|
+
realtimeAudioResult,
|
|
1220
1271
|
audioOutput,
|
|
1221
1272
|
abortController
|
|
1222
1273
|
);
|
|
1223
1274
|
forwardTasks.push(forwardTask);
|
|
1224
1275
|
audioOut = _audioOut;
|
|
1225
1276
|
audioOut.firstFrameFut.await.finally(onFirstFrame);
|
|
1226
|
-
} else {
|
|
1227
|
-
this.logger.warn(
|
|
1228
|
-
"audio output is enabled but neither tts nor realtime audio is available"
|
|
1229
|
-
);
|
|
1230
1277
|
}
|
|
1231
1278
|
} else if (textOut) {
|
|
1232
1279
|
textOut.firstTextFut.await.finally(onFirstFrame);
|
|
1233
1280
|
}
|
|
1234
|
-
outputs.push([msg.messageId, textOut, audioOut]);
|
|
1281
|
+
outputs.push([msg.messageId, textOut, audioOut, msgModalities]);
|
|
1235
1282
|
}
|
|
1236
1283
|
await waitFor(forwardTasks);
|
|
1237
1284
|
} catch (error) {
|
|
@@ -1301,7 +1348,7 @@ ${instructions}` : instructions,
|
|
|
1301
1348
|
replyAbortController.abort();
|
|
1302
1349
|
await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
|
|
1303
1350
|
if (messageOutputs.length > 0) {
|
|
1304
|
-
const [msgId, textOut, audioOut] = messageOutputs[0];
|
|
1351
|
+
const [msgId, textOut, audioOut, msgModalities] = messageOutputs[0];
|
|
1305
1352
|
let forwardedText = (textOut == null ? void 0 : textOut.text) || "";
|
|
1306
1353
|
if (audioOutput) {
|
|
1307
1354
|
audioOutput.clearBuffer();
|
|
@@ -1321,7 +1368,9 @@ ${instructions}` : instructions,
|
|
|
1321
1368
|
}
|
|
1322
1369
|
this.realtimeSession.truncate({
|
|
1323
1370
|
messageId: msgId,
|
|
1324
|
-
audioEndMs: Math.floor(playbackPosition)
|
|
1371
|
+
audioEndMs: Math.floor(playbackPosition),
|
|
1372
|
+
modalities: msgModalities,
|
|
1373
|
+
audioTranscript: forwardedText
|
|
1325
1374
|
});
|
|
1326
1375
|
}
|
|
1327
1376
|
if (forwardedText) {
|
|
@@ -1345,7 +1394,7 @@ ${instructions}` : instructions,
|
|
|
1345
1394
|
return;
|
|
1346
1395
|
}
|
|
1347
1396
|
if (messageOutputs.length > 0) {
|
|
1348
|
-
const [msgId, textOut, _] = messageOutputs[0];
|
|
1397
|
+
const [msgId, textOut, _, __] = messageOutputs[0];
|
|
1349
1398
|
const message = ChatMessage.create({
|
|
1350
1399
|
role: "assistant",
|
|
1351
1400
|
content: (textOut == null ? void 0 : textOut.text) || "",
|