@livekit/agents 0.5.2 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +3 -0
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +2 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +2 -0
- package/dist/index.js.map +1 -1
- package/dist/job.cjs.map +1 -1
- package/dist/job.js.map +1 -1
- package/dist/llm/index.cjs +2 -0
- package/dist/llm/index.cjs.map +1 -1
- package/dist/llm/index.d.ts +1 -1
- package/dist/llm/index.d.ts.map +1 -1
- package/dist/llm/index.js +2 -0
- package/dist/llm/index.js.map +1 -1
- package/dist/llm/llm.cjs +47 -3
- package/dist/llm/llm.cjs.map +1 -1
- package/dist/llm/llm.d.ts +15 -2
- package/dist/llm/llm.d.ts.map +1 -1
- package/dist/llm/llm.js +46 -3
- package/dist/llm/llm.js.map +1 -1
- package/dist/metrics/base.cjs +44 -0
- package/dist/metrics/base.cjs.map +1 -0
- package/dist/metrics/base.d.ts +96 -0
- package/dist/metrics/base.d.ts.map +1 -0
- package/dist/metrics/base.js +20 -0
- package/dist/metrics/base.js.map +1 -0
- package/dist/metrics/index.cjs +35 -0
- package/dist/metrics/index.cjs.map +1 -0
- package/dist/metrics/index.d.ts +5 -0
- package/dist/metrics/index.d.ts.map +1 -0
- package/dist/metrics/index.js +9 -0
- package/dist/metrics/index.js.map +1 -0
- package/dist/metrics/usage_collector.cjs +53 -0
- package/dist/metrics/usage_collector.cjs.map +1 -0
- package/dist/metrics/usage_collector.d.ts +14 -0
- package/dist/metrics/usage_collector.d.ts.map +1 -0
- package/dist/metrics/usage_collector.js +29 -0
- package/dist/metrics/usage_collector.js.map +1 -0
- package/dist/metrics/utils.cjs +104 -0
- package/dist/metrics/utils.cjs.map +1 -0
- package/dist/metrics/utils.d.ts +10 -0
- package/dist/metrics/utils.d.ts.map +1 -0
- package/dist/metrics/utils.js +73 -0
- package/dist/metrics/utils.js.map +1 -0
- package/dist/multimodal/multimodal_agent.cjs +34 -16
- package/dist/multimodal/multimodal_agent.cjs.map +1 -1
- package/dist/multimodal/multimodal_agent.d.ts +4 -5
- package/dist/multimodal/multimodal_agent.d.ts.map +1 -1
- package/dist/multimodal/multimodal_agent.js +34 -16
- package/dist/multimodal/multimodal_agent.js.map +1 -1
- package/dist/pipeline/index.cjs +2 -0
- package/dist/pipeline/index.cjs.map +1 -1
- package/dist/pipeline/index.d.ts +1 -1
- package/dist/pipeline/index.d.ts.map +1 -1
- package/dist/pipeline/index.js +3 -1
- package/dist/pipeline/index.js.map +1 -1
- package/dist/pipeline/pipeline_agent.cjs +166 -66
- package/dist/pipeline/pipeline_agent.cjs.map +1 -1
- package/dist/pipeline/pipeline_agent.d.ts +10 -4
- package/dist/pipeline/pipeline_agent.d.ts.map +1 -1
- package/dist/pipeline/pipeline_agent.js +169 -69
- package/dist/pipeline/pipeline_agent.js.map +1 -1
- package/dist/pipeline/speech_handle.cjs +49 -1
- package/dist/pipeline/speech_handle.cjs.map +1 -1
- package/dist/pipeline/speech_handle.d.ts +12 -2
- package/dist/pipeline/speech_handle.d.ts.map +1 -1
- package/dist/pipeline/speech_handle.js +50 -2
- package/dist/pipeline/speech_handle.js.map +1 -1
- package/dist/stt/index.cjs.map +1 -1
- package/dist/stt/index.d.ts +1 -1
- package/dist/stt/index.d.ts.map +1 -1
- package/dist/stt/index.js.map +1 -1
- package/dist/stt/stream_adapter.cjs +15 -5
- package/dist/stt/stream_adapter.cjs.map +1 -1
- package/dist/stt/stream_adapter.d.ts +4 -1
- package/dist/stt/stream_adapter.d.ts.map +1 -1
- package/dist/stt/stream_adapter.js +15 -5
- package/dist/stt/stream_adapter.js.map +1 -1
- package/dist/stt/stt.cjs +46 -2
- package/dist/stt/stt.cjs.map +1 -1
- package/dist/stt/stt.d.ts +25 -3
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js +46 -2
- package/dist/stt/stt.js.map +1 -1
- package/dist/tts/index.cjs +4 -2
- package/dist/tts/index.cjs.map +1 -1
- package/dist/tts/index.d.ts +1 -1
- package/dist/tts/index.d.ts.map +1 -1
- package/dist/tts/index.js +3 -1
- package/dist/tts/index.js.map +1 -1
- package/dist/tts/stream_adapter.cjs +14 -3
- package/dist/tts/stream_adapter.cjs.map +1 -1
- package/dist/tts/stream_adapter.d.ts +3 -0
- package/dist/tts/stream_adapter.d.ts.map +1 -1
- package/dist/tts/stream_adapter.js +15 -4
- package/dist/tts/stream_adapter.js.map +1 -1
- package/dist/tts/tts.cjs +109 -6
- package/dist/tts/tts.cjs.map +1 -1
- package/dist/tts/tts.d.ts +24 -1
- package/dist/tts/tts.d.ts.map +1 -1
- package/dist/tts/tts.js +107 -5
- package/dist/tts/tts.js.map +1 -1
- package/dist/utils.cjs +11 -4
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +11 -4
- package/dist/utils.js.map +1 -1
- package/dist/vad.cjs +43 -2
- package/dist/vad.cjs.map +1 -1
- package/dist/vad.d.ts +21 -4
- package/dist/vad.d.ts.map +1 -1
- package/dist/vad.js +43 -2
- package/dist/vad.js.map +1 -1
- package/dist/worker.cjs +5 -2
- package/dist/worker.cjs.map +1 -1
- package/dist/worker.d.ts.map +1 -1
- package/dist/worker.js +5 -2
- package/dist/worker.js.map +1 -1
- package/package.json +3 -3
- package/src/index.ts +2 -1
- package/src/job.ts +3 -3
- package/src/llm/index.ts +2 -0
- package/src/llm/llm.ts +55 -3
- package/src/metrics/base.ts +127 -0
- package/src/metrics/index.ts +20 -0
- package/src/metrics/usage_collector.ts +40 -0
- package/src/metrics/utils.ts +100 -0
- package/src/multimodal/multimodal_agent.ts +57 -23
- package/src/pipeline/index.ts +1 -1
- package/src/pipeline/pipeline_agent.ts +208 -89
- package/src/pipeline/speech_handle.ts +67 -2
- package/src/stt/index.ts +2 -0
- package/src/stt/stream_adapter.ts +17 -5
- package/src/stt/stt.ts +67 -3
- package/src/tts/index.ts +2 -0
- package/src/tts/stream_adapter.ts +17 -4
- package/src/tts/tts.ts +127 -4
- package/src/utils.ts +12 -4
- package/src/vad.ts +61 -4
- package/src/worker.ts +7 -3
|
@@ -43,11 +43,13 @@ var import_stt = require("../stt/index.cjs");
|
|
|
43
43
|
var import_basic = require("../tokenize/basic/index.cjs");
|
|
44
44
|
var import_tts = require("../tts/index.cjs");
|
|
45
45
|
var import_utils = require("../utils.cjs");
|
|
46
|
+
var import_vad = require("../vad.cjs");
|
|
46
47
|
var import_agent_output = require("./agent_output.cjs");
|
|
47
48
|
var import_agent_playout = require("./agent_playout.cjs");
|
|
48
49
|
var import_human_input = require("./human_input.cjs");
|
|
49
50
|
var import_speech_handle = require("./speech_handle.cjs");
|
|
50
51
|
const AGENT_STATE_ATTRIBUTE = "lk.agent.state";
|
|
52
|
+
let speechData;
|
|
51
53
|
var VPAEvent = /* @__PURE__ */ ((VPAEvent2) => {
|
|
52
54
|
VPAEvent2[VPAEvent2["USER_STARTED_SPEAKING"] = 0] = "USER_STARTED_SPEAKING";
|
|
53
55
|
VPAEvent2[VPAEvent2["USER_STOPPED_SPEAKING"] = 1] = "USER_STOPPED_SPEAKING";
|
|
@@ -58,12 +60,14 @@ var VPAEvent = /* @__PURE__ */ ((VPAEvent2) => {
|
|
|
58
60
|
VPAEvent2[VPAEvent2["AGENT_SPEECH_INTERRUPTED"] = 6] = "AGENT_SPEECH_INTERRUPTED";
|
|
59
61
|
VPAEvent2[VPAEvent2["FUNCTION_CALLS_COLLECTED"] = 7] = "FUNCTION_CALLS_COLLECTED";
|
|
60
62
|
VPAEvent2[VPAEvent2["FUNCTION_CALLS_FINISHED"] = 8] = "FUNCTION_CALLS_FINISHED";
|
|
63
|
+
VPAEvent2[VPAEvent2["METRICS_COLLECTED"] = 9] = "METRICS_COLLECTED";
|
|
61
64
|
return VPAEvent2;
|
|
62
65
|
})(VPAEvent || {});
|
|
63
66
|
class AgentCallContext {
|
|
64
67
|
#agent;
|
|
65
68
|
#llmStream;
|
|
66
69
|
#metadata = /* @__PURE__ */ new Map();
|
|
70
|
+
#extraChatMessages = [];
|
|
67
71
|
static #current;
|
|
68
72
|
constructor(agent, llmStream) {
|
|
69
73
|
this.#agent = agent;
|
|
@@ -85,6 +89,12 @@ class AgentCallContext {
|
|
|
85
89
|
get llmStream() {
|
|
86
90
|
return this.#llmStream;
|
|
87
91
|
}
|
|
92
|
+
get extraChatMessages() {
|
|
93
|
+
return this.#extraChatMessages;
|
|
94
|
+
}
|
|
95
|
+
addExtraChatMessage(message) {
|
|
96
|
+
this.#extraChatMessages.push(message);
|
|
97
|
+
}
|
|
88
98
|
}
|
|
89
99
|
const defaultBeforeLLMCallback = (agent, chatCtx) => {
|
|
90
100
|
return agent.llm.chat({ chatCtx, fncCtx: agent.fncCtx });
|
|
@@ -106,7 +116,7 @@ const defaultVPAOptions = {
|
|
|
106
116
|
interruptSpeechDuration: 50,
|
|
107
117
|
interruptMinWords: 0,
|
|
108
118
|
minEndpointingDelay: 500,
|
|
109
|
-
|
|
119
|
+
maxNestedFncCalls: 1,
|
|
110
120
|
preemptiveSynthesis: false,
|
|
111
121
|
beforeLLMCallback: defaultBeforeLLMCallback,
|
|
112
122
|
beforeTTSCallback: defaultBeforeTTSCallback,
|
|
@@ -131,7 +141,6 @@ class VoicePipelineAgent extends import_node_events.default {
|
|
|
131
141
|
#transcribedInterimText = "";
|
|
132
142
|
#speechQueueOpen = new import_utils.Future();
|
|
133
143
|
#speechQueue = new import_utils.AsyncIterableQueue();
|
|
134
|
-
#lastEndOfSpeechTime;
|
|
135
144
|
#updateStateTask;
|
|
136
145
|
#started = false;
|
|
137
146
|
#room;
|
|
@@ -139,6 +148,8 @@ class VoicePipelineAgent extends import_node_events.default {
|
|
|
139
148
|
#deferredValidation;
|
|
140
149
|
#logger = (0, import_log.log)();
|
|
141
150
|
#agentPublication;
|
|
151
|
+
#lastFinalTranscriptTime;
|
|
152
|
+
#lastSpeechTime;
|
|
142
153
|
constructor(vad, stt, llm, tts, opts = defaultVPAOptions) {
|
|
143
154
|
super();
|
|
144
155
|
this.#opts = { ...defaultVPAOptions, ...opts };
|
|
@@ -183,6 +194,20 @@ class VoicePipelineAgent extends import_node_events.default {
|
|
|
183
194
|
if (this.#started) {
|
|
184
195
|
throw new Error("voice assistant already started");
|
|
185
196
|
}
|
|
197
|
+
this.#stt.on(import_stt.SpeechEventType.METRICS_COLLECTED, (metrics) => {
|
|
198
|
+
this.emit(9 /* METRICS_COLLECTED */, metrics);
|
|
199
|
+
});
|
|
200
|
+
this.#tts.on(import_tts.TTSEvent.METRICS_COLLECTED, (metrics) => {
|
|
201
|
+
if (!speechData) return;
|
|
202
|
+
this.emit(9 /* METRICS_COLLECTED */, { ...metrics, sequenceId: speechData.sequenceId });
|
|
203
|
+
});
|
|
204
|
+
this.#llm.on(import_llm.LLMEvent.METRICS_COLLECTED, (metrics) => {
|
|
205
|
+
if (!speechData) return;
|
|
206
|
+
this.emit(9 /* METRICS_COLLECTED */, { ...metrics, sequenceId: speechData.sequenceId });
|
|
207
|
+
});
|
|
208
|
+
this.#vad.on(import_vad.VADEventType.METRICS_COLLECTED, (metrics) => {
|
|
209
|
+
this.emit(9 /* METRICS_COLLECTED */, metrics);
|
|
210
|
+
});
|
|
186
211
|
room.on(import_rtc_node.RoomEvent.ParticipantConnected, (participant2) => {
|
|
187
212
|
if (this.#participant) {
|
|
188
213
|
return;
|
|
@@ -203,10 +228,43 @@ class VoicePipelineAgent extends import_node_events.default {
|
|
|
203
228
|
/** Play a speech source through the voice assistant. */
|
|
204
229
|
async say(source, allowInterruptions = true, addToChatCtx = true) {
|
|
205
230
|
await this.#trackPublishedFut.await;
|
|
231
|
+
let callContext;
|
|
232
|
+
let fncSource;
|
|
233
|
+
if (addToChatCtx) {
|
|
234
|
+
callContext = AgentCallContext.getCurrent();
|
|
235
|
+
if (source instanceof import_llm.LLMStream) {
|
|
236
|
+
this.#logger.warn("LLMStream will be ignored for function call chat context");
|
|
237
|
+
} else if (typeof source === "string") {
|
|
238
|
+
fncSource = source;
|
|
239
|
+
} else {
|
|
240
|
+
fncSource = source;
|
|
241
|
+
source = new import_utils.AsyncIterableQueue();
|
|
242
|
+
}
|
|
243
|
+
}
|
|
206
244
|
const newHandle = import_speech_handle.SpeechHandle.createAssistantSpeech(allowInterruptions, addToChatCtx);
|
|
207
245
|
const synthesisHandle = this.#synthesizeAgentSpeech(newHandle.id, source);
|
|
208
246
|
newHandle.initialize(source, synthesisHandle);
|
|
209
|
-
this.#
|
|
247
|
+
if (this.#playingSpeech && !this.#playingSpeech.nestedSpeechFinished) {
|
|
248
|
+
this.#playingSpeech.addNestedSpeech(newHandle);
|
|
249
|
+
} else {
|
|
250
|
+
this.#addSpeechForPlayout(newHandle);
|
|
251
|
+
}
|
|
252
|
+
if (callContext && fncSource) {
|
|
253
|
+
let text;
|
|
254
|
+
if (typeof source === "string") {
|
|
255
|
+
text = fncSource;
|
|
256
|
+
} else {
|
|
257
|
+
text = "";
|
|
258
|
+
for await (const chunk of fncSource) {
|
|
259
|
+
source.put(chunk);
|
|
260
|
+
text += chunk;
|
|
261
|
+
}
|
|
262
|
+
source.close();
|
|
263
|
+
}
|
|
264
|
+
callContext.addExtraChatMessage(import_llm2.ChatMessage.create({ text, role: import_llm2.ChatRole.ASSISTANT }));
|
|
265
|
+
this.#logger.child({ text }).debug("added speech to function call chat context");
|
|
266
|
+
}
|
|
267
|
+
return newHandle;
|
|
210
268
|
}
|
|
211
269
|
#updateState(state, delay = 0) {
|
|
212
270
|
const runTask = (delay2) => {
|
|
@@ -260,11 +318,13 @@ class VoicePipelineAgent extends import_node_events.default {
|
|
|
260
318
|
if (event.speechDuration >= this.#opts.interruptSpeechDuration) {
|
|
261
319
|
this.#interruptIfPossible();
|
|
262
320
|
}
|
|
321
|
+
if (event.rawAccumulatedSpeech > 0) {
|
|
322
|
+
this.#lastSpeechTime = Date.now() - event.rawAccumulatedSilence;
|
|
323
|
+
}
|
|
263
324
|
});
|
|
264
325
|
this.#humanInput.on(import_human_input.HumanInputEvent.END_OF_SPEECH, (event) => {
|
|
265
326
|
this.emit(0 /* USER_STARTED_SPEAKING */);
|
|
266
327
|
this.#deferredValidation.onHumanEndOfSpeech(event);
|
|
267
|
-
this.#lastEndOfSpeechTime = Date.now();
|
|
268
328
|
});
|
|
269
329
|
this.#humanInput.on(import_human_input.HumanInputEvent.INTERIM_TRANSCRIPT, (event) => {
|
|
270
330
|
this.#transcribedInterimText = event.alternatives[0].text;
|
|
@@ -272,7 +332,7 @@ class VoicePipelineAgent extends import_node_events.default {
|
|
|
272
332
|
this.#humanInput.on(import_human_input.HumanInputEvent.FINAL_TRANSCRIPT, (event) => {
|
|
273
333
|
const newTranscript = event.alternatives[0].text;
|
|
274
334
|
if (!newTranscript) return;
|
|
275
|
-
this.#
|
|
335
|
+
this.#lastFinalTranscriptTime = Date.now();
|
|
276
336
|
this.#transcribedText += (this.#transcribedText ? " " : "") + newTranscript;
|
|
277
337
|
if (this.#opts.preemptiveSynthesis && (!this.#playingSpeech || this.#playingSpeech.allowInterruptions)) {
|
|
278
338
|
this.#synthesizeAgentReply();
|
|
@@ -356,23 +416,26 @@ class VoicePipelineAgent extends import_node_events.default {
|
|
|
356
416
|
role: import_llm2.ChatRole.USER
|
|
357
417
|
})
|
|
358
418
|
);
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
419
|
+
speechData = { sequenceId: handle.id };
|
|
420
|
+
try {
|
|
421
|
+
if (cancelled) resolve();
|
|
422
|
+
let llmStream = await this.#opts.beforeLLMCallback(this, copiedCtx);
|
|
423
|
+
if (llmStream === false) {
|
|
424
|
+
handle == null ? void 0 : handle.cancel();
|
|
425
|
+
return;
|
|
426
|
+
}
|
|
427
|
+
if (cancelled) resolve();
|
|
428
|
+
if (!(llmStream instanceof import_llm.LLMStream)) {
|
|
429
|
+
llmStream = await defaultBeforeLLMCallback(this, copiedCtx);
|
|
430
|
+
}
|
|
431
|
+
if (handle.interrupted) {
|
|
432
|
+
return;
|
|
433
|
+
}
|
|
434
|
+
const synthesisHandle = this.#synthesizeAgentSpeech(handle.id, llmStream);
|
|
435
|
+
handle.initialize(llmStream, synthesisHandle);
|
|
436
|
+
} finally {
|
|
437
|
+
speechData = void 0;
|
|
371
438
|
}
|
|
372
|
-
const synthesisHandle = this.#synthesizeAgentSpeech(handle.id, llmStream);
|
|
373
|
-
handle.initialize(llmStream, synthesisHandle);
|
|
374
|
-
const elapsed = !!this.#lastEndOfSpeechTime ? Math.round((Date.now() - this.#lastEndOfSpeechTime) * 1e3) / 1e3 : -1;
|
|
375
|
-
this.#logger.child({ speechId: handle.id, elapsed }).debug("synthesizing agent reply");
|
|
376
439
|
resolve();
|
|
377
440
|
});
|
|
378
441
|
}
|
|
@@ -414,59 +477,81 @@ class VoicePipelineAgent extends import_node_events.default {
|
|
|
414
477
|
commitUserQuestionIfNeeded();
|
|
415
478
|
const collectedText = handle.synthesisHandle.text;
|
|
416
479
|
const isUsingTools = handle.source instanceof import_llm.LLMStream && !!handle.source.functionCalls.length;
|
|
417
|
-
const
|
|
418
|
-
|
|
419
|
-
|
|
480
|
+
const interrupted = handle.interrupted;
|
|
481
|
+
const executeFunctionCalls = async () => {
|
|
482
|
+
if (!isUsingTools || interrupted) return;
|
|
483
|
+
if (handle.fncNestedDepth >= this.#opts.maxNestedFncCalls) {
|
|
484
|
+
this.#logger.child({ speechId: handle.id, fncNestedDepth: handle.fncNestedDepth }).warn("max function calls nested depth reached");
|
|
485
|
+
return;
|
|
486
|
+
}
|
|
420
487
|
if (!userQuestion || !handle.userCommitted) {
|
|
421
488
|
throw new Error("user speech should have been committed before using tools");
|
|
422
489
|
}
|
|
423
490
|
const llmStream = handle.source;
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
}
|
|
440
|
-
}
|
|
441
|
-
const toolCallsInfo = [];
|
|
442
|
-
const toolCallsResults = [];
|
|
443
|
-
for (const fnc of calledFuncs) {
|
|
444
|
-
const task = await fnc.task;
|
|
445
|
-
if (!task || task.result === void 0) continue;
|
|
446
|
-
toolCallsInfo.push(fnc);
|
|
447
|
-
toolCallsResults.push(import_llm2.ChatMessage.createToolFromFunctionResult(task));
|
|
491
|
+
const newFunctionCalls = llmStream.functionCalls;
|
|
492
|
+
new AgentCallContext(this, llmStream);
|
|
493
|
+
this.emit(7 /* FUNCTION_CALLS_COLLECTED */, newFunctionCalls);
|
|
494
|
+
const calledFuncs = [];
|
|
495
|
+
for (const func of newFunctionCalls) {
|
|
496
|
+
const task2 = func.func.execute(func.params).then(
|
|
497
|
+
(result) => ({ name: func.name, toolCallId: func.toolCallId, result }),
|
|
498
|
+
(error) => ({ name: func.name, toolCallId: func.toolCallId, error })
|
|
499
|
+
);
|
|
500
|
+
calledFuncs.push({ ...func, task: task2 });
|
|
501
|
+
this.#logger.child({ function: func.name, speechId: handle.id }).debug("executing AI function");
|
|
502
|
+
try {
|
|
503
|
+
await task2;
|
|
504
|
+
} catch {
|
|
505
|
+
this.#logger.child({ function: func.name, speechId: handle.id }).error("error executing AI function");
|
|
448
506
|
}
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
507
|
+
}
|
|
508
|
+
const toolCallsInfo = [];
|
|
509
|
+
const toolCallsResults = [];
|
|
510
|
+
for (const fnc of calledFuncs) {
|
|
511
|
+
const task2 = await fnc.task;
|
|
512
|
+
if (!task2 || task2.result === void 0) continue;
|
|
513
|
+
toolCallsInfo.push(fnc);
|
|
514
|
+
toolCallsResults.push(import_llm2.ChatMessage.createToolFromFunctionResult(task2));
|
|
515
|
+
}
|
|
516
|
+
if (!toolCallsInfo.length) return;
|
|
517
|
+
const extraToolsMessages = [import_llm2.ChatMessage.createToolCalls(toolCallsInfo, collectedText)];
|
|
518
|
+
extraToolsMessages.push(...toolCallsResults);
|
|
519
|
+
const newSpeechHandle = import_speech_handle.SpeechHandle.createToolSpeech(
|
|
520
|
+
handle.allowInterruptions,
|
|
521
|
+
handle.addToChatCtx,
|
|
522
|
+
handle.fncNestedDepth + 1,
|
|
523
|
+
extraToolsMessages
|
|
524
|
+
);
|
|
525
|
+
const chatCtx = handle.source.chatCtx.copy();
|
|
526
|
+
chatCtx.messages.push(...extraToolsMessages);
|
|
527
|
+
chatCtx.messages.push(...AgentCallContext.getCurrent().extraChatMessages);
|
|
528
|
+
const answerLLMStream = this.llm.chat({
|
|
529
|
+
chatCtx,
|
|
530
|
+
fncCtx: this.fncCtx
|
|
531
|
+
});
|
|
532
|
+
const answerSynthesis = this.#synthesizeAgentSpeech(newSpeechHandle.id, answerLLMStream);
|
|
533
|
+
newSpeechHandle.initialize(answerLLMStream, answerSynthesis);
|
|
534
|
+
handle.addNestedSpeech(newSpeechHandle);
|
|
535
|
+
this.emit(8 /* FUNCTION_CALLS_FINISHED */, calledFuncs);
|
|
536
|
+
};
|
|
537
|
+
const task = executeFunctionCalls().then(() => {
|
|
538
|
+
handle.markNestedSpeechFinished();
|
|
539
|
+
});
|
|
540
|
+
while (!handle.nestedSpeechFinished) {
|
|
541
|
+
const changed = handle.nestedSpeechChanged();
|
|
542
|
+
await Promise.race([changed, task]);
|
|
543
|
+
while (handle.nestedSpeechHandles.length) {
|
|
544
|
+
const speech = handle.nestedSpeechHandles[0];
|
|
545
|
+
this.#playingSpeech = speech;
|
|
546
|
+
await this.#playSpeech(speech);
|
|
547
|
+
handle.nestedSpeechHandles.shift();
|
|
548
|
+
this.#playingSpeech = handle;
|
|
466
549
|
}
|
|
467
550
|
}
|
|
468
551
|
if (handle.addToChatCtx && (!userQuestion || handle.userCommitted)) {
|
|
469
|
-
|
|
552
|
+
if (handle.extraToolsMessages) {
|
|
553
|
+
this.chatCtx.messages.push(...handle.extraToolsMessages);
|
|
554
|
+
}
|
|
470
555
|
if (interrupted) {
|
|
471
556
|
collectedText + "\u2026";
|
|
472
557
|
}
|
|
@@ -483,6 +568,7 @@ class VoicePipelineAgent extends import_node_events.default {
|
|
|
483
568
|
interrupted,
|
|
484
569
|
speechId: handle.id
|
|
485
570
|
}).debug("committed agent speech");
|
|
571
|
+
handle.setDone();
|
|
486
572
|
}
|
|
487
573
|
}
|
|
488
574
|
#synthesizeAgentSpeech(speechId, source) {
|
|
@@ -523,6 +609,20 @@ class VoicePipelineAgent extends import_node_events.default {
|
|
|
523
609
|
}
|
|
524
610
|
}
|
|
525
611
|
this.#logger.child({ speechId: this.#pendingAgentReply.id }).debug("validated agent reply");
|
|
612
|
+
if (this.#lastSpeechTime) {
|
|
613
|
+
const timeSinceLastSpeech = Date.now() - this.#lastSpeechTime;
|
|
614
|
+
const transcriptionDelay = Math.max(
|
|
615
|
+
(this.#lastFinalTranscriptTime || 0) - this.#lastSpeechTime,
|
|
616
|
+
0
|
|
617
|
+
);
|
|
618
|
+
const metrics = {
|
|
619
|
+
timestamp: Date.now(),
|
|
620
|
+
sequenceId: this.#pendingAgentReply.id,
|
|
621
|
+
endOfUtteranceDelay: timeSinceLastSpeech,
|
|
622
|
+
transcriptionDelay
|
|
623
|
+
};
|
|
624
|
+
this.emit(9 /* METRICS_COLLECTED */, metrics);
|
|
625
|
+
}
|
|
526
626
|
this.#addSpeechForPlayout(this.#pendingAgentReply);
|
|
527
627
|
this.#pendingAgentReply = void 0;
|
|
528
628
|
this.#transcribedInterimText = "";
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../src/pipeline/pipeline_agent.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { LocalTrackPublication, RemoteParticipant, Room } from '@livekit/rtc-node';\nimport {\n AudioSource,\n LocalAudioTrack,\n RoomEvent,\n TrackPublishOptions,\n TrackSource,\n} from '@livekit/rtc-node';\nimport type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';\nimport EventEmitter from 'node:events';\nimport type {\n CallableFunctionResult,\n FunctionCallInfo,\n FunctionContext,\n LLM,\n} from '../llm/index.js';\nimport { LLMStream } from '../llm/index.js';\nimport { ChatContext, ChatMessage, ChatRole } from '../llm/index.js';\nimport { log } from '../log.js';\nimport { type STT, StreamAdapter as STTStreamAdapter } from '../stt/index.js';\nimport {\n SentenceTokenizer as BasicSentenceTokenizer,\n WordTokenizer as BasicWordTokenizer,\n hyphenateWord,\n} from '../tokenize/basic/index.js';\nimport type { SentenceTokenizer, WordTokenizer } from '../tokenize/tokenizer.js';\nimport type { TTS } from '../tts/index.js';\nimport { StreamAdapter as TTSStreamAdapter } from '../tts/index.js';\nimport { AsyncIterableQueue, CancellablePromise, Future, gracefullyCancel } from '../utils.js';\nimport type { VAD, VADEvent } from '../vad.js';\nimport type { SpeechSource, SynthesisHandle } from './agent_output.js';\nimport { AgentOutput } from './agent_output.js';\nimport { AgentPlayout, AgentPlayoutEvent } from './agent_playout.js';\nimport { HumanInput, HumanInputEvent } from './human_input.js';\nimport { SpeechHandle } from './speech_handle.js';\n\nexport type AgentState = 'initializing' | 'thinking' | 'listening' | 'speaking';\nexport const AGENT_STATE_ATTRIBUTE = 'lk.agent.state';\n\nexport type BeforeLLMCallback = (\n agent: VoicePipelineAgent,\n chatCtx: ChatContext,\n) => LLMStream | false | void | Promise<LLMStream | false | void>;\n\nexport type BeforeTTSCallback = (\n agent: VoicePipelineAgent,\n source: string | AsyncIterable<string>,\n) => SpeechSource;\n\nexport enum VPAEvent {\n USER_STARTED_SPEAKING,\n USER_STOPPED_SPEAKING,\n AGENT_STARTED_SPEAKING,\n AGENT_STOPPED_SPEAKING,\n USER_SPEECH_COMMITTED,\n AGENT_SPEECH_COMMITTED,\n AGENT_SPEECH_INTERRUPTED,\n FUNCTION_CALLS_COLLECTED,\n FUNCTION_CALLS_FINISHED,\n}\n\nexport type VPACallbacks = {\n [VPAEvent.USER_STARTED_SPEAKING]: () => void;\n [VPAEvent.USER_STOPPED_SPEAKING]: () => void;\n [VPAEvent.AGENT_STARTED_SPEAKING]: () => void;\n [VPAEvent.AGENT_STOPPED_SPEAKING]: () => void;\n [VPAEvent.USER_SPEECH_COMMITTED]: (msg: ChatMessage) => void;\n [VPAEvent.AGENT_SPEECH_COMMITTED]: (msg: ChatMessage) => void;\n [VPAEvent.AGENT_SPEECH_INTERRUPTED]: (msg: ChatMessage) => void;\n [VPAEvent.FUNCTION_CALLS_COLLECTED]: (funcs: FunctionCallInfo[]) => void;\n [VPAEvent.FUNCTION_CALLS_FINISHED]: (funcs: CallableFunctionResult[]) => void;\n};\n\nexport class AgentCallContext {\n #agent: VoicePipelineAgent;\n #llmStream: LLMStream;\n #metadata = new Map<string, any>();\n static #current: AgentCallContext;\n\n constructor(agent: VoicePipelineAgent, llmStream: LLMStream) {\n this.#agent = agent;\n this.#llmStream = llmStream;\n AgentCallContext.#current = this;\n }\n\n static getCurrent(): AgentCallContext {\n return AgentCallContext.#current;\n }\n\n get agent(): VoicePipelineAgent {\n return this.#agent;\n }\n\n storeMetadata(key: string, value: any) {\n this.#metadata.set(key, value);\n }\n\n getMetadata(key: string, orDefault: any = undefined) {\n return this.#metadata.get(key) || orDefault;\n }\n\n get llmStream(): LLMStream {\n return this.#llmStream;\n }\n}\n\nconst defaultBeforeLLMCallback: BeforeLLMCallback = (\n agent: VoicePipelineAgent,\n chatCtx: ChatContext,\n): LLMStream => {\n return agent.llm.chat({ chatCtx, fncCtx: agent.fncCtx });\n};\n\nconst defaultBeforeTTSCallback: BeforeTTSCallback = (\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n _: VoicePipelineAgent,\n text: string | AsyncIterable<string>,\n): string | AsyncIterable<string> => {\n return text;\n};\n\nexport interface AgentTranscriptionOptions {\n /** Whether to forward the user transcription to the client */\n userTranscription: boolean;\n /** Whether to forward the agent transcription to the client */\n agentTranscription: boolean;\n /**\n * The speed at which the agent's speech transcription is forwarded to the client.\n * We try to mimic the agent's speech speed by adjusting the transcription speed.\n */\n agentTranscriptionSpeech: number;\n /**\n * The tokenizer used to split the speech into sentences.\n * This is used to decide when to mark a transcript as final for the agent transcription.\n */\n sentenceTokenizer: SentenceTokenizer;\n /**\n * The tokenizer used to split the speech into words.\n * This is used to simulate the \"interim results\" of the agent transcription.\n */\n wordTokenizer: WordTokenizer;\n /**\n * A function that takes a string (word) as input and returns a list of strings,\n * representing the hyphenated parts of the word.\n */\n hyphenateWord: (word: string) => string[];\n}\n\nconst defaultAgentTranscriptionOptions: AgentTranscriptionOptions = {\n userTranscription: true,\n agentTranscription: true,\n agentTranscriptionSpeech: 1,\n sentenceTokenizer: new BasicSentenceTokenizer(),\n wordTokenizer: new BasicWordTokenizer(false),\n hyphenateWord: hyphenateWord,\n};\n\nexport interface VPAOptions {\n /** Chat context for the assistant. */\n chatCtx?: ChatContext;\n /** Function context for the assistant. */\n fncCtx?: FunctionContext;\n /** Whether to allow the user to interrupt the assistant. */\n allowInterruptions: boolean;\n /** Minimum duration of speech to consider for interruption. */\n interruptSpeechDuration: number;\n /** Minimum number of words to consider for interuption. This may increase latency. */\n interruptMinWords: number;\n /** Delay to wait before considering the user speech done. */\n minEndpointingDelay: number;\n maxRecursiveFncCalls: number;\n /* Whether to preemptively synthesize responses. */\n preemptiveSynthesis: boolean;\n /*\n * Callback called when the assistant is about to synthesize a reply.\n *\n * @remarks\n * Returning void will create a default LLM stream.\n * You can also return your own LLM stream by calling `llm.chat()`.\n * Returning `false` ill cancel the synthesis of the reply.\n */\n beforeLLMCallback: BeforeLLMCallback;\n /*\n * Callback called when the assistant is about to synthesize speech.\n *\n * @remarks\n * This can be used to customize text before synthesis\n * (e.g. editing the pronunciation of a word).\n */\n beforeTTSCallback: BeforeTTSCallback;\n /** Options for assistant transcription. */\n transcription: AgentTranscriptionOptions;\n}\n\nconst defaultVPAOptions: VPAOptions = {\n chatCtx: new ChatContext(),\n allowInterruptions: true,\n interruptSpeechDuration: 50,\n interruptMinWords: 0,\n minEndpointingDelay: 500,\n maxRecursiveFncCalls: 1,\n preemptiveSynthesis: false,\n beforeLLMCallback: defaultBeforeLLMCallback,\n beforeTTSCallback: defaultBeforeTTSCallback,\n transcription: defaultAgentTranscriptionOptions,\n};\n\n/** A pipeline agent (VAD + STT + LLM + TTS) implementation. */\nexport class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<VPACallbacks>) {\n /** Minimum time played for the user speech to be committed to the chat context. */\n readonly MIN_TIME_PLAYED_FOR_COMMIT = 1.5;\n protected static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');\n\n #vad: VAD;\n #stt: STT;\n #llm: LLM;\n #tts: TTS;\n #opts: VPAOptions;\n #humanInput?: HumanInput;\n #agentOutput?: AgentOutput;\n #trackPublishedFut = new Future();\n #pendingAgentReply?: SpeechHandle;\n #agentReplyTask?: CancellablePromise<void>;\n #playingSpeech?: SpeechHandle;\n #transcribedText = '';\n #transcribedInterimText = '';\n #speechQueueOpen = new Future();\n #speechQueue = new AsyncIterableQueue<SpeechHandle | typeof VoicePipelineAgent.FLUSH_SENTINEL>();\n #lastEndOfSpeechTime?: number;\n #updateStateTask?: CancellablePromise<void>;\n #started = false;\n #room?: Room;\n #participant: RemoteParticipant | string | null = null;\n #deferredValidation: DeferredReplyValidation;\n #logger = log();\n #agentPublication?: LocalTrackPublication;\n\n constructor(\n /** Voice Activity Detection instance. */\n vad: VAD,\n /** Speech-to-Text instance. */\n stt: STT,\n /** Large Language Model instance. */\n llm: LLM,\n /** Text-to-Speech instance. */\n tts: TTS,\n /** Additional VoicePipelineAgent options. */\n opts: Partial<VPAOptions> = defaultVPAOptions,\n ) {\n super();\n\n this.#opts = { ...defaultVPAOptions, ...opts };\n\n if (!stt.capabilities.streaming) {\n stt = new STTStreamAdapter(stt, vad);\n }\n\n if (!tts.capabilities.streaming) {\n tts = new TTSStreamAdapter(tts, new BasicSentenceTokenizer());\n }\n\n this.#vad = vad;\n this.#stt = stt;\n this.#llm = llm;\n this.#tts = tts;\n\n this.#deferredValidation = new DeferredReplyValidation(\n this.#validateReplyIfPossible.bind(this),\n this.#opts.minEndpointingDelay,\n );\n }\n\n get fncCtx(): FunctionContext | undefined {\n return this.#opts.fncCtx;\n }\n\n set fncCtx(ctx: FunctionContext) {\n this.#opts.fncCtx = ctx;\n }\n\n get chatCtx(): ChatContext {\n return this.#opts.chatCtx!;\n }\n\n get llm(): LLM {\n return this.#llm;\n }\n\n get tts(): TTS {\n return this.#tts;\n }\n\n get stt(): STT {\n return this.#stt;\n }\n\n get vad(): VAD {\n return this.#vad;\n }\n\n /** Start the voice assistant. */\n start(\n /** The room to connect to. */\n room: Room,\n /**\n * The participant to listen to.\n *\n * @remarks\n * Can be a participant or an identity.\n * If omitted, the first participant in the room will be selected.\n */\n participant: RemoteParticipant | string | null = null,\n ) {\n if (this.#started) {\n throw new Error('voice assistant already started');\n }\n room.on(RoomEvent.ParticipantConnected, (participant: RemoteParticipant) => {\n // automatically link to the first participant that connects, if not already linked\n if (this.#participant) {\n return;\n }\n this.#linkParticipant.call(this, participant.identity);\n });\n\n this.#room = room;\n this.#participant = participant;\n\n if (participant) {\n if (typeof participant === 'string') {\n this.#linkParticipant(participant);\n } else {\n this.#linkParticipant(participant.identity);\n }\n }\n\n this.#run();\n }\n\n /** Play a speech source through the voice assistant. */\n async say(\n source: string | LLMStream | AsyncIterable<string>,\n allowInterruptions = true,\n addToChatCtx = true,\n ) {\n await this.#trackPublishedFut.await;\n const newHandle = SpeechHandle.createAssistantSpeech(allowInterruptions, addToChatCtx);\n const synthesisHandle = this.#synthesizeAgentSpeech(newHandle.id, source);\n newHandle.initialize(source, synthesisHandle);\n this.#addSpeechForPlayout(newHandle);\n }\n\n #updateState(state: AgentState, delay = 0) {\n const runTask = (delay: number): CancellablePromise<void> => {\n return new CancellablePromise(async (resolve, _, onCancel) => {\n let cancelled = false;\n onCancel(() => {\n cancelled = true;\n });\n await new Promise((resolve) => setTimeout(resolve, delay));\n if (this.#room?.isConnected) {\n if (!cancelled) {\n await this.#room.localParticipant?.setAttributes({ [AGENT_STATE_ATTRIBUTE]: state });\n }\n }\n resolve();\n });\n };\n\n if (this.#updateStateTask) {\n this.#updateStateTask.cancel();\n }\n\n this.#updateStateTask = runTask(delay);\n }\n\n #linkParticipant(participantIdentity: string): void {\n if (!this.#room) {\n this.#logger.error('Room is not set');\n return;\n }\n\n this.#participant = this.#room.remoteParticipants.get(participantIdentity) || null;\n if (!this.#participant) {\n this.#logger.error(`Participant with identity ${participantIdentity} not found`);\n return;\n }\n\n this.#humanInput = new HumanInput(this.#room, this.#vad, this.#stt, this.#participant);\n this.#humanInput.on(HumanInputEvent.START_OF_SPEECH, (event) => {\n this.emit(VPAEvent.USER_STARTED_SPEAKING);\n this.#deferredValidation.onHumanStartOfSpeech(event);\n });\n this.#humanInput.on(HumanInputEvent.VAD_INFERENCE_DONE, (event) => {\n if (!this.#trackPublishedFut.done) {\n return;\n }\n if (!this.#agentOutput) {\n throw new Error('agent output is undefined');\n }\n\n let tv = 1;\n if (this.#opts.allowInterruptions) {\n tv = Math.max(0, 1 - event.probability);\n this.#agentOutput.playout.targetVolume = tv;\n }\n\n if (event.speechDuration >= this.#opts.interruptSpeechDuration) {\n this.#interruptIfPossible();\n }\n });\n this.#humanInput.on(HumanInputEvent.END_OF_SPEECH, (event) => {\n this.emit(VPAEvent.USER_STARTED_SPEAKING);\n this.#deferredValidation.onHumanEndOfSpeech(event);\n this.#lastEndOfSpeechTime = Date.now();\n });\n this.#humanInput.on(HumanInputEvent.INTERIM_TRANSCRIPT, (event) => {\n this.#transcribedInterimText = event.alternatives![0].text;\n });\n this.#humanInput.on(HumanInputEvent.FINAL_TRANSCRIPT, (event) => {\n const newTranscript = event.alternatives![0].text;\n if (!newTranscript) return;\n\n this.#logger.child({ userTranscript: newTranscript }).debug('received user transcript');\n this.#transcribedText += (this.#transcribedText ? ' ' : '') + newTranscript;\n\n if (\n this.#opts.preemptiveSynthesis &&\n (!this.#playingSpeech || this.#playingSpeech.allowInterruptions)\n ) {\n this.#synthesizeAgentReply();\n }\n\n this.#deferredValidation.onHumanFinalTranscript(newTranscript);\n\n const words = this.#opts.transcription.wordTokenizer.tokenize(newTranscript);\n if (words.length >= 3) {\n // VAD can sometimes not detect that the human is speaking.\n // to make the interruption more reliable, we also interrupt on the final transcript.\n this.#interruptIfPossible();\n }\n });\n }\n\n async #run() {\n this.#updateState('initializing');\n const audioSource = new AudioSource(this.#tts.sampleRate, this.#tts.numChannels);\n const track = LocalAudioTrack.createAudioTrack('assistant_voice', audioSource);\n this.#agentPublication = await this.#room?.localParticipant?.publishTrack(\n track,\n new TrackPublishOptions({ source: TrackSource.SOURCE_MICROPHONE }),\n );\n\n const agentPlayout = new AgentPlayout(audioSource);\n this.#agentOutput = new AgentOutput(agentPlayout, this.#tts);\n\n agentPlayout.on(AgentPlayoutEvent.PLAYOUT_STARTED, () => {\n this.emit(VPAEvent.AGENT_STARTED_SPEAKING);\n this.#updateState('speaking');\n });\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n agentPlayout.on(AgentPlayoutEvent.PLAYOUT_STOPPED, (_) => {\n this.emit(VPAEvent.AGENT_STOPPED_SPEAKING);\n this.#updateState('listening');\n });\n\n this.#trackPublishedFut.resolve();\n\n while (true) {\n await this.#speechQueueOpen.await;\n for await (const speech of this.#speechQueue) {\n if (speech === VoicePipelineAgent.FLUSH_SENTINEL) break;\n this.#playingSpeech = speech;\n await this.#playSpeech(speech);\n this.#playingSpeech = undefined;\n }\n this.#speechQueueOpen = new Future();\n }\n }\n\n #synthesizeAgentReply() {\n this.#pendingAgentReply?.cancel();\n if (this.#humanInput && this.#humanInput.speaking) {\n this.#updateState('thinking', 200);\n }\n\n this.#pendingAgentReply = SpeechHandle.createAssistantReply(\n this.#opts.allowInterruptions,\n true,\n this.#transcribedText,\n );\n const newHandle = this.#pendingAgentReply;\n this.#agentReplyTask = this.#synthesizeAnswerTask(this.#agentReplyTask, newHandle);\n }\n\n #synthesizeAnswerTask(\n oldTask: CancellablePromise<void> | undefined,\n handle?: SpeechHandle,\n ): CancellablePromise<void> {\n return new CancellablePromise(async (resolve, _, onCancel) => {\n let cancelled = false;\n onCancel(() => {\n cancelled = true;\n });\n\n if (oldTask) {\n await gracefullyCancel(oldTask);\n }\n\n const copiedCtx = this.chatCtx.copy();\n const playingSpeech = this.#playingSpeech;\n if (playingSpeech && playingSpeech.initialized) {\n if (\n (!playingSpeech.userQuestion || playingSpeech.userCommitted) &&\n !playingSpeech.speechCommitted\n ) {\n // the speech is playing but not committed yet,\n // add it to the chat context for this new reply synthesis\n copiedCtx.messages.push(\n ChatMessage.create({\n text: playingSpeech.synthesisHandle.text,\n role: ChatRole.ASSISTANT,\n }),\n );\n }\n }\n\n copiedCtx.messages.push(\n ChatMessage.create({\n text: handle?.userQuestion,\n role: ChatRole.USER,\n }),\n );\n\n if (cancelled) resolve();\n let llmStream = await this.#opts.beforeLLMCallback(this, copiedCtx);\n if (llmStream === false) {\n handle?.cancel();\n return;\n }\n\n if (cancelled) resolve();\n // fallback to default impl if no custom/user stream is returned\n if (!(llmStream instanceof LLMStream)) {\n llmStream = (await defaultBeforeLLMCallback(this, copiedCtx)) as LLMStream;\n }\n\n if (handle!.interrupted) {\n return;\n }\n\n const synthesisHandle = this.#synthesizeAgentSpeech(handle!.id, llmStream);\n handle!.initialize(llmStream, synthesisHandle);\n\n // TODO(theomonnom): find a more reliable way to get the elapsed time from the last EOS\n // (VAD could not have detected any speech — maybe unlikely?)\n const elapsed = !!this.#lastEndOfSpeechTime\n ? Math.round((Date.now() - this.#lastEndOfSpeechTime) * 1000) / 1000\n : -1;\n\n this.#logger.child({ speechId: handle!.id, elapsed }).debug('synthesizing agent reply');\n resolve();\n });\n }\n\n async #playSpeech(handle: SpeechHandle) {\n try {\n await handle.waitForInitialization();\n } catch {\n return;\n }\n await this.#agentPublication!.waitForSubscription();\n const synthesisHandle = handle.synthesisHandle;\n if (synthesisHandle.interrupted) return;\n\n const userQuestion = handle.userQuestion;\n const playHandle = synthesisHandle.play();\n const joinFut = playHandle.join();\n\n const commitUserQuestionIfNeeded = () => {\n if (!userQuestion || synthesisHandle.interrupted || handle.userCommitted) return;\n const isUsingTools =\n handle.source instanceof LLMStream && !!handle.source.functionCalls.length;\n\n // make sure at least some speech was played before committing the user message\n // since we try to validate as fast as possible it is possible the agent gets interrupted\n // really quickly (barely audible), we don't want to mark this question as \"answered\".\n if (\n handle.allowInterruptions &&\n !isUsingTools &&\n playHandle.timePlayed < this.MIN_TIME_PLAYED_FOR_COMMIT &&\n !joinFut.done\n ) {\n return;\n }\n\n this.#logger.child({ userTranscript: userQuestion }).debug('committed user transcript');\n const userMsg = ChatMessage.create({ text: userQuestion, role: ChatRole.USER });\n this.chatCtx.messages.push(userMsg);\n this.emit(VPAEvent.USER_SPEECH_COMMITTED, userMsg);\n\n this.#transcribedText = this.#transcribedText.slice(userQuestion.length);\n handle.markUserCommitted();\n };\n\n // wait for the playHandle to finish and check every 1s if user question should be committed\n commitUserQuestionIfNeeded();\n\n while (!joinFut.done) {\n await new Promise<void>(async (resolve) => {\n setTimeout(resolve, 500);\n await joinFut.await;\n resolve();\n });\n commitUserQuestionIfNeeded();\n if (handle.interrupted) break;\n }\n commitUserQuestionIfNeeded();\n\n const collectedText = handle.synthesisHandle.text;\n const isUsingTools = handle.source instanceof LLMStream && !!handle.source.functionCalls.length;\n const extraToolsMessages = []; // additional messages from the functions to add to the context\n let interrupted = handle.interrupted;\n\n // if the answer is using tools, execute the functions and automatically generate\n // a response to the user question from the returned values\n if (isUsingTools && !interrupted) {\n if (!userQuestion || !handle.userCommitted) {\n throw new Error('user speech should have been committed before using tools');\n }\n const llmStream = handle.source;\n let newFunctionCalls = llmStream.functionCalls;\n\n for (let i = 0; i < this.#opts.maxRecursiveFncCalls; i++) {\n this.emit(VPAEvent.FUNCTION_CALLS_COLLECTED, newFunctionCalls);\n const calledFuncs: FunctionCallInfo[] = [];\n for (const func of newFunctionCalls) {\n const task = func.func.execute(func.params).then(\n (result) => ({ name: func.name, toolCallId: func.toolCallId, result }),\n (error) => ({ name: func.name, toolCallId: func.toolCallId, error }),\n );\n calledFuncs.push({ ...func, task });\n this.#logger\n .child({ function: func.name, speechId: handle.id })\n .debug('executing AI function');\n try {\n await task;\n } catch {\n this.#logger\n .child({ function: func.name, speechId: handle.id })\n .error('error executing AI function');\n }\n }\n\n const toolCallsInfo = [];\n const toolCallsResults = [];\n for (const fnc of calledFuncs) {\n // ignore the function calls that return void\n const task = await fnc.task;\n if (!task || task.result === undefined) continue;\n toolCallsInfo.push(fnc);\n toolCallsResults.push(ChatMessage.createToolFromFunctionResult(task));\n }\n\n if (!toolCallsInfo.length) break;\n\n // generate an answer from the tool calls\n extraToolsMessages.push(ChatMessage.createToolCalls(toolCallsInfo, collectedText));\n extraToolsMessages.push(...toolCallsResults);\n\n const chatCtx = handle.source.chatCtx.copy();\n chatCtx.messages.push(...extraToolsMessages);\n\n const answerLLMStream = this.llm.chat({\n chatCtx,\n fncCtx: this.fncCtx,\n });\n const answerSynthesis = this.#synthesizeAgentSpeech(handle.id, answerLLMStream);\n // replace the synthesis handle with the new one to allow interruption\n handle.synthesisHandle = answerSynthesis;\n const playHandle = answerSynthesis.play();\n await playHandle.join().await;\n\n interrupted = answerSynthesis.interrupted;\n newFunctionCalls = answerLLMStream.functionCalls;\n\n this.emit(VPAEvent.FUNCTION_CALLS_FINISHED, calledFuncs);\n if (!newFunctionCalls) break;\n }\n }\n\n if (handle.addToChatCtx && (!userQuestion || handle.userCommitted)) {\n this.chatCtx.messages.push(...extraToolsMessages);\n if (interrupted) {\n collectedText + '…';\n }\n\n const msg = ChatMessage.create({ text: collectedText, role: ChatRole.ASSISTANT });\n this.chatCtx.messages.push(msg);\n\n handle.markSpeechCommitted();\n if (interrupted) {\n this.emit(VPAEvent.AGENT_SPEECH_INTERRUPTED, msg);\n } else {\n this.emit(VPAEvent.AGENT_SPEECH_COMMITTED, msg);\n }\n\n this.#logger\n .child({\n agentTranscript: collectedText,\n interrupted,\n speechId: handle.id,\n })\n .debug('committed agent speech');\n }\n }\n\n #synthesizeAgentSpeech(\n speechId: string,\n source: string | LLMStream | AsyncIterable<string>,\n ): SynthesisHandle {\n if (!this.#agentOutput) {\n throw new Error('agent output should be initialized when ready');\n }\n\n if (source instanceof LLMStream) {\n source = llmStreamToStringIterable(speechId, source);\n }\n\n const ogSource = source;\n if (!(typeof source === 'string')) {\n // TODO(nbsp): itertools.tee\n }\n\n const ttsSource = this.#opts.beforeTTSCallback(this, ogSource);\n if (!ttsSource) {\n throw new Error('beforeTTSCallback must return string or AsyncIterable<string>');\n }\n\n return this.#agentOutput.synthesize(speechId, ttsSource);\n }\n\n async #validateReplyIfPossible() {\n if (this.#playingSpeech && !this.#playingSpeech.allowInterruptions) {\n this.#logger\n .child({ speechId: this.#playingSpeech.id })\n .debug('skipping validation, agent is speaking and does not allow interruptions');\n return;\n }\n\n if (!this.#pendingAgentReply) {\n if (this.#opts.preemptiveSynthesis || !this.#transcribedText) {\n return;\n }\n this.#synthesizeAgentReply();\n }\n\n if (!this.#pendingAgentReply) {\n throw new Error('pending agent reply is undefined');\n }\n\n // in some bad timimg, we could end up with two pushed agent replies inside the speech queue.\n // so make sure we directly interrupt every reply when validating a new one\n if (this.#speechQueueOpen.done) {\n for await (const speech of this.#speechQueue) {\n if (speech === VoicePipelineAgent.FLUSH_SENTINEL) break;\n if (!speech.isReply) continue;\n if (speech.allowInterruptions) speech.interrupt();\n }\n }\n\n this.#logger.child({ speechId: this.#pendingAgentReply.id }).debug('validated agent reply');\n\n this.#addSpeechForPlayout(this.#pendingAgentReply);\n this.#pendingAgentReply = undefined;\n this.#transcribedInterimText = '';\n }\n\n #interruptIfPossible() {\n if (\n !this.#playingSpeech ||\n !this.#playingSpeech.allowInterruptions ||\n this.#playingSpeech.interrupted\n ) {\n return;\n }\n\n if (this.#opts.interruptMinWords !== 0) {\n // check the final/interim transcribed text for the minimum word count\n // to interrupt the agent speech\n const interimWords = this.#opts.transcription.wordTokenizer.tokenize(\n this.#transcribedInterimText,\n );\n if (interimWords.length < this.#opts.interruptMinWords) {\n return;\n }\n }\n this.#playingSpeech.interrupt();\n }\n\n #addSpeechForPlayout(handle: SpeechHandle) {\n this.#speechQueue.put(handle);\n this.#speechQueue.put(VoicePipelineAgent.FLUSH_SENTINEL);\n this.#speechQueueOpen.resolve();\n }\n\n /** Close the voice assistant. */\n async close() {\n if (!this.#started) {\n return;\n }\n\n this.#room?.removeAllListeners(RoomEvent.ParticipantConnected);\n // TODO(nbsp): await this.#deferredValidation.close()\n }\n}\n\nasync function* llmStreamToStringIterable(\n speechId: string,\n stream: LLMStream,\n): AsyncIterable<string> {\n const startTime = Date.now();\n let firstFrame = true;\n for await (const chunk of stream) {\n const content = chunk.choices[0]?.delta.content;\n if (!content) continue;\n\n if (firstFrame) {\n firstFrame = false;\n log()\n .child({ speechId, elapsed: Math.round(Date.now() - startTime) })\n .debug('received first LLM token');\n }\n yield content;\n }\n}\n\n/** This class is used to try to find the best time to validate the agent reply. */\nclass DeferredReplyValidation {\n // if the STT gives us punctuation, we can try to validate the reply faster.\n readonly PUNCTUATION = '.!?';\n readonly PUNCTUATION_REDUCE_FACTOR = 0.75;\n readonly LATE_TRANSCRIPT_TOLERANCE = 1.5; // late compared to end of speech\n\n #validateFunc: () => Promise<void>;\n #validatingPromise?: Promise<void>;\n #validatingFuture = new Future();\n #lastFinalTranscript = '';\n #lastRecvEndOfSpeechTime = 0;\n #speaking = false;\n #endOfSpeechDelay: number;\n #finalTranscriptDelay: number;\n\n constructor(validateFunc: () => Promise<void>, minEndpointingDelay: number) {\n this.#validateFunc = validateFunc;\n this.#endOfSpeechDelay = minEndpointingDelay;\n this.#finalTranscriptDelay = minEndpointingDelay;\n }\n\n get validating(): boolean {\n return !this.#validatingFuture.done;\n }\n\n onHumanFinalTranscript(transcript: string) {\n this.#lastFinalTranscript = transcript.trim();\n if (this.#speaking) return;\n\n const hasRecentEndOfSpeech =\n Date.now() - this.#lastRecvEndOfSpeechTime < this.LATE_TRANSCRIPT_TOLERANCE;\n let delay = hasRecentEndOfSpeech ? this.#endOfSpeechDelay : this.#finalTranscriptDelay;\n delay = this.#endWithPunctuation() ? delay * this.PUNCTUATION_REDUCE_FACTOR : 1;\n\n this.#run(delay);\n }\n\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n onHumanStartOfSpeech(_: VADEvent) {\n this.#speaking = true;\n // TODO(nbsp):\n // if (this.validating) {\n // this.#validatingPromise.cancel()\n // }\n }\n\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n onHumanEndOfSpeech(_: VADEvent) {\n this.#speaking = false;\n this.#lastRecvEndOfSpeechTime = Date.now();\n\n if (this.#lastFinalTranscript) {\n const delay = this.#endWithPunctuation()\n ? this.#endOfSpeechDelay * this.PUNCTUATION_REDUCE_FACTOR\n : 1;\n this.#run(delay);\n }\n }\n\n // TODO(nbsp): aclose\n\n #endWithPunctuation(): boolean {\n return (\n this.#lastFinalTranscript.length > 0 &&\n this.PUNCTUATION.includes(this.#lastFinalTranscript[this.#lastFinalTranscript.length - 1]!)\n );\n }\n\n #resetStates() {\n this.#lastFinalTranscript = '';\n this.#lastRecvEndOfSpeechTime = 0;\n }\n\n #run(delay: number) {\n const runTask = async (delay: number) => {\n await new Promise((resolve) => setTimeout(resolve, delay));\n this.#resetStates();\n await this.#validateFunc();\n };\n\n this.#validatingFuture = new Future();\n this.#validatingPromise = runTask(delay);\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAIA,sBAMO;AAEP,yBAAyB;AAOzB,iBAA0B;AAC1B,IAAAA,cAAmD;AACnD,iBAAoB;AACpB,iBAA4D;AAC5D,mBAIO;AAGP,iBAAkD;AAClD,mBAAiF;AAGjF,0BAA4B;AAC5B,2BAAgD;AAChD,yBAA4C;AAC5C,2BAA6B;AAGtB,MAAM,wBAAwB;AAY9B,IAAK,WAAL,kBAAKC,cAAL;AACL,EAAAA,oBAAA;AACA,EAAAA,oBAAA;AACA,EAAAA,oBAAA;AACA,EAAAA,oBAAA;AACA,EAAAA,oBAAA;AACA,EAAAA,oBAAA;AACA,EAAAA,oBAAA;AACA,EAAAA,oBAAA;AACA,EAAAA,oBAAA;AATU,SAAAA;AAAA,GAAA;AAwBL,MAAM,iBAAiB;AAAA,EAC5B;AAAA,EACA;AAAA,EACA,YAAY,oBAAI,IAAiB;AAAA,EACjC,OAAO;AAAA,EAEP,YAAY,OAA2B,WAAsB;AAC3D,SAAK,SAAS;AACd,SAAK,aAAa;AAClB,qBAAiB,WAAW;AAAA,EAC9B;AAAA,EAEA,OAAO,aAA+B;AACpC,WAAO,iBAAiB;AAAA,EAC1B;AAAA,EAEA,IAAI,QAA4B;AAC9B,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,cAAc,KAAa,OAAY;AACrC,SAAK,UAAU,IAAI,KAAK,KAAK;AAAA,EAC/B;AAAA,EAEA,YAAY,KAAa,YAAiB,QAAW;AACnD,WAAO,KAAK,UAAU,IAAI,GAAG,KAAK;AAAA,EACpC;AAAA,EAEA,IAAI,YAAuB;AACzB,WAAO,KAAK;AAAA,EACd;AACF;AAEA,MAAM,2BAA8C,CAClD,OACA,YACc;AACd,SAAO,MAAM,IAAI,KAAK,EAAE,SAAS,QAAQ,MAAM,OAAO,CAAC;AACzD;AAEA,MAAM,2BAA8C,CAElD,GACA,SACmC;AACnC,SAAO;AACT;AA6BA,MAAM,mCAA8D;AAAA,EAClE,mBAAmB;AAAA,EACnB,oBAAoB;AAAA,EACpB,0BAA0B;AAAA,EAC1B,mBAAmB,IAAI,aAAAC,kBAAuB;AAAA,EAC9C,eAAe,IAAI,aAAAC,cAAmB,KAAK;AAAA,EAC3C,eAAe;AACjB;AAuCA,MAAM,oBAAgC;AAAA,EACpC,SAAS,IAAI,wBAAY;AAAA,EACzB,oBAAoB;AAAA,EACpB,yBAAyB;AAAA,EACzB,mBAAmB;AAAA,EACnB,qBAAqB;AAAA,EACrB,sBAAsB;AAAA,EACtB,qBAAqB;AAAA,EACrB,mBAAmB;AAAA,EACnB,mBAAmB;AAAA,EACnB,eAAe;AACjB;AAGO,MAAM,2BAA4B,mBAAAC,QAAsD;AAAA;AAAA,EAEpF,6BAA6B;AAAA,EACtC,OAA0B,iBAAiB,OAAO,gBAAgB;AAAA,EAElE;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA,qBAAqB,IAAI,oBAAO;AAAA,EAChC;AAAA,EACA;AAAA,EACA;AAAA,EACA,mBAAmB;AAAA,EACnB,0BAA0B;AAAA,EAC1B,mBAAmB,IAAI,oBAAO;AAAA,EAC9B,eAAe,IAAI,gCAA4E;AAAA,EAC/F;AAAA,EACA;AAAA,EACA,WAAW;AAAA,EACX;AAAA,EACA,eAAkD;AAAA,EAClD;AAAA,EACA,cAAU,gBAAI;AAAA,EACd;AAAA,EAEA,YAEE,KAEA,KAEA,KAEA,KAEA,OAA4B,mBAC5B;AACA,UAAM;AAEN,SAAK,QAAQ,EAAE,GAAG,mBAAmB,GAAG,KAAK;AAE7C,QAAI,CAAC,IAAI,aAAa,WAAW;AAC/B,YAAM,IAAI,WAAAC,cAAiB,KAAK,GAAG;AAAA,IACrC;AAEA,QAAI,CAAC,IAAI,aAAa,WAAW;AAC/B,YAAM,IAAI,WAAAC,cAAiB,KAAK,IAAI,aAAAJ,kBAAuB,CAAC;AAAA,IAC9D;AAEA,SAAK,OAAO;AACZ,SAAK,OAAO;AACZ,SAAK,OAAO;AACZ,SAAK,OAAO;AAEZ,SAAK,sBAAsB,IAAI;AAAA,MAC7B,KAAK,yBAAyB,KAAK,IAAI;AAAA,MACvC,KAAK,MAAM;AAAA,IACb;AAAA,EACF;AAAA,EAEA,IAAI,SAAsC;AACxC,WAAO,KAAK,MAAM;AAAA,EACpB;AAAA,EAEA,IAAI,OAAO,KAAsB;AAC/B,SAAK,MAAM,SAAS;AAAA,EACtB;AAAA,EAEA,IAAI,UAAuB;AACzB,WAAO,KAAK,MAAM;AAAA,EACpB;AAAA,EAEA,IAAI,MAAW;AACb,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,MAAW;AACb,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,MAAW;AACb,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,MAAW;AACb,WAAO,KAAK;AAAA,EACd;AAAA;AAAA,EAGA,MAEE,MAQA,cAAiD,MACjD;AACA,QAAI,KAAK,UAAU;AACjB,YAAM,IAAI,MAAM,iCAAiC;AAAA,IACnD;AACA,SAAK,GAAG,0BAAU,sBAAsB,CAACK,iBAAmC;AAE1E,UAAI,KAAK,cAAc;AACrB;AAAA,MACF;AACA,WAAK,iBAAiB,KAAK,MAAMA,aAAY,QAAQ;AAAA,IACvD,CAAC;AAED,SAAK,QAAQ;AACb,SAAK,eAAe;AAEpB,QAAI,aAAa;AACf,UAAI,OAAO,gBAAgB,UAAU;AACnC,aAAK,iBAAiB,WAAW;AAAA,MACnC,OAAO;AACL,aAAK,iBAAiB,YAAY,QAAQ;AAAA,MAC5C;AAAA,IACF;AAEA,SAAK,KAAK;AAAA,EACZ;AAAA;AAAA,EAGA,MAAM,IACJ,QACA,qBAAqB,MACrB,eAAe,MACf;AACA,UAAM,KAAK,mBAAmB;AAC9B,UAAM,YAAY,kCAAa,sBAAsB,oBAAoB,YAAY;AACrF,UAAM,kBAAkB,KAAK,uBAAuB,UAAU,IAAI,MAAM;AACxE,cAAU,WAAW,QAAQ,eAAe;AAC5C,SAAK,qBAAqB,SAAS;AAAA,EACrC;AAAA,EAEA,aAAa,OAAmB,QAAQ,GAAG;AACzC,UAAM,UAAU,CAACC,WAA4C;AAC3D,aAAO,IAAI,gCAAmB,OAAO,SAAS,GAAG,aAAa;AApWpE;AAqWQ,YAAI,YAAY;AAChB,iBAAS,MAAM;AACb,sBAAY;AAAA,QACd,CAAC;AACD,cAAM,IAAI,QAAQ,CAACC,aAAY,WAAWA,UAASD,MAAK,CAAC;AACzD,aAAI,UAAK,UAAL,mBAAY,aAAa;AAC3B,cAAI,CAAC,WAAW;AACd,oBAAM,UAAK,MAAM,qBAAX,mBAA6B,cAAc,EAAE,CAAC,qBAAqB,GAAG,MAAM;AAAA,UACpF;AAAA,QACF;AACA,gBAAQ;AAAA,MACV,CAAC;AAAA,IACH;AAEA,QAAI,KAAK,kBAAkB;AACzB,WAAK,iBAAiB,OAAO;AAAA,IAC/B;AAEA,SAAK,mBAAmB,QAAQ,KAAK;AAAA,EACvC;AAAA,EAEA,iBAAiB,qBAAmC;AAClD,QAAI,CAAC,KAAK,OAAO;AACf,WAAK,QAAQ,MAAM,iBAAiB;AACpC;AAAA,IACF;AAEA,SAAK,eAAe,KAAK,MAAM,mBAAmB,IAAI,mBAAmB,KAAK;AAC9E,QAAI,CAAC,KAAK,cAAc;AACtB,WAAK,QAAQ,MAAM,6BAA6B,mBAAmB,YAAY;AAC/E;AAAA,IACF;AAEA,SAAK,cAAc,IAAI,8BAAW,KAAK,OAAO,KAAK,MAAM,KAAK,MAAM,KAAK,YAAY;AACrF,SAAK,YAAY,GAAG,mCAAgB,iBAAiB,CAAC,UAAU;AAC9D,WAAK,KAAK,6BAA8B;AACxC,WAAK,oBAAoB,qBAAqB,KAAK;AAAA,IACrD,CAAC;AACD,SAAK,YAAY,GAAG,mCAAgB,oBAAoB,CAAC,UAAU;AACjE,UAAI,CAAC,KAAK,mBAAmB,MAAM;AACjC;AAAA,MACF;AACA,UAAI,CAAC,KAAK,cAAc;AACtB,cAAM,IAAI,MAAM,2BAA2B;AAAA,MAC7C;AAEA,UAAI,KAAK;AACT,UAAI,KAAK,MAAM,oBAAoB;AACjC,aAAK,KAAK,IAAI,GAAG,IAAI,MAAM,WAAW;AACtC,aAAK,aAAa,QAAQ,eAAe;AAAA,MAC3C;AAEA,UAAI,MAAM,kBAAkB,KAAK,MAAM,yBAAyB;AAC9D,aAAK,qBAAqB;AAAA,MAC5B;AAAA,IACF,CAAC;AACD,SAAK,YAAY,GAAG,mCAAgB,eAAe,CAAC,UAAU;AAC5D,WAAK,KAAK,6BAA8B;AACxC,WAAK,oBAAoB,mBAAmB,KAAK;AACjD,WAAK,uBAAuB,KAAK,IAAI;AAAA,IACvC,CAAC;AACD,SAAK,YAAY,GAAG,mCAAgB,oBAAoB,CAAC,UAAU;AACjE,WAAK,0BAA0B,MAAM,aAAc,CAAC,EAAE;AAAA,IACxD,CAAC;AACD,SAAK,YAAY,GAAG,mCAAgB,kBAAkB,CAAC,UAAU;AAC/D,YAAM,gBAAgB,MAAM,aAAc,CAAC,EAAE;AAC7C,UAAI,CAAC,cAAe;AAEpB,WAAK,QAAQ,MAAM,EAAE,gBAAgB,cAAc,CAAC,EAAE,MAAM,0BAA0B;AACtF,WAAK,qBAAqB,KAAK,mBAAmB,MAAM,MAAM;AAE9D,UACE,KAAK,MAAM,wBACV,CAAC,KAAK,kBAAkB,KAAK,eAAe,qBAC7C;AACA,aAAK,sBAAsB;AAAA,MAC7B;AAEA,WAAK,oBAAoB,uBAAuB,aAAa;AAE7D,YAAM,QAAQ,KAAK,MAAM,cAAc,cAAc,SAAS,aAAa;AAC3E,UAAI,MAAM,UAAU,GAAG;AAGrB,aAAK,qBAAqB;AAAA,MAC5B;AAAA,IACF,CAAC;AAAA,EACH;AAAA,EAEA,MAAM,OAAO;AA9bf;AA+bI,SAAK,aAAa,cAAc;AAChC,UAAM,cAAc,IAAI,4BAAY,KAAK,KAAK,YAAY,KAAK,KAAK,WAAW;AAC/E,UAAM,QAAQ,gCAAgB,iBAAiB,mBAAmB,WAAW;AAC7E,SAAK,oBAAoB,QAAM,gBAAK,UAAL,mBAAY,qBAAZ,mBAA8B;AAAA,MAC3D;AAAA,MACA,IAAI,oCAAoB,EAAE,QAAQ,4BAAY,kBAAkB,CAAC;AAAA;AAGnE,UAAM,eAAe,IAAI,kCAAa,WAAW;AACjD,SAAK,eAAe,IAAI,gCAAY,cAAc,KAAK,IAAI;AAE3D,iBAAa,GAAG,uCAAkB,iBAAiB,MAAM;AACvD,WAAK,KAAK,8BAA+B;AACzC,WAAK,aAAa,UAAU;AAAA,IAC9B,CAAC;AAED,iBAAa,GAAG,uCAAkB,iBAAiB,CAAC,MAAM;AACxD,WAAK,KAAK,8BAA+B;AACzC,WAAK,aAAa,WAAW;AAAA,IAC/B,CAAC;AAED,SAAK,mBAAmB,QAAQ;AAEhC,WAAO,MAAM;AACX,YAAM,KAAK,iBAAiB;AAC5B,uBAAiB,UAAU,KAAK,cAAc;AAC5C,YAAI,WAAW,mBAAmB,eAAgB;AAClD,aAAK,iBAAiB;AACtB,cAAM,KAAK,YAAY,MAAM;AAC7B,aAAK,iBAAiB;AAAA,MACxB;AACA,WAAK,mBAAmB,IAAI,oBAAO;AAAA,IACrC;AAAA,EACF;AAAA,EAEA,wBAAwB;AAle1B;AAmeI,eAAK,uBAAL,mBAAyB;AACzB,QAAI,KAAK,eAAe,KAAK,YAAY,UAAU;AACjD,WAAK,aAAa,YAAY,GAAG;AAAA,IACnC;AAEA,SAAK,qBAAqB,kCAAa;AAAA,MACrC,KAAK,MAAM;AAAA,MACX;AAAA,MACA,KAAK;AAAA,IACP;AACA,UAAM,YAAY,KAAK;AACvB,SAAK,kBAAkB,KAAK,sBAAsB,KAAK,iBAAiB,SAAS;AAAA,EACnF;AAAA,EAEA,sBACE,SACA,QAC0B;AAC1B,WAAO,IAAI,gCAAmB,OAAO,SAAS,GAAG,aAAa;AAC5D,UAAI,YAAY;AAChB,eAAS,MAAM;AACb,oBAAY;AAAA,MACd,CAAC;AAED,UAAI,SAAS;AACX,kBAAM,+BAAiB,OAAO;AAAA,MAChC;AAEA,YAAM,YAAY,KAAK,QAAQ,KAAK;AACpC,YAAM,gBAAgB,KAAK;AAC3B,UAAI,iBAAiB,cAAc,aAAa;AAC9C,aACG,CAAC,cAAc,gBAAgB,cAAc,kBAC9C,CAAC,cAAc,iBACf;AAGA,oBAAU,SAAS;AAAA,YACjB,wBAAY,OAAO;AAAA,cACjB,MAAM,cAAc,gBAAgB;AAAA,cACpC,MAAM,qBAAS;AAAA,YACjB,CAAC;AAAA,UACH;AAAA,QACF;AAAA,MACF;AAEA,gBAAU,SAAS;AAAA,QACjB,wBAAY,OAAO;AAAA,UACjB,MAAM,iCAAQ;AAAA,UACd,MAAM,qBAAS;AAAA,QACjB,CAAC;AAAA,MACH;AAEA,UAAI,UAAW,SAAQ;AACvB,UAAI,YAAY,MAAM,KAAK,MAAM,kBAAkB,MAAM,SAAS;AAClE,UAAI,cAAc,OAAO;AACvB,yCAAQ;AACR;AAAA,MACF;AAEA,UAAI,UAAW,SAAQ;AAEvB,UAAI,EAAE,qBAAqB,uBAAY;AACrC,oBAAa,MAAM,yBAAyB,MAAM,SAAS;AAAA,MAC7D;AAEA,UAAI,OAAQ,aAAa;AACvB;AAAA,MACF;AAEA,YAAM,kBAAkB,KAAK,uBAAuB,OAAQ,IAAI,SAAS;AACzE,aAAQ,WAAW,WAAW,eAAe;AAI7C,YAAM,UAAU,CAAC,CAAC,KAAK,uBACnB,KAAK,OAAO,KAAK,IAAI,IAAI,KAAK,wBAAwB,GAAI,IAAI,MAC9D;AAEJ,WAAK,QAAQ,MAAM,EAAE,UAAU,OAAQ,IAAI,QAAQ,CAAC,EAAE,MAAM,0BAA0B;AACtF,cAAQ;AAAA,IACV,CAAC;AAAA,EACH;AAAA,EAEA,MAAM,YAAY,QAAsB;AACtC,QAAI;AACF,YAAM,OAAO,sBAAsB;AAAA,IACrC,QAAQ;AACN;AAAA,IACF;AACA,UAAM,KAAK,kBAAmB,oBAAoB;AAClD,UAAM,kBAAkB,OAAO;AAC/B,QAAI,gBAAgB,YAAa;AAEjC,UAAM,eAAe,OAAO;AAC5B,UAAM,aAAa,gBAAgB,KAAK;AACxC,UAAM,UAAU,WAAW,KAAK;AAEhC,UAAM,6BAA6B,MAAM;AACvC,UAAI,CAAC,gBAAgB,gBAAgB,eAAe,OAAO,cAAe;AAC1E,YAAME,gBACJ,OAAO,kBAAkB,wBAAa,CAAC,CAAC,OAAO,OAAO,cAAc;AAKtE,UACE,OAAO,sBACP,CAACA,iBACD,WAAW,aAAa,KAAK,8BAC7B,CAAC,QAAQ,MACT;AACA;AAAA,MACF;AAEA,WAAK,QAAQ,MAAM,EAAE,gBAAgB,aAAa,CAAC,EAAE,MAAM,2BAA2B;AACtF,YAAM,UAAU,wBAAY,OAAO,EAAE,MAAM,cAAc,MAAM,qBAAS,KAAK,CAAC;AAC9E,WAAK,QAAQ,SAAS,KAAK,OAAO;AAClC,WAAK,KAAK,+BAAgC,OAAO;AAEjD,WAAK,mBAAmB,KAAK,iBAAiB,MAAM,aAAa,MAAM;AACvE,aAAO,kBAAkB;AAAA,IAC3B;AAGA,+BAA2B;AAE3B,WAAO,CAAC,QAAQ,MAAM;AACpB,YAAM,IAAI,QAAc,OAAO,YAAY;AACzC,mBAAW,SAAS,GAAG;AACvB,cAAM,QAAQ;AACd,gBAAQ;AAAA,MACV,CAAC;AACD,iCAA2B;AAC3B,UAAI,OAAO,YAAa;AAAA,IAC1B;AACA,+BAA2B;AAE3B,UAAM,gBAAgB,OAAO,gBAAgB;AAC7C,UAAM,eAAe,OAAO,kBAAkB,wBAAa,CAAC,CAAC,OAAO,OAAO,cAAc;AACzF,UAAM,qBAAqB,CAAC;AAC5B,QAAI,cAAc,OAAO;AAIzB,QAAI,gBAAgB,CAAC,aAAa;AAChC,UAAI,CAAC,gBAAgB,CAAC,OAAO,eAAe;AAC1C,cAAM,IAAI,MAAM,2DAA2D;AAAA,MAC7E;AACA,YAAM,YAAY,OAAO;AACzB,UAAI,mBAAmB,UAAU;AAEjC,eAAS,IAAI,GAAG,IAAI,KAAK,MAAM,sBAAsB,KAAK;AACxD,aAAK,KAAK,kCAAmC,gBAAgB;AAC7D,cAAM,cAAkC,CAAC;AACzC,mBAAW,QAAQ,kBAAkB;AACnC,gBAAM,OAAO,KAAK,KAAK,QAAQ,KAAK,MAAM,EAAE;AAAA,YAC1C,CAAC,YAAY,EAAE,MAAM,KAAK,MAAM,YAAY,KAAK,YAAY,OAAO;AAAA,YACpE,CAAC,WAAW,EAAE,MAAM,KAAK,MAAM,YAAY,KAAK,YAAY,MAAM;AAAA,UACpE;AACA,sBAAY,KAAK,EAAE,GAAG,MAAM,KAAK,CAAC;AAClC,eAAK,QACF,MAAM,EAAE,UAAU,KAAK,MAAM,UAAU,OAAO,GAAG,CAAC,EAClD,MAAM,uBAAuB;AAChC,cAAI;AACF,kBAAM;AAAA,UACR,QAAQ;AACN,iBAAK,QACF,MAAM,EAAE,UAAU,KAAK,MAAM,UAAU,OAAO,GAAG,CAAC,EAClD,MAAM,6BAA6B;AAAA,UACxC;AAAA,QACF;AAEA,cAAM,gBAAgB,CAAC;AACvB,cAAM,mBAAmB,CAAC;AAC1B,mBAAW,OAAO,aAAa;AAE7B,gBAAM,OAAO,MAAM,IAAI;AACvB,cAAI,CAAC,QAAQ,KAAK,WAAW,OAAW;AACxC,wBAAc,KAAK,GAAG;AACtB,2BAAiB,KAAK,wBAAY,6BAA6B,IAAI,CAAC;AAAA,QACtE;AAEA,YAAI,CAAC,cAAc,OAAQ;AAG3B,2BAAmB,KAAK,wBAAY,gBAAgB,eAAe,aAAa,CAAC;AACjF,2BAAmB,KAAK,GAAG,gBAAgB;AAE3C,cAAM,UAAU,OAAO,OAAO,QAAQ,KAAK;AAC3C,gBAAQ,SAAS,KAAK,GAAG,kBAAkB;AAE3C,cAAM,kBAAkB,KAAK,IAAI,KAAK;AAAA,UACpC;AAAA,UACA,QAAQ,KAAK;AAAA,QACf,CAAC;AACD,cAAM,kBAAkB,KAAK,uBAAuB,OAAO,IAAI,eAAe;AAE9E,eAAO,kBAAkB;AACzB,cAAMC,cAAa,gBAAgB,KAAK;AACxC,cAAMA,YAAW,KAAK,EAAE;AAExB,sBAAc,gBAAgB;AAC9B,2BAAmB,gBAAgB;AAEnC,aAAK,KAAK,iCAAkC,WAAW;AACvD,YAAI,CAAC,iBAAkB;AAAA,MACzB;AAAA,IACF;AAEA,QAAI,OAAO,iBAAiB,CAAC,gBAAgB,OAAO,gBAAgB;AAClE,WAAK,QAAQ,SAAS,KAAK,GAAG,kBAAkB;AAChD,UAAI,aAAa;AACf,wBAAgB;AAAA,MAClB;AAEA,YAAM,MAAM,wBAAY,OAAO,EAAE,MAAM,eAAe,MAAM,qBAAS,UAAU,CAAC;AAChF,WAAK,QAAQ,SAAS,KAAK,GAAG;AAE9B,aAAO,oBAAoB;AAC3B,UAAI,aAAa;AACf,aAAK,KAAK,kCAAmC,GAAG;AAAA,MAClD,OAAO;AACL,aAAK,KAAK,gCAAiC,GAAG;AAAA,MAChD;AAEA,WAAK,QACF,MAAM;AAAA,QACL,iBAAiB;AAAA,QACjB;AAAA,QACA,UAAU,OAAO;AAAA,MACnB,CAAC,EACA,MAAM,wBAAwB;AAAA,IACnC;AAAA,EACF;AAAA,EAEA,uBACE,UACA,QACiB;AACjB,QAAI,CAAC,KAAK,cAAc;AACtB,YAAM,IAAI,MAAM,+CAA+C;AAAA,IACjE;AAEA,QAAI,kBAAkB,sBAAW;AAC/B,eAAS,0BAA0B,UAAU,MAAM;AAAA,IACrD;AAEA,UAAM,WAAW;AACjB,QAAI,EAAE,OAAO,WAAW,WAAW;AAAA,IAEnC;AAEA,UAAM,YAAY,KAAK,MAAM,kBAAkB,MAAM,QAAQ;AAC7D,QAAI,CAAC,WAAW;AACd,YAAM,IAAI,MAAM,+DAA+D;AAAA,IACjF;AAEA,WAAO,KAAK,aAAa,WAAW,UAAU,SAAS;AAAA,EACzD;AAAA,EAEA,MAAM,2BAA2B;AAC/B,QAAI,KAAK,kBAAkB,CAAC,KAAK,eAAe,oBAAoB;AAClE,WAAK,QACF,MAAM,EAAE,UAAU,KAAK,eAAe,GAAG,CAAC,EAC1C,MAAM,yEAAyE;AAClF;AAAA,IACF;AAEA,QAAI,CAAC,KAAK,oBAAoB;AAC5B,UAAI,KAAK,MAAM,uBAAuB,CAAC,KAAK,kBAAkB;AAC5D;AAAA,MACF;AACA,WAAK,sBAAsB;AAAA,IAC7B;AAEA,QAAI,CAAC,KAAK,oBAAoB;AAC5B,YAAM,IAAI,MAAM,kCAAkC;AAAA,IACpD;AAIA,QAAI,KAAK,iBAAiB,MAAM;AAC9B,uBAAiB,UAAU,KAAK,cAAc;AAC5C,YAAI,WAAW,mBAAmB,eAAgB;AAClD,YAAI,CAAC,OAAO,QAAS;AACrB,YAAI,OAAO,mBAAoB,QAAO,UAAU;AAAA,MAClD;AAAA,IACF;AAEA,SAAK,QAAQ,MAAM,EAAE,UAAU,KAAK,mBAAmB,GAAG,CAAC,EAAE,MAAM,uBAAuB;AAE1F,SAAK,qBAAqB,KAAK,kBAAkB;AACjD,SAAK,qBAAqB;AAC1B,SAAK,0BAA0B;AAAA,EACjC;AAAA,EAEA,uBAAuB;AACrB,QACE,CAAC,KAAK,kBACN,CAAC,KAAK,eAAe,sBACrB,KAAK,eAAe,aACpB;AACA;AAAA,IACF;AAEA,QAAI,KAAK,MAAM,sBAAsB,GAAG;AAGtC,YAAM,eAAe,KAAK,MAAM,cAAc,cAAc;AAAA,QAC1D,KAAK;AAAA,MACP;AACA,UAAI,aAAa,SAAS,KAAK,MAAM,mBAAmB;AACtD;AAAA,MACF;AAAA,IACF;AACA,SAAK,eAAe,UAAU;AAAA,EAChC;AAAA,EAEA,qBAAqB,QAAsB;AACzC,SAAK,aAAa,IAAI,MAAM;AAC5B,SAAK,aAAa,IAAI,mBAAmB,cAAc;AACvD,SAAK,iBAAiB,QAAQ;AAAA,EAChC;AAAA;AAAA,EAGA,MAAM,QAAQ;AAzyBhB;AA0yBI,QAAI,CAAC,KAAK,UAAU;AAClB;AAAA,IACF;AAEA,eAAK,UAAL,mBAAY,mBAAmB,0BAAU;AAAA,EAE3C;AACF;AAEA,gBAAgB,0BACd,UACA,QACuB;AAtzBzB;AAuzBE,QAAM,YAAY,KAAK,IAAI;AAC3B,MAAI,aAAa;AACjB,mBAAiB,SAAS,QAAQ;AAChC,UAAM,WAAU,WAAM,QAAQ,CAAC,MAAf,mBAAkB,MAAM;AACxC,QAAI,CAAC,QAAS;AAEd,QAAI,YAAY;AACd,mBAAa;AACb,0BAAI,EACD,MAAM,EAAE,UAAU,SAAS,KAAK,MAAM,KAAK,IAAI,IAAI,SAAS,EAAE,CAAC,EAC/D,MAAM,0BAA0B;AAAA,IACrC;AACA,UAAM;AAAA,EACR;AACF;AAGA,MAAM,wBAAwB;AAAA;AAAA,EAEnB,cAAc;AAAA,EACd,4BAA4B;AAAA,EAC5B,4BAA4B;AAAA;AAAA,EAErC;AAAA,EACA;AAAA,EACA,oBAAoB,IAAI,oBAAO;AAAA,EAC/B,uBAAuB;AAAA,EACvB,2BAA2B;AAAA,EAC3B,YAAY;AAAA,EACZ;AAAA,EACA;AAAA,EAEA,YAAY,cAAmC,qBAA6B;AAC1E,SAAK,gBAAgB;AACrB,SAAK,oBAAoB;AACzB,SAAK,wBAAwB;AAAA,EAC/B;AAAA,EAEA,IAAI,aAAsB;AACxB,WAAO,CAAC,KAAK,kBAAkB;AAAA,EACjC;AAAA,EAEA,uBAAuB,YAAoB;AACzC,SAAK,uBAAuB,WAAW,KAAK;AAC5C,QAAI,KAAK,UAAW;AAEpB,UAAM,uBACJ,KAAK,IAAI,IAAI,KAAK,2BAA2B,KAAK;AACpD,QAAI,QAAQ,uBAAuB,KAAK,oBAAoB,KAAK;AACjE,YAAQ,KAAK,oBAAoB,IAAI,QAAQ,KAAK,4BAA4B;AAE9E,SAAK,KAAK,KAAK;AAAA,EACjB;AAAA;AAAA,EAGA,qBAAqB,GAAa;AAChC,SAAK,YAAY;AAAA,EAKnB;AAAA;AAAA,EAGA,mBAAmB,GAAa;AAC9B,SAAK,YAAY;AACjB,SAAK,2BAA2B,KAAK,IAAI;AAEzC,QAAI,KAAK,sBAAsB;AAC7B,YAAM,QAAQ,KAAK,oBAAoB,IACnC,KAAK,oBAAoB,KAAK,4BAC9B;AACJ,WAAK,KAAK,KAAK;AAAA,IACjB;AAAA,EACF;AAAA;AAAA,EAIA,sBAA+B;AAC7B,WACE,KAAK,qBAAqB,SAAS,KACnC,KAAK,YAAY,SAAS,KAAK,qBAAqB,KAAK,qBAAqB,SAAS,CAAC,CAAE;AAAA,EAE9F;AAAA,EAEA,eAAe;AACb,SAAK,uBAAuB;AAC5B,SAAK,2BAA2B;AAAA,EAClC;AAAA,EAEA,KAAK,OAAe;AAClB,UAAM,UAAU,OAAOH,WAAkB;AACvC,YAAM,IAAI,QAAQ,CAAC,YAAY,WAAW,SAASA,MAAK,CAAC;AACzD,WAAK,aAAa;AAClB,YAAM,KAAK,cAAc;AAAA,IAC3B;AAEA,SAAK,oBAAoB,IAAI,oBAAO;AACpC,SAAK,qBAAqB,QAAQ,KAAK;AAAA,EACzC;AACF;","names":["import_llm","VPAEvent","BasicSentenceTokenizer","BasicWordTokenizer","EventEmitter","STTStreamAdapter","TTSStreamAdapter","participant","delay","resolve","isUsingTools","playHandle"]}
|
|
1
|
+
{"version":3,"sources":["../../src/pipeline/pipeline_agent.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { LocalTrackPublication, RemoteParticipant, Room } from '@livekit/rtc-node';\nimport {\n AudioSource,\n LocalAudioTrack,\n RoomEvent,\n TrackPublishOptions,\n TrackSource,\n} from '@livekit/rtc-node';\nimport type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';\nimport EventEmitter from 'node:events';\nimport type {\n CallableFunctionResult,\n FunctionCallInfo,\n FunctionContext,\n LLM,\n} from '../llm/index.js';\nimport { LLMEvent, LLMStream } from '../llm/index.js';\nimport { ChatContext, ChatMessage, ChatRole } from '../llm/index.js';\nimport { log } from '../log.js';\nimport type { AgentMetrics, PipelineEOUMetrics } from '../metrics/base.js';\nimport { type STT, StreamAdapter as STTStreamAdapter, SpeechEventType } from '../stt/index.js';\nimport {\n SentenceTokenizer as BasicSentenceTokenizer,\n WordTokenizer as BasicWordTokenizer,\n hyphenateWord,\n} from '../tokenize/basic/index.js';\nimport type { SentenceTokenizer, WordTokenizer } from '../tokenize/tokenizer.js';\nimport type { TTS } from '../tts/index.js';\nimport { TTSEvent, StreamAdapter as TTSStreamAdapter } from '../tts/index.js';\nimport { AsyncIterableQueue, CancellablePromise, Future, gracefullyCancel } from '../utils.js';\nimport { type VAD, type VADEvent, VADEventType } from '../vad.js';\nimport type { SpeechSource, SynthesisHandle } from './agent_output.js';\nimport { AgentOutput } from './agent_output.js';\nimport { AgentPlayout, AgentPlayoutEvent } from './agent_playout.js';\nimport { HumanInput, HumanInputEvent } from './human_input.js';\nimport { SpeechHandle } from './speech_handle.js';\n\nexport type AgentState = 'initializing' | 'thinking' | 'listening' | 'speaking';\nexport const AGENT_STATE_ATTRIBUTE = 'lk.agent.state';\nlet speechData: { sequenceId: string } | undefined;\n\nexport type BeforeLLMCallback = (\n agent: VoicePipelineAgent,\n chatCtx: ChatContext,\n) => LLMStream | false | void | Promise<LLMStream | false | void>;\n\nexport type BeforeTTSCallback = (\n agent: VoicePipelineAgent,\n source: string | AsyncIterable<string>,\n) => SpeechSource;\n\nexport enum VPAEvent {\n USER_STARTED_SPEAKING,\n USER_STOPPED_SPEAKING,\n AGENT_STARTED_SPEAKING,\n AGENT_STOPPED_SPEAKING,\n USER_SPEECH_COMMITTED,\n AGENT_SPEECH_COMMITTED,\n AGENT_SPEECH_INTERRUPTED,\n FUNCTION_CALLS_COLLECTED,\n FUNCTION_CALLS_FINISHED,\n METRICS_COLLECTED,\n}\n\nexport type VPACallbacks = {\n [VPAEvent.USER_STARTED_SPEAKING]: () => void;\n [VPAEvent.USER_STOPPED_SPEAKING]: () => void;\n [VPAEvent.AGENT_STARTED_SPEAKING]: () => void;\n [VPAEvent.AGENT_STOPPED_SPEAKING]: () => void;\n [VPAEvent.USER_SPEECH_COMMITTED]: (msg: ChatMessage) => void;\n [VPAEvent.AGENT_SPEECH_COMMITTED]: (msg: ChatMessage) => void;\n [VPAEvent.AGENT_SPEECH_INTERRUPTED]: (msg: ChatMessage) => void;\n [VPAEvent.FUNCTION_CALLS_COLLECTED]: (funcs: FunctionCallInfo[]) => void;\n [VPAEvent.FUNCTION_CALLS_FINISHED]: (funcs: CallableFunctionResult[]) => void;\n [VPAEvent.METRICS_COLLECTED]: (metrics: AgentMetrics) => void;\n};\n\nexport class AgentCallContext {\n #agent: VoicePipelineAgent;\n #llmStream: LLMStream;\n #metadata = new Map<string, any>();\n #extraChatMessages: ChatMessage[] = [];\n static #current: AgentCallContext;\n\n constructor(agent: VoicePipelineAgent, llmStream: LLMStream) {\n this.#agent = agent;\n this.#llmStream = llmStream;\n AgentCallContext.#current = this;\n }\n\n static getCurrent(): AgentCallContext {\n return AgentCallContext.#current;\n }\n\n get agent(): VoicePipelineAgent {\n return this.#agent;\n }\n\n storeMetadata(key: string, value: any) {\n this.#metadata.set(key, value);\n }\n\n getMetadata(key: string, orDefault: any = undefined) {\n return this.#metadata.get(key) || orDefault;\n }\n\n get llmStream(): LLMStream {\n return this.#llmStream;\n }\n\n get extraChatMessages() {\n return this.#extraChatMessages;\n }\n\n addExtraChatMessage(message: ChatMessage) {\n this.#extraChatMessages.push(message);\n }\n}\n\nconst defaultBeforeLLMCallback: BeforeLLMCallback = (\n agent: VoicePipelineAgent,\n chatCtx: ChatContext,\n): LLMStream => {\n return agent.llm.chat({ chatCtx, fncCtx: agent.fncCtx });\n};\n\nconst defaultBeforeTTSCallback: BeforeTTSCallback = (\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n _: VoicePipelineAgent,\n text: string | AsyncIterable<string>,\n): string | AsyncIterable<string> => {\n return text;\n};\n\nexport interface AgentTranscriptionOptions {\n /** Whether to forward the user transcription to the client */\n userTranscription: boolean;\n /** Whether to forward the agent transcription to the client */\n agentTranscription: boolean;\n /**\n * The speed at which the agent's speech transcription is forwarded to the client.\n * We try to mimic the agent's speech speed by adjusting the transcription speed.\n */\n agentTranscriptionSpeech: number;\n /**\n * The tokenizer used to split the speech into sentences.\n * This is used to decide when to mark a transcript as final for the agent transcription.\n */\n sentenceTokenizer: SentenceTokenizer;\n /**\n * The tokenizer used to split the speech into words.\n * This is used to simulate the \"interim results\" of the agent transcription.\n */\n wordTokenizer: WordTokenizer;\n /**\n * A function that takes a string (word) as input and returns a list of strings,\n * representing the hyphenated parts of the word.\n */\n hyphenateWord: (word: string) => string[];\n}\n\nconst defaultAgentTranscriptionOptions: AgentTranscriptionOptions = {\n userTranscription: true,\n agentTranscription: true,\n agentTranscriptionSpeech: 1,\n sentenceTokenizer: new BasicSentenceTokenizer(),\n wordTokenizer: new BasicWordTokenizer(false),\n hyphenateWord: hyphenateWord,\n};\n\nexport interface VPAOptions {\n /** Chat context for the assistant. */\n chatCtx?: ChatContext;\n /** Function context for the assistant. */\n fncCtx?: FunctionContext;\n /** Whether to allow the user to interrupt the assistant. */\n allowInterruptions: boolean;\n /** Minimum duration of speech to consider for interruption. */\n interruptSpeechDuration: number;\n /** Minimum number of words to consider for interuption. This may increase latency. */\n interruptMinWords: number;\n /** Delay to wait before considering the user speech done. */\n minEndpointingDelay: number;\n maxNestedFncCalls: number;\n /* Whether to preemptively synthesize responses. */\n preemptiveSynthesis: boolean;\n /*\n * Callback called when the assistant is about to synthesize a reply.\n *\n * @remarks\n * Returning void will create a default LLM stream.\n * You can also return your own LLM stream by calling `llm.chat()`.\n * Returning `false` ill cancel the synthesis of the reply.\n */\n beforeLLMCallback: BeforeLLMCallback;\n /*\n * Callback called when the assistant is about to synthesize speech.\n *\n * @remarks\n * This can be used to customize text before synthesis\n * (e.g. editing the pronunciation of a word).\n */\n beforeTTSCallback: BeforeTTSCallback;\n /** Options for assistant transcription. */\n transcription: AgentTranscriptionOptions;\n}\n\nconst defaultVPAOptions: VPAOptions = {\n chatCtx: new ChatContext(),\n allowInterruptions: true,\n interruptSpeechDuration: 50,\n interruptMinWords: 0,\n minEndpointingDelay: 500,\n maxNestedFncCalls: 1,\n preemptiveSynthesis: false,\n beforeLLMCallback: defaultBeforeLLMCallback,\n beforeTTSCallback: defaultBeforeTTSCallback,\n transcription: defaultAgentTranscriptionOptions,\n};\n\n/** A pipeline agent (VAD + STT + LLM + TTS) implementation. */\nexport class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<VPACallbacks>) {\n /** Minimum time played for the user speech to be committed to the chat context. */\n readonly MIN_TIME_PLAYED_FOR_COMMIT = 1.5;\n protected static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');\n\n #vad: VAD;\n #stt: STT;\n #llm: LLM;\n #tts: TTS;\n #opts: VPAOptions;\n #humanInput?: HumanInput;\n #agentOutput?: AgentOutput;\n #trackPublishedFut = new Future();\n #pendingAgentReply?: SpeechHandle;\n #agentReplyTask?: CancellablePromise<void>;\n #playingSpeech?: SpeechHandle;\n #transcribedText = '';\n #transcribedInterimText = '';\n #speechQueueOpen = new Future();\n #speechQueue = new AsyncIterableQueue<SpeechHandle | typeof VoicePipelineAgent.FLUSH_SENTINEL>();\n #updateStateTask?: CancellablePromise<void>;\n #started = false;\n #room?: Room;\n #participant: RemoteParticipant | string | null = null;\n #deferredValidation: DeferredReplyValidation;\n #logger = log();\n #agentPublication?: LocalTrackPublication;\n #lastFinalTranscriptTime?: number;\n #lastSpeechTime?: number;\n\n constructor(\n /** Voice Activity Detection instance. */\n vad: VAD,\n /** Speech-to-Text instance. */\n stt: STT,\n /** Large Language Model instance. */\n llm: LLM,\n /** Text-to-Speech instance. */\n tts: TTS,\n /** Additional VoicePipelineAgent options. */\n opts: Partial<VPAOptions> = defaultVPAOptions,\n ) {\n super();\n\n this.#opts = { ...defaultVPAOptions, ...opts };\n\n if (!stt.capabilities.streaming) {\n stt = new STTStreamAdapter(stt, vad);\n }\n\n if (!tts.capabilities.streaming) {\n tts = new TTSStreamAdapter(tts, new BasicSentenceTokenizer());\n }\n\n this.#vad = vad;\n this.#stt = stt;\n this.#llm = llm;\n this.#tts = tts;\n\n this.#deferredValidation = new DeferredReplyValidation(\n this.#validateReplyIfPossible.bind(this),\n this.#opts.minEndpointingDelay,\n );\n }\n\n get fncCtx(): FunctionContext | undefined {\n return this.#opts.fncCtx;\n }\n\n set fncCtx(ctx: FunctionContext) {\n this.#opts.fncCtx = ctx;\n }\n\n get chatCtx(): ChatContext {\n return this.#opts.chatCtx!;\n }\n\n get llm(): LLM {\n return this.#llm;\n }\n\n get tts(): TTS {\n return this.#tts;\n }\n\n get stt(): STT {\n return this.#stt;\n }\n\n get vad(): VAD {\n return this.#vad;\n }\n\n /** Start the voice assistant. */\n start(\n /** The room to connect to. */\n room: Room,\n /**\n * The participant to listen to.\n *\n * @remarks\n * Can be a participant or an identity.\n * If omitted, the first participant in the room will be selected.\n */\n participant: RemoteParticipant | string | null = null,\n ) {\n if (this.#started) {\n throw new Error('voice assistant already started');\n }\n\n this.#stt.on(SpeechEventType.METRICS_COLLECTED, (metrics) => {\n this.emit(VPAEvent.METRICS_COLLECTED, metrics);\n });\n\n this.#tts.on(TTSEvent.METRICS_COLLECTED, (metrics) => {\n if (!speechData) return;\n this.emit(VPAEvent.METRICS_COLLECTED, { ...metrics, sequenceId: speechData.sequenceId });\n });\n\n this.#llm.on(LLMEvent.METRICS_COLLECTED, (metrics) => {\n if (!speechData) return;\n this.emit(VPAEvent.METRICS_COLLECTED, { ...metrics, sequenceId: speechData.sequenceId });\n });\n\n this.#vad.on(VADEventType.METRICS_COLLECTED, (metrics) => {\n this.emit(VPAEvent.METRICS_COLLECTED, metrics);\n });\n\n room.on(RoomEvent.ParticipantConnected, (participant: RemoteParticipant) => {\n // automatically link to the first participant that connects, if not already linked\n if (this.#participant) {\n return;\n }\n this.#linkParticipant.call(this, participant.identity!);\n });\n\n this.#room = room;\n this.#participant = participant;\n\n if (participant) {\n if (typeof participant === 'string') {\n this.#linkParticipant(participant);\n } else {\n this.#linkParticipant(participant.identity!);\n }\n }\n\n this.#run();\n }\n\n /** Play a speech source through the voice assistant. */\n async say(\n source: string | LLMStream | AsyncIterable<string>,\n allowInterruptions = true,\n addToChatCtx = true,\n ): Promise<SpeechHandle> {\n await this.#trackPublishedFut.await;\n\n let callContext: AgentCallContext | undefined;\n let fncSource: string | AsyncIterable<string> | undefined;\n if (addToChatCtx) {\n callContext = AgentCallContext.getCurrent();\n if (source instanceof LLMStream) {\n this.#logger.warn('LLMStream will be ignored for function call chat context');\n } else if (typeof source === 'string') {\n fncSource = source;\n } else {\n fncSource = source;\n source = new AsyncIterableQueue<string>();\n }\n }\n\n const newHandle = SpeechHandle.createAssistantSpeech(allowInterruptions, addToChatCtx);\n const synthesisHandle = this.#synthesizeAgentSpeech(newHandle.id, source);\n newHandle.initialize(source, synthesisHandle);\n\n if (this.#playingSpeech && !this.#playingSpeech.nestedSpeechFinished) {\n this.#playingSpeech.addNestedSpeech(newHandle);\n } else {\n this.#addSpeechForPlayout(newHandle);\n }\n\n if (callContext && fncSource) {\n let text: string;\n if (typeof source === 'string') {\n text = fncSource as string;\n } else {\n text = '';\n for await (const chunk of fncSource) {\n (source as AsyncIterableQueue<string>).put(chunk);\n text += chunk;\n }\n (source as AsyncIterableQueue<string>).close();\n }\n\n callContext.addExtraChatMessage(ChatMessage.create({ text, role: ChatRole.ASSISTANT }));\n this.#logger.child({ text }).debug('added speech to function call chat context');\n }\n\n return newHandle;\n }\n\n #updateState(state: AgentState, delay = 0) {\n const runTask = (delay: number): CancellablePromise<void> => {\n return new CancellablePromise(async (resolve, _, onCancel) => {\n let cancelled = false;\n onCancel(() => {\n cancelled = true;\n });\n await new Promise((resolve) => setTimeout(resolve, delay));\n if (this.#room?.isConnected) {\n if (!cancelled) {\n await this.#room.localParticipant?.setAttributes({ [AGENT_STATE_ATTRIBUTE]: state });\n }\n }\n resolve();\n });\n };\n\n if (this.#updateStateTask) {\n this.#updateStateTask.cancel();\n }\n\n this.#updateStateTask = runTask(delay);\n }\n\n #linkParticipant(participantIdentity: string): void {\n if (!this.#room) {\n this.#logger.error('Room is not set');\n return;\n }\n\n this.#participant = this.#room.remoteParticipants.get(participantIdentity) || null;\n if (!this.#participant) {\n this.#logger.error(`Participant with identity ${participantIdentity} not found`);\n return;\n }\n\n this.#humanInput = new HumanInput(this.#room, this.#vad, this.#stt, this.#participant);\n this.#humanInput.on(HumanInputEvent.START_OF_SPEECH, (event) => {\n this.emit(VPAEvent.USER_STARTED_SPEAKING);\n this.#deferredValidation.onHumanStartOfSpeech(event);\n });\n this.#humanInput.on(HumanInputEvent.VAD_INFERENCE_DONE, (event) => {\n if (!this.#trackPublishedFut.done) {\n return;\n }\n if (!this.#agentOutput) {\n throw new Error('agent output is undefined');\n }\n\n let tv = 1;\n if (this.#opts.allowInterruptions) {\n tv = Math.max(0, 1 - event.probability);\n this.#agentOutput.playout.targetVolume = tv;\n }\n\n if (event.speechDuration >= this.#opts.interruptSpeechDuration) {\n this.#interruptIfPossible();\n }\n\n if (event.rawAccumulatedSpeech > 0) {\n this.#lastSpeechTime = Date.now() - event.rawAccumulatedSilence;\n }\n });\n this.#humanInput.on(HumanInputEvent.END_OF_SPEECH, (event) => {\n this.emit(VPAEvent.USER_STARTED_SPEAKING);\n this.#deferredValidation.onHumanEndOfSpeech(event);\n });\n this.#humanInput.on(HumanInputEvent.INTERIM_TRANSCRIPT, (event) => {\n this.#transcribedInterimText = event.alternatives![0].text;\n });\n this.#humanInput.on(HumanInputEvent.FINAL_TRANSCRIPT, (event) => {\n const newTranscript = event.alternatives![0].text;\n if (!newTranscript) return;\n\n this.#lastFinalTranscriptTime = Date.now();\n this.#transcribedText += (this.#transcribedText ? ' ' : '') + newTranscript;\n\n if (\n this.#opts.preemptiveSynthesis &&\n (!this.#playingSpeech || this.#playingSpeech.allowInterruptions)\n ) {\n this.#synthesizeAgentReply();\n }\n\n this.#deferredValidation.onHumanFinalTranscript(newTranscript);\n\n const words = this.#opts.transcription.wordTokenizer.tokenize(newTranscript);\n if (words.length >= 3) {\n // VAD can sometimes not detect that the human is speaking.\n // to make the interruption more reliable, we also interrupt on the final transcript.\n this.#interruptIfPossible();\n }\n });\n }\n\n async #run() {\n this.#updateState('initializing');\n const audioSource = new AudioSource(this.#tts.sampleRate, this.#tts.numChannels);\n const track = LocalAudioTrack.createAudioTrack('assistant_voice', audioSource);\n this.#agentPublication = await this.#room?.localParticipant?.publishTrack(\n track,\n new TrackPublishOptions({ source: TrackSource.SOURCE_MICROPHONE }),\n );\n\n const agentPlayout = new AgentPlayout(audioSource);\n this.#agentOutput = new AgentOutput(agentPlayout, this.#tts);\n\n agentPlayout.on(AgentPlayoutEvent.PLAYOUT_STARTED, () => {\n this.emit(VPAEvent.AGENT_STARTED_SPEAKING);\n this.#updateState('speaking');\n });\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n agentPlayout.on(AgentPlayoutEvent.PLAYOUT_STOPPED, (_) => {\n this.emit(VPAEvent.AGENT_STOPPED_SPEAKING);\n this.#updateState('listening');\n });\n\n this.#trackPublishedFut.resolve();\n\n while (true) {\n await this.#speechQueueOpen.await;\n for await (const speech of this.#speechQueue) {\n if (speech === VoicePipelineAgent.FLUSH_SENTINEL) break;\n this.#playingSpeech = speech;\n await this.#playSpeech(speech);\n this.#playingSpeech = undefined;\n }\n this.#speechQueueOpen = new Future();\n }\n }\n\n #synthesizeAgentReply() {\n this.#pendingAgentReply?.cancel();\n if (this.#humanInput && this.#humanInput.speaking) {\n this.#updateState('thinking', 200);\n }\n\n this.#pendingAgentReply = SpeechHandle.createAssistantReply(\n this.#opts.allowInterruptions,\n true,\n this.#transcribedText,\n );\n const newHandle = this.#pendingAgentReply;\n this.#agentReplyTask = this.#synthesizeAnswerTask(this.#agentReplyTask, newHandle);\n }\n\n #synthesizeAnswerTask(\n oldTask: CancellablePromise<void> | undefined,\n handle?: SpeechHandle,\n ): CancellablePromise<void> {\n return new CancellablePromise(async (resolve, _, onCancel) => {\n let cancelled = false;\n onCancel(() => {\n cancelled = true;\n });\n\n if (oldTask) {\n await gracefullyCancel(oldTask);\n }\n\n const copiedCtx = this.chatCtx.copy();\n const playingSpeech = this.#playingSpeech;\n if (playingSpeech && playingSpeech.initialized) {\n if (\n (!playingSpeech.userQuestion || playingSpeech.userCommitted) &&\n !playingSpeech.speechCommitted\n ) {\n // the speech is playing but not committed yet,\n // add it to the chat context for this new reply synthesis\n copiedCtx.messages.push(\n ChatMessage.create({\n text: playingSpeech.synthesisHandle.text,\n role: ChatRole.ASSISTANT,\n }),\n );\n }\n }\n\n copiedCtx.messages.push(\n ChatMessage.create({\n text: handle?.userQuestion,\n role: ChatRole.USER,\n }),\n );\n\n speechData = { sequenceId: handle!.id };\n\n try {\n if (cancelled) resolve();\n let llmStream = await this.#opts.beforeLLMCallback(this, copiedCtx);\n if (llmStream === false) {\n handle?.cancel();\n return;\n }\n\n if (cancelled) resolve();\n // fallback to default impl if no custom/user stream is returned\n if (!(llmStream instanceof LLMStream)) {\n llmStream = (await defaultBeforeLLMCallback(this, copiedCtx)) as LLMStream;\n }\n\n if (handle!.interrupted) {\n return;\n }\n\n const synthesisHandle = this.#synthesizeAgentSpeech(handle!.id, llmStream);\n handle!.initialize(llmStream, synthesisHandle);\n } finally {\n speechData = undefined;\n }\n resolve();\n });\n }\n\n async #playSpeech(handle: SpeechHandle) {\n try {\n await handle.waitForInitialization();\n } catch {\n return;\n }\n await this.#agentPublication!.waitForSubscription();\n const synthesisHandle = handle.synthesisHandle;\n if (synthesisHandle.interrupted) return;\n\n const userQuestion = handle.userQuestion;\n const playHandle = synthesisHandle.play();\n const joinFut = playHandle.join();\n\n const commitUserQuestionIfNeeded = () => {\n if (!userQuestion || synthesisHandle.interrupted || handle.userCommitted) return;\n const isUsingTools =\n handle.source instanceof LLMStream && !!handle.source.functionCalls.length;\n\n // make sure at least some speech was played before committing the user message\n // since we try to validate as fast as possible it is possible the agent gets interrupted\n // really quickly (barely audible), we don't want to mark this question as \"answered\".\n if (\n handle.allowInterruptions &&\n !isUsingTools &&\n playHandle.timePlayed < this.MIN_TIME_PLAYED_FOR_COMMIT &&\n !joinFut.done\n ) {\n return;\n }\n\n this.#logger.child({ userTranscript: userQuestion }).debug('committed user transcript');\n const userMsg = ChatMessage.create({ text: userQuestion, role: ChatRole.USER });\n this.chatCtx.messages.push(userMsg);\n this.emit(VPAEvent.USER_SPEECH_COMMITTED, userMsg);\n\n this.#transcribedText = this.#transcribedText.slice(userQuestion.length);\n handle.markUserCommitted();\n };\n\n // wait for the playHandle to finish and check every 1s if user question should be committed\n commitUserQuestionIfNeeded();\n\n while (!joinFut.done) {\n await new Promise<void>(async (resolve) => {\n setTimeout(resolve, 500);\n await joinFut.await;\n resolve();\n });\n commitUserQuestionIfNeeded();\n if (handle.interrupted) break;\n }\n commitUserQuestionIfNeeded();\n\n const collectedText = handle.synthesisHandle.text;\n const isUsingTools = handle.source instanceof LLMStream && !!handle.source.functionCalls.length;\n const interrupted = handle.interrupted;\n\n const executeFunctionCalls = async () => {\n // if the answer is using tools, execute the functions and automatically generate\n // a response to the user question from the returned values\n if (!isUsingTools || interrupted) return;\n\n if (handle.fncNestedDepth >= this.#opts.maxNestedFncCalls) {\n this.#logger\n .child({ speechId: handle.id, fncNestedDepth: handle.fncNestedDepth })\n .warn('max function calls nested depth reached');\n return;\n }\n\n if (!userQuestion || !handle.userCommitted) {\n throw new Error('user speech should have been committed before using tools');\n }\n const llmStream = handle.source;\n const newFunctionCalls = llmStream.functionCalls;\n\n new AgentCallContext(this, llmStream);\n\n this.emit(VPAEvent.FUNCTION_CALLS_COLLECTED, newFunctionCalls);\n const calledFuncs: FunctionCallInfo[] = [];\n for (const func of newFunctionCalls) {\n const task = func.func.execute(func.params).then(\n (result) => ({ name: func.name, toolCallId: func.toolCallId, result }),\n (error) => ({ name: func.name, toolCallId: func.toolCallId, error }),\n );\n calledFuncs.push({ ...func, task });\n this.#logger\n .child({ function: func.name, speechId: handle.id })\n .debug('executing AI function');\n try {\n await task;\n } catch {\n this.#logger\n .child({ function: func.name, speechId: handle.id })\n .error('error executing AI function');\n }\n }\n\n const toolCallsInfo = [];\n const toolCallsResults = [];\n for (const fnc of calledFuncs) {\n // ignore the function calls that return void\n const task = await fnc.task;\n if (!task || task.result === undefined) continue;\n toolCallsInfo.push(fnc);\n toolCallsResults.push(ChatMessage.createToolFromFunctionResult(task));\n }\n\n if (!toolCallsInfo.length) return;\n\n // generate an answer from the tool calls\n const extraToolsMessages = [ChatMessage.createToolCalls(toolCallsInfo, collectedText)];\n extraToolsMessages.push(...toolCallsResults);\n\n // create a nested speech handle\n const newSpeechHandle = SpeechHandle.createToolSpeech(\n handle.allowInterruptions,\n handle.addToChatCtx,\n handle.fncNestedDepth + 1,\n extraToolsMessages,\n );\n\n // synthesize the tool speech with the chat ctx from llmStream\n const chatCtx = handle.source.chatCtx.copy();\n chatCtx.messages.push(...extraToolsMessages);\n chatCtx.messages.push(...AgentCallContext.getCurrent().extraChatMessages);\n\n const answerLLMStream = this.llm.chat({\n chatCtx,\n fncCtx: this.fncCtx,\n });\n const answerSynthesis = this.#synthesizeAgentSpeech(newSpeechHandle.id, answerLLMStream);\n newSpeechHandle.initialize(answerLLMStream, answerSynthesis);\n handle.addNestedSpeech(newSpeechHandle);\n\n this.emit(VPAEvent.FUNCTION_CALLS_FINISHED, calledFuncs);\n };\n\n const task = executeFunctionCalls().then(() => {\n handle.markNestedSpeechFinished();\n });\n while (!handle.nestedSpeechFinished) {\n const changed = handle.nestedSpeechChanged();\n await Promise.race([changed, task]);\n while (handle.nestedSpeechHandles.length) {\n const speech = handle.nestedSpeechHandles[0]!;\n this.#playingSpeech = speech;\n await this.#playSpeech(speech);\n handle.nestedSpeechHandles.shift();\n this.#playingSpeech = handle;\n }\n }\n\n if (handle.addToChatCtx && (!userQuestion || handle.userCommitted)) {\n if (handle.extraToolsMessages) {\n this.chatCtx.messages.push(...handle.extraToolsMessages);\n }\n if (interrupted) {\n collectedText + '…';\n }\n\n const msg = ChatMessage.create({ text: collectedText, role: ChatRole.ASSISTANT });\n this.chatCtx.messages.push(msg);\n\n handle.markSpeechCommitted();\n if (interrupted) {\n this.emit(VPAEvent.AGENT_SPEECH_INTERRUPTED, msg);\n } else {\n this.emit(VPAEvent.AGENT_SPEECH_COMMITTED, msg);\n }\n\n this.#logger\n .child({\n agentTranscript: collectedText,\n interrupted,\n speechId: handle.id,\n })\n .debug('committed agent speech');\n\n handle.setDone();\n }\n }\n\n #synthesizeAgentSpeech(\n speechId: string,\n source: string | LLMStream | AsyncIterable<string>,\n ): SynthesisHandle {\n if (!this.#agentOutput) {\n throw new Error('agent output should be initialized when ready');\n }\n\n if (source instanceof LLMStream) {\n source = llmStreamToStringIterable(speechId, source);\n }\n\n const ogSource = source;\n if (!(typeof source === 'string')) {\n // TODO(nbsp): itertools.tee\n }\n\n const ttsSource = this.#opts.beforeTTSCallback(this, ogSource);\n if (!ttsSource) {\n throw new Error('beforeTTSCallback must return string or AsyncIterable<string>');\n }\n\n return this.#agentOutput.synthesize(speechId, ttsSource);\n }\n\n async #validateReplyIfPossible() {\n if (this.#playingSpeech && !this.#playingSpeech.allowInterruptions) {\n this.#logger\n .child({ speechId: this.#playingSpeech.id })\n .debug('skipping validation, agent is speaking and does not allow interruptions');\n return;\n }\n\n if (!this.#pendingAgentReply) {\n if (this.#opts.preemptiveSynthesis || !this.#transcribedText) {\n return;\n }\n this.#synthesizeAgentReply();\n }\n\n if (!this.#pendingAgentReply) {\n throw new Error('pending agent reply is undefined');\n }\n\n // in some bad timimg, we could end up with two pushed agent replies inside the speech queue.\n // so make sure we directly interrupt every reply when validating a new one\n if (this.#speechQueueOpen.done) {\n for await (const speech of this.#speechQueue) {\n if (speech === VoicePipelineAgent.FLUSH_SENTINEL) break;\n if (!speech.isReply) continue;\n if (speech.allowInterruptions) speech.interrupt();\n }\n }\n\n this.#logger.child({ speechId: this.#pendingAgentReply.id }).debug('validated agent reply');\n\n if (this.#lastSpeechTime) {\n const timeSinceLastSpeech = Date.now() - this.#lastSpeechTime;\n const transcriptionDelay = Math.max(\n (this.#lastFinalTranscriptTime || 0) - this.#lastSpeechTime,\n 0,\n );\n const metrics: PipelineEOUMetrics = {\n timestamp: Date.now(),\n sequenceId: this.#pendingAgentReply.id,\n endOfUtteranceDelay: timeSinceLastSpeech,\n transcriptionDelay,\n };\n this.emit(VPAEvent.METRICS_COLLECTED, metrics);\n }\n\n this.#addSpeechForPlayout(this.#pendingAgentReply);\n this.#pendingAgentReply = undefined;\n this.#transcribedInterimText = '';\n }\n\n #interruptIfPossible() {\n if (\n !this.#playingSpeech ||\n !this.#playingSpeech.allowInterruptions ||\n this.#playingSpeech.interrupted\n ) {\n return;\n }\n\n if (this.#opts.interruptMinWords !== 0) {\n // check the final/interim transcribed text for the minimum word count\n // to interrupt the agent speech\n const interimWords = this.#opts.transcription.wordTokenizer.tokenize(\n this.#transcribedInterimText,\n );\n if (interimWords.length < this.#opts.interruptMinWords) {\n return;\n }\n }\n this.#playingSpeech.interrupt();\n }\n\n #addSpeechForPlayout(handle: SpeechHandle) {\n this.#speechQueue.put(handle);\n this.#speechQueue.put(VoicePipelineAgent.FLUSH_SENTINEL);\n this.#speechQueueOpen.resolve();\n }\n\n /** Close the voice assistant. */\n async close() {\n if (!this.#started) {\n return;\n }\n\n this.#room?.removeAllListeners(RoomEvent.ParticipantConnected);\n // TODO(nbsp): await this.#deferredValidation.close()\n }\n}\n\nasync function* llmStreamToStringIterable(\n speechId: string,\n stream: LLMStream,\n): AsyncIterable<string> {\n const startTime = Date.now();\n let firstFrame = true;\n for await (const chunk of stream) {\n const content = chunk.choices[0]?.delta.content;\n if (!content) continue;\n\n if (firstFrame) {\n firstFrame = false;\n log()\n .child({ speechId, elapsed: Math.round(Date.now() - startTime) })\n .debug('received first LLM token');\n }\n yield content;\n }\n}\n\n/** This class is used to try to find the best time to validate the agent reply. */\nclass DeferredReplyValidation {\n // if the STT gives us punctuation, we can try to validate the reply faster.\n readonly PUNCTUATION = '.!?';\n readonly PUNCTUATION_REDUCE_FACTOR = 0.75;\n readonly LATE_TRANSCRIPT_TOLERANCE = 1.5; // late compared to end of speech\n\n #validateFunc: () => Promise<void>;\n #validatingPromise?: Promise<void>;\n #validatingFuture = new Future();\n #lastFinalTranscript = '';\n #lastRecvEndOfSpeechTime = 0;\n #speaking = false;\n #endOfSpeechDelay: number;\n #finalTranscriptDelay: number;\n\n constructor(validateFunc: () => Promise<void>, minEndpointingDelay: number) {\n this.#validateFunc = validateFunc;\n this.#endOfSpeechDelay = minEndpointingDelay;\n this.#finalTranscriptDelay = minEndpointingDelay;\n }\n\n get validating(): boolean {\n return !this.#validatingFuture.done;\n }\n\n onHumanFinalTranscript(transcript: string) {\n this.#lastFinalTranscript = transcript.trim();\n if (this.#speaking) return;\n\n const hasRecentEndOfSpeech =\n Date.now() - this.#lastRecvEndOfSpeechTime < this.LATE_TRANSCRIPT_TOLERANCE;\n let delay = hasRecentEndOfSpeech ? this.#endOfSpeechDelay : this.#finalTranscriptDelay;\n delay = this.#endWithPunctuation() ? delay * this.PUNCTUATION_REDUCE_FACTOR : 1;\n\n this.#run(delay);\n }\n\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n onHumanStartOfSpeech(_: VADEvent) {\n this.#speaking = true;\n // TODO(nbsp):\n // if (this.validating) {\n // this.#validatingPromise.cancel()\n // }\n }\n\n // eslint-disable-next-line @typescript-eslint/no-unused-vars\n onHumanEndOfSpeech(_: VADEvent) {\n this.#speaking = false;\n this.#lastRecvEndOfSpeechTime = Date.now();\n\n if (this.#lastFinalTranscript) {\n const delay = this.#endWithPunctuation()\n ? this.#endOfSpeechDelay * this.PUNCTUATION_REDUCE_FACTOR\n : 1;\n this.#run(delay);\n }\n }\n\n // TODO(nbsp): aclose\n\n #endWithPunctuation(): boolean {\n return (\n this.#lastFinalTranscript.length > 0 &&\n this.PUNCTUATION.includes(this.#lastFinalTranscript[this.#lastFinalTranscript.length - 1]!)\n );\n }\n\n #resetStates() {\n this.#lastFinalTranscript = '';\n this.#lastRecvEndOfSpeechTime = 0;\n }\n\n #run(delay: number) {\n const runTask = async (delay: number) => {\n await new Promise((resolve) => setTimeout(resolve, delay));\n this.#resetStates();\n await this.#validateFunc();\n };\n\n this.#validatingFuture = new Future();\n this.#validatingPromise = runTask(delay);\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAIA,sBAMO;AAEP,yBAAyB;AAOzB,iBAAoC;AACpC,IAAAA,cAAmD;AACnD,iBAAoB;AAEpB,iBAA6E;AAC7E,mBAIO;AAGP,iBAA4D;AAC5D,mBAAiF;AACjF,iBAAsD;AAEtD,0BAA4B;AAC5B,2BAAgD;AAChD,yBAA4C;AAC5C,2BAA6B;AAGtB,MAAM,wBAAwB;AACrC,IAAI;AAYG,IAAK,WAAL,kBAAKC,cAAL;AACL,EAAAA,oBAAA;AACA,EAAAA,oBAAA;AACA,EAAAA,oBAAA;AACA,EAAAA,oBAAA;AACA,EAAAA,oBAAA;AACA,EAAAA,oBAAA;AACA,EAAAA,oBAAA;AACA,EAAAA,oBAAA;AACA,EAAAA,oBAAA;AACA,EAAAA,oBAAA;AAVU,SAAAA;AAAA,GAAA;AA0BL,MAAM,iBAAiB;AAAA,EAC5B;AAAA,EACA;AAAA,EACA,YAAY,oBAAI,IAAiB;AAAA,EACjC,qBAAoC,CAAC;AAAA,EACrC,OAAO;AAAA,EAEP,YAAY,OAA2B,WAAsB;AAC3D,SAAK,SAAS;AACd,SAAK,aAAa;AAClB,qBAAiB,WAAW;AAAA,EAC9B;AAAA,EAEA,OAAO,aAA+B;AACpC,WAAO,iBAAiB;AAAA,EAC1B;AAAA,EAEA,IAAI,QAA4B;AAC9B,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,cAAc,KAAa,OAAY;AACrC,SAAK,UAAU,IAAI,KAAK,KAAK;AAAA,EAC/B;AAAA,EAEA,YAAY,KAAa,YAAiB,QAAW;AACnD,WAAO,KAAK,UAAU,IAAI,GAAG,KAAK;AAAA,EACpC;AAAA,EAEA,IAAI,YAAuB;AACzB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,oBAAoB;AACtB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,oBAAoB,SAAsB;AACxC,SAAK,mBAAmB,KAAK,OAAO;AAAA,EACtC;AACF;AAEA,MAAM,2BAA8C,CAClD,OACA,YACc;AACd,SAAO,MAAM,IAAI,KAAK,EAAE,SAAS,QAAQ,MAAM,OAAO,CAAC;AACzD;AAEA,MAAM,2BAA8C,CAElD,GACA,SACmC;AACnC,SAAO;AACT;AA6BA,MAAM,mCAA8D;AAAA,EAClE,mBAAmB;AAAA,EACnB,oBAAoB;AAAA,EACpB,0BAA0B;AAAA,EAC1B,mBAAmB,IAAI,aAAAC,kBAAuB;AAAA,EAC9C,eAAe,IAAI,aAAAC,cAAmB,KAAK;AAAA,EAC3C,eAAe;AACjB;AAuCA,MAAM,oBAAgC;AAAA,EACpC,SAAS,IAAI,wBAAY;AAAA,EACzB,oBAAoB;AAAA,EACpB,yBAAyB;AAAA,EACzB,mBAAmB;AAAA,EACnB,qBAAqB;AAAA,EACrB,mBAAmB;AAAA,EACnB,qBAAqB;AAAA,EACrB,mBAAmB;AAAA,EACnB,mBAAmB;AAAA,EACnB,eAAe;AACjB;AAGO,MAAM,2BAA4B,mBAAAC,QAAsD;AAAA;AAAA,EAEpF,6BAA6B;AAAA,EACtC,OAA0B,iBAAiB,OAAO,gBAAgB;AAAA,EAElE;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA,qBAAqB,IAAI,oBAAO;AAAA,EAChC;AAAA,EACA;AAAA,EACA;AAAA,EACA,mBAAmB;AAAA,EACnB,0BAA0B;AAAA,EAC1B,mBAAmB,IAAI,oBAAO;AAAA,EAC9B,eAAe,IAAI,gCAA4E;AAAA,EAC/F;AAAA,EACA,WAAW;AAAA,EACX;AAAA,EACA,eAAkD;AAAA,EAClD;AAAA,EACA,cAAU,gBAAI;AAAA,EACd;AAAA,EACA;AAAA,EACA;AAAA,EAEA,YAEE,KAEA,KAEA,KAEA,KAEA,OAA4B,mBAC5B;AACA,UAAM;AAEN,SAAK,QAAQ,EAAE,GAAG,mBAAmB,GAAG,KAAK;AAE7C,QAAI,CAAC,IAAI,aAAa,WAAW;AAC/B,YAAM,IAAI,WAAAC,cAAiB,KAAK,GAAG;AAAA,IACrC;AAEA,QAAI,CAAC,IAAI,aAAa,WAAW;AAC/B,YAAM,IAAI,WAAAC,cAAiB,KAAK,IAAI,aAAAJ,kBAAuB,CAAC;AAAA,IAC9D;AAEA,SAAK,OAAO;AACZ,SAAK,OAAO;AACZ,SAAK,OAAO;AACZ,SAAK,OAAO;AAEZ,SAAK,sBAAsB,IAAI;AAAA,MAC7B,KAAK,yBAAyB,KAAK,IAAI;AAAA,MACvC,KAAK,MAAM;AAAA,IACb;AAAA,EACF;AAAA,EAEA,IAAI,SAAsC;AACxC,WAAO,KAAK,MAAM;AAAA,EACpB;AAAA,EAEA,IAAI,OAAO,KAAsB;AAC/B,SAAK,MAAM,SAAS;AAAA,EACtB;AAAA,EAEA,IAAI,UAAuB;AACzB,WAAO,KAAK,MAAM;AAAA,EACpB;AAAA,EAEA,IAAI,MAAW;AACb,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,MAAW;AACb,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,MAAW;AACb,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,MAAW;AACb,WAAO,KAAK;AAAA,EACd;AAAA;AAAA,EAGA,MAEE,MAQA,cAAiD,MACjD;AACA,QAAI,KAAK,UAAU;AACjB,YAAM,IAAI,MAAM,iCAAiC;AAAA,IACnD;AAEA,SAAK,KAAK,GAAG,2BAAgB,mBAAmB,CAAC,YAAY;AAC3D,WAAK,KAAK,2BAA4B,OAAO;AAAA,IAC/C,CAAC;AAED,SAAK,KAAK,GAAG,oBAAS,mBAAmB,CAAC,YAAY;AACpD,UAAI,CAAC,WAAY;AACjB,WAAK,KAAK,2BAA4B,EAAE,GAAG,SAAS,YAAY,WAAW,WAAW,CAAC;AAAA,IACzF,CAAC;AAED,SAAK,KAAK,GAAG,oBAAS,mBAAmB,CAAC,YAAY;AACpD,UAAI,CAAC,WAAY;AACjB,WAAK,KAAK,2BAA4B,EAAE,GAAG,SAAS,YAAY,WAAW,WAAW,CAAC;AAAA,IACzF,CAAC;AAED,SAAK,KAAK,GAAG,wBAAa,mBAAmB,CAAC,YAAY;AACxD,WAAK,KAAK,2BAA4B,OAAO;AAAA,IAC/C,CAAC;AAED,SAAK,GAAG,0BAAU,sBAAsB,CAACK,iBAAmC;AAE1E,UAAI,KAAK,cAAc;AACrB;AAAA,MACF;AACA,WAAK,iBAAiB,KAAK,MAAMA,aAAY,QAAS;AAAA,IACxD,CAAC;AAED,SAAK,QAAQ;AACb,SAAK,eAAe;AAEpB,QAAI,aAAa;AACf,UAAI,OAAO,gBAAgB,UAAU;AACnC,aAAK,iBAAiB,WAAW;AAAA,MACnC,OAAO;AACL,aAAK,iBAAiB,YAAY,QAAS;AAAA,MAC7C;AAAA,IACF;AAEA,SAAK,KAAK;AAAA,EACZ;AAAA;AAAA,EAGA,MAAM,IACJ,QACA,qBAAqB,MACrB,eAAe,MACQ;AACvB,UAAM,KAAK,mBAAmB;AAE9B,QAAI;AACJ,QAAI;AACJ,QAAI,cAAc;AAChB,oBAAc,iBAAiB,WAAW;AAC1C,UAAI,kBAAkB,sBAAW;AAC/B,aAAK,QAAQ,KAAK,0DAA0D;AAAA,MAC9E,WAAW,OAAO,WAAW,UAAU;AACrC,oBAAY;AAAA,MACd,OAAO;AACL,oBAAY;AACZ,iBAAS,IAAI,gCAA2B;AAAA,MAC1C;AAAA,IACF;AAEA,UAAM,YAAY,kCAAa,sBAAsB,oBAAoB,YAAY;AACrF,UAAM,kBAAkB,KAAK,uBAAuB,UAAU,IAAI,MAAM;AACxE,cAAU,WAAW,QAAQ,eAAe;AAE5C,QAAI,KAAK,kBAAkB,CAAC,KAAK,eAAe,sBAAsB;AACpE,WAAK,eAAe,gBAAgB,SAAS;AAAA,IAC/C,OAAO;AACL,WAAK,qBAAqB,SAAS;AAAA,IACrC;AAEA,QAAI,eAAe,WAAW;AAC5B,UAAI;AACJ,UAAI,OAAO,WAAW,UAAU;AAC9B,eAAO;AAAA,MACT,OAAO;AACL,eAAO;AACP,yBAAiB,SAAS,WAAW;AACnC,UAAC,OAAsC,IAAI,KAAK;AAChD,kBAAQ;AAAA,QACV;AACA,QAAC,OAAsC,MAAM;AAAA,MAC/C;AAEA,kBAAY,oBAAoB,wBAAY,OAAO,EAAE,MAAM,MAAM,qBAAS,UAAU,CAAC,CAAC;AACtF,WAAK,QAAQ,MAAM,EAAE,KAAK,CAAC,EAAE,MAAM,4CAA4C;AAAA,IACjF;AAEA,WAAO;AAAA,EACT;AAAA,EAEA,aAAa,OAAmB,QAAQ,GAAG;AACzC,UAAM,UAAU,CAACC,WAA4C;AAC3D,aAAO,IAAI,gCAAmB,OAAO,SAAS,GAAG,aAAa;AA5apE;AA6aQ,YAAI,YAAY;AAChB,iBAAS,MAAM;AACb,sBAAY;AAAA,QACd,CAAC;AACD,cAAM,IAAI,QAAQ,CAACC,aAAY,WAAWA,UAASD,MAAK,CAAC;AACzD,aAAI,UAAK,UAAL,mBAAY,aAAa;AAC3B,cAAI,CAAC,WAAW;AACd,oBAAM,UAAK,MAAM,qBAAX,mBAA6B,cAAc,EAAE,CAAC,qBAAqB,GAAG,MAAM;AAAA,UACpF;AAAA,QACF;AACA,gBAAQ;AAAA,MACV,CAAC;AAAA,IACH;AAEA,QAAI,KAAK,kBAAkB;AACzB,WAAK,iBAAiB,OAAO;AAAA,IAC/B;AAEA,SAAK,mBAAmB,QAAQ,KAAK;AAAA,EACvC;AAAA,EAEA,iBAAiB,qBAAmC;AAClD,QAAI,CAAC,KAAK,OAAO;AACf,WAAK,QAAQ,MAAM,iBAAiB;AACpC;AAAA,IACF;AAEA,SAAK,eAAe,KAAK,MAAM,mBAAmB,IAAI,mBAAmB,KAAK;AAC9E,QAAI,CAAC,KAAK,cAAc;AACtB,WAAK,QAAQ,MAAM,6BAA6B,mBAAmB,YAAY;AAC/E;AAAA,IACF;AAEA,SAAK,cAAc,IAAI,8BAAW,KAAK,OAAO,KAAK,MAAM,KAAK,MAAM,KAAK,YAAY;AACrF,SAAK,YAAY,GAAG,mCAAgB,iBAAiB,CAAC,UAAU;AAC9D,WAAK,KAAK,6BAA8B;AACxC,WAAK,oBAAoB,qBAAqB,KAAK;AAAA,IACrD,CAAC;AACD,SAAK,YAAY,GAAG,mCAAgB,oBAAoB,CAAC,UAAU;AACjE,UAAI,CAAC,KAAK,mBAAmB,MAAM;AACjC;AAAA,MACF;AACA,UAAI,CAAC,KAAK,cAAc;AACtB,cAAM,IAAI,MAAM,2BAA2B;AAAA,MAC7C;AAEA,UAAI,KAAK;AACT,UAAI,KAAK,MAAM,oBAAoB;AACjC,aAAK,KAAK,IAAI,GAAG,IAAI,MAAM,WAAW;AACtC,aAAK,aAAa,QAAQ,eAAe;AAAA,MAC3C;AAEA,UAAI,MAAM,kBAAkB,KAAK,MAAM,yBAAyB;AAC9D,aAAK,qBAAqB;AAAA,MAC5B;AAEA,UAAI,MAAM,uBAAuB,GAAG;AAClC,aAAK,kBAAkB,KAAK,IAAI,IAAI,MAAM;AAAA,MAC5C;AAAA,IACF,CAAC;AACD,SAAK,YAAY,GAAG,mCAAgB,eAAe,CAAC,UAAU;AAC5D,WAAK,KAAK,6BAA8B;AACxC,WAAK,oBAAoB,mBAAmB,KAAK;AAAA,IACnD,CAAC;AACD,SAAK,YAAY,GAAG,mCAAgB,oBAAoB,CAAC,UAAU;AACjE,WAAK,0BAA0B,MAAM,aAAc,CAAC,EAAE;AAAA,IACxD,CAAC;AACD,SAAK,YAAY,GAAG,mCAAgB,kBAAkB,CAAC,UAAU;AAC/D,YAAM,gBAAgB,MAAM,aAAc,CAAC,EAAE;AAC7C,UAAI,CAAC,cAAe;AAEpB,WAAK,2BAA2B,KAAK,IAAI;AACzC,WAAK,qBAAqB,KAAK,mBAAmB,MAAM,MAAM;AAE9D,UACE,KAAK,MAAM,wBACV,CAAC,KAAK,kBAAkB,KAAK,eAAe,qBAC7C;AACA,aAAK,sBAAsB;AAAA,MAC7B;AAEA,WAAK,oBAAoB,uBAAuB,aAAa;AAE7D,YAAM,QAAQ,KAAK,MAAM,cAAc,cAAc,SAAS,aAAa;AAC3E,UAAI,MAAM,UAAU,GAAG;AAGrB,aAAK,qBAAqB;AAAA,MAC5B;AAAA,IACF,CAAC;AAAA,EACH;AAAA,EAEA,MAAM,OAAO;AAzgBf;AA0gBI,SAAK,aAAa,cAAc;AAChC,UAAM,cAAc,IAAI,4BAAY,KAAK,KAAK,YAAY,KAAK,KAAK,WAAW;AAC/E,UAAM,QAAQ,gCAAgB,iBAAiB,mBAAmB,WAAW;AAC7E,SAAK,oBAAoB,QAAM,gBAAK,UAAL,mBAAY,qBAAZ,mBAA8B;AAAA,MAC3D;AAAA,MACA,IAAI,oCAAoB,EAAE,QAAQ,4BAAY,kBAAkB,CAAC;AAAA;AAGnE,UAAM,eAAe,IAAI,kCAAa,WAAW;AACjD,SAAK,eAAe,IAAI,gCAAY,cAAc,KAAK,IAAI;AAE3D,iBAAa,GAAG,uCAAkB,iBAAiB,MAAM;AACvD,WAAK,KAAK,8BAA+B;AACzC,WAAK,aAAa,UAAU;AAAA,IAC9B,CAAC;AAED,iBAAa,GAAG,uCAAkB,iBAAiB,CAAC,MAAM;AACxD,WAAK,KAAK,8BAA+B;AACzC,WAAK,aAAa,WAAW;AAAA,IAC/B,CAAC;AAED,SAAK,mBAAmB,QAAQ;AAEhC,WAAO,MAAM;AACX,YAAM,KAAK,iBAAiB;AAC5B,uBAAiB,UAAU,KAAK,cAAc;AAC5C,YAAI,WAAW,mBAAmB,eAAgB;AAClD,aAAK,iBAAiB;AACtB,cAAM,KAAK,YAAY,MAAM;AAC7B,aAAK,iBAAiB;AAAA,MACxB;AACA,WAAK,mBAAmB,IAAI,oBAAO;AAAA,IACrC;AAAA,EACF;AAAA,EAEA,wBAAwB;AA7iB1B;AA8iBI,eAAK,uBAAL,mBAAyB;AACzB,QAAI,KAAK,eAAe,KAAK,YAAY,UAAU;AACjD,WAAK,aAAa,YAAY,GAAG;AAAA,IACnC;AAEA,SAAK,qBAAqB,kCAAa;AAAA,MACrC,KAAK,MAAM;AAAA,MACX;AAAA,MACA,KAAK;AAAA,IACP;AACA,UAAM,YAAY,KAAK;AACvB,SAAK,kBAAkB,KAAK,sBAAsB,KAAK,iBAAiB,SAAS;AAAA,EACnF;AAAA,EAEA,sBACE,SACA,QAC0B;AAC1B,WAAO,IAAI,gCAAmB,OAAO,SAAS,GAAG,aAAa;AAC5D,UAAI,YAAY;AAChB,eAAS,MAAM;AACb,oBAAY;AAAA,MACd,CAAC;AAED,UAAI,SAAS;AACX,kBAAM,+BAAiB,OAAO;AAAA,MAChC;AAEA,YAAM,YAAY,KAAK,QAAQ,KAAK;AACpC,YAAM,gBAAgB,KAAK;AAC3B,UAAI,iBAAiB,cAAc,aAAa;AAC9C,aACG,CAAC,cAAc,gBAAgB,cAAc,kBAC9C,CAAC,cAAc,iBACf;AAGA,oBAAU,SAAS;AAAA,YACjB,wBAAY,OAAO;AAAA,cACjB,MAAM,cAAc,gBAAgB;AAAA,cACpC,MAAM,qBAAS;AAAA,YACjB,CAAC;AAAA,UACH;AAAA,QACF;AAAA,MACF;AAEA,gBAAU,SAAS;AAAA,QACjB,wBAAY,OAAO;AAAA,UACjB,MAAM,iCAAQ;AAAA,UACd,MAAM,qBAAS;AAAA,QACjB,CAAC;AAAA,MACH;AAEA,mBAAa,EAAE,YAAY,OAAQ,GAAG;AAEtC,UAAI;AACF,YAAI,UAAW,SAAQ;AACvB,YAAI,YAAY,MAAM,KAAK,MAAM,kBAAkB,MAAM,SAAS;AAClE,YAAI,cAAc,OAAO;AACvB,2CAAQ;AACR;AAAA,QACF;AAEA,YAAI,UAAW,SAAQ;AAEvB,YAAI,EAAE,qBAAqB,uBAAY;AACrC,sBAAa,MAAM,yBAAyB,MAAM,SAAS;AAAA,QAC7D;AAEA,YAAI,OAAQ,aAAa;AACvB;AAAA,QACF;AAEA,cAAM,kBAAkB,KAAK,uBAAuB,OAAQ,IAAI,SAAS;AACzE,eAAQ,WAAW,WAAW,eAAe;AAAA,MAC/C,UAAE;AACA,qBAAa;AAAA,MACf;AACA,cAAQ;AAAA,IACV,CAAC;AAAA,EACH;AAAA,EAEA,MAAM,YAAY,QAAsB;AACtC,QAAI;AACF,YAAM,OAAO,sBAAsB;AAAA,IACrC,QAAQ;AACN;AAAA,IACF;AACA,UAAM,KAAK,kBAAmB,oBAAoB;AAClD,UAAM,kBAAkB,OAAO;AAC/B,QAAI,gBAAgB,YAAa;AAEjC,UAAM,eAAe,OAAO;AAC5B,UAAM,aAAa,gBAAgB,KAAK;AACxC,UAAM,UAAU,WAAW,KAAK;AAEhC,UAAM,6BAA6B,MAAM;AACvC,UAAI,CAAC,gBAAgB,gBAAgB,eAAe,OAAO,cAAe;AAC1E,YAAME,gBACJ,OAAO,kBAAkB,wBAAa,CAAC,CAAC,OAAO,OAAO,cAAc;AAKtE,UACE,OAAO,sBACP,CAACA,iBACD,WAAW,aAAa,KAAK,8BAC7B,CAAC,QAAQ,MACT;AACA;AAAA,MACF;AAEA,WAAK,QAAQ,MAAM,EAAE,gBAAgB,aAAa,CAAC,EAAE,MAAM,2BAA2B;AACtF,YAAM,UAAU,wBAAY,OAAO,EAAE,MAAM,cAAc,MAAM,qBAAS,KAAK,CAAC;AAC9E,WAAK,QAAQ,SAAS,KAAK,OAAO;AAClC,WAAK,KAAK,+BAAgC,OAAO;AAEjD,WAAK,mBAAmB,KAAK,iBAAiB,MAAM,aAAa,MAAM;AACvE,aAAO,kBAAkB;AAAA,IAC3B;AAGA,+BAA2B;AAE3B,WAAO,CAAC,QAAQ,MAAM;AACpB,YAAM,IAAI,QAAc,OAAO,YAAY;AACzC,mBAAW,SAAS,GAAG;AACvB,cAAM,QAAQ;AACd,gBAAQ;AAAA,MACV,CAAC;AACD,iCAA2B;AAC3B,UAAI,OAAO,YAAa;AAAA,IAC1B;AACA,+BAA2B;AAE3B,UAAM,gBAAgB,OAAO,gBAAgB;AAC7C,UAAM,eAAe,OAAO,kBAAkB,wBAAa,CAAC,CAAC,OAAO,OAAO,cAAc;AACzF,UAAM,cAAc,OAAO;AAE3B,UAAM,uBAAuB,YAAY;AAGvC,UAAI,CAAC,gBAAgB,YAAa;AAElC,UAAI,OAAO,kBAAkB,KAAK,MAAM,mBAAmB;AACzD,aAAK,QACF,MAAM,EAAE,UAAU,OAAO,IAAI,gBAAgB,OAAO,eAAe,CAAC,EACpE,KAAK,yCAAyC;AACjD;AAAA,MACF;AAEA,UAAI,CAAC,gBAAgB,CAAC,OAAO,eAAe;AAC1C,cAAM,IAAI,MAAM,2DAA2D;AAAA,MAC7E;AACA,YAAM,YAAY,OAAO;AACzB,YAAM,mBAAmB,UAAU;AAEnC,UAAI,iBAAiB,MAAM,SAAS;AAEpC,WAAK,KAAK,kCAAmC,gBAAgB;AAC7D,YAAM,cAAkC,CAAC;AACzC,iBAAW,QAAQ,kBAAkB;AACnC,cAAMC,QAAO,KAAK,KAAK,QAAQ,KAAK,MAAM,EAAE;AAAA,UAC1C,CAAC,YAAY,EAAE,MAAM,KAAK,MAAM,YAAY,KAAK,YAAY,OAAO;AAAA,UACpE,CAAC,WAAW,EAAE,MAAM,KAAK,MAAM,YAAY,KAAK,YAAY,MAAM;AAAA,QACpE;AACA,oBAAY,KAAK,EAAE,GAAG,MAAM,MAAAA,MAAK,CAAC;AAClC,aAAK,QACF,MAAM,EAAE,UAAU,KAAK,MAAM,UAAU,OAAO,GAAG,CAAC,EAClD,MAAM,uBAAuB;AAChC,YAAI;AACF,gBAAMA;AAAA,QACR,QAAQ;AACN,eAAK,QACF,MAAM,EAAE,UAAU,KAAK,MAAM,UAAU,OAAO,GAAG,CAAC,EAClD,MAAM,6BAA6B;AAAA,QACxC;AAAA,MACF;AAEA,YAAM,gBAAgB,CAAC;AACvB,YAAM,mBAAmB,CAAC;AAC1B,iBAAW,OAAO,aAAa;AAE7B,cAAMA,QAAO,MAAM,IAAI;AACvB,YAAI,CAACA,SAAQA,MAAK,WAAW,OAAW;AACxC,sBAAc,KAAK,GAAG;AACtB,yBAAiB,KAAK,wBAAY,6BAA6BA,KAAI,CAAC;AAAA,MACtE;AAEA,UAAI,CAAC,cAAc,OAAQ;AAG3B,YAAM,qBAAqB,CAAC,wBAAY,gBAAgB,eAAe,aAAa,CAAC;AACrF,yBAAmB,KAAK,GAAG,gBAAgB;AAG3C,YAAM,kBAAkB,kCAAa;AAAA,QACnC,OAAO;AAAA,QACP,OAAO;AAAA,QACP,OAAO,iBAAiB;AAAA,QACxB;AAAA,MACF;AAGA,YAAM,UAAU,OAAO,OAAO,QAAQ,KAAK;AAC3C,cAAQ,SAAS,KAAK,GAAG,kBAAkB;AAC3C,cAAQ,SAAS,KAAK,GAAG,iBAAiB,WAAW,EAAE,iBAAiB;AAExE,YAAM,kBAAkB,KAAK,IAAI,KAAK;AAAA,QACpC;AAAA,QACA,QAAQ,KAAK;AAAA,MACf,CAAC;AACD,YAAM,kBAAkB,KAAK,uBAAuB,gBAAgB,IAAI,eAAe;AACvF,sBAAgB,WAAW,iBAAiB,eAAe;AAC3D,aAAO,gBAAgB,eAAe;AAEtC,WAAK,KAAK,iCAAkC,WAAW;AAAA,IACzD;AAEA,UAAM,OAAO,qBAAqB,EAAE,KAAK,MAAM;AAC7C,aAAO,yBAAyB;AAAA,IAClC,CAAC;AACD,WAAO,CAAC,OAAO,sBAAsB;AACnC,YAAM,UAAU,OAAO,oBAAoB;AAC3C,YAAM,QAAQ,KAAK,CAAC,SAAS,IAAI,CAAC;AAClC,aAAO,OAAO,oBAAoB,QAAQ;AACxC,cAAM,SAAS,OAAO,oBAAoB,CAAC;AAC3C,aAAK,iBAAiB;AACtB,cAAM,KAAK,YAAY,MAAM;AAC7B,eAAO,oBAAoB,MAAM;AACjC,aAAK,iBAAiB;AAAA,MACxB;AAAA,IACF;AAEA,QAAI,OAAO,iBAAiB,CAAC,gBAAgB,OAAO,gBAAgB;AAClE,UAAI,OAAO,oBAAoB;AAC7B,aAAK,QAAQ,SAAS,KAAK,GAAG,OAAO,kBAAkB;AAAA,MACzD;AACA,UAAI,aAAa;AACf,wBAAgB;AAAA,MAClB;AAEA,YAAM,MAAM,wBAAY,OAAO,EAAE,MAAM,eAAe,MAAM,qBAAS,UAAU,CAAC;AAChF,WAAK,QAAQ,SAAS,KAAK,GAAG;AAE9B,aAAO,oBAAoB;AAC3B,UAAI,aAAa;AACf,aAAK,KAAK,kCAAmC,GAAG;AAAA,MAClD,OAAO;AACL,aAAK,KAAK,gCAAiC,GAAG;AAAA,MAChD;AAEA,WAAK,QACF,MAAM;AAAA,QACL,iBAAiB;AAAA,QACjB;AAAA,QACA,UAAU,OAAO;AAAA,MACnB,CAAC,EACA,MAAM,wBAAwB;AAEjC,aAAO,QAAQ;AAAA,IACjB;AAAA,EACF;AAAA,EAEA,uBACE,UACA,QACiB;AACjB,QAAI,CAAC,KAAK,cAAc;AACtB,YAAM,IAAI,MAAM,+CAA+C;AAAA,IACjE;AAEA,QAAI,kBAAkB,sBAAW;AAC/B,eAAS,0BAA0B,UAAU,MAAM;AAAA,IACrD;AAEA,UAAM,WAAW;AACjB,QAAI,EAAE,OAAO,WAAW,WAAW;AAAA,IAEnC;AAEA,UAAM,YAAY,KAAK,MAAM,kBAAkB,MAAM,QAAQ;AAC7D,QAAI,CAAC,WAAW;AACd,YAAM,IAAI,MAAM,+DAA+D;AAAA,IACjF;AAEA,WAAO,KAAK,aAAa,WAAW,UAAU,SAAS;AAAA,EACzD;AAAA,EAEA,MAAM,2BAA2B;AAC/B,QAAI,KAAK,kBAAkB,CAAC,KAAK,eAAe,oBAAoB;AAClE,WAAK,QACF,MAAM,EAAE,UAAU,KAAK,eAAe,GAAG,CAAC,EAC1C,MAAM,yEAAyE;AAClF;AAAA,IACF;AAEA,QAAI,CAAC,KAAK,oBAAoB;AAC5B,UAAI,KAAK,MAAM,uBAAuB,CAAC,KAAK,kBAAkB;AAC5D;AAAA,MACF;AACA,WAAK,sBAAsB;AAAA,IAC7B;AAEA,QAAI,CAAC,KAAK,oBAAoB;AAC5B,YAAM,IAAI,MAAM,kCAAkC;AAAA,IACpD;AAIA,QAAI,KAAK,iBAAiB,MAAM;AAC9B,uBAAiB,UAAU,KAAK,cAAc;AAC5C,YAAI,WAAW,mBAAmB,eAAgB;AAClD,YAAI,CAAC,OAAO,QAAS;AACrB,YAAI,OAAO,mBAAoB,QAAO,UAAU;AAAA,MAClD;AAAA,IACF;AAEA,SAAK,QAAQ,MAAM,EAAE,UAAU,KAAK,mBAAmB,GAAG,CAAC,EAAE,MAAM,uBAAuB;AAE1F,QAAI,KAAK,iBAAiB;AACxB,YAAM,sBAAsB,KAAK,IAAI,IAAI,KAAK;AAC9C,YAAM,qBAAqB,KAAK;AAAA,SAC7B,KAAK,4BAA4B,KAAK,KAAK;AAAA,QAC5C;AAAA,MACF;AACA,YAAM,UAA8B;AAAA,QAClC,WAAW,KAAK,IAAI;AAAA,QACpB,YAAY,KAAK,mBAAmB;AAAA,QACpC,qBAAqB;AAAA,QACrB;AAAA,MACF;AACA,WAAK,KAAK,2BAA4B,OAAO;AAAA,IAC/C;AAEA,SAAK,qBAAqB,KAAK,kBAAkB;AACjD,SAAK,qBAAqB;AAC1B,SAAK,0BAA0B;AAAA,EACjC;AAAA,EAEA,uBAAuB;AACrB,QACE,CAAC,KAAK,kBACN,CAAC,KAAK,eAAe,sBACrB,KAAK,eAAe,aACpB;AACA;AAAA,IACF;AAEA,QAAI,KAAK,MAAM,sBAAsB,GAAG;AAGtC,YAAM,eAAe,KAAK,MAAM,cAAc,cAAc;AAAA,QAC1D,KAAK;AAAA,MACP;AACA,UAAI,aAAa,SAAS,KAAK,MAAM,mBAAmB;AACtD;AAAA,MACF;AAAA,IACF;AACA,SAAK,eAAe,UAAU;AAAA,EAChC;AAAA,EAEA,qBAAqB,QAAsB;AACzC,SAAK,aAAa,IAAI,MAAM;AAC5B,SAAK,aAAa,IAAI,mBAAmB,cAAc;AACvD,SAAK,iBAAiB,QAAQ;AAAA,EAChC;AAAA;AAAA,EAGA,MAAM,QAAQ;AAh6BhB;AAi6BI,QAAI,CAAC,KAAK,UAAU;AAClB;AAAA,IACF;AAEA,eAAK,UAAL,mBAAY,mBAAmB,0BAAU;AAAA,EAE3C;AACF;AAEA,gBAAgB,0BACd,UACA,QACuB;AA76BzB;AA86BE,QAAM,YAAY,KAAK,IAAI;AAC3B,MAAI,aAAa;AACjB,mBAAiB,SAAS,QAAQ;AAChC,UAAM,WAAU,WAAM,QAAQ,CAAC,MAAf,mBAAkB,MAAM;AACxC,QAAI,CAAC,QAAS;AAEd,QAAI,YAAY;AACd,mBAAa;AACb,0BAAI,EACD,MAAM,EAAE,UAAU,SAAS,KAAK,MAAM,KAAK,IAAI,IAAI,SAAS,EAAE,CAAC,EAC/D,MAAM,0BAA0B;AAAA,IACrC;AACA,UAAM;AAAA,EACR;AACF;AAGA,MAAM,wBAAwB;AAAA;AAAA,EAEnB,cAAc;AAAA,EACd,4BAA4B;AAAA,EAC5B,4BAA4B;AAAA;AAAA,EAErC;AAAA,EACA;AAAA,EACA,oBAAoB,IAAI,oBAAO;AAAA,EAC/B,uBAAuB;AAAA,EACvB,2BAA2B;AAAA,EAC3B,YAAY;AAAA,EACZ;AAAA,EACA;AAAA,EAEA,YAAY,cAAmC,qBAA6B;AAC1E,SAAK,gBAAgB;AACrB,SAAK,oBAAoB;AACzB,SAAK,wBAAwB;AAAA,EAC/B;AAAA,EAEA,IAAI,aAAsB;AACxB,WAAO,CAAC,KAAK,kBAAkB;AAAA,EACjC;AAAA,EAEA,uBAAuB,YAAoB;AACzC,SAAK,uBAAuB,WAAW,KAAK;AAC5C,QAAI,KAAK,UAAW;AAEpB,UAAM,uBACJ,KAAK,IAAI,IAAI,KAAK,2BAA2B,KAAK;AACpD,QAAI,QAAQ,uBAAuB,KAAK,oBAAoB,KAAK;AACjE,YAAQ,KAAK,oBAAoB,IAAI,QAAQ,KAAK,4BAA4B;AAE9E,SAAK,KAAK,KAAK;AAAA,EACjB;AAAA;AAAA,EAGA,qBAAqB,GAAa;AAChC,SAAK,YAAY;AAAA,EAKnB;AAAA;AAAA,EAGA,mBAAmB,GAAa;AAC9B,SAAK,YAAY;AACjB,SAAK,2BAA2B,KAAK,IAAI;AAEzC,QAAI,KAAK,sBAAsB;AAC7B,YAAM,QAAQ,KAAK,oBAAoB,IACnC,KAAK,oBAAoB,KAAK,4BAC9B;AACJ,WAAK,KAAK,KAAK;AAAA,IACjB;AAAA,EACF;AAAA;AAAA,EAIA,sBAA+B;AAC7B,WACE,KAAK,qBAAqB,SAAS,KACnC,KAAK,YAAY,SAAS,KAAK,qBAAqB,KAAK,qBAAqB,SAAS,CAAC,CAAE;AAAA,EAE9F;AAAA,EAEA,eAAe;AACb,SAAK,uBAAuB;AAC5B,SAAK,2BAA2B;AAAA,EAClC;AAAA,EAEA,KAAK,OAAe;AAClB,UAAM,UAAU,OAAOH,WAAkB;AACvC,YAAM,IAAI,QAAQ,CAAC,YAAY,WAAW,SAASA,MAAK,CAAC;AACzD,WAAK,aAAa;AAClB,YAAM,KAAK,cAAc;AAAA,IAC3B;AAEA,SAAK,oBAAoB,IAAI,oBAAO;AACpC,SAAK,qBAAqB,QAAQ,KAAK;AAAA,EACzC;AACF;","names":["import_llm","VPAEvent","BasicSentenceTokenizer","BasicWordTokenizer","EventEmitter","STTStreamAdapter","TTSStreamAdapter","participant","delay","resolve","isUsingTools","task"]}
|
|
@@ -3,11 +3,13 @@ import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
|
|
|
3
3
|
import type { CallableFunctionResult, FunctionCallInfo, FunctionContext, LLM } from '../llm/index.js';
|
|
4
4
|
import { LLMStream } from '../llm/index.js';
|
|
5
5
|
import { ChatContext, ChatMessage } from '../llm/index.js';
|
|
6
|
+
import type { AgentMetrics } from '../metrics/base.js';
|
|
6
7
|
import { type STT } from '../stt/index.js';
|
|
7
8
|
import type { SentenceTokenizer, WordTokenizer } from '../tokenize/tokenizer.js';
|
|
8
9
|
import type { TTS } from '../tts/index.js';
|
|
9
|
-
import type
|
|
10
|
+
import { type VAD } from '../vad.js';
|
|
10
11
|
import type { SpeechSource } from './agent_output.js';
|
|
12
|
+
import { SpeechHandle } from './speech_handle.js';
|
|
11
13
|
export type AgentState = 'initializing' | 'thinking' | 'listening' | 'speaking';
|
|
12
14
|
export declare const AGENT_STATE_ATTRIBUTE = "lk.agent.state";
|
|
13
15
|
export type BeforeLLMCallback = (agent: VoicePipelineAgent, chatCtx: ChatContext) => LLMStream | false | void | Promise<LLMStream | false | void>;
|
|
@@ -21,7 +23,8 @@ export declare enum VPAEvent {
|
|
|
21
23
|
AGENT_SPEECH_COMMITTED = 5,
|
|
22
24
|
AGENT_SPEECH_INTERRUPTED = 6,
|
|
23
25
|
FUNCTION_CALLS_COLLECTED = 7,
|
|
24
|
-
FUNCTION_CALLS_FINISHED = 8
|
|
26
|
+
FUNCTION_CALLS_FINISHED = 8,
|
|
27
|
+
METRICS_COLLECTED = 9
|
|
25
28
|
}
|
|
26
29
|
export type VPACallbacks = {
|
|
27
30
|
[VPAEvent.USER_STARTED_SPEAKING]: () => void;
|
|
@@ -33,6 +36,7 @@ export type VPACallbacks = {
|
|
|
33
36
|
[VPAEvent.AGENT_SPEECH_INTERRUPTED]: (msg: ChatMessage) => void;
|
|
34
37
|
[VPAEvent.FUNCTION_CALLS_COLLECTED]: (funcs: FunctionCallInfo[]) => void;
|
|
35
38
|
[VPAEvent.FUNCTION_CALLS_FINISHED]: (funcs: CallableFunctionResult[]) => void;
|
|
39
|
+
[VPAEvent.METRICS_COLLECTED]: (metrics: AgentMetrics) => void;
|
|
36
40
|
};
|
|
37
41
|
export declare class AgentCallContext {
|
|
38
42
|
#private;
|
|
@@ -42,6 +46,8 @@ export declare class AgentCallContext {
|
|
|
42
46
|
storeMetadata(key: string, value: any): void;
|
|
43
47
|
getMetadata(key: string, orDefault?: any): any;
|
|
44
48
|
get llmStream(): LLMStream;
|
|
49
|
+
get extraChatMessages(): ChatMessage[];
|
|
50
|
+
addExtraChatMessage(message: ChatMessage): void;
|
|
45
51
|
}
|
|
46
52
|
export interface AgentTranscriptionOptions {
|
|
47
53
|
/** Whether to forward the user transcription to the client */
|
|
@@ -82,7 +88,7 @@ export interface VPAOptions {
|
|
|
82
88
|
interruptMinWords: number;
|
|
83
89
|
/** Delay to wait before considering the user speech done. */
|
|
84
90
|
minEndpointingDelay: number;
|
|
85
|
-
|
|
91
|
+
maxNestedFncCalls: number;
|
|
86
92
|
preemptiveSynthesis: boolean;
|
|
87
93
|
beforeLLMCallback: BeforeLLMCallback;
|
|
88
94
|
beforeTTSCallback: BeforeTTSCallback;
|
|
@@ -127,7 +133,7 @@ export declare class VoicePipelineAgent extends VoicePipelineAgent_base {
|
|
|
127
133
|
*/
|
|
128
134
|
participant?: RemoteParticipant | string | null): void;
|
|
129
135
|
/** Play a speech source through the voice assistant. */
|
|
130
|
-
say(source: string | LLMStream | AsyncIterable<string>, allowInterruptions?: boolean, addToChatCtx?: boolean): Promise<
|
|
136
|
+
say(source: string | LLMStream | AsyncIterable<string>, allowInterruptions?: boolean, addToChatCtx?: boolean): Promise<SpeechHandle>;
|
|
131
137
|
/** Close the voice assistant. */
|
|
132
138
|
close(): Promise<void>;
|
|
133
139
|
}
|