@livekit/agents 0.5.2 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +3 -0
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +2 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +2 -0
- package/dist/index.js.map +1 -1
- package/dist/job.cjs.map +1 -1
- package/dist/job.js.map +1 -1
- package/dist/llm/index.cjs +2 -0
- package/dist/llm/index.cjs.map +1 -1
- package/dist/llm/index.d.ts +1 -1
- package/dist/llm/index.d.ts.map +1 -1
- package/dist/llm/index.js +2 -0
- package/dist/llm/index.js.map +1 -1
- package/dist/llm/llm.cjs +47 -3
- package/dist/llm/llm.cjs.map +1 -1
- package/dist/llm/llm.d.ts +15 -2
- package/dist/llm/llm.d.ts.map +1 -1
- package/dist/llm/llm.js +46 -3
- package/dist/llm/llm.js.map +1 -1
- package/dist/metrics/base.cjs +44 -0
- package/dist/metrics/base.cjs.map +1 -0
- package/dist/metrics/base.d.ts +96 -0
- package/dist/metrics/base.d.ts.map +1 -0
- package/dist/metrics/base.js +20 -0
- package/dist/metrics/base.js.map +1 -0
- package/dist/metrics/index.cjs +35 -0
- package/dist/metrics/index.cjs.map +1 -0
- package/dist/metrics/index.d.ts +5 -0
- package/dist/metrics/index.d.ts.map +1 -0
- package/dist/metrics/index.js +9 -0
- package/dist/metrics/index.js.map +1 -0
- package/dist/metrics/usage_collector.cjs +53 -0
- package/dist/metrics/usage_collector.cjs.map +1 -0
- package/dist/metrics/usage_collector.d.ts +14 -0
- package/dist/metrics/usage_collector.d.ts.map +1 -0
- package/dist/metrics/usage_collector.js +29 -0
- package/dist/metrics/usage_collector.js.map +1 -0
- package/dist/metrics/utils.cjs +104 -0
- package/dist/metrics/utils.cjs.map +1 -0
- package/dist/metrics/utils.d.ts +10 -0
- package/dist/metrics/utils.d.ts.map +1 -0
- package/dist/metrics/utils.js +73 -0
- package/dist/metrics/utils.js.map +1 -0
- package/dist/multimodal/multimodal_agent.cjs +34 -16
- package/dist/multimodal/multimodal_agent.cjs.map +1 -1
- package/dist/multimodal/multimodal_agent.d.ts +4 -5
- package/dist/multimodal/multimodal_agent.d.ts.map +1 -1
- package/dist/multimodal/multimodal_agent.js +34 -16
- package/dist/multimodal/multimodal_agent.js.map +1 -1
- package/dist/pipeline/index.cjs +2 -0
- package/dist/pipeline/index.cjs.map +1 -1
- package/dist/pipeline/index.d.ts +1 -1
- package/dist/pipeline/index.d.ts.map +1 -1
- package/dist/pipeline/index.js +3 -1
- package/dist/pipeline/index.js.map +1 -1
- package/dist/pipeline/pipeline_agent.cjs +166 -66
- package/dist/pipeline/pipeline_agent.cjs.map +1 -1
- package/dist/pipeline/pipeline_agent.d.ts +10 -4
- package/dist/pipeline/pipeline_agent.d.ts.map +1 -1
- package/dist/pipeline/pipeline_agent.js +169 -69
- package/dist/pipeline/pipeline_agent.js.map +1 -1
- package/dist/pipeline/speech_handle.cjs +49 -1
- package/dist/pipeline/speech_handle.cjs.map +1 -1
- package/dist/pipeline/speech_handle.d.ts +12 -2
- package/dist/pipeline/speech_handle.d.ts.map +1 -1
- package/dist/pipeline/speech_handle.js +50 -2
- package/dist/pipeline/speech_handle.js.map +1 -1
- package/dist/stt/index.cjs.map +1 -1
- package/dist/stt/index.d.ts +1 -1
- package/dist/stt/index.d.ts.map +1 -1
- package/dist/stt/index.js.map +1 -1
- package/dist/stt/stream_adapter.cjs +15 -5
- package/dist/stt/stream_adapter.cjs.map +1 -1
- package/dist/stt/stream_adapter.d.ts +4 -1
- package/dist/stt/stream_adapter.d.ts.map +1 -1
- package/dist/stt/stream_adapter.js +15 -5
- package/dist/stt/stream_adapter.js.map +1 -1
- package/dist/stt/stt.cjs +46 -2
- package/dist/stt/stt.cjs.map +1 -1
- package/dist/stt/stt.d.ts +25 -3
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js +46 -2
- package/dist/stt/stt.js.map +1 -1
- package/dist/tts/index.cjs +4 -2
- package/dist/tts/index.cjs.map +1 -1
- package/dist/tts/index.d.ts +1 -1
- package/dist/tts/index.d.ts.map +1 -1
- package/dist/tts/index.js +3 -1
- package/dist/tts/index.js.map +1 -1
- package/dist/tts/stream_adapter.cjs +14 -3
- package/dist/tts/stream_adapter.cjs.map +1 -1
- package/dist/tts/stream_adapter.d.ts +3 -0
- package/dist/tts/stream_adapter.d.ts.map +1 -1
- package/dist/tts/stream_adapter.js +15 -4
- package/dist/tts/stream_adapter.js.map +1 -1
- package/dist/tts/tts.cjs +109 -6
- package/dist/tts/tts.cjs.map +1 -1
- package/dist/tts/tts.d.ts +24 -1
- package/dist/tts/tts.d.ts.map +1 -1
- package/dist/tts/tts.js +107 -5
- package/dist/tts/tts.js.map +1 -1
- package/dist/utils.cjs +11 -4
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +11 -4
- package/dist/utils.js.map +1 -1
- package/dist/vad.cjs +43 -2
- package/dist/vad.cjs.map +1 -1
- package/dist/vad.d.ts +21 -4
- package/dist/vad.d.ts.map +1 -1
- package/dist/vad.js +43 -2
- package/dist/vad.js.map +1 -1
- package/dist/worker.cjs +5 -2
- package/dist/worker.cjs.map +1 -1
- package/dist/worker.d.ts.map +1 -1
- package/dist/worker.js +5 -2
- package/dist/worker.js.map +1 -1
- package/package.json +3 -3
- package/src/index.ts +2 -1
- package/src/job.ts +3 -3
- package/src/llm/index.ts +2 -0
- package/src/llm/llm.ts +55 -3
- package/src/metrics/base.ts +127 -0
- package/src/metrics/index.ts +20 -0
- package/src/metrics/usage_collector.ts +40 -0
- package/src/metrics/utils.ts +100 -0
- package/src/multimodal/multimodal_agent.ts +57 -23
- package/src/pipeline/index.ts +1 -1
- package/src/pipeline/pipeline_agent.ts +208 -89
- package/src/pipeline/speech_handle.ts +67 -2
- package/src/stt/index.ts +2 -0
- package/src/stt/stream_adapter.ts +17 -5
- package/src/stt/stt.ts +67 -3
- package/src/tts/index.ts +2 -0
- package/src/tts/stream_adapter.ts +17 -4
- package/src/tts/tts.ts +127 -4
- package/src/utils.ts +12 -4
- package/src/vad.ts +61 -4
- package/src/worker.ts +7 -3
|
@@ -17,10 +17,11 @@ import type {
|
|
|
17
17
|
FunctionContext,
|
|
18
18
|
LLM,
|
|
19
19
|
} from '../llm/index.js';
|
|
20
|
-
import { LLMStream } from '../llm/index.js';
|
|
20
|
+
import { LLMEvent, LLMStream } from '../llm/index.js';
|
|
21
21
|
import { ChatContext, ChatMessage, ChatRole } from '../llm/index.js';
|
|
22
22
|
import { log } from '../log.js';
|
|
23
|
-
import {
|
|
23
|
+
import type { AgentMetrics, PipelineEOUMetrics } from '../metrics/base.js';
|
|
24
|
+
import { type STT, StreamAdapter as STTStreamAdapter, SpeechEventType } from '../stt/index.js';
|
|
24
25
|
import {
|
|
25
26
|
SentenceTokenizer as BasicSentenceTokenizer,
|
|
26
27
|
WordTokenizer as BasicWordTokenizer,
|
|
@@ -28,9 +29,9 @@ import {
|
|
|
28
29
|
} from '../tokenize/basic/index.js';
|
|
29
30
|
import type { SentenceTokenizer, WordTokenizer } from '../tokenize/tokenizer.js';
|
|
30
31
|
import type { TTS } from '../tts/index.js';
|
|
31
|
-
import { StreamAdapter as TTSStreamAdapter } from '../tts/index.js';
|
|
32
|
+
import { TTSEvent, StreamAdapter as TTSStreamAdapter } from '../tts/index.js';
|
|
32
33
|
import { AsyncIterableQueue, CancellablePromise, Future, gracefullyCancel } from '../utils.js';
|
|
33
|
-
import type
|
|
34
|
+
import { type VAD, type VADEvent, VADEventType } from '../vad.js';
|
|
34
35
|
import type { SpeechSource, SynthesisHandle } from './agent_output.js';
|
|
35
36
|
import { AgentOutput } from './agent_output.js';
|
|
36
37
|
import { AgentPlayout, AgentPlayoutEvent } from './agent_playout.js';
|
|
@@ -39,6 +40,7 @@ import { SpeechHandle } from './speech_handle.js';
|
|
|
39
40
|
|
|
40
41
|
export type AgentState = 'initializing' | 'thinking' | 'listening' | 'speaking';
|
|
41
42
|
export const AGENT_STATE_ATTRIBUTE = 'lk.agent.state';
|
|
43
|
+
let speechData: { sequenceId: string } | undefined;
|
|
42
44
|
|
|
43
45
|
export type BeforeLLMCallback = (
|
|
44
46
|
agent: VoicePipelineAgent,
|
|
@@ -60,6 +62,7 @@ export enum VPAEvent {
|
|
|
60
62
|
AGENT_SPEECH_INTERRUPTED,
|
|
61
63
|
FUNCTION_CALLS_COLLECTED,
|
|
62
64
|
FUNCTION_CALLS_FINISHED,
|
|
65
|
+
METRICS_COLLECTED,
|
|
63
66
|
}
|
|
64
67
|
|
|
65
68
|
export type VPACallbacks = {
|
|
@@ -72,12 +75,14 @@ export type VPACallbacks = {
|
|
|
72
75
|
[VPAEvent.AGENT_SPEECH_INTERRUPTED]: (msg: ChatMessage) => void;
|
|
73
76
|
[VPAEvent.FUNCTION_CALLS_COLLECTED]: (funcs: FunctionCallInfo[]) => void;
|
|
74
77
|
[VPAEvent.FUNCTION_CALLS_FINISHED]: (funcs: CallableFunctionResult[]) => void;
|
|
78
|
+
[VPAEvent.METRICS_COLLECTED]: (metrics: AgentMetrics) => void;
|
|
75
79
|
};
|
|
76
80
|
|
|
77
81
|
export class AgentCallContext {
|
|
78
82
|
#agent: VoicePipelineAgent;
|
|
79
83
|
#llmStream: LLMStream;
|
|
80
84
|
#metadata = new Map<string, any>();
|
|
85
|
+
#extraChatMessages: ChatMessage[] = [];
|
|
81
86
|
static #current: AgentCallContext;
|
|
82
87
|
|
|
83
88
|
constructor(agent: VoicePipelineAgent, llmStream: LLMStream) {
|
|
@@ -105,6 +110,14 @@ export class AgentCallContext {
|
|
|
105
110
|
get llmStream(): LLMStream {
|
|
106
111
|
return this.#llmStream;
|
|
107
112
|
}
|
|
113
|
+
|
|
114
|
+
get extraChatMessages() {
|
|
115
|
+
return this.#extraChatMessages;
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
addExtraChatMessage(message: ChatMessage) {
|
|
119
|
+
this.#extraChatMessages.push(message);
|
|
120
|
+
}
|
|
108
121
|
}
|
|
109
122
|
|
|
110
123
|
const defaultBeforeLLMCallback: BeforeLLMCallback = (
|
|
@@ -171,7 +184,7 @@ export interface VPAOptions {
|
|
|
171
184
|
interruptMinWords: number;
|
|
172
185
|
/** Delay to wait before considering the user speech done. */
|
|
173
186
|
minEndpointingDelay: number;
|
|
174
|
-
|
|
187
|
+
maxNestedFncCalls: number;
|
|
175
188
|
/* Whether to preemptively synthesize responses. */
|
|
176
189
|
preemptiveSynthesis: boolean;
|
|
177
190
|
/*
|
|
@@ -201,7 +214,7 @@ const defaultVPAOptions: VPAOptions = {
|
|
|
201
214
|
interruptSpeechDuration: 50,
|
|
202
215
|
interruptMinWords: 0,
|
|
203
216
|
minEndpointingDelay: 500,
|
|
204
|
-
|
|
217
|
+
maxNestedFncCalls: 1,
|
|
205
218
|
preemptiveSynthesis: false,
|
|
206
219
|
beforeLLMCallback: defaultBeforeLLMCallback,
|
|
207
220
|
beforeTTSCallback: defaultBeforeTTSCallback,
|
|
@@ -229,7 +242,6 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
|
|
|
229
242
|
#transcribedInterimText = '';
|
|
230
243
|
#speechQueueOpen = new Future();
|
|
231
244
|
#speechQueue = new AsyncIterableQueue<SpeechHandle | typeof VoicePipelineAgent.FLUSH_SENTINEL>();
|
|
232
|
-
#lastEndOfSpeechTime?: number;
|
|
233
245
|
#updateStateTask?: CancellablePromise<void>;
|
|
234
246
|
#started = false;
|
|
235
247
|
#room?: Room;
|
|
@@ -237,6 +249,8 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
|
|
|
237
249
|
#deferredValidation: DeferredReplyValidation;
|
|
238
250
|
#logger = log();
|
|
239
251
|
#agentPublication?: LocalTrackPublication;
|
|
252
|
+
#lastFinalTranscriptTime?: number;
|
|
253
|
+
#lastSpeechTime?: number;
|
|
240
254
|
|
|
241
255
|
constructor(
|
|
242
256
|
/** Voice Activity Detection instance. */
|
|
@@ -317,12 +331,31 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
|
|
|
317
331
|
if (this.#started) {
|
|
318
332
|
throw new Error('voice assistant already started');
|
|
319
333
|
}
|
|
334
|
+
|
|
335
|
+
this.#stt.on(SpeechEventType.METRICS_COLLECTED, (metrics) => {
|
|
336
|
+
this.emit(VPAEvent.METRICS_COLLECTED, metrics);
|
|
337
|
+
});
|
|
338
|
+
|
|
339
|
+
this.#tts.on(TTSEvent.METRICS_COLLECTED, (metrics) => {
|
|
340
|
+
if (!speechData) return;
|
|
341
|
+
this.emit(VPAEvent.METRICS_COLLECTED, { ...metrics, sequenceId: speechData.sequenceId });
|
|
342
|
+
});
|
|
343
|
+
|
|
344
|
+
this.#llm.on(LLMEvent.METRICS_COLLECTED, (metrics) => {
|
|
345
|
+
if (!speechData) return;
|
|
346
|
+
this.emit(VPAEvent.METRICS_COLLECTED, { ...metrics, sequenceId: speechData.sequenceId });
|
|
347
|
+
});
|
|
348
|
+
|
|
349
|
+
this.#vad.on(VADEventType.METRICS_COLLECTED, (metrics) => {
|
|
350
|
+
this.emit(VPAEvent.METRICS_COLLECTED, metrics);
|
|
351
|
+
});
|
|
352
|
+
|
|
320
353
|
room.on(RoomEvent.ParticipantConnected, (participant: RemoteParticipant) => {
|
|
321
354
|
// automatically link to the first participant that connects, if not already linked
|
|
322
355
|
if (this.#participant) {
|
|
323
356
|
return;
|
|
324
357
|
}
|
|
325
|
-
this.#linkParticipant.call(this, participant.identity);
|
|
358
|
+
this.#linkParticipant.call(this, participant.identity!);
|
|
326
359
|
});
|
|
327
360
|
|
|
328
361
|
this.#room = room;
|
|
@@ -332,7 +365,7 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
|
|
|
332
365
|
if (typeof participant === 'string') {
|
|
333
366
|
this.#linkParticipant(participant);
|
|
334
367
|
} else {
|
|
335
|
-
this.#linkParticipant(participant.identity);
|
|
368
|
+
this.#linkParticipant(participant.identity!);
|
|
336
369
|
}
|
|
337
370
|
}
|
|
338
371
|
|
|
@@ -344,12 +377,51 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
|
|
|
344
377
|
source: string | LLMStream | AsyncIterable<string>,
|
|
345
378
|
allowInterruptions = true,
|
|
346
379
|
addToChatCtx = true,
|
|
347
|
-
) {
|
|
380
|
+
): Promise<SpeechHandle> {
|
|
348
381
|
await this.#trackPublishedFut.await;
|
|
382
|
+
|
|
383
|
+
let callContext: AgentCallContext | undefined;
|
|
384
|
+
let fncSource: string | AsyncIterable<string> | undefined;
|
|
385
|
+
if (addToChatCtx) {
|
|
386
|
+
callContext = AgentCallContext.getCurrent();
|
|
387
|
+
if (source instanceof LLMStream) {
|
|
388
|
+
this.#logger.warn('LLMStream will be ignored for function call chat context');
|
|
389
|
+
} else if (typeof source === 'string') {
|
|
390
|
+
fncSource = source;
|
|
391
|
+
} else {
|
|
392
|
+
fncSource = source;
|
|
393
|
+
source = new AsyncIterableQueue<string>();
|
|
394
|
+
}
|
|
395
|
+
}
|
|
396
|
+
|
|
349
397
|
const newHandle = SpeechHandle.createAssistantSpeech(allowInterruptions, addToChatCtx);
|
|
350
398
|
const synthesisHandle = this.#synthesizeAgentSpeech(newHandle.id, source);
|
|
351
399
|
newHandle.initialize(source, synthesisHandle);
|
|
352
|
-
|
|
400
|
+
|
|
401
|
+
if (this.#playingSpeech && !this.#playingSpeech.nestedSpeechFinished) {
|
|
402
|
+
this.#playingSpeech.addNestedSpeech(newHandle);
|
|
403
|
+
} else {
|
|
404
|
+
this.#addSpeechForPlayout(newHandle);
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
if (callContext && fncSource) {
|
|
408
|
+
let text: string;
|
|
409
|
+
if (typeof source === 'string') {
|
|
410
|
+
text = fncSource as string;
|
|
411
|
+
} else {
|
|
412
|
+
text = '';
|
|
413
|
+
for await (const chunk of fncSource) {
|
|
414
|
+
(source as AsyncIterableQueue<string>).put(chunk);
|
|
415
|
+
text += chunk;
|
|
416
|
+
}
|
|
417
|
+
(source as AsyncIterableQueue<string>).close();
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
callContext.addExtraChatMessage(ChatMessage.create({ text, role: ChatRole.ASSISTANT }));
|
|
421
|
+
this.#logger.child({ text }).debug('added speech to function call chat context');
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
return newHandle;
|
|
353
425
|
}
|
|
354
426
|
|
|
355
427
|
#updateState(state: AgentState, delay = 0) {
|
|
@@ -410,11 +482,14 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
|
|
|
410
482
|
if (event.speechDuration >= this.#opts.interruptSpeechDuration) {
|
|
411
483
|
this.#interruptIfPossible();
|
|
412
484
|
}
|
|
485
|
+
|
|
486
|
+
if (event.rawAccumulatedSpeech > 0) {
|
|
487
|
+
this.#lastSpeechTime = Date.now() - event.rawAccumulatedSilence;
|
|
488
|
+
}
|
|
413
489
|
});
|
|
414
490
|
this.#humanInput.on(HumanInputEvent.END_OF_SPEECH, (event) => {
|
|
415
491
|
this.emit(VPAEvent.USER_STARTED_SPEAKING);
|
|
416
492
|
this.#deferredValidation.onHumanEndOfSpeech(event);
|
|
417
|
-
this.#lastEndOfSpeechTime = Date.now();
|
|
418
493
|
});
|
|
419
494
|
this.#humanInput.on(HumanInputEvent.INTERIM_TRANSCRIPT, (event) => {
|
|
420
495
|
this.#transcribedInterimText = event.alternatives![0].text;
|
|
@@ -423,7 +498,7 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
|
|
|
423
498
|
const newTranscript = event.alternatives![0].text;
|
|
424
499
|
if (!newTranscript) return;
|
|
425
500
|
|
|
426
|
-
this.#
|
|
501
|
+
this.#lastFinalTranscriptTime = Date.now();
|
|
427
502
|
this.#transcribedText += (this.#transcribedText ? ' ' : '') + newTranscript;
|
|
428
503
|
|
|
429
504
|
if (
|
|
@@ -534,33 +609,31 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
|
|
|
534
609
|
}),
|
|
535
610
|
);
|
|
536
611
|
|
|
537
|
-
|
|
538
|
-
let llmStream = await this.#opts.beforeLLMCallback(this, copiedCtx);
|
|
539
|
-
if (llmStream === false) {
|
|
540
|
-
handle?.cancel();
|
|
541
|
-
return;
|
|
542
|
-
}
|
|
543
|
-
|
|
544
|
-
if (cancelled) resolve();
|
|
545
|
-
// fallback to default impl if no custom/user stream is returned
|
|
546
|
-
if (!(llmStream instanceof LLMStream)) {
|
|
547
|
-
llmStream = (await defaultBeforeLLMCallback(this, copiedCtx)) as LLMStream;
|
|
548
|
-
}
|
|
612
|
+
speechData = { sequenceId: handle!.id };
|
|
549
613
|
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
614
|
+
try {
|
|
615
|
+
if (cancelled) resolve();
|
|
616
|
+
let llmStream = await this.#opts.beforeLLMCallback(this, copiedCtx);
|
|
617
|
+
if (llmStream === false) {
|
|
618
|
+
handle?.cancel();
|
|
619
|
+
return;
|
|
620
|
+
}
|
|
553
621
|
|
|
554
|
-
|
|
555
|
-
|
|
622
|
+
if (cancelled) resolve();
|
|
623
|
+
// fallback to default impl if no custom/user stream is returned
|
|
624
|
+
if (!(llmStream instanceof LLMStream)) {
|
|
625
|
+
llmStream = (await defaultBeforeLLMCallback(this, copiedCtx)) as LLMStream;
|
|
626
|
+
}
|
|
556
627
|
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
? Math.round((Date.now() - this.#lastEndOfSpeechTime) * 1000) / 1000
|
|
561
|
-
: -1;
|
|
628
|
+
if (handle!.interrupted) {
|
|
629
|
+
return;
|
|
630
|
+
}
|
|
562
631
|
|
|
563
|
-
|
|
632
|
+
const synthesisHandle = this.#synthesizeAgentSpeech(handle!.id, llmStream);
|
|
633
|
+
handle!.initialize(llmStream, synthesisHandle);
|
|
634
|
+
} finally {
|
|
635
|
+
speechData = undefined;
|
|
636
|
+
}
|
|
564
637
|
resolve();
|
|
565
638
|
});
|
|
566
639
|
}
|
|
@@ -621,78 +694,107 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
|
|
|
621
694
|
|
|
622
695
|
const collectedText = handle.synthesisHandle.text;
|
|
623
696
|
const isUsingTools = handle.source instanceof LLMStream && !!handle.source.functionCalls.length;
|
|
624
|
-
const
|
|
625
|
-
|
|
697
|
+
const interrupted = handle.interrupted;
|
|
698
|
+
|
|
699
|
+
const executeFunctionCalls = async () => {
|
|
700
|
+
// if the answer is using tools, execute the functions and automatically generate
|
|
701
|
+
// a response to the user question from the returned values
|
|
702
|
+
if (!isUsingTools || interrupted) return;
|
|
703
|
+
|
|
704
|
+
if (handle.fncNestedDepth >= this.#opts.maxNestedFncCalls) {
|
|
705
|
+
this.#logger
|
|
706
|
+
.child({ speechId: handle.id, fncNestedDepth: handle.fncNestedDepth })
|
|
707
|
+
.warn('max function calls nested depth reached');
|
|
708
|
+
return;
|
|
709
|
+
}
|
|
626
710
|
|
|
627
|
-
// if the answer is using tools, execute the functions and automatically generate
|
|
628
|
-
// a response to the user question from the returned values
|
|
629
|
-
if (isUsingTools && !interrupted) {
|
|
630
711
|
if (!userQuestion || !handle.userCommitted) {
|
|
631
712
|
throw new Error('user speech should have been committed before using tools');
|
|
632
713
|
}
|
|
633
714
|
const llmStream = handle.source;
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
)
|
|
644
|
-
|
|
715
|
+
const newFunctionCalls = llmStream.functionCalls;
|
|
716
|
+
|
|
717
|
+
new AgentCallContext(this, llmStream);
|
|
718
|
+
|
|
719
|
+
this.emit(VPAEvent.FUNCTION_CALLS_COLLECTED, newFunctionCalls);
|
|
720
|
+
const calledFuncs: FunctionCallInfo[] = [];
|
|
721
|
+
for (const func of newFunctionCalls) {
|
|
722
|
+
const task = func.func.execute(func.params).then(
|
|
723
|
+
(result) => ({ name: func.name, toolCallId: func.toolCallId, result }),
|
|
724
|
+
(error) => ({ name: func.name, toolCallId: func.toolCallId, error }),
|
|
725
|
+
);
|
|
726
|
+
calledFuncs.push({ ...func, task });
|
|
727
|
+
this.#logger
|
|
728
|
+
.child({ function: func.name, speechId: handle.id })
|
|
729
|
+
.debug('executing AI function');
|
|
730
|
+
try {
|
|
731
|
+
await task;
|
|
732
|
+
} catch {
|
|
645
733
|
this.#logger
|
|
646
734
|
.child({ function: func.name, speechId: handle.id })
|
|
647
|
-
.
|
|
648
|
-
try {
|
|
649
|
-
await task;
|
|
650
|
-
} catch {
|
|
651
|
-
this.#logger
|
|
652
|
-
.child({ function: func.name, speechId: handle.id })
|
|
653
|
-
.error('error executing AI function');
|
|
654
|
-
}
|
|
735
|
+
.error('error executing AI function');
|
|
655
736
|
}
|
|
737
|
+
}
|
|
656
738
|
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
739
|
+
const toolCallsInfo = [];
|
|
740
|
+
const toolCallsResults = [];
|
|
741
|
+
for (const fnc of calledFuncs) {
|
|
742
|
+
// ignore the function calls that return void
|
|
743
|
+
const task = await fnc.task;
|
|
744
|
+
if (!task || task.result === undefined) continue;
|
|
745
|
+
toolCallsInfo.push(fnc);
|
|
746
|
+
toolCallsResults.push(ChatMessage.createToolFromFunctionResult(task));
|
|
747
|
+
}
|
|
666
748
|
|
|
667
|
-
|
|
749
|
+
if (!toolCallsInfo.length) return;
|
|
668
750
|
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
751
|
+
// generate an answer from the tool calls
|
|
752
|
+
const extraToolsMessages = [ChatMessage.createToolCalls(toolCallsInfo, collectedText)];
|
|
753
|
+
extraToolsMessages.push(...toolCallsResults);
|
|
672
754
|
|
|
673
|
-
|
|
674
|
-
|
|
755
|
+
// create a nested speech handle
|
|
756
|
+
const newSpeechHandle = SpeechHandle.createToolSpeech(
|
|
757
|
+
handle.allowInterruptions,
|
|
758
|
+
handle.addToChatCtx,
|
|
759
|
+
handle.fncNestedDepth + 1,
|
|
760
|
+
extraToolsMessages,
|
|
761
|
+
);
|
|
675
762
|
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
const answerSynthesis = this.#synthesizeAgentSpeech(handle.id, answerLLMStream);
|
|
681
|
-
// replace the synthesis handle with the new one to allow interruption
|
|
682
|
-
handle.synthesisHandle = answerSynthesis;
|
|
683
|
-
const playHandle = answerSynthesis.play();
|
|
684
|
-
await playHandle.join().await;
|
|
763
|
+
// synthesize the tool speech with the chat ctx from llmStream
|
|
764
|
+
const chatCtx = handle.source.chatCtx.copy();
|
|
765
|
+
chatCtx.messages.push(...extraToolsMessages);
|
|
766
|
+
chatCtx.messages.push(...AgentCallContext.getCurrent().extraChatMessages);
|
|
685
767
|
|
|
686
|
-
|
|
687
|
-
|
|
768
|
+
const answerLLMStream = this.llm.chat({
|
|
769
|
+
chatCtx,
|
|
770
|
+
fncCtx: this.fncCtx,
|
|
771
|
+
});
|
|
772
|
+
const answerSynthesis = this.#synthesizeAgentSpeech(newSpeechHandle.id, answerLLMStream);
|
|
773
|
+
newSpeechHandle.initialize(answerLLMStream, answerSynthesis);
|
|
774
|
+
handle.addNestedSpeech(newSpeechHandle);
|
|
775
|
+
|
|
776
|
+
this.emit(VPAEvent.FUNCTION_CALLS_FINISHED, calledFuncs);
|
|
777
|
+
};
|
|
688
778
|
|
|
689
|
-
|
|
690
|
-
|
|
779
|
+
const task = executeFunctionCalls().then(() => {
|
|
780
|
+
handle.markNestedSpeechFinished();
|
|
781
|
+
});
|
|
782
|
+
while (!handle.nestedSpeechFinished) {
|
|
783
|
+
const changed = handle.nestedSpeechChanged();
|
|
784
|
+
await Promise.race([changed, task]);
|
|
785
|
+
while (handle.nestedSpeechHandles.length) {
|
|
786
|
+
const speech = handle.nestedSpeechHandles[0]!;
|
|
787
|
+
this.#playingSpeech = speech;
|
|
788
|
+
await this.#playSpeech(speech);
|
|
789
|
+
handle.nestedSpeechHandles.shift();
|
|
790
|
+
this.#playingSpeech = handle;
|
|
691
791
|
}
|
|
692
792
|
}
|
|
693
793
|
|
|
694
794
|
if (handle.addToChatCtx && (!userQuestion || handle.userCommitted)) {
|
|
695
|
-
|
|
795
|
+
if (handle.extraToolsMessages) {
|
|
796
|
+
this.chatCtx.messages.push(...handle.extraToolsMessages);
|
|
797
|
+
}
|
|
696
798
|
if (interrupted) {
|
|
697
799
|
collectedText + '…';
|
|
698
800
|
}
|
|
@@ -714,6 +816,8 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
|
|
|
714
816
|
speechId: handle.id,
|
|
715
817
|
})
|
|
716
818
|
.debug('committed agent speech');
|
|
819
|
+
|
|
820
|
+
handle.setDone();
|
|
717
821
|
}
|
|
718
822
|
}
|
|
719
823
|
|
|
@@ -773,6 +877,21 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
|
|
|
773
877
|
|
|
774
878
|
this.#logger.child({ speechId: this.#pendingAgentReply.id }).debug('validated agent reply');
|
|
775
879
|
|
|
880
|
+
if (this.#lastSpeechTime) {
|
|
881
|
+
const timeSinceLastSpeech = Date.now() - this.#lastSpeechTime;
|
|
882
|
+
const transcriptionDelay = Math.max(
|
|
883
|
+
(this.#lastFinalTranscriptTime || 0) - this.#lastSpeechTime,
|
|
884
|
+
0,
|
|
885
|
+
);
|
|
886
|
+
const metrics: PipelineEOUMetrics = {
|
|
887
|
+
timestamp: Date.now(),
|
|
888
|
+
sequenceId: this.#pendingAgentReply.id,
|
|
889
|
+
endOfUtteranceDelay: timeSinceLastSpeech,
|
|
890
|
+
transcriptionDelay,
|
|
891
|
+
};
|
|
892
|
+
this.emit(VPAEvent.METRICS_COLLECTED, metrics);
|
|
893
|
+
}
|
|
894
|
+
|
|
776
895
|
this.#addSpeechForPlayout(this.#pendingAgentReply);
|
|
777
896
|
this.#pendingAgentReply = undefined;
|
|
778
897
|
this.#transcribedInterimText = '';
|
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
//
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
import { randomUUID } from 'crypto';
|
|
5
|
-
import type { LLMStream } from '../llm/index.js';
|
|
6
|
-
import { Future } from '../utils.js';
|
|
5
|
+
import type { ChatMessage, LLMStream } from '../llm/index.js';
|
|
6
|
+
import { AsyncIterableQueue, Future } from '../utils.js';
|
|
7
7
|
import type { SynthesisHandle } from './agent_output.js';
|
|
8
8
|
|
|
9
9
|
export class SpeechHandle {
|
|
@@ -14,10 +14,16 @@ export class SpeechHandle {
|
|
|
14
14
|
#userQuestion: string;
|
|
15
15
|
#userCommitted = false;
|
|
16
16
|
#initFut = new Future();
|
|
17
|
+
#doneFut = new Future();
|
|
17
18
|
#speechCommitted = false;
|
|
18
19
|
#source?: string | LLMStream | AsyncIterable<string>;
|
|
19
20
|
#synthesisHandle?: SynthesisHandle;
|
|
20
21
|
#initialized = false;
|
|
22
|
+
#fncNestedDepth: number;
|
|
23
|
+
#fncExtraToolsMesages?: ChatMessage[];
|
|
24
|
+
#nestedSpeechHandles: SpeechHandle[] = [];
|
|
25
|
+
#nestedSpeechChanged = new AsyncIterableQueue<void>();
|
|
26
|
+
#nestedSpeechFinished = false;
|
|
21
27
|
|
|
22
28
|
constructor(
|
|
23
29
|
id: string,
|
|
@@ -25,12 +31,16 @@ export class SpeechHandle {
|
|
|
25
31
|
addToChatCtx: boolean,
|
|
26
32
|
isReply: boolean,
|
|
27
33
|
userQuestion: string,
|
|
34
|
+
fncNestedDepth = 0,
|
|
35
|
+
extraToolsMessages: ChatMessage[] | undefined = undefined,
|
|
28
36
|
) {
|
|
29
37
|
this.#id = id;
|
|
30
38
|
this.#allowInterruptions = allowInterruptions;
|
|
31
39
|
this.#addToChatCtx = addToChatCtx;
|
|
32
40
|
this.#isReply = isReply;
|
|
33
41
|
this.#userQuestion = userQuestion;
|
|
42
|
+
this.#fncNestedDepth = fncNestedDepth;
|
|
43
|
+
this.#fncExtraToolsMesages = extraToolsMessages;
|
|
34
44
|
}
|
|
35
45
|
|
|
36
46
|
static createAssistantReply(
|
|
@@ -45,6 +55,23 @@ export class SpeechHandle {
|
|
|
45
55
|
return new SpeechHandle(randomUUID(), allowInterruptions, addToChatCtx, false, '');
|
|
46
56
|
}
|
|
47
57
|
|
|
58
|
+
static createToolSpeech(
|
|
59
|
+
allowInterruptions: boolean,
|
|
60
|
+
addToChatCtx: boolean,
|
|
61
|
+
fncNestedDepth: number,
|
|
62
|
+
extraToolsMessages: ChatMessage[],
|
|
63
|
+
): SpeechHandle {
|
|
64
|
+
return new SpeechHandle(
|
|
65
|
+
randomUUID(),
|
|
66
|
+
allowInterruptions,
|
|
67
|
+
addToChatCtx,
|
|
68
|
+
false,
|
|
69
|
+
'',
|
|
70
|
+
fncNestedDepth,
|
|
71
|
+
extraToolsMessages,
|
|
72
|
+
);
|
|
73
|
+
}
|
|
74
|
+
|
|
48
75
|
async waitForInitialization() {
|
|
49
76
|
await this.#initFut.await;
|
|
50
77
|
}
|
|
@@ -122,6 +149,43 @@ export class SpeechHandle {
|
|
|
122
149
|
return !!this.#synthesisHandle?.interrupted;
|
|
123
150
|
}
|
|
124
151
|
|
|
152
|
+
get fncNestedDepth(): number {
|
|
153
|
+
return this.#fncNestedDepth;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
get extraToolsMessages(): ChatMessage[] | undefined {
|
|
157
|
+
return this.#fncExtraToolsMesages;
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
addNestedSpeech(handle: SpeechHandle) {
|
|
161
|
+
this.#nestedSpeechHandles.push(handle);
|
|
162
|
+
this.#nestedSpeechChanged.put();
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
get nestedSpeechHandles(): SpeechHandle[] {
|
|
166
|
+
return this.#nestedSpeechHandles;
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
async nestedSpeechChanged() {
|
|
170
|
+
await this.#nestedSpeechChanged.next();
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
get nestedSpeechFinished(): boolean {
|
|
174
|
+
return this.#nestedSpeechFinished;
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
markNestedSpeechFinished() {
|
|
178
|
+
this.#nestedSpeechFinished = true;
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
join() {
|
|
182
|
+
return this.#doneFut.await;
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
setDone() {
|
|
186
|
+
this.#doneFut.resolve();
|
|
187
|
+
}
|
|
188
|
+
|
|
125
189
|
interrupt() {
|
|
126
190
|
if (!this.#allowInterruptions) {
|
|
127
191
|
throw new Error('interruptions are not allowed');
|
|
@@ -131,6 +195,7 @@ export class SpeechHandle {
|
|
|
131
195
|
|
|
132
196
|
cancel() {
|
|
133
197
|
this.#initFut.reject(new Error());
|
|
198
|
+
this.#nestedSpeechChanged.close();
|
|
134
199
|
this.#synthesisHandle?.interrupt();
|
|
135
200
|
}
|
|
136
201
|
}
|
package/src/stt/index.ts
CHANGED
|
@@ -10,14 +10,20 @@ import { STT, SpeechEventType, SpeechStream } from './stt.js';
|
|
|
10
10
|
export class StreamAdapter extends STT {
|
|
11
11
|
#stt: STT;
|
|
12
12
|
#vad: VAD;
|
|
13
|
+
label: string;
|
|
13
14
|
|
|
14
15
|
constructor(stt: STT, vad: VAD) {
|
|
15
16
|
super({ streaming: true, interimResults: false });
|
|
16
17
|
this.#stt = stt;
|
|
17
18
|
this.#vad = vad;
|
|
19
|
+
this.label = `stt.StreamAdapter<${this.#stt.label}>`;
|
|
20
|
+
|
|
21
|
+
this.#stt.on(SpeechEventType.METRICS_COLLECTED, (metrics) => {
|
|
22
|
+
this.emit(SpeechEventType.METRICS_COLLECTED, metrics);
|
|
23
|
+
});
|
|
18
24
|
}
|
|
19
25
|
|
|
20
|
-
|
|
26
|
+
_recognize(frame: AudioFrame): Promise<SpeechEvent> {
|
|
21
27
|
return this.#stt.recognize(frame);
|
|
22
28
|
}
|
|
23
29
|
|
|
@@ -29,15 +35,21 @@ export class StreamAdapter extends STT {
|
|
|
29
35
|
export class StreamAdapterWrapper extends SpeechStream {
|
|
30
36
|
#stt: STT;
|
|
31
37
|
#vadStream: VADStream;
|
|
38
|
+
label: string;
|
|
32
39
|
|
|
33
40
|
constructor(stt: STT, vad: VAD) {
|
|
34
|
-
super();
|
|
41
|
+
super(stt);
|
|
35
42
|
this.#stt = stt;
|
|
36
43
|
this.#vadStream = vad.stream();
|
|
44
|
+
this.label = `stt.StreamAdapterWrapper<${this.#stt.label}>`;
|
|
37
45
|
|
|
38
46
|
this.#run();
|
|
39
47
|
}
|
|
40
48
|
|
|
49
|
+
async monitorMetrics() {
|
|
50
|
+
return; // do nothing
|
|
51
|
+
}
|
|
52
|
+
|
|
41
53
|
async #run() {
|
|
42
54
|
const forwardInput = async () => {
|
|
43
55
|
for await (const input of this.input) {
|
|
@@ -54,17 +66,17 @@ export class StreamAdapterWrapper extends SpeechStream {
|
|
|
54
66
|
for await (const ev of this.#vadStream) {
|
|
55
67
|
switch (ev.type) {
|
|
56
68
|
case VADEventType.START_OF_SPEECH:
|
|
57
|
-
this.
|
|
69
|
+
this.output.put({ type: SpeechEventType.START_OF_SPEECH });
|
|
58
70
|
break;
|
|
59
71
|
case VADEventType.END_OF_SPEECH:
|
|
60
|
-
this.
|
|
72
|
+
this.output.put({ type: SpeechEventType.END_OF_SPEECH });
|
|
61
73
|
|
|
62
74
|
const event = await this.#stt.recognize(ev.frames);
|
|
63
75
|
if (!event.alternatives![0].text) {
|
|
64
76
|
continue;
|
|
65
77
|
}
|
|
66
78
|
|
|
67
|
-
this.
|
|
79
|
+
this.output.put(event);
|
|
68
80
|
break;
|
|
69
81
|
}
|
|
70
82
|
}
|