@livekit/agents 0.5.1 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. package/dist/index.cjs +3 -0
  2. package/dist/index.cjs.map +1 -1
  3. package/dist/index.d.ts +2 -1
  4. package/dist/index.d.ts.map +1 -1
  5. package/dist/index.js +2 -0
  6. package/dist/index.js.map +1 -1
  7. package/dist/llm/index.cjs +2 -0
  8. package/dist/llm/index.cjs.map +1 -1
  9. package/dist/llm/index.d.ts +1 -1
  10. package/dist/llm/index.d.ts.map +1 -1
  11. package/dist/llm/index.js +2 -0
  12. package/dist/llm/index.js.map +1 -1
  13. package/dist/llm/llm.cjs +47 -3
  14. package/dist/llm/llm.cjs.map +1 -1
  15. package/dist/llm/llm.d.ts +15 -2
  16. package/dist/llm/llm.d.ts.map +1 -1
  17. package/dist/llm/llm.js +46 -3
  18. package/dist/llm/llm.js.map +1 -1
  19. package/dist/metrics/base.cjs +44 -0
  20. package/dist/metrics/base.cjs.map +1 -0
  21. package/dist/metrics/base.d.ts +96 -0
  22. package/dist/metrics/base.d.ts.map +1 -0
  23. package/dist/metrics/base.js +20 -0
  24. package/dist/metrics/base.js.map +1 -0
  25. package/dist/metrics/index.cjs +35 -0
  26. package/dist/metrics/index.cjs.map +1 -0
  27. package/dist/metrics/index.d.ts +5 -0
  28. package/dist/metrics/index.d.ts.map +1 -0
  29. package/dist/metrics/index.js +9 -0
  30. package/dist/metrics/index.js.map +1 -0
  31. package/dist/metrics/usage_collector.cjs +53 -0
  32. package/dist/metrics/usage_collector.cjs.map +1 -0
  33. package/dist/metrics/usage_collector.d.ts +14 -0
  34. package/dist/metrics/usage_collector.d.ts.map +1 -0
  35. package/dist/metrics/usage_collector.js +29 -0
  36. package/dist/metrics/usage_collector.js.map +1 -0
  37. package/dist/metrics/utils.cjs +104 -0
  38. package/dist/metrics/utils.cjs.map +1 -0
  39. package/dist/metrics/utils.d.ts +10 -0
  40. package/dist/metrics/utils.d.ts.map +1 -0
  41. package/dist/metrics/utils.js +73 -0
  42. package/dist/metrics/utils.js.map +1 -0
  43. package/dist/multimodal/multimodal_agent.cjs +7 -13
  44. package/dist/multimodal/multimodal_agent.cjs.map +1 -1
  45. package/dist/multimodal/multimodal_agent.d.ts +1 -4
  46. package/dist/multimodal/multimodal_agent.d.ts.map +1 -1
  47. package/dist/multimodal/multimodal_agent.js +7 -13
  48. package/dist/multimodal/multimodal_agent.js.map +1 -1
  49. package/dist/pipeline/agent_output.cjs +9 -2
  50. package/dist/pipeline/agent_output.cjs.map +1 -1
  51. package/dist/pipeline/agent_output.d.ts +1 -0
  52. package/dist/pipeline/agent_output.d.ts.map +1 -1
  53. package/dist/pipeline/agent_output.js +9 -2
  54. package/dist/pipeline/agent_output.js.map +1 -1
  55. package/dist/pipeline/index.cjs +2 -0
  56. package/dist/pipeline/index.cjs.map +1 -1
  57. package/dist/pipeline/index.d.ts +1 -1
  58. package/dist/pipeline/index.d.ts.map +1 -1
  59. package/dist/pipeline/index.js +3 -1
  60. package/dist/pipeline/index.js.map +1 -1
  61. package/dist/pipeline/pipeline_agent.cjs +168 -70
  62. package/dist/pipeline/pipeline_agent.cjs.map +1 -1
  63. package/dist/pipeline/pipeline_agent.d.ts +10 -4
  64. package/dist/pipeline/pipeline_agent.d.ts.map +1 -1
  65. package/dist/pipeline/pipeline_agent.js +171 -73
  66. package/dist/pipeline/pipeline_agent.js.map +1 -1
  67. package/dist/pipeline/speech_handle.cjs +49 -1
  68. package/dist/pipeline/speech_handle.cjs.map +1 -1
  69. package/dist/pipeline/speech_handle.d.ts +12 -2
  70. package/dist/pipeline/speech_handle.d.ts.map +1 -1
  71. package/dist/pipeline/speech_handle.js +50 -2
  72. package/dist/pipeline/speech_handle.js.map +1 -1
  73. package/dist/stt/index.cjs.map +1 -1
  74. package/dist/stt/index.d.ts +1 -1
  75. package/dist/stt/index.d.ts.map +1 -1
  76. package/dist/stt/index.js.map +1 -1
  77. package/dist/stt/stream_adapter.cjs +15 -5
  78. package/dist/stt/stream_adapter.cjs.map +1 -1
  79. package/dist/stt/stream_adapter.d.ts +4 -1
  80. package/dist/stt/stream_adapter.d.ts.map +1 -1
  81. package/dist/stt/stream_adapter.js +15 -5
  82. package/dist/stt/stream_adapter.js.map +1 -1
  83. package/dist/stt/stt.cjs +46 -2
  84. package/dist/stt/stt.cjs.map +1 -1
  85. package/dist/stt/stt.d.ts +25 -3
  86. package/dist/stt/stt.d.ts.map +1 -1
  87. package/dist/stt/stt.js +46 -2
  88. package/dist/stt/stt.js.map +1 -1
  89. package/dist/tts/index.cjs +4 -2
  90. package/dist/tts/index.cjs.map +1 -1
  91. package/dist/tts/index.d.ts +1 -1
  92. package/dist/tts/index.d.ts.map +1 -1
  93. package/dist/tts/index.js +3 -1
  94. package/dist/tts/index.js.map +1 -1
  95. package/dist/tts/stream_adapter.cjs +14 -3
  96. package/dist/tts/stream_adapter.cjs.map +1 -1
  97. package/dist/tts/stream_adapter.d.ts +3 -0
  98. package/dist/tts/stream_adapter.d.ts.map +1 -1
  99. package/dist/tts/stream_adapter.js +15 -4
  100. package/dist/tts/stream_adapter.js.map +1 -1
  101. package/dist/tts/tts.cjs +109 -6
  102. package/dist/tts/tts.cjs.map +1 -1
  103. package/dist/tts/tts.d.ts +24 -1
  104. package/dist/tts/tts.d.ts.map +1 -1
  105. package/dist/tts/tts.js +107 -5
  106. package/dist/tts/tts.js.map +1 -1
  107. package/dist/vad.cjs +43 -2
  108. package/dist/vad.cjs.map +1 -1
  109. package/dist/vad.d.ts +21 -4
  110. package/dist/vad.d.ts.map +1 -1
  111. package/dist/vad.js +43 -2
  112. package/dist/vad.js.map +1 -1
  113. package/package.json +1 -1
  114. package/src/index.ts +2 -1
  115. package/src/llm/index.ts +2 -0
  116. package/src/llm/llm.ts +55 -3
  117. package/src/metrics/base.ts +127 -0
  118. package/src/metrics/index.ts +20 -0
  119. package/src/metrics/usage_collector.ts +40 -0
  120. package/src/metrics/utils.ts +100 -0
  121. package/src/multimodal/multimodal_agent.ts +12 -17
  122. package/src/pipeline/agent_output.ts +14 -7
  123. package/src/pipeline/index.ts +1 -1
  124. package/src/pipeline/pipeline_agent.ts +210 -95
  125. package/src/pipeline/speech_handle.ts +67 -2
  126. package/src/stt/index.ts +2 -0
  127. package/src/stt/stream_adapter.ts +17 -5
  128. package/src/stt/stt.ts +67 -3
  129. package/src/tts/index.ts +2 -0
  130. package/src/tts/stream_adapter.ts +17 -4
  131. package/src/tts/tts.ts +127 -4
  132. package/src/vad.ts +61 -4
@@ -17,10 +17,11 @@ import type {
17
17
  FunctionContext,
18
18
  LLM,
19
19
  } from '../llm/index.js';
20
- import { LLMStream } from '../llm/index.js';
20
+ import { LLMEvent, LLMStream } from '../llm/index.js';
21
21
  import { ChatContext, ChatMessage, ChatRole } from '../llm/index.js';
22
22
  import { log } from '../log.js';
23
- import { type STT, StreamAdapter as STTStreamAdapter } from '../stt/index.js';
23
+ import type { AgentMetrics, PipelineEOUMetrics } from '../metrics/base.js';
24
+ import { type STT, StreamAdapter as STTStreamAdapter, SpeechEventType } from '../stt/index.js';
24
25
  import {
25
26
  SentenceTokenizer as BasicSentenceTokenizer,
26
27
  WordTokenizer as BasicWordTokenizer,
@@ -28,9 +29,9 @@ import {
28
29
  } from '../tokenize/basic/index.js';
29
30
  import type { SentenceTokenizer, WordTokenizer } from '../tokenize/tokenizer.js';
30
31
  import type { TTS } from '../tts/index.js';
31
- import { StreamAdapter as TTSStreamAdapter } from '../tts/index.js';
32
+ import { TTSEvent, StreamAdapter as TTSStreamAdapter } from '../tts/index.js';
32
33
  import { AsyncIterableQueue, CancellablePromise, Future, gracefullyCancel } from '../utils.js';
33
- import type { VAD, VADEvent } from '../vad.js';
34
+ import { type VAD, type VADEvent, VADEventType } from '../vad.js';
34
35
  import type { SpeechSource, SynthesisHandle } from './agent_output.js';
35
36
  import { AgentOutput } from './agent_output.js';
36
37
  import { AgentPlayout, AgentPlayoutEvent } from './agent_playout.js';
@@ -39,6 +40,7 @@ import { SpeechHandle } from './speech_handle.js';
39
40
 
40
41
  export type AgentState = 'initializing' | 'thinking' | 'listening' | 'speaking';
41
42
  export const AGENT_STATE_ATTRIBUTE = 'lk.agent.state';
43
+ let speechData: { sequenceId: string } | undefined;
42
44
 
43
45
  export type BeforeLLMCallback = (
44
46
  agent: VoicePipelineAgent,
@@ -60,6 +62,7 @@ export enum VPAEvent {
60
62
  AGENT_SPEECH_INTERRUPTED,
61
63
  FUNCTION_CALLS_COLLECTED,
62
64
  FUNCTION_CALLS_FINISHED,
65
+ METRICS_COLLECTED,
63
66
  }
64
67
 
65
68
  export type VPACallbacks = {
@@ -72,12 +75,14 @@ export type VPACallbacks = {
72
75
  [VPAEvent.AGENT_SPEECH_INTERRUPTED]: (msg: ChatMessage) => void;
73
76
  [VPAEvent.FUNCTION_CALLS_COLLECTED]: (funcs: FunctionCallInfo[]) => void;
74
77
  [VPAEvent.FUNCTION_CALLS_FINISHED]: (funcs: CallableFunctionResult[]) => void;
78
+ [VPAEvent.METRICS_COLLECTED]: (metrics: AgentMetrics) => void;
75
79
  };
76
80
 
77
81
  export class AgentCallContext {
78
82
  #agent: VoicePipelineAgent;
79
83
  #llmStream: LLMStream;
80
84
  #metadata = new Map<string, any>();
85
+ #extraChatMessages: ChatMessage[] = [];
81
86
  static #current: AgentCallContext;
82
87
 
83
88
  constructor(agent: VoicePipelineAgent, llmStream: LLMStream) {
@@ -105,6 +110,14 @@ export class AgentCallContext {
105
110
  get llmStream(): LLMStream {
106
111
  return this.#llmStream;
107
112
  }
113
+
114
+ get extraChatMessages() {
115
+ return this.#extraChatMessages;
116
+ }
117
+
118
+ addExtraChatMessage(message: ChatMessage) {
119
+ this.#extraChatMessages.push(message);
120
+ }
108
121
  }
109
122
 
110
123
  const defaultBeforeLLMCallback: BeforeLLMCallback = (
@@ -171,7 +184,7 @@ export interface VPAOptions {
171
184
  interruptMinWords: number;
172
185
  /** Delay to wait before considering the user speech done. */
173
186
  minEndpointingDelay: number;
174
- maxRecursiveFncCalls: number;
187
+ maxNestedFncCalls: number;
175
188
  /* Whether to preemptively synthesize responses. */
176
189
  preemptiveSynthesis: boolean;
177
190
  /*
@@ -201,7 +214,7 @@ const defaultVPAOptions: VPAOptions = {
201
214
  interruptSpeechDuration: 50,
202
215
  interruptMinWords: 0,
203
216
  minEndpointingDelay: 500,
204
- maxRecursiveFncCalls: 1,
217
+ maxNestedFncCalls: 1,
205
218
  preemptiveSynthesis: false,
206
219
  beforeLLMCallback: defaultBeforeLLMCallback,
207
220
  beforeTTSCallback: defaultBeforeTTSCallback,
@@ -229,7 +242,6 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
229
242
  #transcribedInterimText = '';
230
243
  #speechQueueOpen = new Future();
231
244
  #speechQueue = new AsyncIterableQueue<SpeechHandle | typeof VoicePipelineAgent.FLUSH_SENTINEL>();
232
- #lastEndOfSpeechTime?: number;
233
245
  #updateStateTask?: CancellablePromise<void>;
234
246
  #started = false;
235
247
  #room?: Room;
@@ -237,6 +249,8 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
237
249
  #deferredValidation: DeferredReplyValidation;
238
250
  #logger = log();
239
251
  #agentPublication?: LocalTrackPublication;
252
+ #lastFinalTranscriptTime?: number;
253
+ #lastSpeechTime?: number;
240
254
 
241
255
  constructor(
242
256
  /** Voice Activity Detection instance. */
@@ -317,6 +331,25 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
317
331
  if (this.#started) {
318
332
  throw new Error('voice assistant already started');
319
333
  }
334
+
335
+ this.#stt.on(SpeechEventType.METRICS_COLLECTED, (metrics) => {
336
+ this.emit(VPAEvent.METRICS_COLLECTED, metrics);
337
+ });
338
+
339
+ this.#tts.on(TTSEvent.METRICS_COLLECTED, (metrics) => {
340
+ if (!speechData) return;
341
+ this.emit(VPAEvent.METRICS_COLLECTED, { ...metrics, sequenceId: speechData.sequenceId });
342
+ });
343
+
344
+ this.#llm.on(LLMEvent.METRICS_COLLECTED, (metrics) => {
345
+ if (!speechData) return;
346
+ this.emit(VPAEvent.METRICS_COLLECTED, { ...metrics, sequenceId: speechData.sequenceId });
347
+ });
348
+
349
+ this.#vad.on(VADEventType.METRICS_COLLECTED, (metrics) => {
350
+ this.emit(VPAEvent.METRICS_COLLECTED, metrics);
351
+ });
352
+
320
353
  room.on(RoomEvent.ParticipantConnected, (participant: RemoteParticipant) => {
321
354
  // automatically link to the first participant that connects, if not already linked
322
355
  if (this.#participant) {
@@ -344,12 +377,51 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
344
377
  source: string | LLMStream | AsyncIterable<string>,
345
378
  allowInterruptions = true,
346
379
  addToChatCtx = true,
347
- ) {
380
+ ): Promise<SpeechHandle> {
348
381
  await this.#trackPublishedFut.await;
382
+
383
+ let callContext: AgentCallContext | undefined;
384
+ let fncSource: string | AsyncIterable<string> | undefined;
385
+ if (addToChatCtx) {
386
+ callContext = AgentCallContext.getCurrent();
387
+ if (source instanceof LLMStream) {
388
+ this.#logger.warn('LLMStream will be ignored for function call chat context');
389
+ } else if (typeof source === 'string') {
390
+ fncSource = source;
391
+ } else {
392
+ fncSource = source;
393
+ source = new AsyncIterableQueue<string>();
394
+ }
395
+ }
396
+
349
397
  const newHandle = SpeechHandle.createAssistantSpeech(allowInterruptions, addToChatCtx);
350
398
  const synthesisHandle = this.#synthesizeAgentSpeech(newHandle.id, source);
351
399
  newHandle.initialize(source, synthesisHandle);
352
- this.#addSpeechForPlayout(newHandle);
400
+
401
+ if (this.#playingSpeech && !this.#playingSpeech.nestedSpeechFinished) {
402
+ this.#playingSpeech.addNestedSpeech(newHandle);
403
+ } else {
404
+ this.#addSpeechForPlayout(newHandle);
405
+ }
406
+
407
+ if (callContext && fncSource) {
408
+ let text: string;
409
+ if (typeof source === 'string') {
410
+ text = fncSource as string;
411
+ } else {
412
+ text = '';
413
+ for await (const chunk of fncSource) {
414
+ (source as AsyncIterableQueue<string>).put(chunk);
415
+ text += chunk;
416
+ }
417
+ (source as AsyncIterableQueue<string>).close();
418
+ }
419
+
420
+ callContext.addExtraChatMessage(ChatMessage.create({ text, role: ChatRole.ASSISTANT }));
421
+ this.#logger.child({ text }).debug('added speech to function call chat context');
422
+ }
423
+
424
+ return newHandle;
353
425
  }
354
426
 
355
427
  #updateState(state: AgentState, delay = 0) {
@@ -410,11 +482,14 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
410
482
  if (event.speechDuration >= this.#opts.interruptSpeechDuration) {
411
483
  this.#interruptIfPossible();
412
484
  }
485
+
486
+ if (event.rawAccumulatedSpeech > 0) {
487
+ this.#lastSpeechTime = Date.now() - event.rawAccumulatedSilence;
488
+ }
413
489
  });
414
490
  this.#humanInput.on(HumanInputEvent.END_OF_SPEECH, (event) => {
415
491
  this.emit(VPAEvent.USER_STARTED_SPEAKING);
416
492
  this.#deferredValidation.onHumanEndOfSpeech(event);
417
- this.#lastEndOfSpeechTime = Date.now();
418
493
  });
419
494
  this.#humanInput.on(HumanInputEvent.INTERIM_TRANSCRIPT, (event) => {
420
495
  this.#transcribedInterimText = event.alternatives![0].text;
@@ -423,7 +498,7 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
423
498
  const newTranscript = event.alternatives![0].text;
424
499
  if (!newTranscript) return;
425
500
 
426
- this.#logger.child({ userTranscript: newTranscript }).debug('received user transcript');
501
+ this.#lastFinalTranscriptTime = Date.now();
427
502
  this.#transcribedText += (this.#transcribedText ? ' ' : '') + newTranscript;
428
503
 
429
504
  if (
@@ -520,8 +595,7 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
520
595
  // add it to the chat context for this new reply synthesis
521
596
  copiedCtx.messages.push(
522
597
  ChatMessage.create({
523
- // TODO(nbsp): uhhh unsure where to get the played text here
524
- // text: playingSpeech.synthesisHandle.(theres no ttsForwarder here)
598
+ text: playingSpeech.synthesisHandle.text,
525
599
  role: ChatRole.ASSISTANT,
526
600
  }),
527
601
  );
@@ -535,33 +609,31 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
535
609
  }),
536
610
  );
537
611
 
538
- if (cancelled) resolve();
539
- let llmStream = await this.#opts.beforeLLMCallback(this, copiedCtx);
540
- if (llmStream === false) {
541
- handle?.cancel();
542
- return;
543
- }
544
-
545
- if (cancelled) resolve();
546
- // fallback to default impl if no custom/user stream is returned
547
- if (!(llmStream instanceof LLMStream)) {
548
- llmStream = (await defaultBeforeLLMCallback(this, copiedCtx)) as LLMStream;
549
- }
612
+ speechData = { sequenceId: handle!.id };
550
613
 
551
- if (handle!.interrupted) {
552
- return;
553
- }
614
+ try {
615
+ if (cancelled) resolve();
616
+ let llmStream = await this.#opts.beforeLLMCallback(this, copiedCtx);
617
+ if (llmStream === false) {
618
+ handle?.cancel();
619
+ return;
620
+ }
554
621
 
555
- const synthesisHandle = this.#synthesizeAgentSpeech(handle!.id, llmStream);
556
- handle!.initialize(llmStream, synthesisHandle);
622
+ if (cancelled) resolve();
623
+ // fallback to default impl if no custom/user stream is returned
624
+ if (!(llmStream instanceof LLMStream)) {
625
+ llmStream = (await defaultBeforeLLMCallback(this, copiedCtx)) as LLMStream;
626
+ }
557
627
 
558
- // TODO(theomonnom): find a more reliable way to get the elapsed time from the last EOS
559
- // (VAD could not have detected any speech — maybe unlikely?)
560
- const elapsed = !!this.#lastEndOfSpeechTime
561
- ? Math.round((Date.now() - this.#lastEndOfSpeechTime) * 1000) / 1000
562
- : -1;
628
+ if (handle!.interrupted) {
629
+ return;
630
+ }
563
631
 
564
- this.#logger.child({ speechId: handle!.id, elapsed }).debug('synthesizing agent reply');
632
+ const synthesisHandle = this.#synthesizeAgentSpeech(handle!.id, llmStream);
633
+ handle!.initialize(llmStream, synthesisHandle);
634
+ } finally {
635
+ speechData = undefined;
636
+ }
565
637
  resolve();
566
638
  });
567
639
  }
@@ -620,83 +692,109 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
620
692
  }
621
693
  commitUserQuestionIfNeeded();
622
694
 
623
- // TODO(nbsp): what goes here
624
- let collectedText = '';
695
+ const collectedText = handle.synthesisHandle.text;
625
696
  const isUsingTools = handle.source instanceof LLMStream && !!handle.source.functionCalls.length;
626
- const extraToolsMessages = []; // additional messages from the functions to add to the context
627
- let interrupted = handle.interrupted;
697
+ const interrupted = handle.interrupted;
698
+
699
+ const executeFunctionCalls = async () => {
700
+ // if the answer is using tools, execute the functions and automatically generate
701
+ // a response to the user question from the returned values
702
+ if (!isUsingTools || interrupted) return;
703
+
704
+ if (handle.fncNestedDepth >= this.#opts.maxNestedFncCalls) {
705
+ this.#logger
706
+ .child({ speechId: handle.id, fncNestedDepth: handle.fncNestedDepth })
707
+ .warn('max function calls nested depth reached');
708
+ return;
709
+ }
628
710
 
629
- // if the answer is using tools, execute the functions and automatically generate
630
- // a response to the user question from the returned values
631
- if (isUsingTools && !interrupted) {
632
711
  if (!userQuestion || !handle.userCommitted) {
633
712
  throw new Error('user speech should have been committed before using tools');
634
713
  }
635
714
  const llmStream = handle.source;
636
- let newFunctionCalls = llmStream.functionCalls;
637
-
638
- for (let i = 0; i < this.#opts.maxRecursiveFncCalls; i++) {
639
- this.emit(VPAEvent.FUNCTION_CALLS_COLLECTED, newFunctionCalls);
640
- const calledFuncs: FunctionCallInfo[] = [];
641
- for (const func of newFunctionCalls) {
642
- const task = func.func.execute(func.params).then(
643
- (result) => ({ name: func.name, toolCallId: func.toolCallId, result }),
644
- (error) => ({ name: func.name, toolCallId: func.toolCallId, error }),
645
- );
646
- calledFuncs.push({ ...func, task });
715
+ const newFunctionCalls = llmStream.functionCalls;
716
+
717
+ new AgentCallContext(this, llmStream);
718
+
719
+ this.emit(VPAEvent.FUNCTION_CALLS_COLLECTED, newFunctionCalls);
720
+ const calledFuncs: FunctionCallInfo[] = [];
721
+ for (const func of newFunctionCalls) {
722
+ const task = func.func.execute(func.params).then(
723
+ (result) => ({ name: func.name, toolCallId: func.toolCallId, result }),
724
+ (error) => ({ name: func.name, toolCallId: func.toolCallId, error }),
725
+ );
726
+ calledFuncs.push({ ...func, task });
727
+ this.#logger
728
+ .child({ function: func.name, speechId: handle.id })
729
+ .debug('executing AI function');
730
+ try {
731
+ await task;
732
+ } catch {
647
733
  this.#logger
648
734
  .child({ function: func.name, speechId: handle.id })
649
- .debug('executing AI function');
650
- try {
651
- await task;
652
- } catch {
653
- this.#logger
654
- .child({ function: func.name, speechId: handle.id })
655
- .error('error executing AI function');
656
- }
735
+ .error('error executing AI function');
657
736
  }
737
+ }
658
738
 
659
- const toolCallsInfo = [];
660
- const toolCallsResults = [];
661
- for (const fnc of calledFuncs) {
662
- // ignore the function calls that return void
663
- const task = await fnc.task;
664
- if (!task || task.result === undefined) continue;
665
- toolCallsInfo.push(fnc);
666
- toolCallsResults.push(ChatMessage.createToolFromFunctionResult(task));
667
- }
739
+ const toolCallsInfo = [];
740
+ const toolCallsResults = [];
741
+ for (const fnc of calledFuncs) {
742
+ // ignore the function calls that return void
743
+ const task = await fnc.task;
744
+ if (!task || task.result === undefined) continue;
745
+ toolCallsInfo.push(fnc);
746
+ toolCallsResults.push(ChatMessage.createToolFromFunctionResult(task));
747
+ }
748
+
749
+ if (!toolCallsInfo.length) return;
668
750
 
669
- if (!toolCallsInfo.length) break;
751
+ // generate an answer from the tool calls
752
+ const extraToolsMessages = [ChatMessage.createToolCalls(toolCallsInfo, collectedText)];
753
+ extraToolsMessages.push(...toolCallsResults);
670
754
 
671
- // generate an answer from the tool calls
672
- extraToolsMessages.push(ChatMessage.createToolCalls(toolCallsInfo, collectedText));
673
- extraToolsMessages.push(...toolCallsResults);
755
+ // create a nested speech handle
756
+ const newSpeechHandle = SpeechHandle.createToolSpeech(
757
+ handle.allowInterruptions,
758
+ handle.addToChatCtx,
759
+ handle.fncNestedDepth + 1,
760
+ extraToolsMessages,
761
+ );
674
762
 
675
- const chatCtx = handle.source.chatCtx.copy();
676
- chatCtx.messages.push(...extraToolsMessages);
763
+ // synthesize the tool speech with the chat ctx from llmStream
764
+ const chatCtx = handle.source.chatCtx.copy();
765
+ chatCtx.messages.push(...extraToolsMessages);
766
+ chatCtx.messages.push(...AgentCallContext.getCurrent().extraChatMessages);
677
767
 
678
- const answerLLMStream = this.llm.chat({
679
- chatCtx,
680
- fncCtx: this.fncCtx,
681
- });
682
- const answerSynthesis = this.#synthesizeAgentSpeech(handle.id, answerLLMStream);
683
- // replace the synthesis handle with the new one to allow interruption
684
- handle.synthesisHandle = answerSynthesis;
685
- const playHandle = answerSynthesis.play();
686
- await playHandle.join().await;
687
-
688
- // TODO(nbsp): what text goes here
689
- collectedText = '';
690
- interrupted = answerSynthesis.interrupted;
691
- newFunctionCalls = answerLLMStream.functionCalls;
692
-
693
- this.emit(VPAEvent.FUNCTION_CALLS_FINISHED, calledFuncs);
694
- if (!newFunctionCalls) break;
768
+ const answerLLMStream = this.llm.chat({
769
+ chatCtx,
770
+ fncCtx: this.fncCtx,
771
+ });
772
+ const answerSynthesis = this.#synthesizeAgentSpeech(newSpeechHandle.id, answerLLMStream);
773
+ newSpeechHandle.initialize(answerLLMStream, answerSynthesis);
774
+ handle.addNestedSpeech(newSpeechHandle);
775
+
776
+ this.emit(VPAEvent.FUNCTION_CALLS_FINISHED, calledFuncs);
777
+ };
778
+
779
+ const task = executeFunctionCalls().then(() => {
780
+ handle.markNestedSpeechFinished();
781
+ });
782
+ while (!handle.nestedSpeechFinished) {
783
+ const changed = handle.nestedSpeechChanged();
784
+ await Promise.race([changed, task]);
785
+ while (handle.nestedSpeechHandles.length) {
786
+ const speech = handle.nestedSpeechHandles[0]!;
787
+ this.#playingSpeech = speech;
788
+ await this.#playSpeech(speech);
789
+ handle.nestedSpeechHandles.shift();
790
+ this.#playingSpeech = handle;
695
791
  }
696
792
  }
697
793
 
698
794
  if (handle.addToChatCtx && (!userQuestion || handle.userCommitted)) {
699
- this.chatCtx.messages.push(...extraToolsMessages);
795
+ if (handle.extraToolsMessages) {
796
+ this.chatCtx.messages.push(...handle.extraToolsMessages);
797
+ }
700
798
  if (interrupted) {
701
799
  collectedText + '…';
702
800
  }
@@ -718,6 +816,8 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
718
816
  speechId: handle.id,
719
817
  })
720
818
  .debug('committed agent speech');
819
+
820
+ handle.setDone();
721
821
  }
722
822
  }
723
823
 
@@ -777,6 +877,21 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
777
877
 
778
878
  this.#logger.child({ speechId: this.#pendingAgentReply.id }).debug('validated agent reply');
779
879
 
880
+ if (this.#lastSpeechTime) {
881
+ const timeSinceLastSpeech = Date.now() - this.#lastSpeechTime;
882
+ const transcriptionDelay = Math.max(
883
+ (this.#lastFinalTranscriptTime || 0) - this.#lastSpeechTime,
884
+ 0,
885
+ );
886
+ const metrics: PipelineEOUMetrics = {
887
+ timestamp: Date.now(),
888
+ sequenceId: this.#pendingAgentReply.id,
889
+ endOfUtteranceDelay: timeSinceLastSpeech,
890
+ transcriptionDelay,
891
+ };
892
+ this.emit(VPAEvent.METRICS_COLLECTED, metrics);
893
+ }
894
+
780
895
  this.#addSpeechForPlayout(this.#pendingAgentReply);
781
896
  this.#pendingAgentReply = undefined;
782
897
  this.#transcribedInterimText = '';
@@ -2,8 +2,8 @@
2
2
  //
3
3
  // SPDX-License-Identifier: Apache-2.0
4
4
  import { randomUUID } from 'crypto';
5
- import type { LLMStream } from '../llm/index.js';
6
- import { Future } from '../utils.js';
5
+ import type { ChatMessage, LLMStream } from '../llm/index.js';
6
+ import { AsyncIterableQueue, Future } from '../utils.js';
7
7
  import type { SynthesisHandle } from './agent_output.js';
8
8
 
9
9
  export class SpeechHandle {
@@ -14,10 +14,16 @@ export class SpeechHandle {
14
14
  #userQuestion: string;
15
15
  #userCommitted = false;
16
16
  #initFut = new Future();
17
+ #doneFut = new Future();
17
18
  #speechCommitted = false;
18
19
  #source?: string | LLMStream | AsyncIterable<string>;
19
20
  #synthesisHandle?: SynthesisHandle;
20
21
  #initialized = false;
22
+ #fncNestedDepth: number;
23
+ #fncExtraToolsMesages?: ChatMessage[];
24
+ #nestedSpeechHandles: SpeechHandle[] = [];
25
+ #nestedSpeechChanged = new AsyncIterableQueue<void>();
26
+ #nestedSpeechFinished = false;
21
27
 
22
28
  constructor(
23
29
  id: string,
@@ -25,12 +31,16 @@ export class SpeechHandle {
25
31
  addToChatCtx: boolean,
26
32
  isReply: boolean,
27
33
  userQuestion: string,
34
+ fncNestedDepth = 0,
35
+ extraToolsMessages: ChatMessage[] | undefined = undefined,
28
36
  ) {
29
37
  this.#id = id;
30
38
  this.#allowInterruptions = allowInterruptions;
31
39
  this.#addToChatCtx = addToChatCtx;
32
40
  this.#isReply = isReply;
33
41
  this.#userQuestion = userQuestion;
42
+ this.#fncNestedDepth = fncNestedDepth;
43
+ this.#fncExtraToolsMesages = extraToolsMessages;
34
44
  }
35
45
 
36
46
  static createAssistantReply(
@@ -45,6 +55,23 @@ export class SpeechHandle {
45
55
  return new SpeechHandle(randomUUID(), allowInterruptions, addToChatCtx, false, '');
46
56
  }
47
57
 
58
+ static createToolSpeech(
59
+ allowInterruptions: boolean,
60
+ addToChatCtx: boolean,
61
+ fncNestedDepth: number,
62
+ extraToolsMessages: ChatMessage[],
63
+ ): SpeechHandle {
64
+ return new SpeechHandle(
65
+ randomUUID(),
66
+ allowInterruptions,
67
+ addToChatCtx,
68
+ false,
69
+ '',
70
+ fncNestedDepth,
71
+ extraToolsMessages,
72
+ );
73
+ }
74
+
48
75
  async waitForInitialization() {
49
76
  await this.#initFut.await;
50
77
  }
@@ -122,6 +149,43 @@ export class SpeechHandle {
122
149
  return !!this.#synthesisHandle?.interrupted;
123
150
  }
124
151
 
152
+ get fncNestedDepth(): number {
153
+ return this.#fncNestedDepth;
154
+ }
155
+
156
+ get extraToolsMessages(): ChatMessage[] | undefined {
157
+ return this.#fncExtraToolsMesages;
158
+ }
159
+
160
+ addNestedSpeech(handle: SpeechHandle) {
161
+ this.#nestedSpeechHandles.push(handle);
162
+ this.#nestedSpeechChanged.put();
163
+ }
164
+
165
+ get nestedSpeechHandles(): SpeechHandle[] {
166
+ return this.#nestedSpeechHandles;
167
+ }
168
+
169
+ async nestedSpeechChanged() {
170
+ await this.#nestedSpeechChanged.next();
171
+ }
172
+
173
+ get nestedSpeechFinished(): boolean {
174
+ return this.#nestedSpeechFinished;
175
+ }
176
+
177
+ markNestedSpeechFinished() {
178
+ this.#nestedSpeechFinished = true;
179
+ }
180
+
181
+ join() {
182
+ return this.#doneFut.await;
183
+ }
184
+
185
+ setDone() {
186
+ this.#doneFut.resolve();
187
+ }
188
+
125
189
  interrupt() {
126
190
  if (!this.#allowInterruptions) {
127
191
  throw new Error('interruptions are not allowed');
@@ -131,6 +195,7 @@ export class SpeechHandle {
131
195
 
132
196
  cancel() {
133
197
  this.#initFut.reject(new Error());
198
+ this.#nestedSpeechChanged.close();
134
199
  this.#synthesisHandle?.interrupt();
135
200
  }
136
201
  }
package/src/stt/index.ts CHANGED
@@ -6,6 +6,8 @@ export {
6
6
  type SpeechEvent,
7
7
  type SpeechData,
8
8
  type STTCapabilities,
9
+ type RecognitionUsage,
10
+ type STTCallbacks,
9
11
  SpeechEventType,
10
12
  STT,
11
13
  SpeechStream,
@@ -10,14 +10,20 @@ import { STT, SpeechEventType, SpeechStream } from './stt.js';
10
10
  export class StreamAdapter extends STT {
11
11
  #stt: STT;
12
12
  #vad: VAD;
13
+ label: string;
13
14
 
14
15
  constructor(stt: STT, vad: VAD) {
15
16
  super({ streaming: true, interimResults: false });
16
17
  this.#stt = stt;
17
18
  this.#vad = vad;
19
+ this.label = `stt.StreamAdapter<${this.#stt.label}>`;
20
+
21
+ this.#stt.on(SpeechEventType.METRICS_COLLECTED, (metrics) => {
22
+ this.emit(SpeechEventType.METRICS_COLLECTED, metrics);
23
+ });
18
24
  }
19
25
 
20
- recognize(frame: AudioFrame): Promise<SpeechEvent> {
26
+ _recognize(frame: AudioFrame): Promise<SpeechEvent> {
21
27
  return this.#stt.recognize(frame);
22
28
  }
23
29
 
@@ -29,15 +35,21 @@ export class StreamAdapter extends STT {
29
35
  export class StreamAdapterWrapper extends SpeechStream {
30
36
  #stt: STT;
31
37
  #vadStream: VADStream;
38
+ label: string;
32
39
 
33
40
  constructor(stt: STT, vad: VAD) {
34
- super();
41
+ super(stt);
35
42
  this.#stt = stt;
36
43
  this.#vadStream = vad.stream();
44
+ this.label = `stt.StreamAdapterWrapper<${this.#stt.label}>`;
37
45
 
38
46
  this.#run();
39
47
  }
40
48
 
49
+ async monitorMetrics() {
50
+ return; // do nothing
51
+ }
52
+
41
53
  async #run() {
42
54
  const forwardInput = async () => {
43
55
  for await (const input of this.input) {
@@ -54,17 +66,17 @@ export class StreamAdapterWrapper extends SpeechStream {
54
66
  for await (const ev of this.#vadStream) {
55
67
  switch (ev.type) {
56
68
  case VADEventType.START_OF_SPEECH:
57
- this.queue.put({ type: SpeechEventType.START_OF_SPEECH });
69
+ this.output.put({ type: SpeechEventType.START_OF_SPEECH });
58
70
  break;
59
71
  case VADEventType.END_OF_SPEECH:
60
- this.queue.put({ type: SpeechEventType.END_OF_SPEECH });
72
+ this.output.put({ type: SpeechEventType.END_OF_SPEECH });
61
73
 
62
74
  const event = await this.#stt.recognize(ev.frames);
63
75
  if (!event.alternatives![0].text) {
64
76
  continue;
65
77
  }
66
78
 
67
- this.queue.put(event);
79
+ this.output.put(event);
68
80
  break;
69
81
  }
70
82
  }