@livekit/agents 0.5.2 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (140) hide show
  1. package/dist/index.cjs +3 -0
  2. package/dist/index.cjs.map +1 -1
  3. package/dist/index.d.ts +2 -1
  4. package/dist/index.d.ts.map +1 -1
  5. package/dist/index.js +2 -0
  6. package/dist/index.js.map +1 -1
  7. package/dist/job.cjs.map +1 -1
  8. package/dist/job.js.map +1 -1
  9. package/dist/llm/index.cjs +2 -0
  10. package/dist/llm/index.cjs.map +1 -1
  11. package/dist/llm/index.d.ts +1 -1
  12. package/dist/llm/index.d.ts.map +1 -1
  13. package/dist/llm/index.js +2 -0
  14. package/dist/llm/index.js.map +1 -1
  15. package/dist/llm/llm.cjs +47 -3
  16. package/dist/llm/llm.cjs.map +1 -1
  17. package/dist/llm/llm.d.ts +15 -2
  18. package/dist/llm/llm.d.ts.map +1 -1
  19. package/dist/llm/llm.js +46 -3
  20. package/dist/llm/llm.js.map +1 -1
  21. package/dist/metrics/base.cjs +44 -0
  22. package/dist/metrics/base.cjs.map +1 -0
  23. package/dist/metrics/base.d.ts +96 -0
  24. package/dist/metrics/base.d.ts.map +1 -0
  25. package/dist/metrics/base.js +20 -0
  26. package/dist/metrics/base.js.map +1 -0
  27. package/dist/metrics/index.cjs +35 -0
  28. package/dist/metrics/index.cjs.map +1 -0
  29. package/dist/metrics/index.d.ts +5 -0
  30. package/dist/metrics/index.d.ts.map +1 -0
  31. package/dist/metrics/index.js +9 -0
  32. package/dist/metrics/index.js.map +1 -0
  33. package/dist/metrics/usage_collector.cjs +53 -0
  34. package/dist/metrics/usage_collector.cjs.map +1 -0
  35. package/dist/metrics/usage_collector.d.ts +14 -0
  36. package/dist/metrics/usage_collector.d.ts.map +1 -0
  37. package/dist/metrics/usage_collector.js +29 -0
  38. package/dist/metrics/usage_collector.js.map +1 -0
  39. package/dist/metrics/utils.cjs +104 -0
  40. package/dist/metrics/utils.cjs.map +1 -0
  41. package/dist/metrics/utils.d.ts +10 -0
  42. package/dist/metrics/utils.d.ts.map +1 -0
  43. package/dist/metrics/utils.js +73 -0
  44. package/dist/metrics/utils.js.map +1 -0
  45. package/dist/multimodal/multimodal_agent.cjs +34 -16
  46. package/dist/multimodal/multimodal_agent.cjs.map +1 -1
  47. package/dist/multimodal/multimodal_agent.d.ts +4 -5
  48. package/dist/multimodal/multimodal_agent.d.ts.map +1 -1
  49. package/dist/multimodal/multimodal_agent.js +34 -16
  50. package/dist/multimodal/multimodal_agent.js.map +1 -1
  51. package/dist/pipeline/index.cjs +2 -0
  52. package/dist/pipeline/index.cjs.map +1 -1
  53. package/dist/pipeline/index.d.ts +1 -1
  54. package/dist/pipeline/index.d.ts.map +1 -1
  55. package/dist/pipeline/index.js +3 -1
  56. package/dist/pipeline/index.js.map +1 -1
  57. package/dist/pipeline/pipeline_agent.cjs +166 -66
  58. package/dist/pipeline/pipeline_agent.cjs.map +1 -1
  59. package/dist/pipeline/pipeline_agent.d.ts +10 -4
  60. package/dist/pipeline/pipeline_agent.d.ts.map +1 -1
  61. package/dist/pipeline/pipeline_agent.js +169 -69
  62. package/dist/pipeline/pipeline_agent.js.map +1 -1
  63. package/dist/pipeline/speech_handle.cjs +49 -1
  64. package/dist/pipeline/speech_handle.cjs.map +1 -1
  65. package/dist/pipeline/speech_handle.d.ts +12 -2
  66. package/dist/pipeline/speech_handle.d.ts.map +1 -1
  67. package/dist/pipeline/speech_handle.js +50 -2
  68. package/dist/pipeline/speech_handle.js.map +1 -1
  69. package/dist/stt/index.cjs.map +1 -1
  70. package/dist/stt/index.d.ts +1 -1
  71. package/dist/stt/index.d.ts.map +1 -1
  72. package/dist/stt/index.js.map +1 -1
  73. package/dist/stt/stream_adapter.cjs +15 -5
  74. package/dist/stt/stream_adapter.cjs.map +1 -1
  75. package/dist/stt/stream_adapter.d.ts +4 -1
  76. package/dist/stt/stream_adapter.d.ts.map +1 -1
  77. package/dist/stt/stream_adapter.js +15 -5
  78. package/dist/stt/stream_adapter.js.map +1 -1
  79. package/dist/stt/stt.cjs +46 -2
  80. package/dist/stt/stt.cjs.map +1 -1
  81. package/dist/stt/stt.d.ts +25 -3
  82. package/dist/stt/stt.d.ts.map +1 -1
  83. package/dist/stt/stt.js +46 -2
  84. package/dist/stt/stt.js.map +1 -1
  85. package/dist/tts/index.cjs +4 -2
  86. package/dist/tts/index.cjs.map +1 -1
  87. package/dist/tts/index.d.ts +1 -1
  88. package/dist/tts/index.d.ts.map +1 -1
  89. package/dist/tts/index.js +3 -1
  90. package/dist/tts/index.js.map +1 -1
  91. package/dist/tts/stream_adapter.cjs +14 -3
  92. package/dist/tts/stream_adapter.cjs.map +1 -1
  93. package/dist/tts/stream_adapter.d.ts +3 -0
  94. package/dist/tts/stream_adapter.d.ts.map +1 -1
  95. package/dist/tts/stream_adapter.js +15 -4
  96. package/dist/tts/stream_adapter.js.map +1 -1
  97. package/dist/tts/tts.cjs +109 -6
  98. package/dist/tts/tts.cjs.map +1 -1
  99. package/dist/tts/tts.d.ts +24 -1
  100. package/dist/tts/tts.d.ts.map +1 -1
  101. package/dist/tts/tts.js +107 -5
  102. package/dist/tts/tts.js.map +1 -1
  103. package/dist/utils.cjs +11 -4
  104. package/dist/utils.cjs.map +1 -1
  105. package/dist/utils.d.ts.map +1 -1
  106. package/dist/utils.js +11 -4
  107. package/dist/utils.js.map +1 -1
  108. package/dist/vad.cjs +43 -2
  109. package/dist/vad.cjs.map +1 -1
  110. package/dist/vad.d.ts +21 -4
  111. package/dist/vad.d.ts.map +1 -1
  112. package/dist/vad.js +43 -2
  113. package/dist/vad.js.map +1 -1
  114. package/dist/worker.cjs +5 -2
  115. package/dist/worker.cjs.map +1 -1
  116. package/dist/worker.d.ts.map +1 -1
  117. package/dist/worker.js +5 -2
  118. package/dist/worker.js.map +1 -1
  119. package/package.json +3 -3
  120. package/src/index.ts +2 -1
  121. package/src/job.ts +3 -3
  122. package/src/llm/index.ts +2 -0
  123. package/src/llm/llm.ts +55 -3
  124. package/src/metrics/base.ts +127 -0
  125. package/src/metrics/index.ts +20 -0
  126. package/src/metrics/usage_collector.ts +40 -0
  127. package/src/metrics/utils.ts +100 -0
  128. package/src/multimodal/multimodal_agent.ts +57 -23
  129. package/src/pipeline/index.ts +1 -1
  130. package/src/pipeline/pipeline_agent.ts +208 -89
  131. package/src/pipeline/speech_handle.ts +67 -2
  132. package/src/stt/index.ts +2 -0
  133. package/src/stt/stream_adapter.ts +17 -5
  134. package/src/stt/stt.ts +67 -3
  135. package/src/tts/index.ts +2 -0
  136. package/src/tts/stream_adapter.ts +17 -4
  137. package/src/tts/tts.ts +127 -4
  138. package/src/utils.ts +12 -4
  139. package/src/vad.ts +61 -4
  140. package/src/worker.ts +7 -3
@@ -17,10 +17,11 @@ import type {
17
17
  FunctionContext,
18
18
  LLM,
19
19
  } from '../llm/index.js';
20
- import { LLMStream } from '../llm/index.js';
20
+ import { LLMEvent, LLMStream } from '../llm/index.js';
21
21
  import { ChatContext, ChatMessage, ChatRole } from '../llm/index.js';
22
22
  import { log } from '../log.js';
23
- import { type STT, StreamAdapter as STTStreamAdapter } from '../stt/index.js';
23
+ import type { AgentMetrics, PipelineEOUMetrics } from '../metrics/base.js';
24
+ import { type STT, StreamAdapter as STTStreamAdapter, SpeechEventType } from '../stt/index.js';
24
25
  import {
25
26
  SentenceTokenizer as BasicSentenceTokenizer,
26
27
  WordTokenizer as BasicWordTokenizer,
@@ -28,9 +29,9 @@ import {
28
29
  } from '../tokenize/basic/index.js';
29
30
  import type { SentenceTokenizer, WordTokenizer } from '../tokenize/tokenizer.js';
30
31
  import type { TTS } from '../tts/index.js';
31
- import { StreamAdapter as TTSStreamAdapter } from '../tts/index.js';
32
+ import { TTSEvent, StreamAdapter as TTSStreamAdapter } from '../tts/index.js';
32
33
  import { AsyncIterableQueue, CancellablePromise, Future, gracefullyCancel } from '../utils.js';
33
- import type { VAD, VADEvent } from '../vad.js';
34
+ import { type VAD, type VADEvent, VADEventType } from '../vad.js';
34
35
  import type { SpeechSource, SynthesisHandle } from './agent_output.js';
35
36
  import { AgentOutput } from './agent_output.js';
36
37
  import { AgentPlayout, AgentPlayoutEvent } from './agent_playout.js';
@@ -39,6 +40,7 @@ import { SpeechHandle } from './speech_handle.js';
39
40
 
40
41
  export type AgentState = 'initializing' | 'thinking' | 'listening' | 'speaking';
41
42
  export const AGENT_STATE_ATTRIBUTE = 'lk.agent.state';
43
+ let speechData: { sequenceId: string } | undefined;
42
44
 
43
45
  export type BeforeLLMCallback = (
44
46
  agent: VoicePipelineAgent,
@@ -60,6 +62,7 @@ export enum VPAEvent {
60
62
  AGENT_SPEECH_INTERRUPTED,
61
63
  FUNCTION_CALLS_COLLECTED,
62
64
  FUNCTION_CALLS_FINISHED,
65
+ METRICS_COLLECTED,
63
66
  }
64
67
 
65
68
  export type VPACallbacks = {
@@ -72,12 +75,14 @@ export type VPACallbacks = {
72
75
  [VPAEvent.AGENT_SPEECH_INTERRUPTED]: (msg: ChatMessage) => void;
73
76
  [VPAEvent.FUNCTION_CALLS_COLLECTED]: (funcs: FunctionCallInfo[]) => void;
74
77
  [VPAEvent.FUNCTION_CALLS_FINISHED]: (funcs: CallableFunctionResult[]) => void;
78
+ [VPAEvent.METRICS_COLLECTED]: (metrics: AgentMetrics) => void;
75
79
  };
76
80
 
77
81
  export class AgentCallContext {
78
82
  #agent: VoicePipelineAgent;
79
83
  #llmStream: LLMStream;
80
84
  #metadata = new Map<string, any>();
85
+ #extraChatMessages: ChatMessage[] = [];
81
86
  static #current: AgentCallContext;
82
87
 
83
88
  constructor(agent: VoicePipelineAgent, llmStream: LLMStream) {
@@ -105,6 +110,14 @@ export class AgentCallContext {
105
110
  get llmStream(): LLMStream {
106
111
  return this.#llmStream;
107
112
  }
113
+
114
+ get extraChatMessages() {
115
+ return this.#extraChatMessages;
116
+ }
117
+
118
+ addExtraChatMessage(message: ChatMessage) {
119
+ this.#extraChatMessages.push(message);
120
+ }
108
121
  }
109
122
 
110
123
  const defaultBeforeLLMCallback: BeforeLLMCallback = (
@@ -171,7 +184,7 @@ export interface VPAOptions {
171
184
  interruptMinWords: number;
172
185
  /** Delay to wait before considering the user speech done. */
173
186
  minEndpointingDelay: number;
174
- maxRecursiveFncCalls: number;
187
+ maxNestedFncCalls: number;
175
188
  /* Whether to preemptively synthesize responses. */
176
189
  preemptiveSynthesis: boolean;
177
190
  /*
@@ -201,7 +214,7 @@ const defaultVPAOptions: VPAOptions = {
201
214
  interruptSpeechDuration: 50,
202
215
  interruptMinWords: 0,
203
216
  minEndpointingDelay: 500,
204
- maxRecursiveFncCalls: 1,
217
+ maxNestedFncCalls: 1,
205
218
  preemptiveSynthesis: false,
206
219
  beforeLLMCallback: defaultBeforeLLMCallback,
207
220
  beforeTTSCallback: defaultBeforeTTSCallback,
@@ -229,7 +242,6 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
229
242
  #transcribedInterimText = '';
230
243
  #speechQueueOpen = new Future();
231
244
  #speechQueue = new AsyncIterableQueue<SpeechHandle | typeof VoicePipelineAgent.FLUSH_SENTINEL>();
232
- #lastEndOfSpeechTime?: number;
233
245
  #updateStateTask?: CancellablePromise<void>;
234
246
  #started = false;
235
247
  #room?: Room;
@@ -237,6 +249,8 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
237
249
  #deferredValidation: DeferredReplyValidation;
238
250
  #logger = log();
239
251
  #agentPublication?: LocalTrackPublication;
252
+ #lastFinalTranscriptTime?: number;
253
+ #lastSpeechTime?: number;
240
254
 
241
255
  constructor(
242
256
  /** Voice Activity Detection instance. */
@@ -317,12 +331,31 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
317
331
  if (this.#started) {
318
332
  throw new Error('voice assistant already started');
319
333
  }
334
+
335
+ this.#stt.on(SpeechEventType.METRICS_COLLECTED, (metrics) => {
336
+ this.emit(VPAEvent.METRICS_COLLECTED, metrics);
337
+ });
338
+
339
+ this.#tts.on(TTSEvent.METRICS_COLLECTED, (metrics) => {
340
+ if (!speechData) return;
341
+ this.emit(VPAEvent.METRICS_COLLECTED, { ...metrics, sequenceId: speechData.sequenceId });
342
+ });
343
+
344
+ this.#llm.on(LLMEvent.METRICS_COLLECTED, (metrics) => {
345
+ if (!speechData) return;
346
+ this.emit(VPAEvent.METRICS_COLLECTED, { ...metrics, sequenceId: speechData.sequenceId });
347
+ });
348
+
349
+ this.#vad.on(VADEventType.METRICS_COLLECTED, (metrics) => {
350
+ this.emit(VPAEvent.METRICS_COLLECTED, metrics);
351
+ });
352
+
320
353
  room.on(RoomEvent.ParticipantConnected, (participant: RemoteParticipant) => {
321
354
  // automatically link to the first participant that connects, if not already linked
322
355
  if (this.#participant) {
323
356
  return;
324
357
  }
325
- this.#linkParticipant.call(this, participant.identity);
358
+ this.#linkParticipant.call(this, participant.identity!);
326
359
  });
327
360
 
328
361
  this.#room = room;
@@ -332,7 +365,7 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
332
365
  if (typeof participant === 'string') {
333
366
  this.#linkParticipant(participant);
334
367
  } else {
335
- this.#linkParticipant(participant.identity);
368
+ this.#linkParticipant(participant.identity!);
336
369
  }
337
370
  }
338
371
 
@@ -344,12 +377,51 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
344
377
  source: string | LLMStream | AsyncIterable<string>,
345
378
  allowInterruptions = true,
346
379
  addToChatCtx = true,
347
- ) {
380
+ ): Promise<SpeechHandle> {
348
381
  await this.#trackPublishedFut.await;
382
+
383
+ let callContext: AgentCallContext | undefined;
384
+ let fncSource: string | AsyncIterable<string> | undefined;
385
+ if (addToChatCtx) {
386
+ callContext = AgentCallContext.getCurrent();
387
+ if (source instanceof LLMStream) {
388
+ this.#logger.warn('LLMStream will be ignored for function call chat context');
389
+ } else if (typeof source === 'string') {
390
+ fncSource = source;
391
+ } else {
392
+ fncSource = source;
393
+ source = new AsyncIterableQueue<string>();
394
+ }
395
+ }
396
+
349
397
  const newHandle = SpeechHandle.createAssistantSpeech(allowInterruptions, addToChatCtx);
350
398
  const synthesisHandle = this.#synthesizeAgentSpeech(newHandle.id, source);
351
399
  newHandle.initialize(source, synthesisHandle);
352
- this.#addSpeechForPlayout(newHandle);
400
+
401
+ if (this.#playingSpeech && !this.#playingSpeech.nestedSpeechFinished) {
402
+ this.#playingSpeech.addNestedSpeech(newHandle);
403
+ } else {
404
+ this.#addSpeechForPlayout(newHandle);
405
+ }
406
+
407
+ if (callContext && fncSource) {
408
+ let text: string;
409
+ if (typeof source === 'string') {
410
+ text = fncSource as string;
411
+ } else {
412
+ text = '';
413
+ for await (const chunk of fncSource) {
414
+ (source as AsyncIterableQueue<string>).put(chunk);
415
+ text += chunk;
416
+ }
417
+ (source as AsyncIterableQueue<string>).close();
418
+ }
419
+
420
+ callContext.addExtraChatMessage(ChatMessage.create({ text, role: ChatRole.ASSISTANT }));
421
+ this.#logger.child({ text }).debug('added speech to function call chat context');
422
+ }
423
+
424
+ return newHandle;
353
425
  }
354
426
 
355
427
  #updateState(state: AgentState, delay = 0) {
@@ -410,11 +482,14 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
410
482
  if (event.speechDuration >= this.#opts.interruptSpeechDuration) {
411
483
  this.#interruptIfPossible();
412
484
  }
485
+
486
+ if (event.rawAccumulatedSpeech > 0) {
487
+ this.#lastSpeechTime = Date.now() - event.rawAccumulatedSilence;
488
+ }
413
489
  });
414
490
  this.#humanInput.on(HumanInputEvent.END_OF_SPEECH, (event) => {
415
491
  this.emit(VPAEvent.USER_STARTED_SPEAKING);
416
492
  this.#deferredValidation.onHumanEndOfSpeech(event);
417
- this.#lastEndOfSpeechTime = Date.now();
418
493
  });
419
494
  this.#humanInput.on(HumanInputEvent.INTERIM_TRANSCRIPT, (event) => {
420
495
  this.#transcribedInterimText = event.alternatives![0].text;
@@ -423,7 +498,7 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
423
498
  const newTranscript = event.alternatives![0].text;
424
499
  if (!newTranscript) return;
425
500
 
426
- this.#logger.child({ userTranscript: newTranscript }).debug('received user transcript');
501
+ this.#lastFinalTranscriptTime = Date.now();
427
502
  this.#transcribedText += (this.#transcribedText ? ' ' : '') + newTranscript;
428
503
 
429
504
  if (
@@ -534,33 +609,31 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
534
609
  }),
535
610
  );
536
611
 
537
- if (cancelled) resolve();
538
- let llmStream = await this.#opts.beforeLLMCallback(this, copiedCtx);
539
- if (llmStream === false) {
540
- handle?.cancel();
541
- return;
542
- }
543
-
544
- if (cancelled) resolve();
545
- // fallback to default impl if no custom/user stream is returned
546
- if (!(llmStream instanceof LLMStream)) {
547
- llmStream = (await defaultBeforeLLMCallback(this, copiedCtx)) as LLMStream;
548
- }
612
+ speechData = { sequenceId: handle!.id };
549
613
 
550
- if (handle!.interrupted) {
551
- return;
552
- }
614
+ try {
615
+ if (cancelled) resolve();
616
+ let llmStream = await this.#opts.beforeLLMCallback(this, copiedCtx);
617
+ if (llmStream === false) {
618
+ handle?.cancel();
619
+ return;
620
+ }
553
621
 
554
- const synthesisHandle = this.#synthesizeAgentSpeech(handle!.id, llmStream);
555
- handle!.initialize(llmStream, synthesisHandle);
622
+ if (cancelled) resolve();
623
+ // fallback to default impl if no custom/user stream is returned
624
+ if (!(llmStream instanceof LLMStream)) {
625
+ llmStream = (await defaultBeforeLLMCallback(this, copiedCtx)) as LLMStream;
626
+ }
556
627
 
557
- // TODO(theomonnom): find a more reliable way to get the elapsed time from the last EOS
558
- // (VAD could not have detected any speech — maybe unlikely?)
559
- const elapsed = !!this.#lastEndOfSpeechTime
560
- ? Math.round((Date.now() - this.#lastEndOfSpeechTime) * 1000) / 1000
561
- : -1;
628
+ if (handle!.interrupted) {
629
+ return;
630
+ }
562
631
 
563
- this.#logger.child({ speechId: handle!.id, elapsed }).debug('synthesizing agent reply');
632
+ const synthesisHandle = this.#synthesizeAgentSpeech(handle!.id, llmStream);
633
+ handle!.initialize(llmStream, synthesisHandle);
634
+ } finally {
635
+ speechData = undefined;
636
+ }
564
637
  resolve();
565
638
  });
566
639
  }
@@ -621,78 +694,107 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
621
694
 
622
695
  const collectedText = handle.synthesisHandle.text;
623
696
  const isUsingTools = handle.source instanceof LLMStream && !!handle.source.functionCalls.length;
624
- const extraToolsMessages = []; // additional messages from the functions to add to the context
625
- let interrupted = handle.interrupted;
697
+ const interrupted = handle.interrupted;
698
+
699
+ const executeFunctionCalls = async () => {
700
+ // if the answer is using tools, execute the functions and automatically generate
701
+ // a response to the user question from the returned values
702
+ if (!isUsingTools || interrupted) return;
703
+
704
+ if (handle.fncNestedDepth >= this.#opts.maxNestedFncCalls) {
705
+ this.#logger
706
+ .child({ speechId: handle.id, fncNestedDepth: handle.fncNestedDepth })
707
+ .warn('max function calls nested depth reached');
708
+ return;
709
+ }
626
710
 
627
- // if the answer is using tools, execute the functions and automatically generate
628
- // a response to the user question from the returned values
629
- if (isUsingTools && !interrupted) {
630
711
  if (!userQuestion || !handle.userCommitted) {
631
712
  throw new Error('user speech should have been committed before using tools');
632
713
  }
633
714
  const llmStream = handle.source;
634
- let newFunctionCalls = llmStream.functionCalls;
635
-
636
- for (let i = 0; i < this.#opts.maxRecursiveFncCalls; i++) {
637
- this.emit(VPAEvent.FUNCTION_CALLS_COLLECTED, newFunctionCalls);
638
- const calledFuncs: FunctionCallInfo[] = [];
639
- for (const func of newFunctionCalls) {
640
- const task = func.func.execute(func.params).then(
641
- (result) => ({ name: func.name, toolCallId: func.toolCallId, result }),
642
- (error) => ({ name: func.name, toolCallId: func.toolCallId, error }),
643
- );
644
- calledFuncs.push({ ...func, task });
715
+ const newFunctionCalls = llmStream.functionCalls;
716
+
717
+ new AgentCallContext(this, llmStream);
718
+
719
+ this.emit(VPAEvent.FUNCTION_CALLS_COLLECTED, newFunctionCalls);
720
+ const calledFuncs: FunctionCallInfo[] = [];
721
+ for (const func of newFunctionCalls) {
722
+ const task = func.func.execute(func.params).then(
723
+ (result) => ({ name: func.name, toolCallId: func.toolCallId, result }),
724
+ (error) => ({ name: func.name, toolCallId: func.toolCallId, error }),
725
+ );
726
+ calledFuncs.push({ ...func, task });
727
+ this.#logger
728
+ .child({ function: func.name, speechId: handle.id })
729
+ .debug('executing AI function');
730
+ try {
731
+ await task;
732
+ } catch {
645
733
  this.#logger
646
734
  .child({ function: func.name, speechId: handle.id })
647
- .debug('executing AI function');
648
- try {
649
- await task;
650
- } catch {
651
- this.#logger
652
- .child({ function: func.name, speechId: handle.id })
653
- .error('error executing AI function');
654
- }
735
+ .error('error executing AI function');
655
736
  }
737
+ }
656
738
 
657
- const toolCallsInfo = [];
658
- const toolCallsResults = [];
659
- for (const fnc of calledFuncs) {
660
- // ignore the function calls that return void
661
- const task = await fnc.task;
662
- if (!task || task.result === undefined) continue;
663
- toolCallsInfo.push(fnc);
664
- toolCallsResults.push(ChatMessage.createToolFromFunctionResult(task));
665
- }
739
+ const toolCallsInfo = [];
740
+ const toolCallsResults = [];
741
+ for (const fnc of calledFuncs) {
742
+ // ignore the function calls that return void
743
+ const task = await fnc.task;
744
+ if (!task || task.result === undefined) continue;
745
+ toolCallsInfo.push(fnc);
746
+ toolCallsResults.push(ChatMessage.createToolFromFunctionResult(task));
747
+ }
666
748
 
667
- if (!toolCallsInfo.length) break;
749
+ if (!toolCallsInfo.length) return;
668
750
 
669
- // generate an answer from the tool calls
670
- extraToolsMessages.push(ChatMessage.createToolCalls(toolCallsInfo, collectedText));
671
- extraToolsMessages.push(...toolCallsResults);
751
+ // generate an answer from the tool calls
752
+ const extraToolsMessages = [ChatMessage.createToolCalls(toolCallsInfo, collectedText)];
753
+ extraToolsMessages.push(...toolCallsResults);
672
754
 
673
- const chatCtx = handle.source.chatCtx.copy();
674
- chatCtx.messages.push(...extraToolsMessages);
755
+ // create a nested speech handle
756
+ const newSpeechHandle = SpeechHandle.createToolSpeech(
757
+ handle.allowInterruptions,
758
+ handle.addToChatCtx,
759
+ handle.fncNestedDepth + 1,
760
+ extraToolsMessages,
761
+ );
675
762
 
676
- const answerLLMStream = this.llm.chat({
677
- chatCtx,
678
- fncCtx: this.fncCtx,
679
- });
680
- const answerSynthesis = this.#synthesizeAgentSpeech(handle.id, answerLLMStream);
681
- // replace the synthesis handle with the new one to allow interruption
682
- handle.synthesisHandle = answerSynthesis;
683
- const playHandle = answerSynthesis.play();
684
- await playHandle.join().await;
763
+ // synthesize the tool speech with the chat ctx from llmStream
764
+ const chatCtx = handle.source.chatCtx.copy();
765
+ chatCtx.messages.push(...extraToolsMessages);
766
+ chatCtx.messages.push(...AgentCallContext.getCurrent().extraChatMessages);
685
767
 
686
- interrupted = answerSynthesis.interrupted;
687
- newFunctionCalls = answerLLMStream.functionCalls;
768
+ const answerLLMStream = this.llm.chat({
769
+ chatCtx,
770
+ fncCtx: this.fncCtx,
771
+ });
772
+ const answerSynthesis = this.#synthesizeAgentSpeech(newSpeechHandle.id, answerLLMStream);
773
+ newSpeechHandle.initialize(answerLLMStream, answerSynthesis);
774
+ handle.addNestedSpeech(newSpeechHandle);
775
+
776
+ this.emit(VPAEvent.FUNCTION_CALLS_FINISHED, calledFuncs);
777
+ };
688
778
 
689
- this.emit(VPAEvent.FUNCTION_CALLS_FINISHED, calledFuncs);
690
- if (!newFunctionCalls) break;
779
+ const task = executeFunctionCalls().then(() => {
780
+ handle.markNestedSpeechFinished();
781
+ });
782
+ while (!handle.nestedSpeechFinished) {
783
+ const changed = handle.nestedSpeechChanged();
784
+ await Promise.race([changed, task]);
785
+ while (handle.nestedSpeechHandles.length) {
786
+ const speech = handle.nestedSpeechHandles[0]!;
787
+ this.#playingSpeech = speech;
788
+ await this.#playSpeech(speech);
789
+ handle.nestedSpeechHandles.shift();
790
+ this.#playingSpeech = handle;
691
791
  }
692
792
  }
693
793
 
694
794
  if (handle.addToChatCtx && (!userQuestion || handle.userCommitted)) {
695
- this.chatCtx.messages.push(...extraToolsMessages);
795
+ if (handle.extraToolsMessages) {
796
+ this.chatCtx.messages.push(...handle.extraToolsMessages);
797
+ }
696
798
  if (interrupted) {
697
799
  collectedText + '…';
698
800
  }
@@ -714,6 +816,8 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
714
816
  speechId: handle.id,
715
817
  })
716
818
  .debug('committed agent speech');
819
+
820
+ handle.setDone();
717
821
  }
718
822
  }
719
823
 
@@ -773,6 +877,21 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter<
773
877
 
774
878
  this.#logger.child({ speechId: this.#pendingAgentReply.id }).debug('validated agent reply');
775
879
 
880
+ if (this.#lastSpeechTime) {
881
+ const timeSinceLastSpeech = Date.now() - this.#lastSpeechTime;
882
+ const transcriptionDelay = Math.max(
883
+ (this.#lastFinalTranscriptTime || 0) - this.#lastSpeechTime,
884
+ 0,
885
+ );
886
+ const metrics: PipelineEOUMetrics = {
887
+ timestamp: Date.now(),
888
+ sequenceId: this.#pendingAgentReply.id,
889
+ endOfUtteranceDelay: timeSinceLastSpeech,
890
+ transcriptionDelay,
891
+ };
892
+ this.emit(VPAEvent.METRICS_COLLECTED, metrics);
893
+ }
894
+
776
895
  this.#addSpeechForPlayout(this.#pendingAgentReply);
777
896
  this.#pendingAgentReply = undefined;
778
897
  this.#transcribedInterimText = '';
@@ -2,8 +2,8 @@
2
2
  //
3
3
  // SPDX-License-Identifier: Apache-2.0
4
4
  import { randomUUID } from 'crypto';
5
- import type { LLMStream } from '../llm/index.js';
6
- import { Future } from '../utils.js';
5
+ import type { ChatMessage, LLMStream } from '../llm/index.js';
6
+ import { AsyncIterableQueue, Future } from '../utils.js';
7
7
  import type { SynthesisHandle } from './agent_output.js';
8
8
 
9
9
  export class SpeechHandle {
@@ -14,10 +14,16 @@ export class SpeechHandle {
14
14
  #userQuestion: string;
15
15
  #userCommitted = false;
16
16
  #initFut = new Future();
17
+ #doneFut = new Future();
17
18
  #speechCommitted = false;
18
19
  #source?: string | LLMStream | AsyncIterable<string>;
19
20
  #synthesisHandle?: SynthesisHandle;
20
21
  #initialized = false;
22
+ #fncNestedDepth: number;
23
+ #fncExtraToolsMesages?: ChatMessage[];
24
+ #nestedSpeechHandles: SpeechHandle[] = [];
25
+ #nestedSpeechChanged = new AsyncIterableQueue<void>();
26
+ #nestedSpeechFinished = false;
21
27
 
22
28
  constructor(
23
29
  id: string,
@@ -25,12 +31,16 @@ export class SpeechHandle {
25
31
  addToChatCtx: boolean,
26
32
  isReply: boolean,
27
33
  userQuestion: string,
34
+ fncNestedDepth = 0,
35
+ extraToolsMessages: ChatMessage[] | undefined = undefined,
28
36
  ) {
29
37
  this.#id = id;
30
38
  this.#allowInterruptions = allowInterruptions;
31
39
  this.#addToChatCtx = addToChatCtx;
32
40
  this.#isReply = isReply;
33
41
  this.#userQuestion = userQuestion;
42
+ this.#fncNestedDepth = fncNestedDepth;
43
+ this.#fncExtraToolsMesages = extraToolsMessages;
34
44
  }
35
45
 
36
46
  static createAssistantReply(
@@ -45,6 +55,23 @@ export class SpeechHandle {
45
55
  return new SpeechHandle(randomUUID(), allowInterruptions, addToChatCtx, false, '');
46
56
  }
47
57
 
58
+ static createToolSpeech(
59
+ allowInterruptions: boolean,
60
+ addToChatCtx: boolean,
61
+ fncNestedDepth: number,
62
+ extraToolsMessages: ChatMessage[],
63
+ ): SpeechHandle {
64
+ return new SpeechHandle(
65
+ randomUUID(),
66
+ allowInterruptions,
67
+ addToChatCtx,
68
+ false,
69
+ '',
70
+ fncNestedDepth,
71
+ extraToolsMessages,
72
+ );
73
+ }
74
+
48
75
  async waitForInitialization() {
49
76
  await this.#initFut.await;
50
77
  }
@@ -122,6 +149,43 @@ export class SpeechHandle {
122
149
  return !!this.#synthesisHandle?.interrupted;
123
150
  }
124
151
 
152
+ get fncNestedDepth(): number {
153
+ return this.#fncNestedDepth;
154
+ }
155
+
156
+ get extraToolsMessages(): ChatMessage[] | undefined {
157
+ return this.#fncExtraToolsMesages;
158
+ }
159
+
160
+ addNestedSpeech(handle: SpeechHandle) {
161
+ this.#nestedSpeechHandles.push(handle);
162
+ this.#nestedSpeechChanged.put();
163
+ }
164
+
165
+ get nestedSpeechHandles(): SpeechHandle[] {
166
+ return this.#nestedSpeechHandles;
167
+ }
168
+
169
+ async nestedSpeechChanged() {
170
+ await this.#nestedSpeechChanged.next();
171
+ }
172
+
173
+ get nestedSpeechFinished(): boolean {
174
+ return this.#nestedSpeechFinished;
175
+ }
176
+
177
+ markNestedSpeechFinished() {
178
+ this.#nestedSpeechFinished = true;
179
+ }
180
+
181
+ join() {
182
+ return this.#doneFut.await;
183
+ }
184
+
185
+ setDone() {
186
+ this.#doneFut.resolve();
187
+ }
188
+
125
189
  interrupt() {
126
190
  if (!this.#allowInterruptions) {
127
191
  throw new Error('interruptions are not allowed');
@@ -131,6 +195,7 @@ export class SpeechHandle {
131
195
 
132
196
  cancel() {
133
197
  this.#initFut.reject(new Error());
198
+ this.#nestedSpeechChanged.close();
134
199
  this.#synthesisHandle?.interrupt();
135
200
  }
136
201
  }
package/src/stt/index.ts CHANGED
@@ -6,6 +6,8 @@ export {
6
6
  type SpeechEvent,
7
7
  type SpeechData,
8
8
  type STTCapabilities,
9
+ type RecognitionUsage,
10
+ type STTCallbacks,
9
11
  SpeechEventType,
10
12
  STT,
11
13
  SpeechStream,
@@ -10,14 +10,20 @@ import { STT, SpeechEventType, SpeechStream } from './stt.js';
10
10
  export class StreamAdapter extends STT {
11
11
  #stt: STT;
12
12
  #vad: VAD;
13
+ label: string;
13
14
 
14
15
  constructor(stt: STT, vad: VAD) {
15
16
  super({ streaming: true, interimResults: false });
16
17
  this.#stt = stt;
17
18
  this.#vad = vad;
19
+ this.label = `stt.StreamAdapter<${this.#stt.label}>`;
20
+
21
+ this.#stt.on(SpeechEventType.METRICS_COLLECTED, (metrics) => {
22
+ this.emit(SpeechEventType.METRICS_COLLECTED, metrics);
23
+ });
18
24
  }
19
25
 
20
- recognize(frame: AudioFrame): Promise<SpeechEvent> {
26
+ _recognize(frame: AudioFrame): Promise<SpeechEvent> {
21
27
  return this.#stt.recognize(frame);
22
28
  }
23
29
 
@@ -29,15 +35,21 @@ export class StreamAdapter extends STT {
29
35
  export class StreamAdapterWrapper extends SpeechStream {
30
36
  #stt: STT;
31
37
  #vadStream: VADStream;
38
+ label: string;
32
39
 
33
40
  constructor(stt: STT, vad: VAD) {
34
- super();
41
+ super(stt);
35
42
  this.#stt = stt;
36
43
  this.#vadStream = vad.stream();
44
+ this.label = `stt.StreamAdapterWrapper<${this.#stt.label}>`;
37
45
 
38
46
  this.#run();
39
47
  }
40
48
 
49
+ async monitorMetrics() {
50
+ return; // do nothing
51
+ }
52
+
41
53
  async #run() {
42
54
  const forwardInput = async () => {
43
55
  for await (const input of this.input) {
@@ -54,17 +66,17 @@ export class StreamAdapterWrapper extends SpeechStream {
54
66
  for await (const ev of this.#vadStream) {
55
67
  switch (ev.type) {
56
68
  case VADEventType.START_OF_SPEECH:
57
- this.queue.put({ type: SpeechEventType.START_OF_SPEECH });
69
+ this.output.put({ type: SpeechEventType.START_OF_SPEECH });
58
70
  break;
59
71
  case VADEventType.END_OF_SPEECH:
60
- this.queue.put({ type: SpeechEventType.END_OF_SPEECH });
72
+ this.output.put({ type: SpeechEventType.END_OF_SPEECH });
61
73
 
62
74
  const event = await this.#stt.recognize(ev.frames);
63
75
  if (!event.alternatives![0].text) {
64
76
  continue;
65
77
  }
66
78
 
67
- this.queue.put(event);
79
+ this.output.put(event);
68
80
  break;
69
81
  }
70
82
  }