@livekit/agents 1.0.14 → 1.0.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (133) hide show
  1. package/dist/cli.cjs +12 -12
  2. package/dist/cli.cjs.map +1 -1
  3. package/dist/cli.d.cts +3 -3
  4. package/dist/cli.d.ts +3 -3
  5. package/dist/cli.d.ts.map +1 -1
  6. package/dist/cli.js +13 -13
  7. package/dist/cli.js.map +1 -1
  8. package/dist/inference/stt.cjs.map +1 -1
  9. package/dist/inference/stt.d.ts.map +1 -1
  10. package/dist/inference/stt.js +1 -1
  11. package/dist/inference/stt.js.map +1 -1
  12. package/dist/inference/tts.cjs.map +1 -1
  13. package/dist/inference/tts.d.cts +2 -1
  14. package/dist/inference/tts.d.ts +2 -1
  15. package/dist/inference/tts.d.ts.map +1 -1
  16. package/dist/inference/tts.js +1 -5
  17. package/dist/inference/tts.js.map +1 -1
  18. package/dist/llm/chat_context.cjs +78 -0
  19. package/dist/llm/chat_context.cjs.map +1 -1
  20. package/dist/llm/chat_context.d.cts +16 -0
  21. package/dist/llm/chat_context.d.ts +16 -0
  22. package/dist/llm/chat_context.d.ts.map +1 -1
  23. package/dist/llm/chat_context.js +78 -0
  24. package/dist/llm/chat_context.js.map +1 -1
  25. package/dist/llm/chat_context.test.cjs +531 -0
  26. package/dist/llm/chat_context.test.cjs.map +1 -1
  27. package/dist/llm/chat_context.test.js +531 -0
  28. package/dist/llm/chat_context.test.js.map +1 -1
  29. package/dist/llm/tool_context.cjs +43 -2
  30. package/dist/llm/tool_context.cjs.map +1 -1
  31. package/dist/llm/tool_context.d.cts +39 -11
  32. package/dist/llm/tool_context.d.ts +39 -11
  33. package/dist/llm/tool_context.d.ts.map +1 -1
  34. package/dist/llm/tool_context.js +42 -3
  35. package/dist/llm/tool_context.js.map +1 -1
  36. package/dist/llm/tool_context.test.cjs +197 -0
  37. package/dist/llm/tool_context.test.cjs.map +1 -1
  38. package/dist/llm/tool_context.test.js +175 -0
  39. package/dist/llm/tool_context.test.js.map +1 -1
  40. package/dist/llm/utils.cjs +17 -11
  41. package/dist/llm/utils.cjs.map +1 -1
  42. package/dist/llm/utils.d.cts +1 -2
  43. package/dist/llm/utils.d.ts +1 -2
  44. package/dist/llm/utils.d.ts.map +1 -1
  45. package/dist/llm/utils.js +17 -11
  46. package/dist/llm/utils.js.map +1 -1
  47. package/dist/llm/zod-utils.cjs +99 -0
  48. package/dist/llm/zod-utils.cjs.map +1 -0
  49. package/dist/llm/zod-utils.d.cts +65 -0
  50. package/dist/llm/zod-utils.d.ts +65 -0
  51. package/dist/llm/zod-utils.d.ts.map +1 -0
  52. package/dist/llm/zod-utils.js +61 -0
  53. package/dist/llm/zod-utils.js.map +1 -0
  54. package/dist/llm/zod-utils.test.cjs +389 -0
  55. package/dist/llm/zod-utils.test.cjs.map +1 -0
  56. package/dist/llm/zod-utils.test.js +372 -0
  57. package/dist/llm/zod-utils.test.js.map +1 -0
  58. package/dist/metrics/base.cjs.map +1 -1
  59. package/dist/metrics/base.d.cts +7 -0
  60. package/dist/metrics/base.d.ts +7 -0
  61. package/dist/metrics/base.d.ts.map +1 -1
  62. package/dist/stt/stt.cjs +1 -0
  63. package/dist/stt/stt.cjs.map +1 -1
  64. package/dist/stt/stt.d.cts +7 -1
  65. package/dist/stt/stt.d.ts +7 -1
  66. package/dist/stt/stt.d.ts.map +1 -1
  67. package/dist/stt/stt.js +1 -0
  68. package/dist/stt/stt.js.map +1 -1
  69. package/dist/vad.cjs +16 -0
  70. package/dist/vad.cjs.map +1 -1
  71. package/dist/vad.d.cts +6 -0
  72. package/dist/vad.d.ts +6 -0
  73. package/dist/vad.d.ts.map +1 -1
  74. package/dist/vad.js +16 -0
  75. package/dist/vad.js.map +1 -1
  76. package/dist/voice/agent_activity.cjs +83 -8
  77. package/dist/voice/agent_activity.cjs.map +1 -1
  78. package/dist/voice/agent_activity.d.cts +6 -2
  79. package/dist/voice/agent_activity.d.ts +6 -2
  80. package/dist/voice/agent_activity.d.ts.map +1 -1
  81. package/dist/voice/agent_activity.js +83 -8
  82. package/dist/voice/agent_activity.js.map +1 -1
  83. package/dist/voice/agent_session.cjs +3 -2
  84. package/dist/voice/agent_session.cjs.map +1 -1
  85. package/dist/voice/agent_session.d.cts +2 -1
  86. package/dist/voice/agent_session.d.ts +2 -1
  87. package/dist/voice/agent_session.d.ts.map +1 -1
  88. package/dist/voice/agent_session.js +3 -2
  89. package/dist/voice/agent_session.js.map +1 -1
  90. package/dist/voice/audio_recognition.cjs +138 -16
  91. package/dist/voice/audio_recognition.cjs.map +1 -1
  92. package/dist/voice/audio_recognition.d.cts +11 -0
  93. package/dist/voice/audio_recognition.d.ts +11 -0
  94. package/dist/voice/audio_recognition.d.ts.map +1 -1
  95. package/dist/voice/audio_recognition.js +138 -16
  96. package/dist/voice/audio_recognition.js.map +1 -1
  97. package/dist/voice/generation.cjs +8 -3
  98. package/dist/voice/generation.cjs.map +1 -1
  99. package/dist/voice/generation.d.ts.map +1 -1
  100. package/dist/voice/generation.js +8 -3
  101. package/dist/voice/generation.js.map +1 -1
  102. package/dist/voice/room_io/_input.cjs.map +1 -1
  103. package/dist/voice/room_io/_input.d.ts.map +1 -1
  104. package/dist/voice/room_io/_input.js +0 -1
  105. package/dist/voice/room_io/_input.js.map +1 -1
  106. package/dist/worker.cjs +17 -11
  107. package/dist/worker.cjs.map +1 -1
  108. package/dist/worker.d.cts +16 -9
  109. package/dist/worker.d.ts +16 -9
  110. package/dist/worker.d.ts.map +1 -1
  111. package/dist/worker.js +16 -12
  112. package/dist/worker.js.map +1 -1
  113. package/package.json +5 -4
  114. package/src/cli.ts +17 -17
  115. package/src/inference/stt.ts +2 -1
  116. package/src/inference/tts.ts +2 -5
  117. package/src/llm/__snapshots__/zod-utils.test.ts.snap +341 -0
  118. package/src/llm/chat_context.test.ts +607 -0
  119. package/src/llm/chat_context.ts +106 -0
  120. package/src/llm/tool_context.test.ts +210 -1
  121. package/src/llm/tool_context.ts +101 -17
  122. package/src/llm/utils.ts +18 -15
  123. package/src/llm/zod-utils.test.ts +476 -0
  124. package/src/llm/zod-utils.ts +144 -0
  125. package/src/metrics/base.ts +7 -0
  126. package/src/stt/stt.ts +6 -0
  127. package/src/vad.ts +18 -0
  128. package/src/voice/agent_activity.ts +119 -9
  129. package/src/voice/agent_session.ts +3 -1
  130. package/src/voice/audio_recognition.ts +235 -57
  131. package/src/voice/generation.ts +8 -3
  132. package/src/voice/room_io/_input.ts +1 -1
  133. package/src/worker.ts +29 -18
@@ -22,6 +22,7 @@ import {
22
22
  type ToolContext,
23
23
  } from '../llm/index.js';
24
24
  import type { LLMError } from '../llm/llm.js';
25
+ import { isSameToolChoice, isSameToolContext } from '../llm/tool_context.js';
25
26
  import { log } from '../log.js';
26
27
  import type {
27
28
  EOUMetrics,
@@ -43,6 +44,7 @@ import { type AgentSession, type TurnDetectionMode } from './agent_session.js';
43
44
  import {
44
45
  AudioRecognition,
45
46
  type EndOfTurnInfo,
47
+ type PreemptiveGenerationInfo,
46
48
  type RecognitionHooks,
47
49
  type _TurnDetector,
48
50
  } from './audio_recognition.js';
@@ -71,6 +73,16 @@ import { SpeechHandle } from './speech_handle.js';
71
73
  // equivalent to Python's contextvars
72
74
  const speechHandleStorage = new AsyncLocalStorage<SpeechHandle>();
73
75
 
76
+ interface PreemptiveGeneration {
77
+ speechHandle: SpeechHandle;
78
+ userMessage: ChatMessage;
79
+ info: PreemptiveGenerationInfo;
80
+ chatCtx: ChatContext;
81
+ tools: ToolContext;
82
+ toolChoice: ToolChoice | null;
83
+ createdAt: number;
84
+ }
85
+
74
86
  export class AgentActivity implements RecognitionHooks {
75
87
  private static readonly REPLY_TASK_CANCEL_TIMEOUT = 5000;
76
88
  private started = false;
@@ -87,6 +99,7 @@ export class AgentActivity implements RecognitionHooks {
87
99
  private audioStream = new DeferredReadableStream<AudioFrame>();
88
100
  // default to null as None, which maps to the default provider tool choice value
89
101
  private toolChoice: ToolChoice | null = null;
102
+ private _preemptiveGeneration?: PreemptiveGeneration;
90
103
 
91
104
  agent: Agent;
92
105
  agentSession: AgentSession;
@@ -589,8 +602,12 @@ export class AgentActivity implements RecognitionHooks {
589
602
  this.agentSession._updateUserState('speaking');
590
603
  }
591
604
 
592
- onEndOfSpeech(_ev: VADEvent): void {
593
- this.agentSession._updateUserState('listening');
605
+ onEndOfSpeech(ev: VADEvent): void {
606
+ let speechEndTime = Date.now();
607
+ if (ev) {
608
+ speechEndTime = speechEndTime - ev.silenceDuration;
609
+ }
610
+ this.agentSession._updateUserState('listening', speechEndTime);
594
611
  }
595
612
 
596
613
  onVADInferenceDone(ev: VADEvent): void {
@@ -664,6 +681,55 @@ export class AgentActivity implements RecognitionHooks {
664
681
  );
665
682
  }
666
683
 
684
+ onPreemptiveGeneration(info: PreemptiveGenerationInfo): void {
685
+ if (
686
+ !this.agentSession.options.preemptiveGeneration ||
687
+ this.draining ||
688
+ (this._currentSpeech !== undefined && !this._currentSpeech.interrupted) ||
689
+ !(this.llm instanceof LLM)
690
+ ) {
691
+ return;
692
+ }
693
+
694
+ this.cancelPreemptiveGeneration();
695
+
696
+ this.logger.info(
697
+ {
698
+ newTranscript: info.newTranscript,
699
+ transcriptConfidence: info.transcriptConfidence,
700
+ },
701
+ 'starting preemptive generation',
702
+ );
703
+
704
+ const userMessage = ChatMessage.create({
705
+ role: 'user',
706
+ content: info.newTranscript,
707
+ });
708
+ const chatCtx = this.agent.chatCtx.copy();
709
+ const speechHandle = this.generateReply({
710
+ userMessage,
711
+ chatCtx,
712
+ scheduleSpeech: false,
713
+ });
714
+
715
+ this._preemptiveGeneration = {
716
+ speechHandle,
717
+ userMessage,
718
+ info,
719
+ chatCtx: chatCtx.copy(),
720
+ tools: { ...this.tools },
721
+ toolChoice: this.toolChoice,
722
+ createdAt: Date.now(),
723
+ };
724
+ }
725
+
726
+ private cancelPreemptiveGeneration(): void {
727
+ if (this._preemptiveGeneration !== undefined) {
728
+ this._preemptiveGeneration.speechHandle._cancel();
729
+ this._preemptiveGeneration = undefined;
730
+ }
731
+ }
732
+
667
733
  private createSpeechTask(options: {
668
734
  task: Task<void>;
669
735
  ownedSpeechHandle?: SpeechHandle;
@@ -694,6 +760,7 @@ export class AgentActivity implements RecognitionHooks {
694
760
 
695
761
  async onEndOfTurn(info: EndOfTurnInfo): Promise<boolean> {
696
762
  if (this.draining) {
763
+ this.cancelPreemptiveGeneration();
697
764
  this.logger.warn({ user_input: info.newTranscript }, 'skipping user input, task is draining');
698
765
  // copied from python:
699
766
  // TODO(shubhra): should we "forward" this new turn to the next agent/activity?
@@ -710,6 +777,7 @@ export class AgentActivity implements RecognitionHooks {
710
777
  info.newTranscript.split(' ').length < this.agentSession.options.minInterruptionWords
711
778
  ) {
712
779
  // avoid interruption if the new_transcript is too short
780
+ this.cancelPreemptiveGeneration();
713
781
  this.logger.info('skipping user input, new_transcript is too short');
714
782
  return false;
715
783
  }
@@ -775,6 +843,7 @@ export class AgentActivity implements RecognitionHooks {
775
843
  instructions?: string;
776
844
  toolChoice?: ToolChoice | null;
777
845
  allowInterruptions?: boolean;
846
+ scheduleSpeech?: boolean;
778
847
  }): SpeechHandle {
779
848
  const {
780
849
  userMessage,
@@ -782,6 +851,7 @@ export class AgentActivity implements RecognitionHooks {
782
851
  instructions: defaultInstructions,
783
852
  toolChoice: defaultToolChoice,
784
853
  allowInterruptions: defaultAllowInterruptions,
854
+ scheduleSpeech = true,
785
855
  } = options;
786
856
 
787
857
  let instructions = defaultInstructions;
@@ -871,7 +941,9 @@ export class AgentActivity implements RecognitionHooks {
871
941
  task.finally(() => this.onPipelineReplyDone());
872
942
  }
873
943
 
874
- this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
944
+ if (scheduleSpeech) {
945
+ this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
946
+ }
875
947
  return handle;
876
948
  }
877
949
 
@@ -977,9 +1049,40 @@ export class AgentActivity implements RecognitionHooks {
977
1049
  return;
978
1050
  }
979
1051
 
980
- // Ensure the new message is passed to generateReply
981
- // This preserves the original message id, making it easier for users to track responses
982
- const speechHandle = this.generateReply({ userMessage, chatCtx });
1052
+ let speechHandle: SpeechHandle | undefined;
1053
+ if (this._preemptiveGeneration !== undefined) {
1054
+ const preemptive = this._preemptiveGeneration;
1055
+ // make sure the onUserTurnCompleted didn't change some request parameters
1056
+ // otherwise invalidate the preemptive generation
1057
+ if (
1058
+ preemptive.info.newTranscript === userMessage?.textContent &&
1059
+ preemptive.chatCtx.isEquivalent(chatCtx) &&
1060
+ isSameToolContext(preemptive.tools, this.tools) &&
1061
+ isSameToolChoice(preemptive.toolChoice, this.toolChoice)
1062
+ ) {
1063
+ speechHandle = preemptive.speechHandle;
1064
+ this.scheduleSpeech(speechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
1065
+ this.logger.debug(
1066
+ {
1067
+ preemptiveLeadTime: Date.now() - preemptive.createdAt,
1068
+ },
1069
+ 'using preemptive generation',
1070
+ );
1071
+ } else {
1072
+ this.logger.warn(
1073
+ 'preemptive generation enabled but chat context or tools have changed after `onUserTurnCompleted`',
1074
+ );
1075
+ preemptive.speechHandle._cancel();
1076
+ }
1077
+
1078
+ this._preemptiveGeneration = undefined;
1079
+ }
1080
+
1081
+ if (speechHandle === undefined) {
1082
+ // Ensure the new message is passed to generateReply
1083
+ // This preserves the original message id, making it easier for users to track responses
1084
+ speechHandle = this.generateReply({ userMessage, chatCtx });
1085
+ }
983
1086
 
984
1087
  const eouMetrics: EOUMetrics = {
985
1088
  type: 'eou_metrics',
@@ -987,6 +1090,7 @@ export class AgentActivity implements RecognitionHooks {
987
1090
  endOfUtteranceDelayMs: info.endOfUtteranceDelay,
988
1091
  transcriptionDelayMs: info.transcriptionDelay,
989
1092
  onUserTurnCompletedDelayMs: callbackDuration,
1093
+ lastSpeakingTimeMs: info.stoppedSpeakingAt ?? 0,
990
1094
  speechId: speechHandle.id,
991
1095
  };
992
1096
 
@@ -1139,10 +1243,9 @@ export class AgentActivity implements RecognitionHooks {
1139
1243
 
1140
1244
  chatCtx = chatCtx.copy();
1141
1245
 
1246
+ // Insert new message into temporary chat context for LLM inference
1142
1247
  if (newMessage) {
1143
1248
  chatCtx.insert(newMessage);
1144
- this.agent._chatCtx.insert(newMessage);
1145
- this.agentSession._conversationItemAdded(newMessage);
1146
1249
  }
1147
1250
 
1148
1251
  if (instructions) {
@@ -1157,7 +1260,6 @@ export class AgentActivity implements RecognitionHooks {
1157
1260
  }
1158
1261
  }
1159
1262
 
1160
- this.agentSession._updateAgentState('thinking');
1161
1263
  const tasks: Array<Task<void>> = [];
1162
1264
  const [llmTask, llmGenData] = performLLMInference(
1163
1265
  // preserve `this` context in llmNode
@@ -1185,6 +1287,12 @@ export class AgentActivity implements RecognitionHooks {
1185
1287
 
1186
1288
  await speechHandle.waitIfNotInterrupted([speechHandle._waitForScheduled()]);
1187
1289
 
1290
+ // Add new message to actual chat context if the speech is scheduled
1291
+ if (newMessage && speechHandle.scheduled) {
1292
+ this.agent._chatCtx.insert(newMessage);
1293
+ this.agentSession._conversationItemAdded(newMessage);
1294
+ }
1295
+
1188
1296
  if (speechHandle.interrupted) {
1189
1297
  replyAbortController.abort();
1190
1298
  await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
@@ -1917,6 +2025,7 @@ export class AgentActivity implements RecognitionHooks {
1917
2025
  try {
1918
2026
  if (this._draining) return;
1919
2027
 
2028
+ this.cancelPreemptiveGeneration();
1920
2029
  this.createSpeechTask({
1921
2030
  task: Task.from(() => this.agent.onExit()),
1922
2031
  name: 'AgentActivity_onExit',
@@ -1937,6 +2046,7 @@ export class AgentActivity implements RecognitionHooks {
1937
2046
  this.logger.warn('task closing without draining');
1938
2047
  }
1939
2048
 
2049
+ this.cancelPreemptiveGeneration();
1940
2050
  // Unregister event handlers to prevent duplicate metrics
1941
2051
  if (this.llm instanceof LLM) {
1942
2052
  this.llm.off('metrics_collected', this.onMetricsCollected);
@@ -57,6 +57,7 @@ export interface VoiceOptions {
57
57
  minEndpointingDelay: number;
58
58
  maxEndpointingDelay: number;
59
59
  maxToolSteps: number;
60
+ preemptiveGeneration: boolean;
60
61
  }
61
62
 
62
63
  const defaultVoiceOptions: VoiceOptions = {
@@ -67,6 +68,7 @@ const defaultVoiceOptions: VoiceOptions = {
67
68
  minEndpointingDelay: 500,
68
69
  maxEndpointingDelay: 6000,
69
70
  maxToolSteps: 3,
71
+ preemptiveGeneration: false,
70
72
  } as const;
71
73
 
72
74
  export type TurnDetectionMode = 'stt' | 'vad' | 'realtime_llm' | 'manual' | _TurnDetector;
@@ -421,7 +423,7 @@ export class AgentSession<
421
423
  }
422
424
 
423
425
  /** @internal */
424
- _updateUserState(state: UserState) {
426
+ _updateUserState(state: UserState, _lastSpeakingTime?: number) {
425
427
  if (this.userState === state) {
426
428
  return;
427
429
  }
@@ -17,8 +17,16 @@ import type { STTNode } from './io.js';
17
17
 
18
18
  export interface EndOfTurnInfo {
19
19
  newTranscript: string;
20
+ transcriptConfidence: number;
20
21
  transcriptionDelay: number;
21
22
  endOfUtteranceDelay: number;
23
+ startedSpeakingAt: number | undefined;
24
+ stoppedSpeakingAt: number | undefined;
25
+ }
26
+
27
+ export interface PreemptiveGenerationInfo {
28
+ newTranscript: string;
29
+ transcriptConfidence: number;
22
30
  }
23
31
 
24
32
  export interface RecognitionHooks {
@@ -28,6 +36,7 @@ export interface RecognitionHooks {
28
36
  onInterimTranscript: (ev: SpeechEvent) => void;
29
37
  onFinalTranscript: (ev: SpeechEvent) => void;
30
38
  onEndOfTurn: (info: EndOfTurnInfo) => Promise<boolean>;
39
+ onPreemptiveGeneration: (info: PreemptiveGenerationInfo) => void;
31
40
 
32
41
  retrieveChatCtx: () => ChatContext;
33
42
  }
@@ -63,7 +72,10 @@ export class AudioRecognition {
63
72
  private lastFinalTranscriptTime = 0;
64
73
  private audioTranscript = '';
65
74
  private audioInterimTranscript = '';
66
- private lastSpeakingTime = 0;
75
+ private audioPreflightTranscript = '';
76
+ private finalTranscriptConfidence: number[] = [];
77
+ private lastSpeakingTime: number | undefined;
78
+ private speechStartTime: number | undefined;
67
79
  private userTurnCommitted = false;
68
80
  private speaking = false;
69
81
  private sampleRate?: number;
@@ -144,6 +156,7 @@ export class AudioRecognition {
144
156
  case SpeechEventType.FINAL_TRANSCRIPT:
145
157
  this.hooks.onFinalTranscript(ev);
146
158
  const transcript = ev.alternatives?.[0]?.text;
159
+ const confidence = ev.alternatives?.[0]?.confidence ?? 0;
147
160
  this.lastLanguage = ev.alternatives?.[0]?.language;
148
161
 
149
162
  if (!transcript) {
@@ -162,34 +175,144 @@ export class AudioRecognition {
162
175
  this.lastFinalTranscriptTime = Date.now();
163
176
  this.audioTranscript += ` ${transcript}`;
164
177
  this.audioTranscript = this.audioTranscript.trimStart();
178
+ this.finalTranscriptConfidence.push(confidence);
179
+ const transcriptChanged = this.audioTranscript !== this.audioPreflightTranscript;
165
180
  this.audioInterimTranscript = '';
181
+ this.audioPreflightTranscript = '';
182
+
183
+ if (!this.vad || this.lastSpeakingTime === undefined) {
184
+ // vad disabled, use stt timestamp
185
+ // TODO: this would screw up transcription latency metrics
186
+ // but we'll live with it for now.
187
+ // the correct way is to ensure STT fires SpeechEventType.END_OF_SPEECH
188
+ // and using that timestamp for lastSpeakingTime
189
+ this.lastSpeakingTime = Date.now();
190
+ }
166
191
 
167
- if (!this.speaking) {
168
- if (!this.vad) {
169
- // Copied from python agents:
170
- // vad disabled, use stt timestamp
171
- // TODO: this would screw up transcription latency metrics
172
- // but we'll live with it for now.
173
- // the correct way is to ensure STT fires SpeechEventType.END_OF_SPEECH
174
- // and using that timestamp for _last_speaking_time
175
- this.lastSpeakingTime = Date.now();
192
+ if (this.vadBaseTurnDetection || this.userTurnCommitted) {
193
+ if (transcriptChanged) {
194
+ this.logger.debug(
195
+ { transcript: this.audioTranscript },
196
+ 'triggering preemptive generation (FINAL_TRANSCRIPT)',
197
+ );
198
+ this.hooks.onPreemptiveGeneration({
199
+ newTranscript: this.audioTranscript,
200
+ transcriptConfidence:
201
+ this.finalTranscriptConfidence.length > 0
202
+ ? this.finalTranscriptConfidence.reduce((a, b) => a + b, 0) /
203
+ this.finalTranscriptConfidence.length
204
+ : 0,
205
+ });
176
206
  }
177
207
 
178
- if (this.vadBaseTurnDetection || this.userTurnCommitted) {
208
+ if (!this.speaking) {
179
209
  const chatCtx = this.hooks.retrieveChatCtx();
180
210
  this.logger.debug('running EOU detection on stt FINAL_TRANSCRIPT');
181
211
  this.runEOUDetection(chatCtx);
182
212
  }
183
213
  }
184
214
  break;
215
+ case SpeechEventType.PREFLIGHT_TRANSCRIPT:
216
+ this.hooks.onInterimTranscript(ev);
217
+ const preflightTranscript = ev.alternatives?.[0]?.text ?? '';
218
+ const preflightConfidence = ev.alternatives?.[0]?.confidence ?? 0;
219
+ const preflightLanguage = ev.alternatives?.[0]?.language;
220
+
221
+ const MIN_LANGUAGE_DETECTION_LENGTH = 5;
222
+ if (
223
+ !this.lastLanguage ||
224
+ (preflightLanguage && preflightTranscript.length > MIN_LANGUAGE_DETECTION_LENGTH)
225
+ ) {
226
+ this.lastLanguage = preflightLanguage;
227
+ }
228
+
229
+ if (!preflightTranscript) {
230
+ return;
231
+ }
232
+
233
+ this.logger.debug(
234
+ {
235
+ user_transcript: preflightTranscript,
236
+ language: this.lastLanguage,
237
+ },
238
+ 'received user preflight transcript',
239
+ );
240
+
241
+ // still need to increment it as it's used for turn detection,
242
+ this.lastFinalTranscriptTime = Date.now();
243
+ // preflight transcript includes all pre-committed transcripts (including final transcript from the previous STT run)
244
+ this.audioPreflightTranscript =
245
+ `${this.audioTranscript} ${preflightTranscript}`.trimStart();
246
+ this.audioInterimTranscript = preflightTranscript;
247
+
248
+ if (!this.vad || this.lastSpeakingTime === undefined) {
249
+ // vad disabled, use stt timestamp
250
+ this.lastSpeakingTime = Date.now();
251
+ }
252
+
253
+ if (this.turnDetectionMode !== 'manual' || this.userTurnCommitted) {
254
+ const confidenceVals = [...this.finalTranscriptConfidence, preflightConfidence];
255
+ this.logger.debug(
256
+ {
257
+ transcript:
258
+ this.audioPreflightTranscript.length > 100
259
+ ? this.audioPreflightTranscript.slice(0, 100) + '...'
260
+ : this.audioPreflightTranscript,
261
+ },
262
+ 'triggering preemptive generation (PREFLIGHT_TRANSCRIPT)',
263
+ );
264
+ this.hooks.onPreemptiveGeneration({
265
+ newTranscript: this.audioPreflightTranscript,
266
+ transcriptConfidence:
267
+ confidenceVals.length > 0
268
+ ? confidenceVals.reduce((a, b) => a + b, 0) / confidenceVals.length
269
+ : 0,
270
+ });
271
+ }
272
+ break;
185
273
  case SpeechEventType.INTERIM_TRANSCRIPT:
186
274
  this.logger.debug({ transcript: ev.alternatives?.[0]?.text }, 'interim transcript');
187
275
  this.hooks.onInterimTranscript(ev);
188
276
  this.audioInterimTranscript = ev.alternatives?.[0]?.text ?? '';
189
277
  break;
278
+ case SpeechEventType.START_OF_SPEECH:
279
+ if (this.turnDetectionMode !== 'stt') break;
280
+ this.hooks.onStartOfSpeech({
281
+ type: VADEventType.START_OF_SPEECH,
282
+ samplesIndex: 0,
283
+ timestamp: Date.now(),
284
+ speechDuration: 0,
285
+ silenceDuration: 0,
286
+ frames: [],
287
+ probability: 0,
288
+ inferenceDuration: 0,
289
+ speaking: true,
290
+ rawAccumulatedSilence: 0,
291
+ rawAccumulatedSpeech: 0,
292
+ });
293
+ this.speaking = true;
294
+ this.lastSpeakingTime = Date.now();
295
+
296
+ this.bounceEOUTask?.cancel();
297
+ break;
190
298
  case SpeechEventType.END_OF_SPEECH:
191
299
  if (this.turnDetectionMode !== 'stt') break;
300
+ this.hooks.onEndOfSpeech({
301
+ type: VADEventType.END_OF_SPEECH,
302
+ samplesIndex: 0,
303
+ timestamp: Date.now(),
304
+ speechDuration: 0,
305
+ silenceDuration: 0,
306
+ frames: [],
307
+ probability: 0,
308
+ inferenceDuration: 0,
309
+ speaking: false,
310
+ rawAccumulatedSilence: 0,
311
+ rawAccumulatedSpeech: 0,
312
+ });
313
+ this.speaking = false;
192
314
  this.userTurnCommitted = true;
315
+ this.lastSpeakingTime = Date.now();
193
316
 
194
317
  if (!this.speaking) {
195
318
  const chatCtx = this.hooks.retrieveChatCtx();
@@ -222,61 +345,106 @@ export class AudioRecognition {
222
345
  // disable EOU model if manual turn detection enabled
223
346
  this.audioTranscript && this.turnDetectionMode !== 'manual' ? this.turnDetector : undefined;
224
347
 
225
- const bounceEOUTask = (lastSpeakingTime: number) => async (controller: AbortController) => {
226
- let endpointingDelay = this.minEndpointingDelay;
227
-
228
- // TODO(AJS-74): need to support actual turn detection model plugins for following code to run
229
- if (turnDetector) {
230
- this.logger.debug('Running turn detector model');
231
- if (!turnDetector.supportsLanguage(this.lastLanguage)) {
232
- this.logger.debug(`Turn detector does not support language ${this.lastLanguage}`);
233
- } else {
234
- const endOfTurnProbability = await turnDetector.predictEndOfTurn(chatCtx);
235
- this.logger.debug(
236
- { endOfTurnProbability, language: this.lastLanguage },
237
- 'end of turn probability',
238
- );
239
-
240
- const unlikelyThreshold = await turnDetector.unlikelyThreshold(this.lastLanguage);
241
- this.logger.debug(
242
- {
243
- unlikelyThreshold,
244
- endOfTurnProbability,
245
- language: this.lastLanguage,
246
- transcript: this.audioTranscript,
247
- },
248
- 'EOU Detection',
249
- );
348
+ const bounceEOUTask =
349
+ (
350
+ lastSpeakingTime: number | undefined,
351
+ lastFinalTranscriptTime: number,
352
+ speechStartTime: number | undefined,
353
+ ) =>
354
+ async (controller: AbortController) => {
355
+ let endpointingDelay = this.minEndpointingDelay;
250
356
 
251
- if (unlikelyThreshold && endOfTurnProbability < unlikelyThreshold) {
252
- endpointingDelay = this.maxEndpointingDelay;
357
+ if (turnDetector) {
358
+ this.logger.debug('Running turn detector model');
359
+ if (!turnDetector.supportsLanguage(this.lastLanguage)) {
360
+ this.logger.debug(`Turn detector does not support language ${this.lastLanguage}`);
361
+ } else {
362
+ const endOfTurnProbability = await turnDetector.predictEndOfTurn(chatCtx);
363
+ this.logger.debug(
364
+ { endOfTurnProbability, language: this.lastLanguage },
365
+ 'end of turn probability',
366
+ );
367
+
368
+ const unlikelyThreshold = await turnDetector.unlikelyThreshold(this.lastLanguage);
369
+ this.logger.debug(
370
+ {
371
+ unlikelyThreshold,
372
+ endOfTurnProbability,
373
+ language: this.lastLanguage,
374
+ transcript: this.audioTranscript,
375
+ },
376
+ 'EOU Detection',
377
+ );
378
+
379
+ if (unlikelyThreshold && endOfTurnProbability < unlikelyThreshold) {
380
+ endpointingDelay = this.maxEndpointingDelay;
381
+ }
253
382
  }
254
383
  }
255
- }
256
384
 
257
- const extraSleep = lastSpeakingTime + endpointingDelay - Date.now();
258
- // add delay to see if there's a potential upcoming EOU task that cancels this one
259
- await delay(Math.max(extraSleep, 0), { signal: controller.signal });
385
+ let extraSleep = endpointingDelay;
386
+ if (lastSpeakingTime !== undefined) {
387
+ extraSleep += lastSpeakingTime - Date.now();
388
+ }
260
389
 
261
- this.logger.debug({ transcript: this.audioTranscript }, 'end of user turn');
390
+ if (extraSleep > 0) {
391
+ // add delay to see if there's a potential upcoming EOU task that cancels this one
392
+ await delay(Math.max(extraSleep, 0), { signal: controller.signal });
393
+ }
262
394
 
263
- const committed = await this.hooks.onEndOfTurn({
264
- newTranscript: this.audioTranscript,
265
- transcriptionDelay: Math.max(this.lastFinalTranscriptTime - lastSpeakingTime, 0),
266
- endOfUtteranceDelay: Date.now() - lastSpeakingTime,
267
- });
395
+ this.logger.debug({ transcript: this.audioTranscript }, 'end of user turn');
396
+
397
+ const confidenceAvg =
398
+ this.finalTranscriptConfidence.length > 0
399
+ ? this.finalTranscriptConfidence.reduce((a, b) => a + b, 0) /
400
+ this.finalTranscriptConfidence.length
401
+ : 0;
402
+
403
+ let startedSpeakingAt: number | undefined;
404
+ let stoppedSpeakingAt: number | undefined;
405
+ let transcriptionDelay: number | undefined;
406
+ let endOfUtteranceDelay: number | undefined;
407
+
408
+ // sometimes, we can't calculate the metrics because VAD was unreliable.
409
+ // in this case, we just ignore the calculation, it's better than providing likely wrong values
410
+ if (
411
+ lastFinalTranscriptTime !== 0 &&
412
+ lastSpeakingTime !== undefined &&
413
+ speechStartTime !== undefined
414
+ ) {
415
+ startedSpeakingAt = speechStartTime;
416
+ stoppedSpeakingAt = lastSpeakingTime;
417
+ transcriptionDelay = Math.max(lastFinalTranscriptTime - lastSpeakingTime, 0);
418
+ endOfUtteranceDelay = Date.now() - lastSpeakingTime;
419
+ }
268
420
 
269
- if (committed) {
270
- // clear the transcript if the user turn was committed
271
- this.audioTranscript = '';
272
- }
421
+ const committed = await this.hooks.onEndOfTurn({
422
+ newTranscript: this.audioTranscript,
423
+ transcriptConfidence: confidenceAvg,
424
+ transcriptionDelay: transcriptionDelay ?? 0,
425
+ endOfUtteranceDelay: endOfUtteranceDelay ?? 0,
426
+ startedSpeakingAt,
427
+ stoppedSpeakingAt,
428
+ });
429
+
430
+ if (committed) {
431
+ // clear the transcript if the user turn was committed
432
+ this.audioTranscript = '';
433
+ this.finalTranscriptConfidence = [];
434
+ this.lastSpeakingTime = undefined;
435
+ this.lastFinalTranscriptTime = 0;
436
+ this.speechStartTime = undefined;
437
+ }
273
438
 
274
- this.userTurnCommitted = false;
275
- };
439
+ this.userTurnCommitted = false;
440
+ };
276
441
 
277
442
  // cancel any existing EOU task
278
443
  this.bounceEOUTask?.cancel();
279
- this.bounceEOUTask = Task.from(bounceEOUTask(this.lastSpeakingTime));
444
+ // copy the values before awaiting (the values can change)
445
+ this.bounceEOUTask = Task.from(
446
+ bounceEOUTask(this.lastSpeakingTime, this.lastFinalTranscriptTime, this.speechStartTime),
447
+ );
280
448
 
281
449
  this.bounceEOUTask.result
282
450
  .then(() => {
@@ -376,13 +544,21 @@ export class AudioRecognition {
376
544
  break;
377
545
  case VADEventType.INFERENCE_DONE:
378
546
  this.hooks.onVADInferenceDone(ev);
547
+ // for metrics, get the "earliest" signal of speech as possible
548
+ if (ev.rawAccumulatedSpeech > 0.0) {
549
+ this.lastSpeakingTime = Date.now();
550
+
551
+ if (this.speechStartTime === undefined) {
552
+ this.speechStartTime = Date.now();
553
+ }
554
+ }
379
555
  break;
380
556
  case VADEventType.END_OF_SPEECH:
381
557
  this.logger.debug('VAD task: END_OF_SPEECH');
382
558
  this.hooks.onEndOfSpeech(ev);
383
- this.speaking = false;
559
+
384
560
  // when VAD fires END_OF_SPEECH, it already waited for the silence_duration
385
- this.lastSpeakingTime = Date.now() - ev.silenceDuration;
561
+ this.speaking = false;
386
562
 
387
563
  if (
388
564
  this.vadBaseTurnDetection ||
@@ -412,6 +588,8 @@ export class AudioRecognition {
412
588
  clearUserTurn() {
413
589
  this.audioTranscript = '';
414
590
  this.audioInterimTranscript = '';
591
+ this.audioPreflightTranscript = '';
592
+ this.finalTranscriptConfidence = [];
415
593
  this.userTurnCommitted = false;
416
594
 
417
595
  this.sttTask?.cancelAndWait().finally(() => {
@@ -4,7 +4,6 @@
4
4
  import type { AudioFrame } from '@livekit/rtc-node';
5
5
  import { AudioResampler } from '@livekit/rtc-node';
6
6
  import type { ReadableStream, ReadableStreamDefaultReader } from 'stream/web';
7
- import { ZodObject } from 'zod';
8
7
  import {
9
8
  type ChatContext,
10
9
  ChatMessage,
@@ -19,6 +18,7 @@ import {
19
18
  isFunctionTool,
20
19
  isToolError,
21
20
  } from '../llm/tool_context.js';
21
+ import { isZodSchema, parseZodSchema } from '../llm/zod-utils.js';
22
22
  import { log } from '../log.js';
23
23
  import { IdentityTransform } from '../stream/identity_transform.js';
24
24
  import { Future, Task, shortuuid, toError } from '../utils.js';
@@ -732,8 +732,13 @@ export function performToolExecutions({
732
732
  try {
733
733
  const jsonArgs = JSON.parse(toolCall.args);
734
734
 
735
- if (tool.parameters instanceof ZodObject) {
736
- parsedArgs = tool.parameters.parse(jsonArgs);
735
+ if (isZodSchema(tool.parameters)) {
736
+ const result = await parseZodSchema<object>(tool.parameters, jsonArgs);
737
+ if (result.success) {
738
+ parsedArgs = result.data;
739
+ } else {
740
+ throw result.error;
741
+ }
737
742
  } else {
738
743
  parsedArgs = jsonArgs;
739
744
  }