@livekit/agents 1.0.15 → 1.0.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. package/dist/cli.cjs +12 -12
  2. package/dist/cli.cjs.map +1 -1
  3. package/dist/cli.d.cts +3 -3
  4. package/dist/cli.d.ts +3 -3
  5. package/dist/cli.d.ts.map +1 -1
  6. package/dist/cli.js +13 -13
  7. package/dist/cli.js.map +1 -1
  8. package/dist/inference/stt.cjs.map +1 -1
  9. package/dist/inference/stt.d.ts.map +1 -1
  10. package/dist/inference/stt.js +1 -1
  11. package/dist/inference/stt.js.map +1 -1
  12. package/dist/inference/tts.cjs.map +1 -1
  13. package/dist/inference/tts.d.cts +2 -1
  14. package/dist/inference/tts.d.ts +2 -1
  15. package/dist/inference/tts.d.ts.map +1 -1
  16. package/dist/inference/tts.js +1 -5
  17. package/dist/inference/tts.js.map +1 -1
  18. package/dist/llm/chat_context.cjs +78 -0
  19. package/dist/llm/chat_context.cjs.map +1 -1
  20. package/dist/llm/chat_context.d.cts +16 -0
  21. package/dist/llm/chat_context.d.ts +16 -0
  22. package/dist/llm/chat_context.d.ts.map +1 -1
  23. package/dist/llm/chat_context.js +78 -0
  24. package/dist/llm/chat_context.js.map +1 -1
  25. package/dist/llm/chat_context.test.cjs +531 -0
  26. package/dist/llm/chat_context.test.cjs.map +1 -1
  27. package/dist/llm/chat_context.test.js +531 -0
  28. package/dist/llm/chat_context.test.js.map +1 -1
  29. package/dist/llm/tool_context.cjs +40 -0
  30. package/dist/llm/tool_context.cjs.map +1 -1
  31. package/dist/llm/tool_context.d.cts +2 -0
  32. package/dist/llm/tool_context.d.ts +2 -0
  33. package/dist/llm/tool_context.d.ts.map +1 -1
  34. package/dist/llm/tool_context.js +38 -0
  35. package/dist/llm/tool_context.js.map +1 -1
  36. package/dist/metrics/base.cjs.map +1 -1
  37. package/dist/metrics/base.d.cts +7 -0
  38. package/dist/metrics/base.d.ts +7 -0
  39. package/dist/metrics/base.d.ts.map +1 -1
  40. package/dist/stt/stt.cjs +1 -0
  41. package/dist/stt/stt.cjs.map +1 -1
  42. package/dist/stt/stt.d.cts +7 -1
  43. package/dist/stt/stt.d.ts +7 -1
  44. package/dist/stt/stt.d.ts.map +1 -1
  45. package/dist/stt/stt.js +1 -0
  46. package/dist/stt/stt.js.map +1 -1
  47. package/dist/voice/agent_activity.cjs +83 -8
  48. package/dist/voice/agent_activity.cjs.map +1 -1
  49. package/dist/voice/agent_activity.d.cts +6 -2
  50. package/dist/voice/agent_activity.d.ts +6 -2
  51. package/dist/voice/agent_activity.d.ts.map +1 -1
  52. package/dist/voice/agent_activity.js +83 -8
  53. package/dist/voice/agent_activity.js.map +1 -1
  54. package/dist/voice/agent_session.cjs +3 -2
  55. package/dist/voice/agent_session.cjs.map +1 -1
  56. package/dist/voice/agent_session.d.cts +2 -1
  57. package/dist/voice/agent_session.d.ts +2 -1
  58. package/dist/voice/agent_session.d.ts.map +1 -1
  59. package/dist/voice/agent_session.js +3 -2
  60. package/dist/voice/agent_session.js.map +1 -1
  61. package/dist/voice/audio_recognition.cjs +138 -16
  62. package/dist/voice/audio_recognition.cjs.map +1 -1
  63. package/dist/voice/audio_recognition.d.cts +11 -0
  64. package/dist/voice/audio_recognition.d.ts +11 -0
  65. package/dist/voice/audio_recognition.d.ts.map +1 -1
  66. package/dist/voice/audio_recognition.js +138 -16
  67. package/dist/voice/audio_recognition.js.map +1 -1
  68. package/dist/voice/room_io/_input.cjs.map +1 -1
  69. package/dist/voice/room_io/_input.d.ts.map +1 -1
  70. package/dist/voice/room_io/_input.js +0 -1
  71. package/dist/voice/room_io/_input.js.map +1 -1
  72. package/dist/worker.cjs +17 -11
  73. package/dist/worker.cjs.map +1 -1
  74. package/dist/worker.d.cts +16 -9
  75. package/dist/worker.d.ts +16 -9
  76. package/dist/worker.d.ts.map +1 -1
  77. package/dist/worker.js +16 -12
  78. package/dist/worker.js.map +1 -1
  79. package/package.json +1 -1
  80. package/src/cli.ts +17 -17
  81. package/src/inference/stt.ts +2 -1
  82. package/src/inference/tts.ts +2 -5
  83. package/src/llm/chat_context.test.ts +607 -0
  84. package/src/llm/chat_context.ts +106 -0
  85. package/src/llm/tool_context.ts +44 -0
  86. package/src/metrics/base.ts +7 -0
  87. package/src/stt/stt.ts +6 -0
  88. package/src/voice/agent_activity.ts +119 -9
  89. package/src/voice/agent_session.ts +3 -1
  90. package/src/voice/audio_recognition.ts +235 -57
  91. package/src/voice/room_io/_input.ts +1 -1
  92. package/src/worker.ts +29 -18
@@ -187,6 +187,50 @@ export type ToolContext<UserData = UnknownUserData> = {
187
187
  [name: string]: FunctionTool<any, UserData, any>;
188
188
  };
189
189
 
190
+ export function isSameToolContext(ctx1: ToolContext, ctx2: ToolContext): boolean {
191
+ const toolNames = new Set(Object.keys(ctx1));
192
+ const toolNames2 = new Set(Object.keys(ctx2));
193
+
194
+ if (toolNames.size !== toolNames2.size) {
195
+ return false;
196
+ }
197
+
198
+ for (const name of toolNames) {
199
+ if (!toolNames2.has(name)) {
200
+ return false;
201
+ }
202
+
203
+ const tool1 = ctx1[name];
204
+ const tool2 = ctx2[name];
205
+
206
+ if (!tool1 || !tool2) {
207
+ return false;
208
+ }
209
+
210
+ if (tool1.description !== tool2.description) {
211
+ return false;
212
+ }
213
+ }
214
+
215
+ return true;
216
+ }
217
+
218
+ export function isSameToolChoice(choice1: ToolChoice | null, choice2: ToolChoice | null): boolean {
219
+ if (choice1 === choice2) {
220
+ return true;
221
+ }
222
+ if (choice1 === null || choice2 === null) {
223
+ return false;
224
+ }
225
+ if (typeof choice1 === 'string' && typeof choice2 === 'string') {
226
+ return choice1 === choice2;
227
+ }
228
+ if (typeof choice1 === 'object' && typeof choice2 === 'object') {
229
+ return choice1.type === choice2.type && choice1.function.name === choice2.function.name;
230
+ }
231
+ return false;
232
+ }
233
+
190
234
  /**
191
235
  * Create a function tool with inferred parameters from the schema.
192
236
  */
@@ -91,6 +91,13 @@ export type EOUMetrics = {
91
91
  * Time taken to invoke the user's `Agent.onUserTurnCompleted` callback.
92
92
  */
93
93
  onUserTurnCompletedDelayMs: number;
94
+ /**
95
+ * The time the user stopped speaking.
96
+ */
97
+ lastSpeakingTimeMs: number;
98
+ /**
99
+ * The ID of the speech handle.
100
+ */
94
101
  speechId?: string;
95
102
  };
96
103
 
package/src/stt/stt.ts CHANGED
@@ -38,6 +38,12 @@ export enum SpeechEventType {
38
38
  END_OF_SPEECH = 3,
39
39
  /** Usage event, emitted periodically to indicate usage metrics. */
40
40
  RECOGNITION_USAGE = 4,
41
+ /**
42
+ * Preflight transcript, emitted before final transcript when STT has high confidence
43
+ * but hasn't fully committed yet. Includes all pre-committed transcripts including
44
+ * final transcript from the previous STT run.
45
+ */
46
+ PREFLIGHT_TRANSCRIPT = 5,
41
47
  }
42
48
 
43
49
  /** SpeechData contains metadata about this {@link SpeechEvent}. */
@@ -22,6 +22,7 @@ import {
22
22
  type ToolContext,
23
23
  } from '../llm/index.js';
24
24
  import type { LLMError } from '../llm/llm.js';
25
+ import { isSameToolChoice, isSameToolContext } from '../llm/tool_context.js';
25
26
  import { log } from '../log.js';
26
27
  import type {
27
28
  EOUMetrics,
@@ -43,6 +44,7 @@ import { type AgentSession, type TurnDetectionMode } from './agent_session.js';
43
44
  import {
44
45
  AudioRecognition,
45
46
  type EndOfTurnInfo,
47
+ type PreemptiveGenerationInfo,
46
48
  type RecognitionHooks,
47
49
  type _TurnDetector,
48
50
  } from './audio_recognition.js';
@@ -71,6 +73,16 @@ import { SpeechHandle } from './speech_handle.js';
71
73
  // equivalent to Python's contextvars
72
74
  const speechHandleStorage = new AsyncLocalStorage<SpeechHandle>();
73
75
 
76
+ interface PreemptiveGeneration {
77
+ speechHandle: SpeechHandle;
78
+ userMessage: ChatMessage;
79
+ info: PreemptiveGenerationInfo;
80
+ chatCtx: ChatContext;
81
+ tools: ToolContext;
82
+ toolChoice: ToolChoice | null;
83
+ createdAt: number;
84
+ }
85
+
74
86
  export class AgentActivity implements RecognitionHooks {
75
87
  private static readonly REPLY_TASK_CANCEL_TIMEOUT = 5000;
76
88
  private started = false;
@@ -87,6 +99,7 @@ export class AgentActivity implements RecognitionHooks {
87
99
  private audioStream = new DeferredReadableStream<AudioFrame>();
88
100
  // default to null as None, which maps to the default provider tool choice value
89
101
  private toolChoice: ToolChoice | null = null;
102
+ private _preemptiveGeneration?: PreemptiveGeneration;
90
103
 
91
104
  agent: Agent;
92
105
  agentSession: AgentSession;
@@ -589,8 +602,12 @@ export class AgentActivity implements RecognitionHooks {
589
602
  this.agentSession._updateUserState('speaking');
590
603
  }
591
604
 
592
- onEndOfSpeech(_ev: VADEvent): void {
593
- this.agentSession._updateUserState('listening');
605
+ onEndOfSpeech(ev: VADEvent): void {
606
+ let speechEndTime = Date.now();
607
+ if (ev) {
608
+ speechEndTime = speechEndTime - ev.silenceDuration;
609
+ }
610
+ this.agentSession._updateUserState('listening', speechEndTime);
594
611
  }
595
612
 
596
613
  onVADInferenceDone(ev: VADEvent): void {
@@ -664,6 +681,55 @@ export class AgentActivity implements RecognitionHooks {
664
681
  );
665
682
  }
666
683
 
684
+ onPreemptiveGeneration(info: PreemptiveGenerationInfo): void {
685
+ if (
686
+ !this.agentSession.options.preemptiveGeneration ||
687
+ this.draining ||
688
+ (this._currentSpeech !== undefined && !this._currentSpeech.interrupted) ||
689
+ !(this.llm instanceof LLM)
690
+ ) {
691
+ return;
692
+ }
693
+
694
+ this.cancelPreemptiveGeneration();
695
+
696
+ this.logger.info(
697
+ {
698
+ newTranscript: info.newTranscript,
699
+ transcriptConfidence: info.transcriptConfidence,
700
+ },
701
+ 'starting preemptive generation',
702
+ );
703
+
704
+ const userMessage = ChatMessage.create({
705
+ role: 'user',
706
+ content: info.newTranscript,
707
+ });
708
+ const chatCtx = this.agent.chatCtx.copy();
709
+ const speechHandle = this.generateReply({
710
+ userMessage,
711
+ chatCtx,
712
+ scheduleSpeech: false,
713
+ });
714
+
715
+ this._preemptiveGeneration = {
716
+ speechHandle,
717
+ userMessage,
718
+ info,
719
+ chatCtx: chatCtx.copy(),
720
+ tools: { ...this.tools },
721
+ toolChoice: this.toolChoice,
722
+ createdAt: Date.now(),
723
+ };
724
+ }
725
+
726
+ private cancelPreemptiveGeneration(): void {
727
+ if (this._preemptiveGeneration !== undefined) {
728
+ this._preemptiveGeneration.speechHandle._cancel();
729
+ this._preemptiveGeneration = undefined;
730
+ }
731
+ }
732
+
667
733
  private createSpeechTask(options: {
668
734
  task: Task<void>;
669
735
  ownedSpeechHandle?: SpeechHandle;
@@ -694,6 +760,7 @@ export class AgentActivity implements RecognitionHooks {
694
760
 
695
761
  async onEndOfTurn(info: EndOfTurnInfo): Promise<boolean> {
696
762
  if (this.draining) {
763
+ this.cancelPreemptiveGeneration();
697
764
  this.logger.warn({ user_input: info.newTranscript }, 'skipping user input, task is draining');
698
765
  // copied from python:
699
766
  // TODO(shubhra): should we "forward" this new turn to the next agent/activity?
@@ -710,6 +777,7 @@ export class AgentActivity implements RecognitionHooks {
710
777
  info.newTranscript.split(' ').length < this.agentSession.options.minInterruptionWords
711
778
  ) {
712
779
  // avoid interruption if the new_transcript is too short
780
+ this.cancelPreemptiveGeneration();
713
781
  this.logger.info('skipping user input, new_transcript is too short');
714
782
  return false;
715
783
  }
@@ -775,6 +843,7 @@ export class AgentActivity implements RecognitionHooks {
775
843
  instructions?: string;
776
844
  toolChoice?: ToolChoice | null;
777
845
  allowInterruptions?: boolean;
846
+ scheduleSpeech?: boolean;
778
847
  }): SpeechHandle {
779
848
  const {
780
849
  userMessage,
@@ -782,6 +851,7 @@ export class AgentActivity implements RecognitionHooks {
782
851
  instructions: defaultInstructions,
783
852
  toolChoice: defaultToolChoice,
784
853
  allowInterruptions: defaultAllowInterruptions,
854
+ scheduleSpeech = true,
785
855
  } = options;
786
856
 
787
857
  let instructions = defaultInstructions;
@@ -871,7 +941,9 @@ export class AgentActivity implements RecognitionHooks {
871
941
  task.finally(() => this.onPipelineReplyDone());
872
942
  }
873
943
 
874
- this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
944
+ if (scheduleSpeech) {
945
+ this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
946
+ }
875
947
  return handle;
876
948
  }
877
949
 
@@ -977,9 +1049,40 @@ export class AgentActivity implements RecognitionHooks {
977
1049
  return;
978
1050
  }
979
1051
 
980
- // Ensure the new message is passed to generateReply
981
- // This preserves the original message id, making it easier for users to track responses
982
- const speechHandle = this.generateReply({ userMessage, chatCtx });
1052
+ let speechHandle: SpeechHandle | undefined;
1053
+ if (this._preemptiveGeneration !== undefined) {
1054
+ const preemptive = this._preemptiveGeneration;
1055
+ // make sure the onUserTurnCompleted didn't change some request parameters
1056
+ // otherwise invalidate the preemptive generation
1057
+ if (
1058
+ preemptive.info.newTranscript === userMessage?.textContent &&
1059
+ preemptive.chatCtx.isEquivalent(chatCtx) &&
1060
+ isSameToolContext(preemptive.tools, this.tools) &&
1061
+ isSameToolChoice(preemptive.toolChoice, this.toolChoice)
1062
+ ) {
1063
+ speechHandle = preemptive.speechHandle;
1064
+ this.scheduleSpeech(speechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
1065
+ this.logger.debug(
1066
+ {
1067
+ preemptiveLeadTime: Date.now() - preemptive.createdAt,
1068
+ },
1069
+ 'using preemptive generation',
1070
+ );
1071
+ } else {
1072
+ this.logger.warn(
1073
+ 'preemptive generation enabled but chat context or tools have changed after `onUserTurnCompleted`',
1074
+ );
1075
+ preemptive.speechHandle._cancel();
1076
+ }
1077
+
1078
+ this._preemptiveGeneration = undefined;
1079
+ }
1080
+
1081
+ if (speechHandle === undefined) {
1082
+ // Ensure the new message is passed to generateReply
1083
+ // This preserves the original message id, making it easier for users to track responses
1084
+ speechHandle = this.generateReply({ userMessage, chatCtx });
1085
+ }
983
1086
 
984
1087
  const eouMetrics: EOUMetrics = {
985
1088
  type: 'eou_metrics',
@@ -987,6 +1090,7 @@ export class AgentActivity implements RecognitionHooks {
987
1090
  endOfUtteranceDelayMs: info.endOfUtteranceDelay,
988
1091
  transcriptionDelayMs: info.transcriptionDelay,
989
1092
  onUserTurnCompletedDelayMs: callbackDuration,
1093
+ lastSpeakingTimeMs: info.stoppedSpeakingAt ?? 0,
990
1094
  speechId: speechHandle.id,
991
1095
  };
992
1096
 
@@ -1139,10 +1243,9 @@ export class AgentActivity implements RecognitionHooks {
1139
1243
 
1140
1244
  chatCtx = chatCtx.copy();
1141
1245
 
1246
+ // Insert new message into temporary chat context for LLM inference
1142
1247
  if (newMessage) {
1143
1248
  chatCtx.insert(newMessage);
1144
- this.agent._chatCtx.insert(newMessage);
1145
- this.agentSession._conversationItemAdded(newMessage);
1146
1249
  }
1147
1250
 
1148
1251
  if (instructions) {
@@ -1157,7 +1260,6 @@ export class AgentActivity implements RecognitionHooks {
1157
1260
  }
1158
1261
  }
1159
1262
 
1160
- this.agentSession._updateAgentState('thinking');
1161
1263
  const tasks: Array<Task<void>> = [];
1162
1264
  const [llmTask, llmGenData] = performLLMInference(
1163
1265
  // preserve `this` context in llmNode
@@ -1185,6 +1287,12 @@ export class AgentActivity implements RecognitionHooks {
1185
1287
 
1186
1288
  await speechHandle.waitIfNotInterrupted([speechHandle._waitForScheduled()]);
1187
1289
 
1290
+ // Add new message to actual chat context if the speech is scheduled
1291
+ if (newMessage && speechHandle.scheduled) {
1292
+ this.agent._chatCtx.insert(newMessage);
1293
+ this.agentSession._conversationItemAdded(newMessage);
1294
+ }
1295
+
1188
1296
  if (speechHandle.interrupted) {
1189
1297
  replyAbortController.abort();
1190
1298
  await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
@@ -1917,6 +2025,7 @@ export class AgentActivity implements RecognitionHooks {
1917
2025
  try {
1918
2026
  if (this._draining) return;
1919
2027
 
2028
+ this.cancelPreemptiveGeneration();
1920
2029
  this.createSpeechTask({
1921
2030
  task: Task.from(() => this.agent.onExit()),
1922
2031
  name: 'AgentActivity_onExit',
@@ -1937,6 +2046,7 @@ export class AgentActivity implements RecognitionHooks {
1937
2046
  this.logger.warn('task closing without draining');
1938
2047
  }
1939
2048
 
2049
+ this.cancelPreemptiveGeneration();
1940
2050
  // Unregister event handlers to prevent duplicate metrics
1941
2051
  if (this.llm instanceof LLM) {
1942
2052
  this.llm.off('metrics_collected', this.onMetricsCollected);
@@ -57,6 +57,7 @@ export interface VoiceOptions {
57
57
  minEndpointingDelay: number;
58
58
  maxEndpointingDelay: number;
59
59
  maxToolSteps: number;
60
+ preemptiveGeneration: boolean;
60
61
  }
61
62
 
62
63
  const defaultVoiceOptions: VoiceOptions = {
@@ -67,6 +68,7 @@ const defaultVoiceOptions: VoiceOptions = {
67
68
  minEndpointingDelay: 500,
68
69
  maxEndpointingDelay: 6000,
69
70
  maxToolSteps: 3,
71
+ preemptiveGeneration: false,
70
72
  } as const;
71
73
 
72
74
  export type TurnDetectionMode = 'stt' | 'vad' | 'realtime_llm' | 'manual' | _TurnDetector;
@@ -421,7 +423,7 @@ export class AgentSession<
421
423
  }
422
424
 
423
425
  /** @internal */
424
- _updateUserState(state: UserState) {
426
+ _updateUserState(state: UserState, _lastSpeakingTime?: number) {
425
427
  if (this.userState === state) {
426
428
  return;
427
429
  }