@livekit/agents 1.0.47 → 1.0.49

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (151) hide show
  1. package/dist/beta/index.cjs +29 -0
  2. package/dist/beta/index.cjs.map +1 -0
  3. package/dist/beta/index.d.cts +2 -0
  4. package/dist/beta/index.d.ts +2 -0
  5. package/dist/beta/index.d.ts.map +1 -0
  6. package/dist/beta/index.js +7 -0
  7. package/dist/beta/index.js.map +1 -0
  8. package/dist/beta/workflows/index.cjs +29 -0
  9. package/dist/beta/workflows/index.cjs.map +1 -0
  10. package/dist/beta/workflows/index.d.cts +2 -0
  11. package/dist/beta/workflows/index.d.ts +2 -0
  12. package/dist/beta/workflows/index.d.ts.map +1 -0
  13. package/dist/beta/workflows/index.js +7 -0
  14. package/dist/beta/workflows/index.js.map +1 -0
  15. package/dist/beta/workflows/task_group.cjs +162 -0
  16. package/dist/beta/workflows/task_group.cjs.map +1 -0
  17. package/dist/beta/workflows/task_group.d.cts +32 -0
  18. package/dist/beta/workflows/task_group.d.ts +32 -0
  19. package/dist/beta/workflows/task_group.d.ts.map +1 -0
  20. package/dist/beta/workflows/task_group.js +138 -0
  21. package/dist/beta/workflows/task_group.js.map +1 -0
  22. package/dist/cpu.cjs +189 -0
  23. package/dist/cpu.cjs.map +1 -0
  24. package/dist/cpu.d.cts +24 -0
  25. package/dist/cpu.d.ts +24 -0
  26. package/dist/cpu.d.ts.map +1 -0
  27. package/dist/cpu.js +152 -0
  28. package/dist/cpu.js.map +1 -0
  29. package/dist/cpu.test.cjs +227 -0
  30. package/dist/cpu.test.cjs.map +1 -0
  31. package/dist/cpu.test.js +204 -0
  32. package/dist/cpu.test.js.map +1 -0
  33. package/dist/index.cjs +3 -0
  34. package/dist/index.cjs.map +1 -1
  35. package/dist/index.d.cts +2 -1
  36. package/dist/index.d.ts +2 -1
  37. package/dist/index.d.ts.map +1 -1
  38. package/dist/index.js +2 -0
  39. package/dist/index.js.map +1 -1
  40. package/dist/inference/api_protos.d.cts +59 -59
  41. package/dist/inference/api_protos.d.ts +59 -59
  42. package/dist/inference/llm.cjs.map +1 -1
  43. package/dist/inference/llm.d.cts +1 -1
  44. package/dist/inference/llm.d.ts +1 -1
  45. package/dist/inference/llm.d.ts.map +1 -1
  46. package/dist/inference/llm.js.map +1 -1
  47. package/dist/inference/tts.cjs.map +1 -1
  48. package/dist/inference/tts.d.cts +6 -0
  49. package/dist/inference/tts.d.ts +6 -0
  50. package/dist/inference/tts.d.ts.map +1 -1
  51. package/dist/inference/tts.js.map +1 -1
  52. package/dist/llm/chat_context.cjs +89 -1
  53. package/dist/llm/chat_context.cjs.map +1 -1
  54. package/dist/llm/chat_context.d.cts +10 -1
  55. package/dist/llm/chat_context.d.ts +10 -1
  56. package/dist/llm/chat_context.d.ts.map +1 -1
  57. package/dist/llm/chat_context.js +89 -1
  58. package/dist/llm/chat_context.js.map +1 -1
  59. package/dist/llm/chat_context.test.cjs +43 -0
  60. package/dist/llm/chat_context.test.cjs.map +1 -1
  61. package/dist/llm/chat_context.test.js +43 -0
  62. package/dist/llm/chat_context.test.js.map +1 -1
  63. package/dist/llm/index.cjs +2 -0
  64. package/dist/llm/index.cjs.map +1 -1
  65. package/dist/llm/index.d.cts +1 -1
  66. package/dist/llm/index.d.ts +1 -1
  67. package/dist/llm/index.d.ts.map +1 -1
  68. package/dist/llm/index.js +3 -1
  69. package/dist/llm/index.js.map +1 -1
  70. package/dist/llm/provider_format/index.d.cts +1 -1
  71. package/dist/llm/provider_format/index.d.ts +1 -1
  72. package/dist/llm/tool_context.cjs +7 -0
  73. package/dist/llm/tool_context.cjs.map +1 -1
  74. package/dist/llm/tool_context.d.cts +10 -2
  75. package/dist/llm/tool_context.d.ts +10 -2
  76. package/dist/llm/tool_context.d.ts.map +1 -1
  77. package/dist/llm/tool_context.js +6 -0
  78. package/dist/llm/tool_context.js.map +1 -1
  79. package/dist/utils.cjs +1 -0
  80. package/dist/utils.cjs.map +1 -1
  81. package/dist/utils.d.ts.map +1 -1
  82. package/dist/utils.js +1 -0
  83. package/dist/utils.js.map +1 -1
  84. package/dist/version.cjs +1 -1
  85. package/dist/version.js +1 -1
  86. package/dist/voice/agent.cjs +9 -0
  87. package/dist/voice/agent.cjs.map +1 -1
  88. package/dist/voice/agent.d.cts +1 -0
  89. package/dist/voice/agent.d.ts +1 -0
  90. package/dist/voice/agent.d.ts.map +1 -1
  91. package/dist/voice/agent.js +9 -0
  92. package/dist/voice/agent.js.map +1 -1
  93. package/dist/voice/agent_activity.cjs +67 -16
  94. package/dist/voice/agent_activity.cjs.map +1 -1
  95. package/dist/voice/agent_activity.d.cts +7 -0
  96. package/dist/voice/agent_activity.d.ts +7 -0
  97. package/dist/voice/agent_activity.d.ts.map +1 -1
  98. package/dist/voice/agent_activity.js +68 -17
  99. package/dist/voice/agent_activity.js.map +1 -1
  100. package/dist/voice/agent_session.cjs +27 -1
  101. package/dist/voice/agent_session.cjs.map +1 -1
  102. package/dist/voice/agent_session.d.cts +6 -0
  103. package/dist/voice/agent_session.d.ts +6 -0
  104. package/dist/voice/agent_session.d.ts.map +1 -1
  105. package/dist/voice/agent_session.js +27 -1
  106. package/dist/voice/agent_session.js.map +1 -1
  107. package/dist/voice/room_io/room_io.cjs +11 -2
  108. package/dist/voice/room_io/room_io.cjs.map +1 -1
  109. package/dist/voice/room_io/room_io.d.ts.map +1 -1
  110. package/dist/voice/room_io/room_io.js +12 -3
  111. package/dist/voice/room_io/room_io.js.map +1 -1
  112. package/dist/voice/testing/fake_llm.cjs +127 -0
  113. package/dist/voice/testing/fake_llm.cjs.map +1 -0
  114. package/dist/voice/testing/fake_llm.d.cts +30 -0
  115. package/dist/voice/testing/fake_llm.d.ts +30 -0
  116. package/dist/voice/testing/fake_llm.d.ts.map +1 -0
  117. package/dist/voice/testing/fake_llm.js +103 -0
  118. package/dist/voice/testing/fake_llm.js.map +1 -0
  119. package/dist/voice/testing/index.cjs +3 -0
  120. package/dist/voice/testing/index.cjs.map +1 -1
  121. package/dist/voice/testing/index.d.cts +1 -0
  122. package/dist/voice/testing/index.d.ts +1 -0
  123. package/dist/voice/testing/index.d.ts.map +1 -1
  124. package/dist/voice/testing/index.js +2 -0
  125. package/dist/voice/testing/index.js.map +1 -1
  126. package/dist/worker.cjs +6 -29
  127. package/dist/worker.cjs.map +1 -1
  128. package/dist/worker.d.ts.map +1 -1
  129. package/dist/worker.js +6 -19
  130. package/dist/worker.js.map +1 -1
  131. package/package.json +1 -1
  132. package/src/beta/index.ts +9 -0
  133. package/src/beta/workflows/index.ts +9 -0
  134. package/src/beta/workflows/task_group.ts +194 -0
  135. package/src/cpu.test.ts +239 -0
  136. package/src/cpu.ts +173 -0
  137. package/src/index.ts +2 -1
  138. package/src/inference/llm.ts +2 -0
  139. package/src/inference/tts.ts +8 -1
  140. package/src/llm/chat_context.test.ts +48 -0
  141. package/src/llm/chat_context.ts +123 -0
  142. package/src/llm/index.ts +1 -0
  143. package/src/llm/tool_context.ts +14 -0
  144. package/src/utils.ts +5 -0
  145. package/src/voice/agent.ts +11 -0
  146. package/src/voice/agent_activity.ts +102 -16
  147. package/src/voice/agent_session.ts +33 -2
  148. package/src/voice/room_io/room_io.ts +14 -3
  149. package/src/voice/testing/fake_llm.ts +138 -0
  150. package/src/voice/testing/index.ts +2 -0
  151. package/src/worker.ts +34 -50
@@ -2,6 +2,8 @@
2
2
  //
3
3
  // SPDX-License-Identifier: Apache-2.0
4
4
  import { describe, expect, it } from 'vitest';
5
+ import { initializeLogger } from '../log.js';
6
+ import { FakeLLM } from '../voice/testing/fake_llm.js';
5
7
  import {
6
8
  type AudioContent,
7
9
  ChatContext,
@@ -13,6 +15,8 @@ import {
13
15
  ReadonlyChatContext,
14
16
  } from './chat_context.js';
15
17
 
18
+ initializeLogger({ pretty: false, level: 'error' });
19
+
16
20
  describe('ChatContext.toJSON', () => {
17
21
  it('should match snapshot for empty context', () => {
18
22
  const context = new ChatContext();
@@ -283,6 +287,50 @@ describe('ChatContext.toJSON', () => {
283
287
  });
284
288
  });
285
289
 
290
+ describe('ChatContext._summarize', () => {
291
+ it('keeps chronological timestamps with summary + tail', async () => {
292
+ const ctx = new ChatContext();
293
+ ctx.addMessage({ role: 'system', content: 'System prompt', createdAt: 0 });
294
+ ctx.addMessage({ role: 'user', content: 'hello', createdAt: 1000 });
295
+ ctx.addMessage({ role: 'assistant', content: 'hi there', createdAt: 2000 });
296
+ ctx.insert(
297
+ new FunctionCallOutput({
298
+ callId: 'call_1',
299
+ name: 'lookup',
300
+ output: '{"ok":true}',
301
+ isError: false,
302
+ createdAt: 3500,
303
+ }),
304
+ );
305
+ ctx.addMessage({ role: 'user', content: 'my color is blue', createdAt: 3000 });
306
+ ctx.addMessage({ role: 'assistant', content: 'noted', createdAt: 4000 });
307
+
308
+ const fake = new FakeLLM([
309
+ {
310
+ input: 'Conversation to summarize:\n\nuser: hello\nassistant: hi there',
311
+ content: 'condensed head',
312
+ },
313
+ ]);
314
+
315
+ await ctx._summarize(fake, { keepLastTurns: 1 });
316
+
317
+ const summary = ctx.items.find(
318
+ (item) =>
319
+ item.type === 'message' && item.role === 'assistant' && item.extra?.is_summary === true,
320
+ );
321
+ expect(summary).toBeDefined();
322
+ if (!summary || summary.type !== 'message') {
323
+ throw new Error('summary message is missing');
324
+ }
325
+
326
+ expect(summary.createdAt).toBeCloseTo(2999.999, 6);
327
+
328
+ const createdAts = ctx.items.map((item) => item.createdAt);
329
+ const sorted = [...createdAts].sort((a, b) => a - b);
330
+ expect(createdAts).toEqual(sorted);
331
+ });
332
+ });
333
+
286
334
  describe('ReadonlyChatContext with immutable array', () => {
287
335
  it('should have readonly property set to true', () => {
288
336
  const items: ChatItem[] = [
@@ -3,6 +3,7 @@
3
3
  // SPDX-License-Identifier: Apache-2.0
4
4
  import type { AudioFrame, VideoFrame } from '@livekit/rtc-node';
5
5
  import { createImmutableArray, shortuuid } from '../utils.js';
6
+ import type { LLM } from './llm.js';
6
7
  import { type ProviderFormat, toChatCtx } from './provider_format/index.js';
7
8
  import type { JSONObject, JSONValue, ToolContext } from './tool_context.js';
8
9
 
@@ -95,12 +96,15 @@ export class ChatMessage {
95
96
 
96
97
  createdAt: number;
97
98
 
99
+ extra: Record<string, unknown>;
100
+
98
101
  constructor(params: {
99
102
  role: ChatRole;
100
103
  content: ChatContent[] | string;
101
104
  id?: string;
102
105
  interrupted?: boolean;
103
106
  createdAt?: number;
107
+ extra?: Record<string, unknown>;
104
108
  }) {
105
109
  const {
106
110
  role,
@@ -108,12 +112,14 @@ export class ChatMessage {
108
112
  id = shortuuid('item_'),
109
113
  interrupted = false,
110
114
  createdAt = Date.now(),
115
+ extra = {},
111
116
  } = params;
112
117
  this.id = id;
113
118
  this.role = role;
114
119
  this.content = Array.isArray(content) ? content : [content];
115
120
  this.interrupted = interrupted;
116
121
  this.createdAt = createdAt;
122
+ this.extra = extra;
117
123
  }
118
124
 
119
125
  static create(params: {
@@ -122,6 +128,7 @@ export class ChatMessage {
122
128
  id?: string;
123
129
  interrupted?: boolean;
124
130
  createdAt?: number;
131
+ extra?: Record<string, unknown>;
125
132
  }) {
126
133
  return new ChatMessage(params);
127
134
  }
@@ -401,6 +408,7 @@ export class AgentHandoffItem {
401
408
  }
402
409
  }
403
410
 
411
+ // TODO(parity): Add AgentConfigUpdate type to ChatItem union
404
412
  export type ChatItem = ChatMessage | FunctionCall | FunctionCallOutput | AgentHandoffItem;
405
413
 
406
414
  export class ChatContext {
@@ -431,6 +439,7 @@ export class ChatContext {
431
439
  id?: string;
432
440
  interrupted?: boolean;
433
441
  createdAt?: number;
442
+ extra?: Record<string, unknown>;
434
443
  }): ChatMessage {
435
444
  const msg = new ChatMessage(params);
436
445
  if (params.createdAt !== undefined) {
@@ -463,11 +472,13 @@ export class ChatContext {
463
472
  return idx !== -1 ? idx : undefined;
464
473
  }
465
474
 
475
+ // TODO(parity): Add excludeConfigUpdate option when AgentConfigUpdate is ported
466
476
  copy(
467
477
  options: {
468
478
  excludeFunctionCall?: boolean;
469
479
  excludeInstructions?: boolean;
470
480
  excludeEmptyMessage?: boolean;
481
+ excludeHandoff?: boolean;
471
482
  toolCtx?: ToolContext<any>; // eslint-disable-line @typescript-eslint/no-explicit-any
472
483
  } = {},
473
484
  ): ChatContext {
@@ -475,6 +486,7 @@ export class ChatContext {
475
486
  excludeFunctionCall = false,
476
487
  excludeInstructions = false,
477
488
  excludeEmptyMessage = false,
489
+ excludeHandoff = false,
478
490
  toolCtx,
479
491
  } = options;
480
492
  const items: ChatItem[] = [];
@@ -500,6 +512,10 @@ export class ChatContext {
500
512
  continue;
501
513
  }
502
514
 
515
+ if (excludeHandoff && item.type === 'agent_handoff') {
516
+ continue;
517
+ }
518
+
503
519
  if (toolCtx !== undefined && isToolCallOrOutput(item) && toolCtx[item.name] === undefined) {
504
520
  continue;
505
521
  }
@@ -510,6 +526,7 @@ export class ChatContext {
510
526
  return new ChatContext(items);
511
527
  }
512
528
 
529
+ // TODO(parity): Add excludeConfigUpdate option when AgentConfigUpdate is ported
513
530
  merge(
514
531
  other: ChatContext,
515
532
  options: {
@@ -762,6 +779,112 @@ export class ChatContext {
762
779
  return true;
763
780
  }
764
781
 
782
+ async _summarize(llm: LLM, options: { keepLastTurns?: number } = {}): Promise<ChatContext> {
783
+ const { keepLastTurns = 2 } = options;
784
+
785
+ const toSummarize: ChatMessage[] = [];
786
+ for (const item of this._items) {
787
+ if (item.type !== 'message') continue;
788
+ if (item.role !== 'user' && item.role !== 'assistant') continue;
789
+ if (item.extra?.is_summary === true) continue;
790
+
791
+ const text = (item.textContent ?? '').trim();
792
+ if (text) {
793
+ toSummarize.push(item);
794
+ }
795
+ }
796
+
797
+ if (toSummarize.length === 0) {
798
+ return this;
799
+ }
800
+
801
+ const tailN = Math.max(0, Math.min(toSummarize.length, keepLastTurns * 2));
802
+ let head: ChatMessage[];
803
+ let tail: ChatMessage[];
804
+ if (tailN === 0) {
805
+ head = toSummarize;
806
+ tail = [];
807
+ } else {
808
+ head = toSummarize.slice(0, -tailN);
809
+ tail = toSummarize.slice(-tailN);
810
+ }
811
+
812
+ if (head.length === 0) {
813
+ return this;
814
+ }
815
+
816
+ const sourceText = head
817
+ .map((m) => `${m.role}: ${(m.textContent ?? '').trim()}`)
818
+ .join('\n')
819
+ .trim();
820
+
821
+ if (!sourceText) {
822
+ return this;
823
+ }
824
+
825
+ // TODO: refactor this into LLMStream.collect API.
826
+ const promptCtx = new ChatContext();
827
+ promptCtx.addMessage({
828
+ role: 'system',
829
+ content:
830
+ 'Compress older chat history into a short, faithful summary.\n' +
831
+ 'Focus on user goals, constraints, decisions, key facts/preferences/entities, and pending tasks.\n' +
832
+ 'Exclude chit-chat and greetings. Be concise.',
833
+ });
834
+ promptCtx.addMessage({
835
+ role: 'user',
836
+ content: `Conversation to summarize:\n\n${sourceText}`,
837
+ });
838
+
839
+ const chunks: string[] = [];
840
+ for await (const chunk of llm.chat({ chatCtx: promptCtx })) {
841
+ if (chunk.delta?.content) {
842
+ chunks.push(chunk.delta.content);
843
+ }
844
+ }
845
+
846
+ const summary = chunks.join('').trim();
847
+ if (!summary) {
848
+ return this;
849
+ }
850
+
851
+ const tailStartTs = tail.length > 0 ? tail[0]!.createdAt : Infinity;
852
+
853
+ const preserved: ChatItem[] = [];
854
+ for (const it of this._items) {
855
+ if (
856
+ (it.type === 'function_call' || it.type === 'function_call_output') &&
857
+ it.createdAt < tailStartTs
858
+ ) {
859
+ continue;
860
+ }
861
+
862
+ if (it.type === 'message' && (it.role === 'user' || it.role === 'assistant')) {
863
+ continue;
864
+ }
865
+
866
+ preserved.push(it);
867
+ }
868
+
869
+ this._items = preserved;
870
+
871
+ const createdAtHint =
872
+ tail.length > 0 ? tail[0]!.createdAt - 1e-3 : head[head.length - 1]!.createdAt + 1e-3;
873
+
874
+ this.addMessage({
875
+ role: 'assistant',
876
+ content: `[history summary]\n${summary}`,
877
+ createdAt: createdAtHint,
878
+ extra: { is_summary: true },
879
+ });
880
+
881
+ for (const msg of tail) {
882
+ this.insert(msg);
883
+ }
884
+
885
+ return this;
886
+ }
887
+
765
888
  /**
766
889
  * Indicates whether the context is read-only
767
890
  */
package/src/llm/index.ts CHANGED
@@ -6,6 +6,7 @@ export {
6
6
  isFunctionTool,
7
7
  tool,
8
8
  ToolError,
9
+ ToolFlag,
9
10
  type AgentHandoff,
10
11
  type FunctionTool,
11
12
  type ProviderDefinedTool,
@@ -80,6 +80,13 @@ export class ToolError extends Error {
80
80
  }
81
81
  }
82
82
 
83
+ export const ToolFlag = {
84
+ NONE: 0,
85
+ IGNORE_ON_ENTER: 1 << 0,
86
+ } as const;
87
+
88
+ export type ToolFlag = (typeof ToolFlag)[keyof typeof ToolFlag];
89
+
83
90
  export interface AgentHandoff {
84
91
  /**
85
92
  * The agent to handoff to.
@@ -178,6 +185,8 @@ export interface FunctionTool<
178
185
  */
179
186
  execute: ToolExecuteFunction<Parameters, UserData, Result>;
180
187
 
188
+ flags: number;
189
+
181
190
  [FUNCTION_TOOL_SYMBOL]: true;
182
191
  }
183
192
 
@@ -242,10 +251,12 @@ export function tool<
242
251
  description,
243
252
  parameters,
244
253
  execute,
254
+ flags,
245
255
  }: {
246
256
  description: string;
247
257
  parameters: Schema;
248
258
  execute: ToolExecuteFunction<InferToolInput<Schema>, UserData, Result>;
259
+ flags?: number;
249
260
  }): FunctionTool<InferToolInput<Schema>, UserData, Result>;
250
261
 
251
262
  /**
@@ -254,10 +265,12 @@ export function tool<
254
265
  export function tool<UserData = UnknownUserData, Result = unknown>({
255
266
  description,
256
267
  execute,
268
+ flags,
257
269
  }: {
258
270
  description: string;
259
271
  parameters?: never;
260
272
  execute: ToolExecuteFunction<Record<string, never>, UserData, Result>;
273
+ flags?: number;
261
274
  }): FunctionTool<Record<string, never>, UserData, Result>;
262
275
 
263
276
  /**
@@ -295,6 +308,7 @@ export function tool(tool: any): any {
295
308
  description: tool.description,
296
309
  parameters,
297
310
  execute: tool.execute,
311
+ flags: tool.flags ?? ToolFlag.NONE,
298
312
  [TOOL_SYMBOL]: true,
299
313
  [FUNCTION_TOOL_SYMBOL]: true,
300
314
  };
package/src/utils.ts CHANGED
@@ -173,6 +173,11 @@ export class Future<T = void> {
173
173
  this.#rejected = true;
174
174
  this.#error = error;
175
175
  this.#rejectPromise(error);
176
+ // Python calls Future.exception() right after set_exception() to silence
177
+ // "exception was never retrieved" warnings. In JS, consume the rejection
178
+ // immediately so Node does not emit unhandled-rejection noise before a
179
+ // later await/catch observes it.
180
+ void this.#await.catch(() => undefined);
176
181
  }
177
182
  }
178
183
 
@@ -302,6 +302,17 @@ export class Agent<UserData = any> {
302
302
  this._agentActivity.updateChatCtx(chatCtx);
303
303
  }
304
304
 
305
+ // TODO(parity): Add when AgentConfigUpdate is ported to ChatContext.
306
+ async updateTools(tools: ToolContext): Promise<void> {
307
+ if (!this._agentActivity) {
308
+ this._tools = { ...tools };
309
+ this._chatCtx = this._chatCtx.copy({ toolCtx: this._tools });
310
+ return;
311
+ }
312
+
313
+ await this._agentActivity.updateTools(tools);
314
+ }
315
+
305
316
  static default = {
306
317
  async sttNode(
307
318
  agent: Agent,
@@ -7,7 +7,7 @@ import type { Span } from '@opentelemetry/api';
7
7
  import { ROOT_CONTEXT, context as otelContext, trace } from '@opentelemetry/api';
8
8
  import { Heap } from 'heap-js';
9
9
  import { AsyncLocalStorage } from 'node:async_hooks';
10
- import { ReadableStream } from 'node:stream/web';
10
+ import { ReadableStream, TransformStream } from 'node:stream/web';
11
11
  import { type ChatContext, ChatMessage } from '../llm/chat_context.js';
12
12
  import {
13
13
  type ChatItem,
@@ -23,6 +23,7 @@ import {
23
23
  type RealtimeSession,
24
24
  type ToolChoice,
25
25
  type ToolContext,
26
+ ToolFlag,
26
27
  } from '../llm/index.js';
27
28
  import type { LLMError } from '../llm/llm.js';
28
29
  import { isSameToolChoice, isSameToolContext } from '../llm/tool_context.js';
@@ -83,6 +84,12 @@ import { SpeechHandle } from './speech_handle.js';
83
84
  import { setParticipantSpanAttributes } from './utils.js';
84
85
 
85
86
  export const agentActivityStorage = new AsyncLocalStorage<AgentActivity>();
87
+ export const onEnterStorage = new AsyncLocalStorage<OnEnterData>();
88
+
89
+ interface OnEnterData {
90
+ session: AgentSession;
91
+ agent: Agent;
92
+ }
86
93
 
87
94
  interface PreemptiveGeneration {
88
95
  speechHandle: SpeechHandle;
@@ -312,6 +319,8 @@ export class AgentActivity implements RecognitionHooks {
312
319
  }
313
320
  }
314
321
 
322
+ // TODO(parity): Record initial AgentConfigUpdate in chat context
323
+
315
324
  // metrics and error handling
316
325
  if (this.llm instanceof LLM) {
317
326
  this.llm.on('metrics_collected', this.onMetricsCollected);
@@ -354,11 +363,13 @@ export class AgentActivity implements RecognitionHooks {
354
363
  if (runOnEnter) {
355
364
  this._onEnterTask = this.createSpeechTask({
356
365
  taskFn: () =>
357
- tracer.startActiveSpan(async () => this.agent.onEnter(), {
358
- name: 'on_enter',
359
- context: trace.setSpan(ROOT_CONTEXT, startSpan),
360
- attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
361
- }),
366
+ onEnterStorage.run({ session: this.agentSession, agent: this.agent }, () =>
367
+ tracer.startActiveSpan(async () => this.agent.onEnter(), {
368
+ name: 'on_enter',
369
+ context: trace.setSpan(ROOT_CONTEXT, startSpan),
370
+ attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
371
+ }),
372
+ ),
362
373
  inlineTask: true,
363
374
  name: 'AgentActivity_onEnter',
364
375
  });
@@ -446,6 +457,20 @@ export class AgentActivity implements RecognitionHooks {
446
457
  }
447
458
  }
448
459
 
460
+ // TODO: Add when AgentConfigUpdate is ported to ChatContext.
461
+ async updateTools(tools: ToolContext): Promise<void> {
462
+ this.agent._tools = { ...tools };
463
+
464
+ if (this.realtimeSession) {
465
+ await this.realtimeSession.updateTools(tools);
466
+ }
467
+
468
+ if (this.llm instanceof LLM) {
469
+ // for realtime LLM, we assume the server will remove unvalid tool messages
470
+ await this.updateChatCtx(this.agent._chatCtx.copy({ toolCtx: tools }));
471
+ }
472
+ }
473
+
449
474
  updateOptions({ toolChoice }: { toolChoice?: ToolChoice | null }): void {
450
475
  if (toolChoice !== undefined) {
451
476
  this.toolChoice = toolChoice;
@@ -460,15 +485,36 @@ export class AgentActivity implements RecognitionHooks {
460
485
  void this.audioStream.close();
461
486
  this.audioStream = new MultiInputStream<AudioFrame>();
462
487
 
488
+ // Filter is applied on this.audioStream.stream (downstream of MultiInputStream) rather
489
+ // than on the source audioStream via pipeThrough. pipeThrough locks its source stream, so
490
+ // if it were applied directly on audioStream, that lock would survive MultiInputStream.close()
491
+ // and make audioStream permanently locked for subsequent attachAudioInput calls (e.g. handoff).
492
+ const aecWarmupAudioFilter = new TransformStream<AudioFrame, AudioFrame>({
493
+ transform: (frame, controller) => {
494
+ const shouldDiscardForAecWarmup =
495
+ this.agentSession.agentState === 'speaking' && this.agentSession._aecWarmupRemaining > 0;
496
+ if (!shouldDiscardForAecWarmup) {
497
+ controller.enqueue(frame);
498
+ }
499
+ },
500
+ });
501
+
463
502
  this.audioStreamId = this.audioStream.addInputStream(audioStream);
464
- const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee();
465
503
 
466
- if (this.realtimeSession) {
504
+ if (this.realtimeSession && this.audioRecognition) {
505
+ const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream
506
+ .pipeThrough(aecWarmupAudioFilter)
507
+ .tee();
467
508
  this.realtimeSession.setInputAudioStream(realtimeAudioStream);
468
- }
469
-
470
- if (this.audioRecognition) {
471
509
  this.audioRecognition.setInputAudioStream(recognitionAudioStream);
510
+ } else if (this.realtimeSession) {
511
+ this.realtimeSession.setInputAudioStream(
512
+ this.audioStream.stream.pipeThrough(aecWarmupAudioFilter),
513
+ );
514
+ } else if (this.audioRecognition) {
515
+ this.audioRecognition.setInputAudioStream(
516
+ this.audioStream.stream.pipeThrough(aecWarmupAudioFilter),
517
+ );
472
518
  }
473
519
  }
474
520
 
@@ -730,6 +776,11 @@ export class AgentActivity implements RecognitionHooks {
730
776
  }
731
777
 
732
778
  private interruptByAudioActivity(): void {
779
+ if (this.agentSession._aecWarmupRemaining > 0) {
780
+ // Disable interruption from audio activity while AEC warmup is active.
781
+ return;
782
+ }
783
+
733
784
  if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection) {
734
785
  // skip speech handle interruption if server side turn detection is enabled
735
786
  return;
@@ -1129,12 +1180,25 @@ export class AgentActivity implements RecognitionHooks {
1129
1180
  instructions = `${this.agent.instructions}\n${instructions}`;
1130
1181
  }
1131
1182
 
1183
+ // Filter out tools with IGNORE_ON_ENTER flag when generateReply is called inside onEnter
1184
+ const onEnterData = onEnterStorage.getStore();
1185
+ const shouldFilterTools =
1186
+ onEnterData?.agent === this.agent && onEnterData?.session === this.agentSession;
1187
+
1188
+ const tools = shouldFilterTools
1189
+ ? Object.fromEntries(
1190
+ Object.entries(this.agent.toolCtx).filter(
1191
+ ([, fnTool]) => !(fnTool.flags & ToolFlag.IGNORE_ON_ENTER),
1192
+ ),
1193
+ )
1194
+ : this.agent.toolCtx;
1195
+
1132
1196
  const task = this.createSpeechTask({
1133
1197
  taskFn: (abortController: AbortController) =>
1134
1198
  this.pipelineReplyTask(
1135
1199
  handle,
1136
1200
  chatCtx ?? this.agent.chatCtx,
1137
- this.agent.toolCtx,
1201
+ tools,
1138
1202
  {
1139
1203
  toolChoice: toOaiToolChoice(toolChoice !== undefined ? toolChoice : this.toolChoice),
1140
1204
  },
@@ -1172,7 +1236,24 @@ export class AgentActivity implements RecognitionHooks {
1172
1236
 
1173
1237
  this.realtimeSession?.interrupt();
1174
1238
 
1175
- if (currentSpeech === undefined) {
1239
+ if (force) {
1240
+ // Force-interrupt (used during shutdown): cancel all speech tasks so they
1241
+ // don't block on I/O that will never complete (e.g. audioOutput.waitForPlayout()
1242
+ // when the room is disconnected). Mark the current speech as done immediately
1243
+ // so the interrupt future resolves without waiting for tasks to finish.
1244
+ // Clear the queue so mainTask doesn't dequeue already-interrupted handles
1245
+ // and hang on _waitForGeneration() (the generation future created by
1246
+ // _authorizeGeneration would never resolve since _markDone is a no-op
1247
+ // once doneFut is already settled).
1248
+ for (const task of this.speechTasks) {
1249
+ task.cancel();
1250
+ }
1251
+ if (currentSpeech && !currentSpeech.done()) {
1252
+ currentSpeech._markDone();
1253
+ }
1254
+ this.speechQueue.clear();
1255
+ future.resolve();
1256
+ } else if (currentSpeech === undefined) {
1176
1257
  future.resolve();
1177
1258
  } else {
1178
1259
  currentSpeech.addDoneCallback(() => {
@@ -1680,9 +1761,7 @@ export class AgentActivity implements RecognitionHooks {
1680
1761
  }
1681
1762
 
1682
1763
  replyAbortController.abort();
1683
- await Promise.allSettled(
1684
- tasks.map((task) => task.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT)),
1685
- );
1764
+ await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
1686
1765
 
1687
1766
  let forwardedText = textOut?.text || '';
1688
1767
 
@@ -2511,6 +2590,13 @@ export class AgentActivity implements RecognitionHooks {
2511
2590
  const unlock = await this.lock.lock();
2512
2591
  try {
2513
2592
  this.cancelPreemptiveGeneration();
2593
+
2594
+ await cancelAndWait(Array.from(this.speechTasks), AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
2595
+
2596
+ if (this._currentSpeech && !this._currentSpeech.done()) {
2597
+ this._currentSpeech._markDone();
2598
+ }
2599
+
2514
2600
  await this._closeSessionResources();
2515
2601
 
2516
2602
  if (this._mainTask) {
@@ -77,6 +77,7 @@ export interface VoiceOptions {
77
77
  maxToolSteps: number;
78
78
  preemptiveGeneration: boolean;
79
79
  userAwayTimeout?: number | null;
80
+ aecWarmupDuration: number | null;
80
81
  useTtsAlignedTranscript: boolean;
81
82
  }
82
83
 
@@ -90,6 +91,7 @@ const defaultVoiceOptions: VoiceOptions = {
90
91
  maxToolSteps: 3,
91
92
  preemptiveGeneration: false,
92
93
  userAwayTimeout: 15.0,
94
+ aecWarmupDuration: 3000,
93
95
  useTtsAlignedTranscript: true,
94
96
  } as const;
95
97
 
@@ -158,6 +160,8 @@ export class AgentSession<
158
160
  private closingTask: Promise<void> | null = null;
159
161
  private userAwayTimer: NodeJS.Timeout | null = null;
160
162
 
163
+ private _aecWarmupTimer: NodeJS.Timeout | null = null;
164
+
161
165
  // Connection options for STT, LLM, and TTS
162
166
  private _connOptions: ResolvedSessionConnectOptions;
163
167
 
@@ -169,6 +173,9 @@ export class AgentSession<
169
173
  private userSpeakingSpan?: Span;
170
174
  private agentSpeakingSpan?: Span;
171
175
 
176
+ /** @internal */
177
+ _aecWarmupRemaining = 0;
178
+
172
179
  /** @internal */
173
180
  _recorderIO?: RecorderIO;
174
181
 
@@ -241,6 +248,7 @@ export class AgentSession<
241
248
  // This is the "global" chat context, it holds the entire conversation history
242
249
  this._chatCtx = ChatContext.empty();
243
250
  this.options = { ...defaultVoiceOptions, ...voiceOptions };
251
+ this._aecWarmupRemaining = this.options.aecWarmupDuration ?? 0;
244
252
 
245
253
  this._onUserInputTranscribed = this._onUserInputTranscribed.bind(this);
246
254
  this.on(AgentSessionEventTypes.UserInputTranscribed, this._onUserInputTranscribed);
@@ -774,7 +782,9 @@ export class AgentSession<
774
782
  if (this.closingTask) {
775
783
  return;
776
784
  }
777
- this.closeImpl(reason, error, drain);
785
+ this.closingTask = this.closeImpl(reason, error, drain).finally(() => {
786
+ this.closingTask = null;
787
+ });
778
788
  }
779
789
 
780
790
  /** @internal */
@@ -845,6 +855,14 @@ export class AgentSession<
845
855
  this.agentSpeakingSpan = undefined;
846
856
  }
847
857
 
858
+ if (state === 'speaking' && this._aecWarmupRemaining > 0 && this._aecWarmupTimer === null) {
859
+ this._aecWarmupTimer = setTimeout(() => this._onAecWarmupExpired(), this._aecWarmupRemaining);
860
+ this.logger.debug(
861
+ { warmupDurationMs: this._aecWarmupRemaining },
862
+ 'aec warmup active, disabling interruptions',
863
+ );
864
+ }
865
+
848
866
  const oldState = this._agentState;
849
867
  this._agentState = state;
850
868
 
@@ -938,6 +956,19 @@ export class AgentSession<
938
956
  }
939
957
  }
940
958
 
959
+ /** @internal */
960
+ _onAecWarmupExpired(): void {
961
+ if (this._aecWarmupRemaining > 0) {
962
+ this.logger.debug('aec warmup expired, re-enabling interruptions');
963
+ }
964
+
965
+ this._aecWarmupRemaining = 0;
966
+ if (this._aecWarmupTimer !== null) {
967
+ clearTimeout(this._aecWarmupTimer);
968
+ this._aecWarmupTimer = null;
969
+ }
970
+ }
971
+
941
972
  private _onUserInputTranscribed(ev: UserInputTranscribedEvent): void {
942
973
  if (this.userState === 'away' && ev.isFinal) {
943
974
  this.logger.debug('User returned from away state due to speech input');
@@ -969,6 +1000,7 @@ export class AgentSession<
969
1000
  }
970
1001
 
971
1002
  this._cancelUserAwayTimer();
1003
+ this._onAecWarmupExpired();
972
1004
  this.off(AgentSessionEventTypes.UserInputTranscribed, this._onUserInputTranscribed);
973
1005
 
974
1006
  if (this.activity) {
@@ -976,7 +1008,6 @@ export class AgentSession<
976
1008
  try {
977
1009
  await this.activity.interrupt({ force: true }).await;
978
1010
  } catch (error) {
979
- // Uninterruptible speech can throw during forced interruption.
980
1011
  this.logger.warn({ error }, 'Error interrupting activity');
981
1012
  }
982
1013
  }