@livekit/agents 1.0.17 → 1.0.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. package/dist/inference/llm.cjs +35 -13
  2. package/dist/inference/llm.cjs.map +1 -1
  3. package/dist/inference/llm.d.cts +10 -5
  4. package/dist/inference/llm.d.ts +10 -5
  5. package/dist/inference/llm.d.ts.map +1 -1
  6. package/dist/inference/llm.js +35 -13
  7. package/dist/inference/llm.js.map +1 -1
  8. package/dist/llm/chat_context.d.cts +1 -1
  9. package/dist/llm/chat_context.d.ts +1 -1
  10. package/dist/llm/llm.cjs.map +1 -1
  11. package/dist/llm/llm.d.cts +1 -1
  12. package/dist/llm/llm.d.ts +1 -1
  13. package/dist/llm/llm.d.ts.map +1 -1
  14. package/dist/llm/llm.js.map +1 -1
  15. package/dist/llm/provider_format/google.cjs.map +1 -1
  16. package/dist/llm/provider_format/google.d.cts +1 -1
  17. package/dist/llm/provider_format/google.d.ts +1 -1
  18. package/dist/llm/provider_format/google.d.ts.map +1 -1
  19. package/dist/llm/provider_format/google.js.map +1 -1
  20. package/dist/llm/provider_format/index.d.cts +1 -1
  21. package/dist/llm/provider_format/index.d.ts +1 -1
  22. package/dist/llm/provider_format/index.d.ts.map +1 -1
  23. package/dist/llm/realtime.cjs.map +1 -1
  24. package/dist/llm/realtime.d.cts +4 -0
  25. package/dist/llm/realtime.d.ts +4 -0
  26. package/dist/llm/realtime.d.ts.map +1 -1
  27. package/dist/llm/realtime.js.map +1 -1
  28. package/dist/llm/utils.cjs +2 -2
  29. package/dist/llm/utils.cjs.map +1 -1
  30. package/dist/llm/utils.d.cts +1 -1
  31. package/dist/llm/utils.d.ts +1 -1
  32. package/dist/llm/utils.d.ts.map +1 -1
  33. package/dist/llm/utils.js +2 -2
  34. package/dist/llm/utils.js.map +1 -1
  35. package/dist/llm/zod-utils.cjs +6 -3
  36. package/dist/llm/zod-utils.cjs.map +1 -1
  37. package/dist/llm/zod-utils.d.cts +1 -1
  38. package/dist/llm/zod-utils.d.ts +1 -1
  39. package/dist/llm/zod-utils.d.ts.map +1 -1
  40. package/dist/llm/zod-utils.js +6 -3
  41. package/dist/llm/zod-utils.js.map +1 -1
  42. package/dist/llm/zod-utils.test.cjs +83 -0
  43. package/dist/llm/zod-utils.test.cjs.map +1 -1
  44. package/dist/llm/zod-utils.test.js +83 -0
  45. package/dist/llm/zod-utils.test.js.map +1 -1
  46. package/dist/utils.cjs.map +1 -1
  47. package/dist/utils.d.cts +7 -0
  48. package/dist/utils.d.ts +7 -0
  49. package/dist/utils.d.ts.map +1 -1
  50. package/dist/utils.js.map +1 -1
  51. package/dist/voice/agent_activity.cjs +69 -20
  52. package/dist/voice/agent_activity.cjs.map +1 -1
  53. package/dist/voice/agent_activity.d.ts.map +1 -1
  54. package/dist/voice/agent_activity.js +69 -20
  55. package/dist/voice/agent_activity.js.map +1 -1
  56. package/dist/voice/agent_session.cjs +40 -1
  57. package/dist/voice/agent_session.cjs.map +1 -1
  58. package/dist/voice/agent_session.d.cts +5 -0
  59. package/dist/voice/agent_session.d.ts +5 -0
  60. package/dist/voice/agent_session.d.ts.map +1 -1
  61. package/dist/voice/agent_session.js +40 -1
  62. package/dist/voice/agent_session.js.map +1 -1
  63. package/dist/voice/interruption_detection.test.cjs +114 -0
  64. package/dist/voice/interruption_detection.test.cjs.map +1 -0
  65. package/dist/voice/interruption_detection.test.js +113 -0
  66. package/dist/voice/interruption_detection.test.js.map +1 -0
  67. package/dist/voice/room_io/room_io.cjs +3 -0
  68. package/dist/voice/room_io/room_io.cjs.map +1 -1
  69. package/dist/voice/room_io/room_io.d.cts +1 -0
  70. package/dist/voice/room_io/room_io.d.ts +1 -0
  71. package/dist/voice/room_io/room_io.d.ts.map +1 -1
  72. package/dist/voice/room_io/room_io.js +3 -0
  73. package/dist/voice/room_io/room_io.js.map +1 -1
  74. package/package.json +3 -3
  75. package/src/inference/llm.ts +53 -21
  76. package/src/llm/__snapshots__/zod-utils.test.ts.snap +218 -0
  77. package/src/llm/llm.ts +1 -1
  78. package/src/llm/provider_format/google.ts +4 -4
  79. package/src/llm/realtime.ts +8 -1
  80. package/src/llm/utils.ts +7 -2
  81. package/src/llm/zod-utils.test.ts +101 -0
  82. package/src/llm/zod-utils.ts +12 -3
  83. package/src/utils.ts +17 -0
  84. package/src/voice/agent_activity.ts +96 -24
  85. package/src/voice/agent_session.ts +54 -0
  86. package/src/voice/interruption_detection.test.ts +151 -0
  87. package/src/voice/room_io/room_io.ts +4 -0
@@ -339,3 +339,221 @@ exports[`Zod Utils > zodSchemaToJsonSchema > Zod v4 schemas > should handle v4 s
339
339
  "type": "object",
340
340
  }
341
341
  `;
342
+
343
+ exports[`Zod Utils > zodSchemaToJsonSchema > strict parameter > should handle arrays in strict mode 1`] = `
344
+ {
345
+ "$schema": "http://json-schema.org/draft-07/schema#",
346
+ "additionalProperties": false,
347
+ "properties": {
348
+ "numbers": {
349
+ "items": {
350
+ "type": "number",
351
+ },
352
+ "type": "array",
353
+ },
354
+ "tags": {
355
+ "items": {
356
+ "type": "string",
357
+ },
358
+ "type": "array",
359
+ },
360
+ },
361
+ "required": [
362
+ "tags",
363
+ "numbers",
364
+ ],
365
+ "type": "object",
366
+ }
367
+ `;
368
+
369
+ exports[`Zod Utils > zodSchemaToJsonSchema > strict parameter > should handle default values in strict mode 1`] = `
370
+ {
371
+ "$schema": "http://json-schema.org/draft-07/schema#",
372
+ "additionalProperties": false,
373
+ "properties": {
374
+ "active": {
375
+ "default": true,
376
+ "type": "boolean",
377
+ },
378
+ "name": {
379
+ "type": "string",
380
+ },
381
+ "role": {
382
+ "default": "user",
383
+ "type": "string",
384
+ },
385
+ },
386
+ "required": [
387
+ "name",
388
+ "role",
389
+ "active",
390
+ ],
391
+ "type": "object",
392
+ }
393
+ `;
394
+
395
+ exports[`Zod Utils > zodSchemaToJsonSchema > strict parameter > should handle nested objects in strict mode 1`] = `
396
+ {
397
+ "$schema": "http://json-schema.org/draft-07/schema#",
398
+ "additionalProperties": false,
399
+ "properties": {
400
+ "metadata": {
401
+ "additionalProperties": false,
402
+ "properties": {
403
+ "created": {
404
+ "type": "string",
405
+ },
406
+ },
407
+ "required": [
408
+ "created",
409
+ ],
410
+ "type": "object",
411
+ },
412
+ "user": {
413
+ "additionalProperties": false,
414
+ "properties": {
415
+ "email": {
416
+ "anyOf": [
417
+ {
418
+ "type": "string",
419
+ },
420
+ {
421
+ "type": "null",
422
+ },
423
+ ],
424
+ },
425
+ "name": {
426
+ "type": "string",
427
+ },
428
+ },
429
+ "required": [
430
+ "name",
431
+ "email",
432
+ ],
433
+ "type": "object",
434
+ },
435
+ },
436
+ "required": [
437
+ "user",
438
+ "metadata",
439
+ ],
440
+ "type": "object",
441
+ }
442
+ `;
443
+
444
+ exports[`Zod Utils > zodSchemaToJsonSchema > strict parameter > should handle nullable fields in strict mode 1`] = `
445
+ {
446
+ "$schema": "http://json-schema.org/draft-07/schema#",
447
+ "additionalProperties": false,
448
+ "properties": {
449
+ "optional": {
450
+ "anyOf": [
451
+ {
452
+ "type": "string",
453
+ },
454
+ {
455
+ "type": "null",
456
+ },
457
+ ],
458
+ },
459
+ "required": {
460
+ "type": "string",
461
+ },
462
+ },
463
+ "required": [
464
+ "required",
465
+ "optional",
466
+ ],
467
+ "type": "object",
468
+ }
469
+ `;
470
+
471
+ exports[`Zod Utils > zodSchemaToJsonSchema > strict parameter > should handle optional fields in strict mode 1`] = `
472
+ {
473
+ "$schema": "http://json-schema.org/draft-07/schema#",
474
+ "additionalProperties": false,
475
+ "properties": {
476
+ "optional": {
477
+ "anyOf": [
478
+ {
479
+ "type": "string",
480
+ },
481
+ {
482
+ "type": "null",
483
+ },
484
+ ],
485
+ },
486
+ "required": {
487
+ "type": "string",
488
+ },
489
+ },
490
+ "required": [
491
+ "required",
492
+ "optional",
493
+ ],
494
+ "type": "object",
495
+ }
496
+ `;
497
+
498
+ exports[`Zod Utils > zodSchemaToJsonSchema > strict parameter > should handle v3 schemas in strict mode 1`] = `
499
+ {
500
+ "$schema": "https://json-schema.org/draft/2019-09/schema#",
501
+ "additionalProperties": false,
502
+ "properties": {
503
+ "age": {
504
+ "type": [
505
+ "number",
506
+ "null",
507
+ ],
508
+ },
509
+ "name": {
510
+ "type": "string",
511
+ },
512
+ },
513
+ "required": [
514
+ "name",
515
+ "age",
516
+ ],
517
+ "type": "object",
518
+ }
519
+ `;
520
+
521
+ exports[`Zod Utils > zodSchemaToJsonSchema > strict parameter > should produce standard JSON schema with strict: false 1`] = `
522
+ {
523
+ "$schema": "http://json-schema.org/draft-07/schema#",
524
+ "additionalProperties": false,
525
+ "properties": {
526
+ "age": {
527
+ "type": "number",
528
+ },
529
+ "name": {
530
+ "type": "string",
531
+ },
532
+ },
533
+ "required": [
534
+ "name",
535
+ "age",
536
+ ],
537
+ "type": "object",
538
+ }
539
+ `;
540
+
541
+ exports[`Zod Utils > zodSchemaToJsonSchema > strict parameter > should produce strict JSON schema with strict: true 1`] = `
542
+ {
543
+ "$schema": "http://json-schema.org/draft-07/schema#",
544
+ "additionalProperties": false,
545
+ "properties": {
546
+ "age": {
547
+ "type": "number",
548
+ },
549
+ "name": {
550
+ "type": "string",
551
+ },
552
+ },
553
+ "required": [
554
+ "name",
555
+ "age",
556
+ ],
557
+ "type": "object",
558
+ }
559
+ `;
package/src/llm/llm.ts CHANGED
@@ -78,7 +78,7 @@ export abstract class LLM extends (EventEmitter as new () => TypedEmitter<LLMCal
78
78
  connOptions?: APIConnectOptions;
79
79
  parallelToolCalls?: boolean;
80
80
  toolChoice?: ToolChoice;
81
- extraKwargs?: Record<string, any>;
81
+ extraKwargs?: Record<string, unknown>;
82
82
  }): LLMStream;
83
83
 
84
84
  /**
@@ -12,11 +12,11 @@ export interface GoogleFormatData {
12
12
  export async function toChatCtx(
13
13
  chatCtx: ChatContext,
14
14
  injectDummyUserMessage: boolean = true,
15
- ): Promise<[Record<string, any>[], GoogleFormatData]> {
16
- const turns: Record<string, any>[] = [];
15
+ ): Promise<[Record<string, unknown>[], GoogleFormatData]> {
16
+ const turns: Record<string, unknown>[] = [];
17
17
  const systemMessages: string[] = [];
18
18
  let currentRole: string | null = null;
19
- let parts: Record<string, any>[] = [];
19
+ let parts: Record<string, unknown>[] = [];
20
20
 
21
21
  // Flatten all grouped tool calls to get individual messages
22
22
  const itemGroups = groupToolCalls(chatCtx);
@@ -104,7 +104,7 @@ export async function toChatCtx(
104
104
  ];
105
105
  }
106
106
 
107
- async function toImagePart(image: ImageContent): Promise<Record<string, any>> {
107
+ async function toImagePart(image: ImageContent): Promise<Record<string, unknown>> {
108
108
  const cacheKey = 'serialized_image';
109
109
  if (!image._cache[cacheKey]) {
110
110
  image._cache[cacheKey] = await serializeImage(image);
@@ -19,6 +19,7 @@ export interface MessageGeneration {
19
19
  messageId: string;
20
20
  textStream: ReadableStream<string>;
21
21
  audioStream: ReadableStream<AudioFrame>;
22
+ modalities?: Promise<('text' | 'audio')[]>;
22
23
  }
23
24
 
24
25
  export interface GenerationCreatedEvent {
@@ -40,6 +41,7 @@ export interface RealtimeCapabilities {
40
41
  turnDetection: boolean;
41
42
  userTranscription: boolean;
42
43
  autoToolReplyGeneration: boolean;
44
+ audioOutput: boolean;
43
45
  }
44
46
 
45
47
  export interface InputTranscriptionCompleted {
@@ -121,7 +123,12 @@ export abstract class RealtimeSession extends EventEmitter {
121
123
  /**
122
124
  * Truncate the message at the given audio end time
123
125
  */
124
- abstract truncate(options: { messageId: string; audioEndMs: number }): Promise<void>;
126
+ abstract truncate(options: {
127
+ messageId: string;
128
+ audioEndMs: number;
129
+ modalities?: ('text' | 'audio')[];
130
+ audioTranscript?: string;
131
+ }): Promise<void>;
125
132
 
126
133
  async close(): Promise<void> {
127
134
  this._mainTask.cancel();
package/src/llm/utils.ts CHANGED
@@ -323,9 +323,14 @@ export function computeChatCtxDiff(oldCtx: ChatContext, newCtx: ChatContext): Di
323
323
  };
324
324
  }
325
325
 
326
- export function toJsonSchema(schema: ToolInputSchema<any>, isOpenai: boolean = true): JSONSchema7 {
326
+ export function toJsonSchema(
327
+ schema: ToolInputSchema<any>,
328
+ isOpenai: boolean = true,
329
+ strict: boolean = false,
330
+ ): JSONSchema7 {
327
331
  if (isZodSchema(schema)) {
328
- return zodSchemaToJsonSchema(schema, isOpenai);
332
+ return zodSchemaToJsonSchema(schema, isOpenai, strict);
329
333
  }
334
+
330
335
  return schema as JSONSchema7;
331
336
  }
@@ -260,6 +260,107 @@ describe('Zod Utils', () => {
260
260
  expect(jsonSchema7).toHaveProperty('properties');
261
261
  });
262
262
  });
263
+
264
+ describe('strict parameter', () => {
265
+ it('should produce strict JSON schema with strict: true', () => {
266
+ const schema = z4.object({
267
+ name: z4.string(),
268
+ age: z4.number(),
269
+ });
270
+
271
+ const strictSchema = zodSchemaToJsonSchema(schema, true, true);
272
+ expect(strictSchema).toMatchSnapshot();
273
+ });
274
+
275
+ it('should handle nullable fields in strict mode', () => {
276
+ const schema = z4.object({
277
+ required: z4.string(),
278
+ optional: z4.string().nullable(),
279
+ });
280
+
281
+ const strictSchema = zodSchemaToJsonSchema(schema, true, true);
282
+ expect(strictSchema).toMatchSnapshot();
283
+ });
284
+
285
+ it('should handle default values in strict mode', () => {
286
+ const schema = z4.object({
287
+ name: z4.string(),
288
+ role: z4.string().default('user'),
289
+ active: z4.boolean().default(true),
290
+ });
291
+
292
+ const strictSchema = zodSchemaToJsonSchema(schema, true, true);
293
+ expect(strictSchema).toMatchSnapshot();
294
+ });
295
+
296
+ it('should handle nested objects in strict mode', () => {
297
+ const schema = z4.object({
298
+ user: z4.object({
299
+ name: z4.string(),
300
+ email: z4.string().nullable(),
301
+ }),
302
+ metadata: z4.object({
303
+ created: z4.string(),
304
+ }),
305
+ });
306
+
307
+ const strictSchema = zodSchemaToJsonSchema(schema, true, true);
308
+ expect(strictSchema).toMatchSnapshot();
309
+ });
310
+
311
+ it('should handle arrays in strict mode', () => {
312
+ const schema = z4.object({
313
+ tags: z4.array(z4.string()),
314
+ numbers: z4.array(z4.number()),
315
+ });
316
+
317
+ const strictSchema = zodSchemaToJsonSchema(schema, true, true);
318
+ expect(strictSchema).toMatchSnapshot();
319
+ });
320
+
321
+ it('should handle v3 schemas in strict mode', () => {
322
+ const schema = z3.object({
323
+ name: z3.string(),
324
+ age: z3.number().optional(),
325
+ });
326
+
327
+ const strictSchema = zodSchemaToJsonSchema(schema, true, true);
328
+ expect(strictSchema).toMatchSnapshot();
329
+ });
330
+
331
+ it('should throw error when using .optional() without .nullable() in strict mode', () => {
332
+ const schema = z4.object({
333
+ required: z4.string(),
334
+ optional: z4.string().optional(),
335
+ });
336
+
337
+ expect(() => zodSchemaToJsonSchema(schema, true, true)).toThrow(
338
+ /uses `.optional\(\)` without `.nullable\(\)` which is not supported by the API/,
339
+ );
340
+ });
341
+
342
+ it('should throw error for nested .optional() fields in strict mode', () => {
343
+ const schema = z4.object({
344
+ user: z4.object({
345
+ name: z4.string(),
346
+ email: z4.string().optional(),
347
+ }),
348
+ });
349
+
350
+ expect(() => zodSchemaToJsonSchema(schema, true, true)).toThrow(
351
+ /uses `.optional\(\)` without `.nullable\(\)` which is not supported by the API/,
352
+ );
353
+ });
354
+
355
+ it('should NOT throw error when using .optional() in non-strict mode', () => {
356
+ const schema = z4.object({
357
+ required: z4.string(),
358
+ optional: z4.string().optional(),
359
+ });
360
+
361
+ expect(() => zodSchemaToJsonSchema(schema, true, false)).not.toThrow();
362
+ });
363
+ });
263
364
  });
264
365
 
265
366
  describe('parseZodSchema', () => {
@@ -2,6 +2,7 @@
2
2
  //
3
3
  // SPDX-License-Identifier: Apache-2.0
4
4
  import type { JSONSchema7 } from 'json-schema';
5
+ import { toStrictJsonSchema } from 'openai/lib/transform';
5
6
  import { zodToJsonSchema as zodToJsonSchemaV3 } from 'zod-to-json-schema';
6
7
  import type * as z3 from 'zod/v3';
7
8
  import * as z4 from 'zod/v4';
@@ -101,12 +102,18 @@ export function isZodObjectSchema(schema: ZodSchema): boolean {
101
102
  * @param isOpenai - Whether to use OpenAI-specific formatting (default: true)
102
103
  * @returns A JSON Schema representation of the Zod schema
103
104
  */
104
- export function zodSchemaToJsonSchema(schema: ZodSchema, isOpenai: boolean = true): JSONSchema7 {
105
+ export function zodSchemaToJsonSchema(
106
+ schema: ZodSchema,
107
+ isOpenai: boolean = true,
108
+ strict: boolean = false,
109
+ ): JSONSchema7 {
110
+ let result: JSONSchema7;
111
+
105
112
  if (isZod4Schema(schema)) {
106
113
  // Zod v4 has native toJSONSchema support
107
114
  // Configuration adapted from Vercel AI SDK to support OpenAPI conversion for Google
108
115
  // Source: https://github.com/vercel/ai/blob/main/packages/provider-utils/src/schema.ts#L255-L258
109
- return z4.toJSONSchema(schema, {
116
+ result = z4.toJSONSchema(schema, {
110
117
  target: 'draft-7',
111
118
  io: 'output',
112
119
  reused: 'inline', // Don't use references by default (to support openapi conversion for google)
@@ -115,11 +122,13 @@ export function zodSchemaToJsonSchema(schema: ZodSchema, isOpenai: boolean = tru
115
122
  // Zod v3 requires the zod-to-json-schema library
116
123
  // Configuration adapted from Vercel AI SDK
117
124
  // $refStrategy: 'none' is equivalent to v4's reused: 'inline'
118
- return zodToJsonSchemaV3(schema, {
125
+ result = zodToJsonSchemaV3(schema, {
119
126
  target: isOpenai ? 'openAi' : 'jsonSchema7',
120
127
  $refStrategy: 'none', // Don't use references by default (to support openapi conversion for google)
121
128
  }) as JSONSchema7;
122
129
  }
130
+
131
+ return strict ? (toStrictJsonSchema(result) as JSONSchema7) : result;
123
132
  }
124
133
 
125
134
  /**
package/src/utils.ts CHANGED
@@ -15,6 +15,23 @@ import { TransformStream, type TransformStreamDefaultController } from 'node:str
15
15
  import { v4 as uuidv4 } from 'uuid';
16
16
  import { log } from './log.js';
17
17
 
18
+ /**
19
+ * Recursively expands all nested properties of a type,
20
+ * resolving aliases so as to inspect the real shape in IDE.
21
+ */
22
+ // eslint-disable-next-line @typescript-eslint/ban-types
23
+ export type Expand<T> = T extends Function
24
+ ? T
25
+ : T extends object
26
+ ? T extends Array<infer U>
27
+ ? Array<Expand<U>>
28
+ : T extends Map<infer K, infer V>
29
+ ? Map<Expand<K>, Expand<V>>
30
+ : T extends Set<infer M>
31
+ ? Set<Expand<M>>
32
+ : { [K in keyof T]: Expand<T[K]> }
33
+ : T;
34
+
18
35
  /** Union of a single and a list of {@link AudioFrame}s */
19
36
  export type AudioBuffer = AudioFrame[] | AudioFrame;
20
37
 
@@ -235,6 +235,14 @@ export class AgentActivity implements RecognitionHooks {
235
235
  } catch (error) {
236
236
  this.logger.error(error, 'failed to update the tools');
237
237
  }
238
+
239
+ if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
240
+ this.logger.error(
241
+ 'audio output is enabled but RealtimeModel has no audio modality ' +
242
+ 'and no TTS is set. Either enable audio modality in the RealtimeModel ' +
243
+ 'or set a TTS model.',
244
+ );
245
+ }
238
246
  } else if (this.llm instanceof LLM) {
239
247
  try {
240
248
  updateInstructions({
@@ -625,11 +633,21 @@ export class AgentActivity implements RecognitionHooks {
625
633
  return;
626
634
  }
627
635
 
636
+ // Refactored interruption word count check:
637
+ // - Always apply minInterruptionWords filtering when STT is available and minInterruptionWords > 0
638
+ // - Apply check to all STT results: empty string, undefined, or any length
639
+ // - This ensures consistent behavior across all interruption scenarios
628
640
  if (this.stt && this.agentSession.options.minInterruptionWords > 0 && this.audioRecognition) {
629
641
  const text = this.audioRecognition.currentTranscript;
630
-
631
642
  // TODO(shubhra): better word splitting for multi-language
632
- if (text && splitWords(text, true).length < this.agentSession.options.minInterruptionWords) {
643
+
644
+ // Normalize text: convert undefined/null to empty string for consistent word counting
645
+ const normalizedText = text ?? '';
646
+ const wordCount = splitWords(normalizedText, true).length;
647
+
648
+ // Only allow interruption if word count meets or exceeds minInterruptionWords
649
+ // This applies to all cases: empty strings, partial speech, and full speech
650
+ if (wordCount < this.agentSession.options.minInterruptionWords) {
633
651
  return;
634
652
  }
635
653
  }
@@ -767,19 +785,30 @@ export class AgentActivity implements RecognitionHooks {
767
785
  return true;
768
786
  }
769
787
 
788
+ // Refactored interruption word count check for consistency with onVADInferenceDone:
789
+ // - Always apply minInterruptionWords filtering when STT is available and minInterruptionWords > 0
790
+ // - Use consistent word splitting logic with splitWords (matching onVADInferenceDone pattern)
770
791
  if (
771
792
  this.stt &&
772
793
  this.turnDetection !== 'manual' &&
773
794
  this._currentSpeech &&
774
795
  this._currentSpeech.allowInterruptions &&
775
796
  !this._currentSpeech.interrupted &&
776
- this.agentSession.options.minInterruptionWords > 0 &&
777
- info.newTranscript.split(' ').length < this.agentSession.options.minInterruptionWords
797
+ this.agentSession.options.minInterruptionWords > 0
778
798
  ) {
779
- // avoid interruption if the new_transcript is too short
780
- this.cancelPreemptiveGeneration();
781
- this.logger.info('skipping user input, new_transcript is too short');
782
- return false;
799
+ const wordCount = splitWords(info.newTranscript, true).length;
800
+ if (wordCount < this.agentSession.options.minInterruptionWords) {
801
+ // avoid interruption if the new_transcript contains fewer words than minInterruptionWords
802
+ this.cancelPreemptiveGeneration();
803
+ this.logger.info(
804
+ {
805
+ wordCount,
806
+ minInterruptionWords: this.agentSession.options.minInterruptionWords,
807
+ },
808
+ 'skipping user input, word count below minimum interruption threshold',
809
+ );
810
+ return false;
811
+ }
783
812
  }
784
813
 
785
814
  const oldTask = this._userTurnCompletedTask;
@@ -1612,7 +1641,7 @@ export class AgentActivity implements RecognitionHooks {
1612
1641
 
1613
1642
  const readMessages = async (
1614
1643
  abortController: AbortController,
1615
- outputs: Array<[string, _TextOut | null, _AudioOut | null]>,
1644
+ outputs: Array<[string, _TextOut | null, _AudioOut | null, ('text' | 'audio')[] | undefined]>,
1616
1645
  ) => {
1617
1646
  replyAbortController.signal.addEventListener('abort', () => abortController.abort(), {
1618
1647
  once: true,
@@ -1627,7 +1656,25 @@ export class AgentActivity implements RecognitionHooks {
1627
1656
  );
1628
1657
  break;
1629
1658
  }
1630
- const trNodeResult = await this.agent.transcriptionNode(msg.textStream, modelSettings);
1659
+
1660
+ const msgModalities = msg.modalities ? await msg.modalities : undefined;
1661
+ let ttsTextInput: ReadableStream<string> | null = null;
1662
+ let trTextInput: ReadableStream<string>;
1663
+
1664
+ if (msgModalities && !msgModalities.includes('audio') && this.tts) {
1665
+ if (this.llm instanceof RealtimeModel && this.llm.capabilities.audioOutput) {
1666
+ this.logger.warn(
1667
+ 'text response received from realtime API, falling back to use a TTS model.',
1668
+ );
1669
+ }
1670
+ const [_ttsTextInput, _trTextInput] = msg.textStream.tee();
1671
+ ttsTextInput = _ttsTextInput;
1672
+ trTextInput = _trTextInput;
1673
+ } else {
1674
+ trTextInput = msg.textStream;
1675
+ }
1676
+
1677
+ const trNodeResult = await this.agent.transcriptionNode(trTextInput, modelSettings);
1631
1678
  let textOut: _TextOut | null = null;
1632
1679
  if (trNodeResult) {
1633
1680
  const [textForwardTask, _textOut] = performTextForwarding(
@@ -1638,30 +1685,51 @@ export class AgentActivity implements RecognitionHooks {
1638
1685
  forwardTasks.push(textForwardTask);
1639
1686
  textOut = _textOut;
1640
1687
  }
1688
+
1641
1689
  let audioOut: _AudioOut | null = null;
1642
1690
  if (audioOutput) {
1643
- const realtimeAudio = await this.agent.realtimeAudioOutputNode(
1644
- msg.audioStream,
1645
- modelSettings,
1646
- );
1647
- if (realtimeAudio) {
1691
+ let realtimeAudioResult: ReadableStream<AudioFrame> | null = null;
1692
+
1693
+ if (ttsTextInput) {
1694
+ const [ttsTask, ttsStream] = performTTSInference(
1695
+ (...args) => this.agent.ttsNode(...args),
1696
+ ttsTextInput,
1697
+ modelSettings,
1698
+ abortController,
1699
+ );
1700
+ tasks.push(ttsTask);
1701
+ realtimeAudioResult = ttsStream;
1702
+ } else if (msgModalities && msgModalities.includes('audio')) {
1703
+ realtimeAudioResult = await this.agent.realtimeAudioOutputNode(
1704
+ msg.audioStream,
1705
+ modelSettings,
1706
+ );
1707
+ } else if (this.llm instanceof RealtimeModel && this.llm.capabilities.audioOutput) {
1708
+ this.logger.error(
1709
+ 'Text message received from Realtime API with audio modality. ' +
1710
+ 'This usually happens when text chat context is synced to the API. ' +
1711
+ 'Try to add a TTS model as fallback or use text modality with TTS instead.',
1712
+ );
1713
+ } else {
1714
+ this.logger.warn(
1715
+ 'audio output is enabled but neither tts nor realtime audio is available',
1716
+ );
1717
+ }
1718
+
1719
+ if (realtimeAudioResult) {
1648
1720
  const [forwardTask, _audioOut] = performAudioForwarding(
1649
- realtimeAudio,
1721
+ realtimeAudioResult,
1650
1722
  audioOutput,
1651
1723
  abortController,
1652
1724
  );
1653
1725
  forwardTasks.push(forwardTask);
1654
1726
  audioOut = _audioOut;
1655
1727
  audioOut.firstFrameFut.await.finally(onFirstFrame);
1656
- } else {
1657
- this.logger.warn(
1658
- 'audio output is enabled but neither tts nor realtime audio is available',
1659
- );
1660
1728
  }
1661
1729
  } else if (textOut) {
1662
1730
  textOut.firstTextFut.await.finally(onFirstFrame);
1663
1731
  }
1664
- outputs.push([msg.messageId, textOut, audioOut]);
1732
+ outputs.push([msg.messageId, textOut, audioOut, msgModalities]);
1665
1733
  }
1666
1734
  await waitFor(forwardTasks);
1667
1735
  } catch (error) {
@@ -1671,7 +1739,9 @@ export class AgentActivity implements RecognitionHooks {
1671
1739
  }
1672
1740
  };
1673
1741
 
1674
- const messageOutputs: Array<[string, _TextOut | null, _AudioOut | null]> = [];
1742
+ const messageOutputs: Array<
1743
+ [string, _TextOut | null, _AudioOut | null, ('text' | 'audio')[] | undefined]
1744
+ > = [];
1675
1745
  const tasks = [
1676
1746
  Task.from(
1677
1747
  (controller) => readMessages(controller, messageOutputs),
@@ -1750,7 +1820,7 @@ export class AgentActivity implements RecognitionHooks {
1750
1820
 
1751
1821
  if (messageOutputs.length > 0) {
1752
1822
  // there should be only one message
1753
- const [msgId, textOut, audioOut] = messageOutputs[0]!;
1823
+ const [msgId, textOut, audioOut, msgModalities] = messageOutputs[0]!;
1754
1824
  let forwardedText = textOut?.text || '';
1755
1825
 
1756
1826
  if (audioOutput) {
@@ -1775,6 +1845,8 @@ export class AgentActivity implements RecognitionHooks {
1775
1845
  this.realtimeSession.truncate({
1776
1846
  messageId: msgId,
1777
1847
  audioEndMs: Math.floor(playbackPosition),
1848
+ modalities: msgModalities,
1849
+ audioTranscript: forwardedText,
1778
1850
  });
1779
1851
  }
1780
1852
 
@@ -1805,7 +1877,7 @@ export class AgentActivity implements RecognitionHooks {
1805
1877
 
1806
1878
  if (messageOutputs.length > 0) {
1807
1879
  // there should be only one message
1808
- const [msgId, textOut, _] = messageOutputs[0]!;
1880
+ const [msgId, textOut, _, __] = messageOutputs[0]!;
1809
1881
  const message = ChatMessage.create({
1810
1882
  role: 'assistant',
1811
1883
  content: textOut?.text || '',