npm - @livekit/agents - Versions diffs - 1.0.17 → 1.0.18 - Mend

@livekit/agents 1.0.17 → 1.0.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

package/dist/inference/llm.cjs +35 -13
package/dist/inference/llm.cjs.map +1 -1
package/dist/inference/llm.d.cts +10 -5
package/dist/inference/llm.d.ts +10 -5
package/dist/inference/llm.d.ts.map +1 -1
package/dist/inference/llm.js +35 -13
package/dist/inference/llm.js.map +1 -1
package/dist/llm/chat_context.d.cts +1 -1
package/dist/llm/chat_context.d.ts +1 -1
package/dist/llm/llm.cjs.map +1 -1
package/dist/llm/llm.d.cts +1 -1
package/dist/llm/llm.d.ts +1 -1
package/dist/llm/llm.d.ts.map +1 -1
package/dist/llm/llm.js.map +1 -1
package/dist/llm/provider_format/google.cjs.map +1 -1
package/dist/llm/provider_format/google.d.cts +1 -1
package/dist/llm/provider_format/google.d.ts +1 -1
package/dist/llm/provider_format/google.d.ts.map +1 -1
package/dist/llm/provider_format/google.js.map +1 -1
package/dist/llm/provider_format/index.d.cts +1 -1
package/dist/llm/provider_format/index.d.ts +1 -1
package/dist/llm/provider_format/index.d.ts.map +1 -1
package/dist/llm/realtime.cjs.map +1 -1
package/dist/llm/realtime.d.cts +4 -0
package/dist/llm/realtime.d.ts +4 -0
package/dist/llm/realtime.d.ts.map +1 -1
package/dist/llm/realtime.js.map +1 -1
package/dist/llm/utils.cjs +2 -2
package/dist/llm/utils.cjs.map +1 -1
package/dist/llm/utils.d.cts +1 -1
package/dist/llm/utils.d.ts +1 -1
package/dist/llm/utils.d.ts.map +1 -1
package/dist/llm/utils.js +2 -2
package/dist/llm/utils.js.map +1 -1
package/dist/llm/zod-utils.cjs +6 -3
package/dist/llm/zod-utils.cjs.map +1 -1
package/dist/llm/zod-utils.d.cts +1 -1
package/dist/llm/zod-utils.d.ts +1 -1
package/dist/llm/zod-utils.d.ts.map +1 -1
package/dist/llm/zod-utils.js +6 -3
package/dist/llm/zod-utils.js.map +1 -1
package/dist/llm/zod-utils.test.cjs +83 -0
package/dist/llm/zod-utils.test.cjs.map +1 -1
package/dist/llm/zod-utils.test.js +83 -0
package/dist/llm/zod-utils.test.js.map +1 -1
package/dist/utils.cjs.map +1 -1
package/dist/utils.d.cts +7 -0
package/dist/utils.d.ts +7 -0
package/dist/utils.d.ts.map +1 -1
package/dist/utils.js.map +1 -1
package/dist/voice/agent_activity.cjs +69 -20
package/dist/voice/agent_activity.cjs.map +1 -1
package/dist/voice/agent_activity.d.ts.map +1 -1
package/dist/voice/agent_activity.js +69 -20
package/dist/voice/agent_activity.js.map +1 -1
package/dist/voice/agent_session.cjs +40 -1
package/dist/voice/agent_session.cjs.map +1 -1
package/dist/voice/agent_session.d.cts +5 -0
package/dist/voice/agent_session.d.ts +5 -0
package/dist/voice/agent_session.d.ts.map +1 -1
package/dist/voice/agent_session.js +40 -1
package/dist/voice/agent_session.js.map +1 -1
package/dist/voice/interruption_detection.test.cjs +114 -0
package/dist/voice/interruption_detection.test.cjs.map +1 -0
package/dist/voice/interruption_detection.test.js +113 -0
package/dist/voice/interruption_detection.test.js.map +1 -0
package/dist/voice/room_io/room_io.cjs +3 -0
package/dist/voice/room_io/room_io.cjs.map +1 -1
package/dist/voice/room_io/room_io.d.cts +1 -0
package/dist/voice/room_io/room_io.d.ts +1 -0
package/dist/voice/room_io/room_io.d.ts.map +1 -1
package/dist/voice/room_io/room_io.js +3 -0
package/dist/voice/room_io/room_io.js.map +1 -1
package/package.json +3 -3
package/src/inference/llm.ts +53 -21
package/src/llm/__snapshots__/zod-utils.test.ts.snap +218 -0
package/src/llm/llm.ts +1 -1
package/src/llm/provider_format/google.ts +4 -4
package/src/llm/realtime.ts +8 -1
package/src/llm/utils.ts +7 -2
package/src/llm/zod-utils.test.ts +101 -0
package/src/llm/zod-utils.ts +12 -3
package/src/utils.ts +17 -0
package/src/voice/agent_activity.ts +96 -24
package/src/voice/agent_session.ts +54 -0
package/src/voice/interruption_detection.test.ts +151 -0
package/src/voice/room_io/room_io.ts +4 -0

package/src/llm/__snapshots__/zod-utils.test.ts.snap CHANGED Viewed

@@ -339,3 +339,221 @@ exports[`Zod Utils > zodSchemaToJsonSchema > Zod v4 schemas > should handle v4 s
   "type": "object",
 }
 `;
+exports[`Zod Utils > zodSchemaToJsonSchema > strict parameter > should handle arrays in strict mode 1`] = `
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "additionalProperties": false,
+  "properties": {
+    "numbers": {
+      "items": {
+        "type": "number",
+      },
+      "type": "array",
+    },
+    "tags": {
+      "items": {
+        "type": "string",
+      },
+      "type": "array",
+    },
+  },
+  "required": [
+    "tags",
+    "numbers",
+  ],
+  "type": "object",
+}
+`;
+exports[`Zod Utils > zodSchemaToJsonSchema > strict parameter > should handle default values in strict mode 1`] = `
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "additionalProperties": false,
+  "properties": {
+    "active": {
+      "default": true,
+      "type": "boolean",
+    },
+    "name": {
+      "type": "string",
+    },
+    "role": {
+      "default": "user",
+      "type": "string",
+    },
+  },
+  "required": [
+    "name",
+    "role",
+    "active",
+  ],
+  "type": "object",
+}
+`;
+exports[`Zod Utils > zodSchemaToJsonSchema > strict parameter > should handle nested objects in strict mode 1`] = `
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "additionalProperties": false,
+  "properties": {
+    "metadata": {
+      "additionalProperties": false,
+      "properties": {
+        "created": {
+          "type": "string",
+        },
+      },
+      "required": [
+        "created",
+      ],
+      "type": "object",
+    },
+    "user": {
+      "additionalProperties": false,
+      "properties": {
+        "email": {
+          "anyOf": [
+            {
+              "type": "string",
+            },
+            {
+              "type": "null",
+            },
+          ],
+        },
+        "name": {
+          "type": "string",
+        },
+      },
+      "required": [
+        "name",
+        "email",
+      ],
+      "type": "object",
+    },
+  },
+  "required": [
+    "user",
+    "metadata",
+  ],
+  "type": "object",
+}
+`;
+exports[`Zod Utils > zodSchemaToJsonSchema > strict parameter > should handle nullable fields in strict mode 1`] = `
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "additionalProperties": false,
+  "properties": {
+    "optional": {
+      "anyOf": [
+        {
+          "type": "string",
+        },
+        {
+          "type": "null",
+        },
+      ],
+    },
+    "required": {
+      "type": "string",
+    },
+  },
+  "required": [
+    "required",
+    "optional",
+  ],
+  "type": "object",
+}
+`;
+exports[`Zod Utils > zodSchemaToJsonSchema > strict parameter > should handle optional fields in strict mode 1`] = `
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "additionalProperties": false,
+  "properties": {
+    "optional": {
+      "anyOf": [
+        {
+          "type": "string",
+        },
+        {
+          "type": "null",
+        },
+      ],
+    },
+    "required": {
+      "type": "string",
+    },
+  },
+  "required": [
+    "required",
+    "optional",
+  ],
+  "type": "object",
+}
+`;
+exports[`Zod Utils > zodSchemaToJsonSchema > strict parameter > should handle v3 schemas in strict mode 1`] = `
+{
+  "$schema": "https://json-schema.org/draft/2019-09/schema#",
+  "additionalProperties": false,
+  "properties": {
+    "age": {
+      "type": [
+        "number",
+        "null",
+      ],
+    },
+    "name": {
+      "type": "string",
+    },
+  },
+  "required": [
+    "name",
+    "age",
+  ],
+  "type": "object",
+}
+`;
+exports[`Zod Utils > zodSchemaToJsonSchema > strict parameter > should produce standard JSON schema with strict: false 1`] = `
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "additionalProperties": false,
+  "properties": {
+    "age": {
+      "type": "number",
+    },
+    "name": {
+      "type": "string",
+    },
+  },
+  "required": [
+    "name",
+    "age",
+  ],
+  "type": "object",
+}
+`;
+exports[`Zod Utils > zodSchemaToJsonSchema > strict parameter > should produce strict JSON schema with strict: true 1`] = `
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "additionalProperties": false,
+  "properties": {
+    "age": {
+      "type": "number",
+    },
+    "name": {
+      "type": "string",
+    },
+  },
+  "required": [
+    "name",
+    "age",
+  ],
+  "type": "object",
+}
+`;

package/src/llm/llm.ts CHANGED Viewed

@@ -78,7 +78,7 @@ export abstract class LLM extends (EventEmitter as new () => TypedEmitter<LLMCal
     connOptions?: APIConnectOptions;
     parallelToolCalls?: boolean;
     toolChoice?: ToolChoice;
-    extraKwargs?: Record<string, any>;
+    extraKwargs?: Record<string, unknown>;
   }): LLMStream;
   /**

package/src/llm/provider_format/google.ts CHANGED Viewed

@@ -12,11 +12,11 @@ export interface GoogleFormatData {
 export async function toChatCtx(
   chatCtx: ChatContext,
   injectDummyUserMessage: boolean = true,
-): Promise<[Record<string, any>[], GoogleFormatData]> {
-  const turns: Record<string, any>[] = [];
+): Promise<[Record<string, unknown>[], GoogleFormatData]> {
+  const turns: Record<string, unknown>[] = [];
   const systemMessages: string[] = [];
   let currentRole: string | null = null;
-  let parts: Record<string, any>[] = [];
+  let parts: Record<string, unknown>[] = [];
   // Flatten all grouped tool calls to get individual messages
   const itemGroups = groupToolCalls(chatCtx);
@@ -104,7 +104,7 @@ export async function toChatCtx(
   ];
 }
-async function toImagePart(image: ImageContent): Promise<Record<string, any>> {
+async function toImagePart(image: ImageContent): Promise<Record<string, unknown>> {
   const cacheKey = 'serialized_image';
   if (!image._cache[cacheKey]) {
     image._cache[cacheKey] = await serializeImage(image);

package/src/llm/realtime.ts CHANGED Viewed

@@ -19,6 +19,7 @@ export interface MessageGeneration {
   messageId: string;
   textStream: ReadableStream<string>;
   audioStream: ReadableStream<AudioFrame>;
+  modalities?: Promise<('text' | 'audio')[]>;
 }
 export interface GenerationCreatedEvent {
@@ -40,6 +41,7 @@ export interface RealtimeCapabilities {
   turnDetection: boolean;
   userTranscription: boolean;
   autoToolReplyGeneration: boolean;
+  audioOutput: boolean;
 }
 export interface InputTranscriptionCompleted {
@@ -121,7 +123,12 @@ export abstract class RealtimeSession extends EventEmitter {
   /**
    * Truncate the message at the given audio end time
    */
-  abstract truncate(options: { messageId: string; audioEndMs: number }): Promise<void>;
+  abstract truncate(options: {
+    messageId: string;
+    audioEndMs: number;
+    modalities?: ('text' | 'audio')[];
+    audioTranscript?: string;
+  }): Promise<void>;
   async close(): Promise<void> {
     this._mainTask.cancel();

package/src/llm/utils.ts CHANGED Viewed

@@ -323,9 +323,14 @@ export function computeChatCtxDiff(oldCtx: ChatContext, newCtx: ChatContext): Di
   };
 }
-export function toJsonSchema(schema: ToolInputSchema<any>, isOpenai: boolean = true): JSONSchema7 {
+export function toJsonSchema(
+  schema: ToolInputSchema<any>,
+  isOpenai: boolean = true,
+  strict: boolean = false,
+): JSONSchema7 {
   if (isZodSchema(schema)) {
-    return zodSchemaToJsonSchema(schema, isOpenai);
+    return zodSchemaToJsonSchema(schema, isOpenai, strict);
   }
   return schema as JSONSchema7;
 }

package/src/llm/zod-utils.test.ts CHANGED Viewed

@@ -260,6 +260,107 @@ describe('Zod Utils', () => {
         expect(jsonSchema7).toHaveProperty('properties');
       });
     });
+    describe('strict parameter', () => {
+      it('should produce strict JSON schema with strict: true', () => {
+        const schema = z4.object({
+          name: z4.string(),
+          age: z4.number(),
+        });
+        const strictSchema = zodSchemaToJsonSchema(schema, true, true);
+        expect(strictSchema).toMatchSnapshot();
+      });
+      it('should handle nullable fields in strict mode', () => {
+        const schema = z4.object({
+          required: z4.string(),
+          optional: z4.string().nullable(),
+        });
+        const strictSchema = zodSchemaToJsonSchema(schema, true, true);
+        expect(strictSchema).toMatchSnapshot();
+      });
+      it('should handle default values in strict mode', () => {
+        const schema = z4.object({
+          name: z4.string(),
+          role: z4.string().default('user'),
+          active: z4.boolean().default(true),
+        });
+        const strictSchema = zodSchemaToJsonSchema(schema, true, true);
+        expect(strictSchema).toMatchSnapshot();
+      });
+      it('should handle nested objects in strict mode', () => {
+        const schema = z4.object({
+          user: z4.object({
+            name: z4.string(),
+            email: z4.string().nullable(),
+          }),
+          metadata: z4.object({
+            created: z4.string(),
+          }),
+        });
+        const strictSchema = zodSchemaToJsonSchema(schema, true, true);
+        expect(strictSchema).toMatchSnapshot();
+      });
+      it('should handle arrays in strict mode', () => {
+        const schema = z4.object({
+          tags: z4.array(z4.string()),
+          numbers: z4.array(z4.number()),
+        });
+        const strictSchema = zodSchemaToJsonSchema(schema, true, true);
+        expect(strictSchema).toMatchSnapshot();
+      });
+      it('should handle v3 schemas in strict mode', () => {
+        const schema = z3.object({
+          name: z3.string(),
+          age: z3.number().optional(),
+        });
+        const strictSchema = zodSchemaToJsonSchema(schema, true, true);
+        expect(strictSchema).toMatchSnapshot();
+      });
+      it('should throw error when using .optional() without .nullable() in strict mode', () => {
+        const schema = z4.object({
+          required: z4.string(),
+          optional: z4.string().optional(),
+        });
+        expect(() => zodSchemaToJsonSchema(schema, true, true)).toThrow(
+          /uses `.optional\(\)` without `.nullable\(\)` which is not supported by the API/,
+        );
+      });
+      it('should throw error for nested .optional() fields in strict mode', () => {
+        const schema = z4.object({
+          user: z4.object({
+            name: z4.string(),
+            email: z4.string().optional(),
+          }),
+        });
+        expect(() => zodSchemaToJsonSchema(schema, true, true)).toThrow(
+          /uses `.optional\(\)` without `.nullable\(\)` which is not supported by the API/,
+        );
+      });
+      it('should NOT throw error when using .optional() in non-strict mode', () => {
+        const schema = z4.object({
+          required: z4.string(),
+          optional: z4.string().optional(),
+        });
+        expect(() => zodSchemaToJsonSchema(schema, true, false)).not.toThrow();
+      });
+    });
   });
   describe('parseZodSchema', () => {

package/src/llm/zod-utils.ts CHANGED Viewed

@@ -2,6 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 import type { JSONSchema7 } from 'json-schema';
+import { toStrictJsonSchema } from 'openai/lib/transform';
 import { zodToJsonSchema as zodToJsonSchemaV3 } from 'zod-to-json-schema';
 import type * as z3 from 'zod/v3';
 import * as z4 from 'zod/v4';
@@ -101,12 +102,18 @@ export function isZodObjectSchema(schema: ZodSchema): boolean {
  * @param isOpenai - Whether to use OpenAI-specific formatting (default: true)
  * @returns A JSON Schema representation of the Zod schema
  */
-export function zodSchemaToJsonSchema(schema: ZodSchema, isOpenai: boolean = true): JSONSchema7 {
+export function zodSchemaToJsonSchema(
+  schema: ZodSchema,
+  isOpenai: boolean = true,
+  strict: boolean = false,
+): JSONSchema7 {
+  let result: JSONSchema7;
   if (isZod4Schema(schema)) {
     // Zod v4 has native toJSONSchema support
     // Configuration adapted from Vercel AI SDK to support OpenAPI conversion for Google
     // Source: https://github.com/vercel/ai/blob/main/packages/provider-utils/src/schema.ts#L255-L258
-    return z4.toJSONSchema(schema, {
+    result = z4.toJSONSchema(schema, {
       target: 'draft-7',
       io: 'output',
       reused: 'inline', // Don't use references by default (to support openapi conversion for google)
@@ -115,11 +122,13 @@ export function zodSchemaToJsonSchema(schema: ZodSchema, isOpenai: boolean = tru
     // Zod v3 requires the zod-to-json-schema library
     // Configuration adapted from Vercel AI SDK
     // $refStrategy: 'none' is equivalent to v4's reused: 'inline'
-    return zodToJsonSchemaV3(schema, {
+    result = zodToJsonSchemaV3(schema, {
       target: isOpenai ? 'openAi' : 'jsonSchema7',
       $refStrategy: 'none', // Don't use references by default (to support openapi conversion for google)
     }) as JSONSchema7;
   }
+  return strict ? (toStrictJsonSchema(result) as JSONSchema7) : result;
 }
 /**

package/src/utils.ts CHANGED Viewed

@@ -15,6 +15,23 @@ import { TransformStream, type TransformStreamDefaultController } from 'node:str
 import { v4 as uuidv4 } from 'uuid';
 import { log } from './log.js';
+/**
+ * Recursively expands all nested properties of a type,
+ * resolving aliases so as to inspect the real shape in IDE.
+ */
+// eslint-disable-next-line @typescript-eslint/ban-types
+export type Expand<T> = T extends Function
+  ? T
+  : T extends object
+    ? T extends Array<infer U>
+      ? Array<Expand<U>>
+      : T extends Map<infer K, infer V>
+        ? Map<Expand<K>, Expand<V>>
+        : T extends Set<infer M>
+          ? Set<Expand<M>>
+          : { [K in keyof T]: Expand<T[K]> }
+    : T;
 /** Union of a single and a list of {@link AudioFrame}s */
 export type AudioBuffer = AudioFrame[] | AudioFrame;

package/src/voice/agent_activity.ts CHANGED Viewed

@@ -235,6 +235,14 @@ export class AgentActivity implements RecognitionHooks {
         } catch (error) {
           this.logger.error(error, 'failed to update the tools');
         }
+        if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
+          this.logger.error(
+            'audio output is enabled but RealtimeModel has no audio modality ' +
+              'and no TTS is set. Either enable audio modality in the RealtimeModel ' +
+              'or set a TTS model.',
+          );
+        }
       } else if (this.llm instanceof LLM) {
         try {
           updateInstructions({
@@ -625,11 +633,21 @@ export class AgentActivity implements RecognitionHooks {
       return;
     }
+    // Refactored interruption word count check:
+    // - Always apply minInterruptionWords filtering when STT is available and minInterruptionWords > 0
+    // - Apply check to all STT results: empty string, undefined, or any length
+    // - This ensures consistent behavior across all interruption scenarios
     if (this.stt && this.agentSession.options.minInterruptionWords > 0 && this.audioRecognition) {
       const text = this.audioRecognition.currentTranscript;
       // TODO(shubhra): better word splitting for multi-language
-      if (text && splitWords(text, true).length < this.agentSession.options.minInterruptionWords) {
+      // Normalize text: convert undefined/null to empty string for consistent word counting
+      const normalizedText = text ?? '';
+      const wordCount = splitWords(normalizedText, true).length;
+      // Only allow interruption if word count meets or exceeds minInterruptionWords
+      // This applies to all cases: empty strings, partial speech, and full speech
+      if (wordCount < this.agentSession.options.minInterruptionWords) {
         return;
       }
     }
@@ -767,19 +785,30 @@ export class AgentActivity implements RecognitionHooks {
       return true;
     }
+    // Refactored interruption word count check for consistency with onVADInferenceDone:
+    // - Always apply minInterruptionWords filtering when STT is available and minInterruptionWords > 0
+    // - Use consistent word splitting logic with splitWords (matching onVADInferenceDone pattern)
     if (
       this.stt &&
       this.turnDetection !== 'manual' &&
       this._currentSpeech &&
       this._currentSpeech.allowInterruptions &&
       !this._currentSpeech.interrupted &&
-      this.agentSession.options.minInterruptionWords > 0 &&
-      info.newTranscript.split(' ').length < this.agentSession.options.minInterruptionWords
+      this.agentSession.options.minInterruptionWords > 0
     ) {
-      // avoid interruption if the new_transcript is too short
-      this.cancelPreemptiveGeneration();
-      this.logger.info('skipping user input, new_transcript is too short');
-      return false;
+      const wordCount = splitWords(info.newTranscript, true).length;
+      if (wordCount < this.agentSession.options.minInterruptionWords) {
+        // avoid interruption if the new_transcript contains fewer words than minInterruptionWords
+        this.cancelPreemptiveGeneration();
+        this.logger.info(
+          {
+            wordCount,
+            minInterruptionWords: this.agentSession.options.minInterruptionWords,
+          },
+          'skipping user input, word count below minimum interruption threshold',
+        );
+        return false;
+      }
     }
     const oldTask = this._userTurnCompletedTask;
@@ -1612,7 +1641,7 @@ export class AgentActivity implements RecognitionHooks {
     const readMessages = async (
       abortController: AbortController,
-      outputs: Array<[string, _TextOut | null, _AudioOut | null]>,
+      outputs: Array<[string, _TextOut | null, _AudioOut | null, ('text' | 'audio')[] | undefined]>,
     ) => {
       replyAbortController.signal.addEventListener('abort', () => abortController.abort(), {
         once: true,
@@ -1627,7 +1656,25 @@ export class AgentActivity implements RecognitionHooks {
             );
             break;
           }
-          const trNodeResult = await this.agent.transcriptionNode(msg.textStream, modelSettings);
+          const msgModalities = msg.modalities ? await msg.modalities : undefined;
+          let ttsTextInput: ReadableStream<string> | null = null;
+          let trTextInput: ReadableStream<string>;
+          if (msgModalities && !msgModalities.includes('audio') && this.tts) {
+            if (this.llm instanceof RealtimeModel && this.llm.capabilities.audioOutput) {
+              this.logger.warn(
+                'text response received from realtime API, falling back to use a TTS model.',
+              );
+            }
+            const [_ttsTextInput, _trTextInput] = msg.textStream.tee();
+            ttsTextInput = _ttsTextInput;
+            trTextInput = _trTextInput;
+          } else {
+            trTextInput = msg.textStream;
+          }
+          const trNodeResult = await this.agent.transcriptionNode(trTextInput, modelSettings);
           let textOut: _TextOut | null = null;
           if (trNodeResult) {
             const [textForwardTask, _textOut] = performTextForwarding(
@@ -1638,30 +1685,51 @@ export class AgentActivity implements RecognitionHooks {
             forwardTasks.push(textForwardTask);
             textOut = _textOut;
           }
           let audioOut: _AudioOut | null = null;
           if (audioOutput) {
-            const realtimeAudio = await this.agent.realtimeAudioOutputNode(
-              msg.audioStream,
-              modelSettings,
-            );
-            if (realtimeAudio) {
+            let realtimeAudioResult: ReadableStream<AudioFrame> | null = null;
+            if (ttsTextInput) {
+              const [ttsTask, ttsStream] = performTTSInference(
+                (...args) => this.agent.ttsNode(...args),
+                ttsTextInput,
+                modelSettings,
+                abortController,
+              );
+              tasks.push(ttsTask);
+              realtimeAudioResult = ttsStream;
+            } else if (msgModalities && msgModalities.includes('audio')) {
+              realtimeAudioResult = await this.agent.realtimeAudioOutputNode(
+                msg.audioStream,
+                modelSettings,
+              );
+            } else if (this.llm instanceof RealtimeModel && this.llm.capabilities.audioOutput) {
+              this.logger.error(
+                'Text message received from Realtime API with audio modality. ' +
+                  'This usually happens when text chat context is synced to the API. ' +
+                  'Try to add a TTS model as fallback or use text modality with TTS instead.',
+              );
+            } else {
+              this.logger.warn(
+                'audio output is enabled but neither tts nor realtime audio is available',
+              );
+            }
+            if (realtimeAudioResult) {
               const [forwardTask, _audioOut] = performAudioForwarding(
-                realtimeAudio,
+                realtimeAudioResult,
                 audioOutput,
                 abortController,
               );
               forwardTasks.push(forwardTask);
               audioOut = _audioOut;
               audioOut.firstFrameFut.await.finally(onFirstFrame);
-            } else {
-              this.logger.warn(
-                'audio output is enabled but neither tts nor realtime audio is available',
-              );
             }
           } else if (textOut) {
             textOut.firstTextFut.await.finally(onFirstFrame);
           }
-          outputs.push([msg.messageId, textOut, audioOut]);
+          outputs.push([msg.messageId, textOut, audioOut, msgModalities]);
         }
         await waitFor(forwardTasks);
       } catch (error) {
@@ -1671,7 +1739,9 @@ export class AgentActivity implements RecognitionHooks {
       }
     };
-    const messageOutputs: Array<[string, _TextOut | null, _AudioOut | null]> = [];
+    const messageOutputs: Array<
+      [string, _TextOut | null, _AudioOut | null, ('text' | 'audio')[] | undefined]
+    > = [];
     const tasks = [
       Task.from(
         (controller) => readMessages(controller, messageOutputs),
@@ -1750,7 +1820,7 @@ export class AgentActivity implements RecognitionHooks {
       if (messageOutputs.length > 0) {
         // there should be only one message
-        const [msgId, textOut, audioOut] = messageOutputs[0]!;
+        const [msgId, textOut, audioOut, msgModalities] = messageOutputs[0]!;
         let forwardedText = textOut?.text || '';
         if (audioOutput) {
@@ -1775,6 +1845,8 @@ export class AgentActivity implements RecognitionHooks {
           this.realtimeSession.truncate({
             messageId: msgId,
             audioEndMs: Math.floor(playbackPosition),
+            modalities: msgModalities,
+            audioTranscript: forwardedText,
           });
         }
@@ -1805,7 +1877,7 @@ export class AgentActivity implements RecognitionHooks {
     if (messageOutputs.length > 0) {
       // there should be only one message
-      const [msgId, textOut, _] = messageOutputs[0]!;
+      const [msgId, textOut, _, __] = messageOutputs[0]!;
       const message = ChatMessage.create({
         role: 'assistant',
         content: textOut?.text || '',