npm - @livekit/agents-plugin-openai - Versions diffs - 0.9.3 → 1.0.0-next.0 - Mend

@livekit/agents-plugin-openai 0.9.3 → 1.0.0-next.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

package/dist/index.cjs +16 -5
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +4 -4
package/dist/index.d.ts +4 -4
package/dist/index.d.ts.map +1 -1
package/dist/index.js +14 -3
package/dist/index.js.map +1 -1
package/dist/llm.cjs +156 -197
package/dist/llm.cjs.map +1 -1
package/dist/llm.d.cts +27 -8
package/dist/llm.d.ts +27 -8
package/dist/llm.d.ts.map +1 -1
package/dist/llm.js +164 -188
package/dist/llm.js.map +1 -1
package/dist/models.cjs +14 -0
package/dist/models.cjs.map +1 -1
package/dist/models.d.cts +11 -6
package/dist/models.d.ts +11 -6
package/dist/models.d.ts.map +1 -1
package/dist/models.js +6 -0
package/dist/models.js.map +1 -1
package/dist/realtime/api_proto.cjs.map +1 -1
package/dist/realtime/api_proto.d.cts +15 -0
package/dist/realtime/api_proto.d.ts +15 -0
package/dist/realtime/api_proto.d.ts.map +1 -1
package/dist/realtime/api_proto.js.map +1 -1
package/dist/realtime/realtime_model.cjs +1057 -820
package/dist/realtime/realtime_model.cjs.map +1 -1
package/dist/realtime/realtime_model.d.cts +126 -160
package/dist/realtime/realtime_model.d.ts +126 -160
package/dist/realtime/realtime_model.d.ts.map +1 -1
package/dist/realtime/realtime_model.js +1067 -825
package/dist/realtime/realtime_model.js.map +1 -1
package/dist/tts.cjs +5 -5
package/dist/tts.cjs.map +1 -1
package/dist/tts.d.cts +2 -1
package/dist/tts.d.ts +2 -1
package/dist/tts.d.ts.map +1 -1
package/dist/tts.js +6 -6
package/dist/tts.js.map +1 -1
package/package.json +9 -7
package/src/index.ts +19 -5
package/src/llm.ts +227 -228
package/src/models.ts +83 -5
package/src/realtime/api_proto.ts +15 -1
package/src/realtime/realtime_model.ts +1305 -996
package/src/tts.ts +6 -6

package/src/realtime/realtime_model.ts CHANGED Viewed

@@ -1,561 +1,644 @@
 // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
+import type { metrics } from '@livekit/agents';
 import {
-  AsyncIterableQueue,
+  type APIConnectOptions,
+  APIConnectionError,
+  APIError,
+  AudioByteStream,
+  DEFAULT_API_CONNECT_OPTIONS,
   Future,
   Queue,
+  Task,
+  cancelAndWait,
+  isAPIError,
   llm,
   log,
-  mergeFrames,
-  metrics,
-  multimodal,
+  shortuuid,
+  stream,
 } from '@livekit/agents';
-import { AudioFrame } from '@livekit/rtc-node';
-import { once } from 'node:events';
-import { WebSocket } from 'ws';
+import { Mutex } from '@livekit/mutex';
+import type { AudioResampler } from '@livekit/rtc-node';
+import { AudioFrame, combineAudioFrames } from '@livekit/rtc-node';
+import { delay } from '@std/async';
+import type { GenerationCreatedEvent } from 'agents/dist/llm/realtime.js';
+import { type MessageEvent, WebSocket } from 'ws';
 import * as api_proto from './api_proto.js';
-interface ModelOptions {
-  modalities: ['text', 'audio'] | ['text'];
-  instructions: string;
+const SAMPLE_RATE = 24000;
+const NUM_CHANNELS = 1;
+const BASE_URL = 'https://api.openai.com/v1';
+const MOCK_AUDIO_ID_PREFIX = 'lk_mock_audio_item_';
+interface RealtimeOptions {
+  model: api_proto.Model;
   voice: api_proto.Voice;
-  inputAudioFormat: api_proto.AudioFormat;
-  outputAudioFormat: api_proto.AudioFormat;
-  inputAudioTranscription: api_proto.InputAudioTranscription | null;
-  turnDetection: api_proto.TurnDetectionType | null;
   temperature: number;
-  maxResponseOutputTokens: number;
-  model: api_proto.Model;
+  toolChoice?: llm.ToolChoice;
+  inputAudioTranscription?: api_proto.InputAudioTranscription | null;
+  // TODO(shubhra): add inputAudioNoiseReduction
+  turnDetection?: api_proto.TurnDetectionType | null;
+  maxResponseOutputTokens?: number | 'inf';
+  speed?: number;
+  // TODO(shubhra): add openai tracing options
   apiKey?: string;
   baseURL: string;
   isAzure: boolean;
+  azureDeployment?: string;
   entraToken?: string;
   apiVersion?: string;
+  maxSessionDuration: number;
+  // reset the connection after this many seconds if provided
+  connOptions: APIConnectOptions;
 }
-export interface RealtimeResponse {
-  id: string;
-  status: api_proto.ResponseStatus;
-  statusDetails: api_proto.ResponseStatusDetails | null;
-  usage: api_proto.ModelUsage | null;
-  output: RealtimeOutput[];
-  doneFut: Future;
-  createdTimestamp: number;
-  firstTokenTimestamp?: number;
+interface MessageGeneration {
+  messageId: string;
+  textChannel: stream.StreamChannel<string>;
+  audioChannel: stream.StreamChannel<AudioFrame>;
+  audioTranscript: string;
 }
-export interface RealtimeOutput {
-  responseId: string;
-  itemId: string;
-  outputIndex: number;
-  role: api_proto.Role;
-  type: 'message' | 'function_call';
-  content: RealtimeContent[];
-  doneFut: Future;
+interface ResponseGeneration {
+  messageChannel: stream.StreamChannel<llm.MessageGeneration>;
+  functionChannel: stream.StreamChannel<llm.FunctionCall>;
+  messages: Map<string, MessageGeneration>;
+  /** @internal */
+  _doneFut: Future;
+  /** @internal */
+  _createdTimestamp: number;
+  /** @internal */
+  _firstTokenTimestamp?: number;
 }
-export interface RealtimeContent {
-  responseId: string;
-  itemId: string;
-  outputIndex: number;
-  contentIndex: number;
-  text: string;
-  audio: AudioFrame[];
-  textStream: AsyncIterableQueue<string>;
-  audioStream: AsyncIterableQueue<AudioFrame>;
-  toolCalls: RealtimeToolCall[];
-  contentType: api_proto.Modality;
+class CreateResponseHandle {
+  instructions?: string;
+  doneFut: Future<llm.GenerationCreatedEvent>;
+  // TODO(shubhra): add timeout
+  constructor({ instructions }: { instructions?: string }) {
+    this.instructions = instructions;
+    this.doneFut = new Future();
+  }
 }
-export interface RealtimeToolCall {
-  name: string;
-  arguments: string;
-  toolCallID: string;
-}
+// default values got from a "default" session from their API
+const DEFAULT_FIRST_RETRY_INTERVAL_MS = 100;
+const DEFAULT_TEMPERATURE = 0.8;
+const DEFAULT_TURN_DETECTION: api_proto.TurnDetectionType = {
+  type: 'server_vad',
+  threshold: 0.5,
+  prefix_padding_ms: 300,
+  silence_duration_ms: 200,
+  create_response: true,
+  interrupt_response: true,
+};
+const DEFAULT_INPUT_AUDIO_TRANSCRIPTION: api_proto.InputAudioTranscription = {
+  model: 'gpt-4o-mini-transcribe',
+};
+const DEFAULT_TOOL_CHOICE: llm.ToolChoice = 'auto';
+const DEFAULT_MAX_RESPONSE_OUTPUT_TOKENS: number | 'inf' = 'inf';
+const AZURE_DEFAULT_INPUT_AUDIO_TRANSCRIPTION: api_proto.InputAudioTranscription = {
+  model: 'whisper-1',
+};
+const AZURE_DEFAULT_TURN_DETECTION: api_proto.TurnDetectionType = {
+  type: 'server_vad',
+  threshold: 0.5,
+  prefix_padding_ms: 300,
+  silence_duration_ms: 200,
+  create_response: true,
+};
+const DEFAULT_MAX_SESSION_DURATION = 20 * 60 * 1000; // 20 minutes
+const DEFAULT_REALTIME_MODEL_OPTIONS = {
+  model: 'gpt-4o-realtime-preview',
+  voice: 'alloy',
+  temperature: DEFAULT_TEMPERATURE,
+  inputAudioTranscription: DEFAULT_INPUT_AUDIO_TRANSCRIPTION,
+  turnDetection: DEFAULT_TURN_DETECTION,
+  toolChoice: DEFAULT_TOOL_CHOICE,
+  maxResponseOutputTokens: DEFAULT_MAX_RESPONSE_OUTPUT_TOKENS,
+  maxSessionDuration: DEFAULT_MAX_SESSION_DURATION,
+  connOptions: DEFAULT_API_CONNECT_OPTIONS,
+};
+export class RealtimeModel extends llm.RealtimeModel {
+  sampleRate = api_proto.SAMPLE_RATE;
+  numChannels = api_proto.NUM_CHANNELS;
+  inFrameSize = api_proto.IN_FRAME_SIZE;
+  outFrameSize = api_proto.OUT_FRAME_SIZE;
-export interface InputSpeechTranscriptionCompleted {
-  itemId: string;
-  transcript: string;
-}
+  /* @internal */
+  _options: RealtimeOptions;
-export interface InputSpeechTranscriptionFailed {
-  itemId: string;
-  message: string;
-}
+  constructor(
+    options: {
+      model?: string;
+      voice?: string;
+      temperature?: number;
+      toolChoice?: llm.ToolChoice;
+      baseURL?: string;
+      inputAudioTranscription?: api_proto.InputAudioTranscription | null;
+      // TODO(shubhra): add inputAudioNoiseReduction
+      turnDetection?: api_proto.TurnDetectionType | null;
+      speed?: number;
+      // TODO(shubhra): add openai tracing options
+      azureDeployment?: string;
+      apiKey?: string;
+      entraToken?: string;
+      apiVersion?: string;
+      maxSessionDuration?: number;
+      connOptions?: APIConnectOptions;
+    } = {},
+  ) {
+    super({
+      messageTruncation: true,
+      turnDetection: options.turnDetection !== null,
+      userTranscription: options.inputAudioTranscription !== null,
+      autoToolReplyGeneration: false,
+    });
-export interface InputSpeechStarted {
-  itemId: string;
-}
+    const isAzure = !!(options.apiVersion || options.entraToken || options.azureDeployment);
-export interface InputSpeechCommitted {
-  itemId: string;
-}
+    if (options.apiKey === '' && !isAzure) {
+      throw new Error(
+        'OpenAI API key is required, either using the argument or by setting the OPENAI_API_KEY environmental variable',
+      );
+    }
+    const apiKey = options.apiKey || process.env.OPENAI_API_KEY;
+    if (!apiKey && !isAzure) {
+      throw new Error(
+        'OpenAI API key is required, either using the argument or by setting the OPENAI_API_KEY environmental variable',
+      );
+    }
-class InputAudioBuffer {
-  #session: RealtimeSession;
+    if (!options.baseURL && isAzure) {
+      const azureEndpoint = process.env.AZURE_OPENAI_ENDPOINT;
+      if (!azureEndpoint) {
+        throw new Error(
+          'Missing Azure endpoint. Please pass base_url or set AZURE_OPENAI_ENDPOINT environment variable.',
+        );
+      }
+      options.baseURL = `${azureEndpoint.replace(/\/$/, '')}/openai`;
+    }
-  constructor(session: RealtimeSession) {
-    this.#session = session;
+    this._options = {
+      ...DEFAULT_REALTIME_MODEL_OPTIONS,
+      ...options,
+      baseURL: options.baseURL || BASE_URL,
+      apiKey,
+      isAzure,
+      model: options.model || DEFAULT_REALTIME_MODEL_OPTIONS.model,
+    };
   }
-  append(frame: AudioFrame) {
-    this.#session.queueMsg({
-      type: 'input_audio_buffer.append',
-      audio: Buffer.from(frame.data.buffer).toString('base64'),
+  /**
+   * Create a RealtimeModel instance configured for Azure OpenAI Service.
+   *
+   * @param azureDeployment - The name of your Azure OpenAI deployment.
+   * @param azureEndpoint - The endpoint URL for your Azure OpenAI resource. If undefined, will attempt to read from the environment variable AZURE_OPENAI_ENDPOINT.
+   * @param apiVersion - API version to use with Azure OpenAI Service. If undefined, will attempt to read from the environment variable OPENAI_API_VERSION.
+   * @param apiKey - Azure OpenAI API key. If undefined, will attempt to read from the environment variable AZURE_OPENAI_API_KEY.
+   * @param entraToken - Azure Entra authentication token. Required if not using API key authentication.
+   * @param baseURL - Base URL for the API endpoint. If undefined, constructed from the azure_endpoint.
+   * @param voice - Voice setting for audio outputs. Defaults to "alloy".
+   * @param inputAudioTranscription - Options for transcribing input audio. Defaults to @see DEFAULT_INPUT_AUDIO_TRANSCRIPTION.
+   * @param turnDetection - Options for server-based voice activity detection (VAD). Defaults to @see DEFAULT_SERVER_VAD_OPTIONS.
+   * @param temperature - Sampling temperature for response generation. Defaults to @see DEFAULT_TEMPERATURE.
+   * @param speed - Speed of the audio output. Defaults to 1.0.
+   * @param maxResponseOutputTokens - Maximum number of tokens in the response. Defaults to @see DEFAULT_MAX_RESPONSE_OUTPUT_TOKENS.
+   * @param maxSessionDuration - Maximum duration of the session in milliseconds. Defaults to @see DEFAULT_MAX_SESSION_DURATION.
+   *
+   * @returns A RealtimeModel instance configured for Azure OpenAI Service.
+   *
+   * @throws Error if required Azure parameters are missing or invalid.
+   */
+  static withAzure({
+    azureDeployment,
+    azureEndpoint,
+    apiVersion,
+    apiKey,
+    entraToken,
+    baseURL,
+    voice = 'alloy',
+    inputAudioTranscription = AZURE_DEFAULT_INPUT_AUDIO_TRANSCRIPTION,
+    turnDetection = AZURE_DEFAULT_TURN_DETECTION,
+    temperature = 0.8,
+    speed,
+  }: {
+    azureDeployment: string;
+    azureEndpoint?: string;
+    apiVersion?: string;
+    apiKey?: string;
+    entraToken?: string;
+    baseURL?: string;
+    voice?: string;
+    inputAudioTranscription?: api_proto.InputAudioTranscription;
+    // TODO(shubhra): add inputAudioNoiseReduction
+    turnDetection?: api_proto.TurnDetectionType;
+    temperature?: number;
+    speed?: number;
+  }) {
+    apiKey = apiKey || process.env.AZURE_OPENAI_API_KEY;
+    if (!apiKey && !entraToken) {
+      throw new Error(
+        'Missing credentials. Please pass one of `apiKey`, `entraToken`, or the `AZURE_OPENAI_API_KEY` environment variable.',
+      );
+    }
+    apiVersion = apiVersion || process.env.OPENAI_API_VERSION;
+    if (!apiVersion) {
+      throw new Error(
+        'Must provide either the `apiVersion` argument or the `OPENAI_API_VERSION` environment variable',
+      );
+    }
+    if (!baseURL) {
+      azureEndpoint = azureEndpoint || process.env.AZURE_OPENAI_ENDPOINT;
+      if (!azureEndpoint) {
+        throw new Error(
+          'Missing Azure endpoint. Please pass the `azure_endpoint` parameter or set the `AZURE_OPENAI_ENDPOINT` environment variable.',
+        );
+      }
+      baseURL = `${azureEndpoint.replace(/\/$/, '')}/openai`;
+    }
+    return new RealtimeModel({
+      voice,
+      inputAudioTranscription,
+      turnDetection,
+      temperature,
+      speed,
+      apiKey,
+      azureDeployment,
+      apiVersion,
+      entraToken,
+      baseURL,
     });
   }
-  clear() {
-    this.#session.queueMsg({
-      type: 'input_audio_buffer.clear',
-    });
+  session() {
+    return new RealtimeSession(this);
   }
-  commit() {
-    this.#session.queueMsg({
-      type: 'input_audio_buffer.commit',
-    });
+  async close() {
+    return;
   }
 }
-class ConversationItem {
-  #session: RealtimeSession;
-  #logger = log();
-  constructor(session: RealtimeSession) {
-    this.#session = session;
-  }
+function processBaseURL({
+  baseURL,
+  model,
+  isAzure = false,
+  azureDeployment,
+  apiVersion,
+}: {
+  baseURL: string;
+  model: string;
+  isAzure: boolean;
+  azureDeployment?: string;
+  apiVersion?: string;
+}): string {
+  const url = new URL([baseURL, 'realtime'].join('/'));
-  truncate(itemId: string, contentIndex: number, audioEnd: number) {
-    this.#session.queueMsg({
-      type: 'conversation.item.truncate',
-      item_id: itemId,
-      content_index: contentIndex,
-      audio_end_ms: audioEnd,
-    });
+  if (url.protocol === 'https:') {
+    url.protocol = 'wss:';
   }
-  delete(itemId: string) {
-    this.#session.queueMsg({
-      type: 'conversation.item.delete',
-      item_id: itemId,
-    });
+  // ensure "/realtime" is added if the path is empty OR "/v1"
+  if (!url.pathname || ['', '/v1', '/openai'].includes(url.pathname.replace(/\/$/, ''))) {
+    url.pathname = url.pathname.replace(/\/$/, '') + '/realtime';
+  } else {
+    url.pathname = url.pathname.replace(/\/$/, '');
   }
-  create(message: llm.ChatMessage, previousItemId?: string): void {
-    if (!message.content) {
-      return;
+  const queryParams: Record<string, string> = {};
+  if (isAzure) {
+    if (apiVersion) {
+      queryParams['api-version'] = apiVersion;
     }
+    if (azureDeployment) {
+      queryParams['deployment'] = azureDeployment;
+    }
+  } else {
+    queryParams['model'] = model;
+  }
-    let event: api_proto.ConversationItemCreateEvent;
+  for (const [key, value] of Object.entries(queryParams)) {
+    url.searchParams.set(key, value);
+  }
-    if (message.toolCallId) {
-      if (typeof message.content !== 'string') {
-        throw new TypeError('message.content must be a string');
-      }
+  return url.toString();
+}
-      event = {
-        type: 'conversation.item.create',
-        previous_item_id: previousItemId,
-        item: {
-          type: 'function_call_output',
-          call_id: message.toolCallId,
-          output: message.content,
-        },
-      };
-    } else {
-      let content = message.content;
-      if (!Array.isArray(content)) {
-        content = [content];
-      }
+/**
+ * A session for the OpenAI Realtime API.
+ *
+ * This class is used to interact with the OpenAI Realtime API.
+ * It is responsible for sending events to the OpenAI Realtime API and receiving events from it.
+ *
+ * It exposes two more events:
+ * - openai_server_event_received: expose the raw server events from the OpenAI Realtime API
+ * - openai_client_event_queued: expose the raw client events sent to the OpenAI Realtime API
+ */
+export class RealtimeSession extends llm.RealtimeSession {
+  private _tools: llm.ToolContext = {};
+  private remoteChatCtx: llm.RemoteChatContext = new llm.RemoteChatContext();
+  private messageChannel = new Queue<api_proto.ClientEvent>();
+  private inputResampler?: AudioResampler;
+  private instructions?: string;
+  private oaiRealtimeModel: RealtimeModel;
+  private currentGeneration?: ResponseGeneration;
+  private responseCreatedFutures: { [id: string]: CreateResponseHandle } = {};
+  private textModeRecoveryRetries: number = 0;
+  private itemCreateFutures: { [id: string]: Future } = {};
+  private itemDeleteFutures: { [id: string]: Future } = {};
+  private updateChatCtxLock = new Mutex();
+  private updateFuncCtxLock = new Mutex();
+  // 100ms chunks
+  private bstream = new AudioByteStream(SAMPLE_RATE, NUM_CHANNELS, SAMPLE_RATE / 10);
+  private pushedDurationMs: number = 0;
-      if (message.role === llm.ChatRole.USER) {
-        const contents: (api_proto.InputTextContent | api_proto.InputAudioContent)[] = [];
-        for (const c of content) {
-          if (typeof c === 'string') {
-            contents.push({
-              type: 'input_text',
-              text: c,
-            });
-          } else if (
-            // typescript type guard for determining ChatAudio vs ChatImage
-            ((c: llm.ChatAudio | llm.ChatImage): c is llm.ChatAudio => {
-              return (c as llm.ChatAudio).frame !== undefined;
-            })(c)
-          ) {
-            contents.push({
-              type: 'input_audio',
-              audio: Buffer.from(mergeFrames(c.frame).data.buffer).toString('base64'),
-            });
-          }
-        }
+  #logger = log();
+  #task: Promise<void>;
+  #closed = false;
-        event = {
-          type: 'conversation.item.create',
-          previous_item_id: previousItemId,
-          item: {
-            type: 'message',
-            role: 'user',
-            content: contents,
-          },
-        };
-      } else if (message.role === llm.ChatRole.ASSISTANT) {
-        const contents: api_proto.TextContent[] = [];
-        for (const c of content) {
-          if (typeof c === 'string') {
-            contents.push({
-              type: 'text',
-              text: c,
-            });
-          } else if (
-            // typescript type guard for determining ChatAudio vs ChatImage
-            ((c: llm.ChatAudio | llm.ChatImage): c is llm.ChatAudio => {
-              return (c as llm.ChatAudio).frame !== undefined;
-            })(c)
-          ) {
-            this.#logger.warn('audio content in assistant message is not supported');
-          }
-        }
+  constructor(realtimeModel: RealtimeModel) {
+    super(realtimeModel);
-        event = {
-          type: 'conversation.item.create',
-          previous_item_id: previousItemId,
-          item: {
-            type: 'message',
-            role: 'assistant',
-            content: contents,
-          },
-        };
-      } else if (message.role === llm.ChatRole.SYSTEM) {
-        const contents: api_proto.InputTextContent[] = [];
-        for (const c of content) {
-          if (typeof c === 'string') {
-            contents.push({
-              type: 'input_text',
-              text: c,
-            });
-          } else if (
-            // typescript type guard for determining ChatAudio vs ChatImage
-            ((c: llm.ChatAudio | llm.ChatImage): c is llm.ChatAudio => {
-              return (c as llm.ChatAudio).frame !== undefined;
-            })(c)
-          ) {
-            this.#logger.warn('audio content in system message is not supported');
-          }
-        }
+    this.oaiRealtimeModel = realtimeModel;
-        event = {
-          type: 'conversation.item.create',
-          previous_item_id: previousItemId,
-          item: {
-            type: 'message',
-            role: 'system',
-            content: contents,
-          },
-        };
-      } else {
-        this.#logger
-          .child({ message })
-          .warn('chat message is not supported inside the realtime API');
-        return;
-      }
-    }
+    this.#task = this.#mainTask();
-    this.#session.queueMsg(event);
+    this.sendEvent(this.createSessionUpdateEvent());
   }
-}
-class Conversation {
-  #session: RealtimeSession;
-  constructor(session: RealtimeSession) {
-    this.#session = session;
+  sendEvent(command: api_proto.ClientEvent): void {
+    this.messageChannel.put(command);
   }
-  get item(): ConversationItem {
-    return new ConversationItem(this.#session);
+  private createSessionUpdateEvent(): api_proto.SessionUpdateEvent {
+    return {
+      type: 'session.update',
+      session: {
+        model: this.oaiRealtimeModel._options.model,
+        voice: this.oaiRealtimeModel._options.voice,
+        input_audio_format: 'pcm16',
+        output_audio_format: 'pcm16',
+        modalities: ['text', 'audio'],
+        turn_detection: this.oaiRealtimeModel._options.turnDetection,
+        input_audio_transcription: this.oaiRealtimeModel._options.inputAudioTranscription,
+        // TODO(shubhra): add inputAudioNoiseReduction
+        temperature: this.oaiRealtimeModel._options.temperature,
+        tool_choice: toOaiToolChoice(this.oaiRealtimeModel._options.toolChoice),
+        max_response_output_tokens:
+          this.oaiRealtimeModel._options.maxResponseOutputTokens === Infinity
+            ? 'inf'
+            : this.oaiRealtimeModel._options.maxResponseOutputTokens,
+        // TODO(shubhra): add tracing options
+        instructions: this.instructions,
+        speed: this.oaiRealtimeModel._options.speed,
+      },
+    };
   }
-}
-class Response {
-  #session: RealtimeSession;
-  constructor(session: RealtimeSession) {
-    this.#session = session;
+  get chatCtx() {
+    return this.remoteChatCtx.toChatCtx();
   }
-  create() {
-    this.#session.queueMsg({
-      type: 'response.create',
-    });
+  get tools() {
+    return { ...this._tools } as llm.ToolContext;
   }
-  cancel() {
-    this.#session.queueMsg({
-      type: 'response.cancel',
-    });
-  }
-}
+  async updateChatCtx(_chatCtx: llm.ChatContext): Promise<void> {
+    const unlock = await this.updateChatCtxLock.lock();
+    const events = this.createChatCtxUpdateEvents(_chatCtx);
+    const futures: Future<void>[] = [];
-interface ContentPtr {
-  response_id: string;
-  output_index: number;
-  content_index: number;
-}
+    for (const event of events) {
+      const future = new Future<void>();
+      futures.push(future);
-export class RealtimeModel extends multimodal.RealtimeModel {
-  sampleRate = api_proto.SAMPLE_RATE;
-  numChannels = api_proto.NUM_CHANNELS;
-  inFrameSize = api_proto.IN_FRAME_SIZE;
-  outFrameSize = api_proto.OUT_FRAME_SIZE;
+      if (event.type === 'conversation.item.create') {
+        this.itemCreateFutures[event.item.id] = future;
+      } else if (event.type == 'conversation.item.delete') {
+        this.itemDeleteFutures[event.item_id] = future;
+      }
-  #defaultOpts: ModelOptions;
-  #sessions: RealtimeSession[] = [];
+      this.sendEvent(event);
+    }
-  static withAzure({
-    baseURL,
-    azureDeployment,
-    apiVersion = '2024-10-01-preview',
-    apiKey = undefined,
-    entraToken = undefined,
-    instructions = '',
-    modalities = ['text', 'audio'],
-    voice = 'alloy',
-    inputAudioFormat = 'pcm16',
-    outputAudioFormat = 'pcm16',
-    inputAudioTranscription = { model: 'whisper-1' },
-    turnDetection = { type: 'server_vad' },
-    temperature = 0.8,
-    maxResponseOutputTokens = Infinity,
-  }: {
-    baseURL: string;
-    azureDeployment: string;
-    apiVersion?: string;
-    apiKey?: string;
-    entraToken?: string;
-    instructions?: string;
-    modalities?: ['text', 'audio'] | ['text'];
-    voice?: api_proto.Voice;
-    inputAudioFormat?: api_proto.AudioFormat;
-    outputAudioFormat?: api_proto.AudioFormat;
-    inputAudioTranscription?: api_proto.InputAudioTranscription;
-    turnDetection?: api_proto.TurnDetectionType;
-    temperature?: number;
-    maxResponseOutputTokens?: number;
-  }) {
-    return new RealtimeModel({
-      isAzure: true,
-      baseURL: new URL('openai', baseURL).toString(),
-      model: azureDeployment,
-      apiVersion,
-      apiKey,
-      entraToken,
-      instructions,
-      modalities,
-      voice,
-      inputAudioFormat,
-      outputAudioFormat,
-      inputAudioTranscription,
-      turnDetection,
-      temperature,
-      maxResponseOutputTokens,
-    });
-  }
+    if (futures.length === 0) {
+      unlock();
+      return;
+    }
-  constructor({
-    modalities = ['text', 'audio'],
-    instructions = '',
-    voice = 'alloy',
-    inputAudioFormat = 'pcm16',
-    outputAudioFormat = 'pcm16',
-    inputAudioTranscription = { model: 'whisper-1' },
-    turnDetection = { type: 'server_vad' },
-    temperature = 0.8,
-    maxResponseOutputTokens = Infinity,
-    model = 'gpt-4o-realtime-preview-2024-10-01',
-    apiKey = process.env.OPENAI_API_KEY || '',
-    baseURL = api_proto.BASE_URL,
-    // used for microsoft
-    isAzure = false,
-    apiVersion = undefined,
-    entraToken = undefined,
-  }: {
-    modalities?: ['text', 'audio'] | ['text'];
-    instructions?: string;
-    voice?: api_proto.Voice;
-    inputAudioFormat?: api_proto.AudioFormat;
-    outputAudioFormat?: api_proto.AudioFormat;
-    inputAudioTranscription?: api_proto.InputAudioTranscription;
-    turnDetection?: api_proto.TurnDetectionType;
-    temperature?: number;
-    maxResponseOutputTokens?: number;
-    model?: api_proto.Model;
-    apiKey?: string;
-    baseURL?: string;
-    isAzure?: boolean;
-    apiVersion?: string;
-    entraToken?: string;
-  }) {
-    super();
+    try {
+      // wait for futures to resolve or timeout
+      await Promise.race([
+        Promise.all(futures),
+        delay(5000).then(() => {
+          throw new Error('Chat ctx update events timed out');
+        }),
+      ]);
+    } catch (e) {
+      this.#logger.error((e as Error).message);
+      throw e;
+    } finally {
+      unlock();
+    }
+  }
-    if (apiKey === '' && !(isAzure && entraToken)) {
-      throw new Error(
-        'OpenAI API key is required, either using the argument or by setting the OPENAI_API_KEY environmental variable',
+  private createChatCtxUpdateEvents(
+    chatCtx: llm.ChatContext,
+    addMockAudio: boolean = false,
+  ): (api_proto.ConversationItemCreateEvent | api_proto.ConversationItemDeleteEvent)[] {
+    const newChatCtx = chatCtx.copy();
+    if (addMockAudio) {
+      newChatCtx.items.push(createMockAudioItem());
+    } else {
+      // clean up existing mock audio items
+      newChatCtx.items = newChatCtx.items.filter(
+        (item) => !item.id.startsWith(MOCK_AUDIO_ID_PREFIX),
       );
     }
-    this.#defaultOpts = {
-      modalities,
-      instructions,
-      voice,
-      inputAudioFormat,
-      outputAudioFormat,
-      inputAudioTranscription,
-      turnDetection,
-      temperature,
-      maxResponseOutputTokens,
-      model,
-      apiKey,
-      baseURL,
-      isAzure,
-      apiVersion,
-      entraToken,
-    };
-  }
+    const events: (
+      | api_proto.ConversationItemCreateEvent
+      | api_proto.ConversationItemDeleteEvent
+    )[] = [];
+    const diffOps = llm.computeChatCtxDiff(this.chatCtx, newChatCtx);
+    for (const op of diffOps.toRemove) {
+      events.push({
+        type: 'conversation.item.delete',
+        item_id: op,
+        event_id: shortuuid('chat_ctx_delete_'),
+      } as api_proto.ConversationItemDeleteEvent);
+    }
-  get sessions(): RealtimeSession[] {
-    return this.#sessions;
+    for (const [previousId, id] of diffOps.toCreate) {
+      const chatItem = newChatCtx.getById(id);
+      if (!chatItem) {
+        throw new Error(`Chat item ${id} not found`);
+      }
+      events.push({
+        type: 'conversation.item.create',
+        item: livekitItemToOpenAIItem(chatItem),
+        previous_item_id: previousId ?? undefined,
+        event_id: shortuuid('chat_ctx_create_'),
+      } as api_proto.ConversationItemCreateEvent);
+    }
+    return events;
   }
-  session({
-    fncCtx,
-    chatCtx,
-    modalities = this.#defaultOpts.modalities,
-    instructions = this.#defaultOpts.instructions,
-    voice = this.#defaultOpts.voice,
-    inputAudioFormat = this.#defaultOpts.inputAudioFormat,
-    outputAudioFormat = this.#defaultOpts.outputAudioFormat,
-    inputAudioTranscription = this.#defaultOpts.inputAudioTranscription,
-    turnDetection = this.#defaultOpts.turnDetection,
-    temperature = this.#defaultOpts.temperature,
-    maxResponseOutputTokens = this.#defaultOpts.maxResponseOutputTokens,
-  }: {
-    fncCtx?: llm.FunctionContext;
-    chatCtx?: llm.ChatContext;
-    modalities?: ['text', 'audio'] | ['text'];
-    instructions?: string;
-    voice?: api_proto.Voice;
-    inputAudioFormat?: api_proto.AudioFormat;
-    outputAudioFormat?: api_proto.AudioFormat;
-    inputAudioTranscription?: api_proto.InputAudioTranscription | null;
-    turnDetection?: api_proto.TurnDetectionType | null;
-    temperature?: number;
-    maxResponseOutputTokens?: number;
-  }): RealtimeSession {
-    const opts: ModelOptions = {
-      modalities,
-      instructions,
-      voice,
-      inputAudioFormat,
-      outputAudioFormat,
-      inputAudioTranscription,
-      turnDetection,
-      temperature,
-      maxResponseOutputTokens,
-      model: this.#defaultOpts.model,
-      apiKey: this.#defaultOpts.apiKey,
-      baseURL: this.#defaultOpts.baseURL,
-      isAzure: this.#defaultOpts.isAzure,
-      apiVersion: this.#defaultOpts.apiVersion,
-      entraToken: this.#defaultOpts.entraToken,
-    };
+  async updateTools(_tools: llm.ToolContext): Promise<void> {
+    const unlock = await this.updateFuncCtxLock.lock();
+    const ev = this.createToolsUpdateEvent(_tools);
+    this.sendEvent(ev);
-    const newSession = new RealtimeSession(opts, {
-      chatCtx: chatCtx || new llm.ChatContext(),
-      fncCtx,
-    });
-    this.#sessions.push(newSession);
-    return newSession;
-  }
+    if (!ev.session.tools) {
+      throw new Error('Tools are missing in the session update event');
+    }
-  async close() {
-    await Promise.allSettled(this.#sessions.map((session) => session.close()));
+    // TODO(brian): these logics below are noops I think, leaving it here to keep
+    // parity with the python but we should remove them later
+    const retainedToolNames = new Set(ev.session.tools.map((tool) => tool.name));
+    const retainedTools = Object.fromEntries(
+      Object.entries(_tools).filter(
+        ([name, tool]) => llm.isFunctionTool(tool) && retainedToolNames.has(name),
+      ),
+    );
+    this._tools = retainedTools as llm.ToolContext;
+    unlock();
   }
-}
-export class RealtimeSession extends multimodal.RealtimeSession {
-  #chatCtx: llm.ChatContext | undefined = undefined;
-  #fncCtx: llm.FunctionContext | undefined = undefined;
-  #opts: ModelOptions;
-  #pendingResponses: { [id: string]: RealtimeResponse } = {};
-  #sessionId = 'not-connected';
-  #ws: WebSocket | null = null;
-  #expiresAt: number | null = null;
-  #logger = log();
-  #task: Promise<void>;
-  #closing = true;
-  #sendQueue = new Queue<api_proto.ClientEvent>();
+  private createToolsUpdateEvent(_tools: llm.ToolContext): api_proto.SessionUpdateEvent {
+    const oaiTools: api_proto.Tool[] = [];
-  constructor(
-    opts: ModelOptions,
-    { fncCtx, chatCtx }: { fncCtx?: llm.FunctionContext; chatCtx?: llm.ChatContext },
-  ) {
-    super();
-    this.#opts = opts;
-    this.#chatCtx = chatCtx;
-    this.#fncCtx = fncCtx;
-    this.#task = this.#start();
-    this.sessionUpdate({
-      modalities: this.#opts.modalities,
-      instructions: this.#opts.instructions,
-      voice: this.#opts.voice,
-      inputAudioFormat: this.#opts.inputAudioFormat,
-      outputAudioFormat: this.#opts.outputAudioFormat,
-      inputAudioTranscription: this.#opts.inputAudioTranscription,
-      turnDetection: this.#opts.turnDetection,
-      temperature: this.#opts.temperature,
-      maxResponseOutputTokens: this.#opts.maxResponseOutputTokens,
-      toolChoice: 'auto',
-    });
+    for (const [name, tool] of Object.entries(_tools)) {
+      if (!llm.isFunctionTool(tool)) {
+        this.#logger.error({ name, tool }, "OpenAI Realtime API doesn't support this tool type");
+        continue;
+      }
+      const { parameters: toolParameters, description } = tool;
+      try {
+        const parameters = llm.toJsonSchema(
+          toolParameters,
+        ) as unknown as api_proto.Tool['parameters'];
+        oaiTools.push({
+          name,
+          description,
+          parameters: parameters,
+          type: 'function',
+        });
+      } catch (e) {
+        this.#logger.error({ name, tool }, "OpenAI Realtime API doesn't support this tool type");
+        continue;
+      }
+    }
+    return {
+      type: 'session.update',
+      session: {
+        model: this.oaiRealtimeModel._options.model,
+        tools: oaiTools,
+      },
+      event_id: shortuuid('tools_update_'),
+    };
   }
-  get chatCtx(): llm.ChatContext | undefined {
-    return this.#chatCtx;
+  async updateInstructions(_instructions: string): Promise<void> {
+    const eventId = shortuuid('instructions_update_');
+    this.sendEvent({
+      type: 'session.update',
+      session: {
+        instructions: _instructions,
+      },
+      event_id: eventId,
+    } as api_proto.SessionUpdateEvent);
+    this.instructions = _instructions;
   }
-  get fncCtx(): llm.FunctionContext | undefined {
-    return this.#fncCtx;
+  updateOptions({ toolChoice }: { toolChoice?: llm.ToolChoice }): void {
+    const options: api_proto.SessionUpdateEvent['session'] = {};
+    this.oaiRealtimeModel._options.toolChoice = toolChoice;
+    options.tool_choice = toOaiToolChoice(toolChoice);
+    // TODO(brian): add other options here
+    this.sendEvent({
+      type: 'session.update',
+      session: options,
+      event_id: shortuuid('options_update_'),
+    });
   }
-  set fncCtx(ctx: llm.FunctionContext | undefined) {
-    this.#fncCtx = ctx;
+  pushAudio(frame: AudioFrame): void {
+    for (const f of this.resampleAudio(frame)) {
+      for (const nf of this.bstream.write(f.data.buffer)) {
+        this.sendEvent({
+          type: 'input_audio_buffer.append',
+          audio: Buffer.from(nf.data.buffer).toString('base64'),
+        } as api_proto.InputAudioBufferAppendEvent);
+        // TODO(AJS-102): use frame.durationMs once available in rtc-node
+        this.pushedDurationMs += (nf.samplesPerChannel / nf.sampleRate) * 1000;
+      }
+    }
   }
-  get conversation(): Conversation {
-    return new Conversation(this);
+  async commitAudio(): Promise<void> {
+    if (this.pushedDurationMs > 100) {
+      // OpenAI requires at least 100ms of audio
+      this.sendEvent({
+        type: 'input_audio_buffer.commit',
+      } as api_proto.InputAudioBufferCommitEvent);
+      this.pushedDurationMs = 0;
+    }
   }
-  get inputAudioBuffer(): InputAudioBuffer {
-    return new InputAudioBuffer(this);
+  async clearAudio(): Promise<void> {
+    this.sendEvent({
+      type: 'input_audio_buffer.clear',
+    } as api_proto.InputAudioBufferClearEvent);
+    this.pushedDurationMs = 0;
   }
-  get response(): Response {
-    return new Response(this);
+  async generateReply(instructions?: string): Promise<llm.GenerationCreatedEvent> {
+    const handle = this.createResponse({ instructions, userInitiated: true });
+    this.textModeRecoveryRetries = 0;
+    return handle.doneFut.await;
   }
-  get expiration(): number {
-    if (!this.#expiresAt) {
-      throw new Error('session not started');
-    }
-    return this.#expiresAt * 1000;
+  async interrupt(): Promise<void> {
+    this.sendEvent({
+      type: 'response.cancel',
+    } as api_proto.ResponseCancelEvent);
   }
-  queueMsg(command: api_proto.ClientEvent): void {
-    this.#sendQueue.put(command);
+  async truncate(_options: { messageId: string; audioEndMs: number }): Promise<void> {
+    this.sendEvent({
+      type: 'conversation.item.truncate',
+      content_index: 0,
+      item_id: _options.messageId,
+      audio_end_ms: _options.audioEndMs,
+    } as api_proto.ConversationItemTruncateEvent);
   }
   /// Truncates the data field of the event to the specified maxLength to avoid overwhelming logs
@@ -588,646 +671,872 @@ export class RealtimeSession extends multimodal.RealtimeSession {
     return untypedEvent;
   }
-  sessionUpdate({
-    modalities = this.#opts.modalities,
-    instructions = this.#opts.instructions,
-    voice = this.#opts.voice,
-    inputAudioFormat = this.#opts.inputAudioFormat,
-    outputAudioFormat = this.#opts.outputAudioFormat,
-    inputAudioTranscription = this.#opts.inputAudioTranscription,
-    turnDetection = this.#opts.turnDetection,
-    temperature = this.#opts.temperature,
-    maxResponseOutputTokens = this.#opts.maxResponseOutputTokens,
-    toolChoice = 'auto',
-    selectedTools = Object.keys(this.#fncCtx || {}),
-  }: {
-    modalities: ['text', 'audio'] | ['text'];
-    instructions?: string;
-    voice?: api_proto.Voice;
-    inputAudioFormat?: api_proto.AudioFormat;
-    outputAudioFormat?: api_proto.AudioFormat;
-    inputAudioTranscription?: api_proto.InputAudioTranscription | null;
-    turnDetection?: api_proto.TurnDetectionType | null;
-    temperature?: number;
-    maxResponseOutputTokens?: number;
-    toolChoice?: api_proto.ToolChoice;
-    selectedTools?: string[];
-  }) {
-    this.#opts = {
-      modalities,
-      instructions,
-      voice,
-      inputAudioFormat,
-      outputAudioFormat,
-      inputAudioTranscription,
-      turnDetection,
-      temperature,
-      maxResponseOutputTokens,
-      model: this.#opts.model,
-      apiKey: this.#opts.apiKey,
-      baseURL: this.#opts.baseURL,
-      isAzure: this.#opts.isAzure,
-      apiVersion: this.#opts.apiVersion,
-      entraToken: this.#opts.entraToken,
+  private async createWsConn(): Promise<WebSocket> {
+    const headers: Record<string, string> = {
+      'User-Agent': 'LiveKit-Agents-JS',
     };
-    const tools = this.#fncCtx
-      ? Object.entries(this.#fncCtx)
-          .filter(([name]) => selectedTools.includes(name))
-          .map(([name, func]) => ({
-            type: 'function' as const,
-            name,
-            description: func.description,
-            parameters:
-              // don't format parameters if they are raw openai params
-              func.parameters.type == ('object' as const)
-                ? func.parameters
-                : llm.oaiParams(func.parameters),
-          }))
-      : [];
-    const sessionUpdateEvent: api_proto.SessionUpdateEvent = {
-      type: 'session.update',
-      session: {
-        modalities: this.#opts.modalities,
-        instructions: this.#opts.instructions,
-        voice: this.#opts.voice,
-        input_audio_format: this.#opts.inputAudioFormat,
-        output_audio_format: this.#opts.outputAudioFormat,
-        input_audio_transcription: this.#opts.inputAudioTranscription,
-        turn_detection: this.#opts.turnDetection,
-        temperature: this.#opts.temperature,
-        max_response_output_tokens:
-          this.#opts.maxResponseOutputTokens === Infinity
-            ? 'inf'
-            : this.#opts.maxResponseOutputTokens,
-        tools,
-        tool_choice: toolChoice,
-      },
-    };
-    if (this.#opts.isAzure && this.#opts.maxResponseOutputTokens === Infinity) {
-      // microsoft doesn't support inf for max_response_output_tokens, but accepts no args
-      sessionUpdateEvent.session.max_response_output_tokens = undefined;
-    }
+    if (this.oaiRealtimeModel._options.isAzure) {
+      // Microsoft API has two ways of authentication
+      // 1. Entra token set as `Bearer` token
+      // 2. API key set as `api_key` header (also accepts query string)
+      if (this.oaiRealtimeModel._options.entraToken) {
+        headers.Authorization = `Bearer ${this.oaiRealtimeModel._options.entraToken}`;
+      } else if (this.oaiRealtimeModel._options.apiKey) {
+        headers['api-key'] = this.oaiRealtimeModel._options.apiKey;
+      } else {
+        throw new Error('Microsoft API key or entraToken is required');
+      }
+    } else {
+      headers.Authorization = `Bearer ${this.oaiRealtimeModel._options.apiKey}`;
+      headers['OpenAI-Beta'] = 'realtime=v1';
+    }
-    this.queueMsg(sessionUpdateEvent);
-  }
+    const url = processBaseURL({
+      baseURL: this.oaiRealtimeModel._options.baseURL,
+      model: this.oaiRealtimeModel._options.model,
+      isAzure: this.oaiRealtimeModel._options.isAzure,
+      apiVersion: this.oaiRealtimeModel._options.apiVersion,
+      azureDeployment: this.oaiRealtimeModel._options.azureDeployment,
+    });
-  /** Create an empty audio message with the given duration. */
-  #createEmptyUserAudioMessage(duration: number): llm.ChatMessage {
-    const samples = duration * api_proto.SAMPLE_RATE;
-    return new llm.ChatMessage({
-      role: llm.ChatRole.USER,
-      content: {
-        frame: new AudioFrame(
-          new Int16Array(samples * api_proto.NUM_CHANNELS),
-          api_proto.SAMPLE_RATE,
-          api_proto.NUM_CHANNELS,
-          samples,
-        ),
-      },
+    this.#logger.debug(`Connecting to OpenAI Realtime API at ${url}`);
+    return new Promise((resolve, reject) => {
+      const ws = new WebSocket(url, { headers });
+      let waiting = true;
+      const timeout = setTimeout(() => {
+        ws.close();
+        reject(new Error('WebSocket connection timeout'));
+      }, this.oaiRealtimeModel._options.connOptions.timeoutMs);
+      ws.once('open', () => {
+        if (!waiting) return;
+        waiting = false;
+        clearTimeout(timeout);
+        resolve(ws);
+      });
+      ws.once('close', () => {
+        if (!waiting) return;
+        waiting = false;
+        clearTimeout(timeout);
+        reject(new Error('OpenAI Realtime API connection closed'));
+      });
     });
   }
-  /**
-   * Try to recover from a text response to audio mode.
-   *
-   * @remarks
-   * Sometimes the OpenAI Realtime API returns text instead of audio responses.
-   * This method tries to recover from this by requesting a new response after deleting the text
-   * response and creating an empty user audio message.
-   */
-  recoverFromTextResponse(itemId: string) {
-    if (itemId) {
-      this.conversation.item.delete(itemId);
-    }
-    this.conversation.item.create(this.#createEmptyUserAudioMessage(1));
-    this.response.create();
-  }
+  async #mainTask(): Promise<void> {
+    let reconnecting = false;
+    let numRetries = 0;
+    let wsConn: WebSocket | null = null;
+    const maxRetries = this.oaiRealtimeModel._options.connOptions.maxRetry;
-  #start(): Promise<void> {
-    return new Promise(async (resolve, reject) => {
-      const headers: Record<string, string> = {
-        'User-Agent': 'LiveKit-Agents-JS',
-      };
-      if (this.#opts.isAzure) {
-        // Microsoft API has two ways of authentication
-        // 1. Entra token set as `Bearer` token
-        // 2. API key set as `api_key` header (also accepts query string)
-        if (this.#opts.entraToken) {
-          headers.Authorization = `Bearer ${this.#opts.entraToken}`;
-        } else if (this.#opts.apiKey) {
-          headers['api-key'] = this.#opts.apiKey;
-        } else {
-          reject(new Error('Microsoft API key or entraToken is required'));
-          return;
-        }
-      } else {
-        headers.Authorization = `Bearer ${this.#opts.apiKey}`;
-        headers['OpenAI-Beta'] = 'realtime=v1';
-      }
-      const url = new URL([this.#opts.baseURL, 'realtime'].join('/'));
-      if (url.protocol === 'https:') {
-        url.protocol = 'wss:';
-      }
+    const reconnect = async () => {
+      this.#logger.debug(
+        {
+          maxSessionDuration: this.oaiRealtimeModel._options.maxSessionDuration,
+        },
+        'Reconnecting to OpenAI Realtime API',
+      );
-      // Construct query parameters
-      const queryParams: Record<string, string> = {};
-      if (this.#opts.isAzure) {
-        queryParams['api-version'] = this.#opts.apiVersion ?? '2024-10-01-preview';
-        queryParams['deployment'] = this.#opts.model;
-      } else {
-        queryParams['model'] = this.#opts.model;
-      }
+      const events: api_proto.ClientEvent[] = [];
-      for (const [key, value] of Object.entries(queryParams)) {
-        url.searchParams.set(key, value);
+      // options and instructions
+      events.push(this.createSessionUpdateEvent());
+      // tools
+      if (Object.keys(this._tools).length > 0) {
+        events.push(this.createToolsUpdateEvent(this._tools));
       }
-      console.debug('Connecting to OpenAI Realtime API at ', url.toString());
-      this.#ws = new WebSocket(url.toString(), {
-        headers: headers,
+      // chat context
+      const chatCtx = this.chatCtx.copy({
+        excludeFunctionCall: true,
+        excludeInstructions: true,
+        excludeEmptyMessage: true,
       });
-      this.#ws.onerror = (error) => {
-        reject(new Error('OpenAI Realtime WebSocket error: ' + error.message));
-      };
-      await once(this.#ws, 'open');
-      this.#closing = false;
+      const oldChatCtx = this.remoteChatCtx;
+      this.remoteChatCtx = new llm.RemoteChatContext();
+      events.push(...this.createChatCtxUpdateEvents(chatCtx));
-      this.#ws.onmessage = (message) => {
-        const event: api_proto.ServerEvent = JSON.parse(message.data as string);
-        this.#logger.debug(`<- ${JSON.stringify(this.#loggableEvent(event))}`);
-        switch (event.type) {
-          case 'error':
-            this.#handleError(event);
-            break;
-          case 'session.created':
-            this.#handleSessionCreated(event);
-            break;
-          case 'session.updated':
-            this.#handleSessionUpdated(event);
-            break;
-          case 'conversation.created':
-            this.#handleConversationCreated(event);
-            break;
-          case 'input_audio_buffer.committed':
-            this.#handleInputAudioBufferCommitted(event);
-            break;
-          case 'input_audio_buffer.cleared':
-            this.#handleInputAudioBufferCleared(event);
-            break;
-          case 'input_audio_buffer.speech_started':
-            this.#handleInputAudioBufferSpeechStarted(event);
-            break;
-          case 'input_audio_buffer.speech_stopped':
-            this.#handleInputAudioBufferSpeechStopped(event);
-            break;
-          case 'conversation.item.created':
-            this.#handleConversationItemCreated(event);
-            break;
-          case 'conversation.item.input_audio_transcription.completed':
-            this.#handleConversationItemInputAudioTranscriptionCompleted(event);
-            break;
-          case 'conversation.item.input_audio_transcription.failed':
-            this.#handleConversationItemInputAudioTranscriptionFailed(event);
-            break;
-          case 'conversation.item.truncated':
-            this.#handleConversationItemTruncated(event);
-            break;
-          case 'conversation.item.deleted':
-            this.#handleConversationItemDeleted(event);
-            break;
-          case 'response.created':
-            this.#handleResponseCreated(event);
-            break;
-          case 'response.done':
-            this.#handleResponseDone(event);
-            break;
-          case 'response.output_item.added':
-            this.#handleResponseOutputItemAdded(event);
-            break;
-          case 'response.output_item.done':
-            this.#handleResponseOutputItemDone(event);
-            break;
-          case 'response.content_part.added':
-            this.#handleResponseContentPartAdded(event);
-            break;
-          case 'response.content_part.done':
-            this.#handleResponseContentPartDone(event);
-            break;
-          case 'response.text.delta':
-            this.#handleResponseTextDelta(event);
-            break;
-          case 'response.text.done':
-            this.#handleResponseTextDone(event);
-            break;
-          case 'response.audio_transcript.delta':
-            this.#handleResponseAudioTranscriptDelta(event);
-            break;
-          case 'response.audio_transcript.done':
-            this.#handleResponseAudioTranscriptDone(event);
-            break;
-          case 'response.audio.delta':
-            this.#handleResponseAudioDelta(event);
-            break;
-          case 'response.audio.done':
-            this.#handleResponseAudioDone(event);
-            break;
-          case 'response.function_call_arguments.delta':
-            this.#handleResponseFunctionCallArgumentsDelta(event);
-            break;
-          case 'response.function_call_arguments.done':
-            this.#handleResponseFunctionCallArgumentsDone(event);
-            break;
-          case 'rate_limits.updated':
-            this.#handleRateLimitsUpdated(event);
-            break;
+      try {
+        for (const ev of events) {
+          this.emit('openai_client_event_queued', ev);
+          wsConn!.send(JSON.stringify(ev));
         }
-      };
+      } catch (error) {
+        this.remoteChatCtx = oldChatCtx;
+        throw new APIConnectionError({
+          message: 'Failed to send message to OpenAI Realtime API during session re-connection',
+        });
+      }
-      const sendTask = async () => {
-        while (this.#ws && !this.#closing && this.#ws.readyState === WebSocket.OPEN) {
-          try {
-            const event = await this.#sendQueue.get();
-            if (event.type !== 'input_audio_buffer.append') {
-              this.#logger.debug(`-> ${JSON.stringify(this.#loggableEvent(event))}`);
-            }
-            this.#ws.send(JSON.stringify(event));
-          } catch (error) {
-            this.#logger.error('Error sending event:', error);
-          }
+      this.#logger.debug('Reconnected to OpenAI Realtime API');
+      this.emit('session_reconnected', {} as llm.RealtimeSessionReconnectedEvent);
+    };
+    reconnecting = false;
+    while (!this.#closed) {
+      this.#logger.debug('Creating WebSocket connection to OpenAI Realtime API');
+      wsConn = await this.createWsConn();
+      try {
+        if (reconnecting) {
+          await reconnect();
+          numRetries = 0;
+        }
+        await this.runWs(wsConn);
+      } catch (error) {
+        if (!isAPIError(error)) {
+          this.emitError({ error: error as Error, recoverable: false });
+          throw error;
         }
-      };
-      sendTask();
+        if (maxRetries === 0 || !error.retryable) {
+          this.emitError({ error: error as Error, recoverable: false });
+          throw error;
+        }
-      this.#ws.onclose = () => {
-        if (this.#expiresAt && Date.now() >= this.#expiresAt * 1000) {
-          this.#closing = true;
+        if (numRetries === maxRetries) {
+          this.emitError({ error: error as Error, recoverable: false });
+          throw new APIConnectionError({
+            message: `OpenAI Realtime API connection failed after ${numRetries} attempts`,
+            options: {
+              body: error,
+              retryable: false,
+            },
+          });
         }
-        if (!this.#closing) {
-          reject(new Error('OpenAI Realtime connection closed unexpectedly'));
+        this.emitError({ error: error as Error, recoverable: true });
+        const retryInterval =
+          numRetries === 0
+            ? DEFAULT_FIRST_RETRY_INTERVAL_MS
+            : this.oaiRealtimeModel._options.connOptions.retryIntervalMs;
+        this.#logger.warn(
+          {
+            attempt: numRetries,
+            maxRetries,
+            error,
+          },
+          `OpenAI Realtime API connection failed, retrying in ${retryInterval / 1000}s`,
+        );
+        await delay(retryInterval);
+        numRetries++;
+      }
+      reconnecting = true;
+    }
+  }
+  private async runWs(wsConn: WebSocket): Promise<void> {
+    const forwardEvents = async (signal: AbortSignal): Promise<void> => {
+      while (!this.#closed && wsConn.readyState === WebSocket.OPEN && !signal.aborted) {
+        try {
+          const event = await this.messageChannel.get();
+          if (signal.aborted) {
+            break;
+          }
+          if (event.type !== 'input_audio_buffer.append') {
+            this.#logger.debug(`(client) -> ${JSON.stringify(this.#loggableEvent(event))}`);
+          }
+          this.emit('openai_client_event_queued', event);
+          wsConn.send(JSON.stringify(event));
+        } catch (error) {
+          break;
         }
-        this.#ws = null;
-        resolve();
-      };
+      }
+      wsConn.close();
+    };
+    const wsCloseFuture = new Future<void | Error>();
+    wsConn.onerror = (error) => {
+      wsCloseFuture.resolve(new APIConnectionError({ message: error.message }));
+    };
+    wsConn.onclose = () => {
+      wsCloseFuture.resolve();
+    };
+    wsConn.onmessage = (message: MessageEvent) => {
+      const event: api_proto.ServerEvent = JSON.parse(message.data as string);
+      this.emit('openai_server_event_received', event);
+      this.#logger.debug(`(server) <- ${JSON.stringify(this.#loggableEvent(event))}`);
+      switch (event.type) {
+        case 'input_audio_buffer.speech_started':
+          this.handleInputAudioBufferSpeechStarted(event);
+          break;
+        case 'input_audio_buffer.speech_stopped':
+          this.handleInputAudioBufferSpeechStopped(event);
+          break;
+        case 'response.created':
+          this.handleResponseCreated(event);
+          break;
+        case 'response.output_item.added':
+          this.handleResponseOutputItemAdded(event);
+          break;
+        case 'conversation.item.created':
+          this.handleConversationItemCreated(event);
+          break;
+        case 'conversation.item.deleted':
+          this.handleConversationItemDeleted(event);
+          break;
+        case 'conversation.item.input_audio_transcription.completed':
+          this.handleConversationItemInputAudioTranscriptionCompleted(event);
+          break;
+        case 'conversation.item.input_audio_transcription.failed':
+          this.handleConversationItemInputAudioTranscriptionFailed(event);
+          break;
+        case 'response.content_part.added':
+          this.handleResponseContentPartAdded(event);
+          break;
+        case 'response.content_part.done':
+          this.handleResponseContentPartDone(event);
+          break;
+        case 'response.audio_transcript.delta':
+          this.handleResponseAudioTranscriptDelta(event);
+          break;
+        case 'response.audio.delta':
+          this.handleResponseAudioDelta(event);
+          break;
+        case 'response.audio_transcript.done':
+          this.handleResponseAudioTranscriptDone(event);
+          break;
+        case 'response.audio.done':
+          this.handleResponseAudioDone(event);
+          break;
+        case 'response.output_item.done':
+          this.handleResponseOutputItemDone(event);
+          break;
+        case 'response.done':
+          this.handleResponseDone(event);
+          break;
+        case 'error':
+          this.handleError(event);
+          break;
+        default:
+          this.#logger.debug(`unhandled event: ${event.type}`);
+          break;
+      }
+    };
+    const sendTask = Task.from(({ signal }) => forwardEvents(signal));
+    const wsTask = Task.from(({ signal }) => {
+      const abortPromise = new Promise<void>((resolve) => {
+        signal.addEventListener('abort', () => {
+          resolve();
+        });
+      });
+      return Promise.race([wsCloseFuture.await, abortPromise]);
+    });
+    const waitReconnectTask = Task.from(async ({ signal }) => {
+      await delay(this.oaiRealtimeModel._options.maxSessionDuration, { signal });
+      return new APIConnectionError({
+        message: 'OpenAI Realtime API connection timeout',
+      });
     });
+    try {
+      const result = await Promise.race([wsTask.result, sendTask.result, waitReconnectTask.result]);
+      if (waitReconnectTask.done && this.currentGeneration) {
+        await this.currentGeneration._doneFut.await;
+      }
+      if (result instanceof Error) {
+        throw result;
+      }
+    } finally {
+      await cancelAndWait([wsTask, sendTask, waitReconnectTask], 2000);
+      wsConn.close();
+    }
   }
   async close() {
-    if (!this.#ws) return;
-    this.#closing = true;
-    this.#ws.close();
+    super.close();
+    this.#closed = true;
     await this.#task;
   }
-  #getContent(ptr: ContentPtr): RealtimeContent {
-    const response = this.#pendingResponses[ptr.response_id];
-    const output = response!.output[ptr.output_index];
-    const content = output!.content[ptr.content_index]!;
-    return content;
+  private handleInputAudioBufferSpeechStarted(
+    _event: api_proto.InputAudioBufferSpeechStartedEvent,
+  ): void {
+    this.emit('input_speech_started', {} as llm.InputSpeechStartedEvent);
   }
-  #handleError(event: api_proto.ErrorEvent): void {
-    this.#logger.error(`OpenAI Realtime error ${JSON.stringify(event.error)}`);
+  private handleInputAudioBufferSpeechStopped(
+    _event: api_proto.InputAudioBufferSpeechStoppedEvent,
+  ): void {
+    this.emit('input_speech_stopped', {
+      userTranscriptionEnabled: this.oaiRealtimeModel._options.inputAudioTranscription !== null,
+    } as llm.InputSpeechStoppedEvent);
   }
-  #handleSessionCreated(event: api_proto.SessionCreatedEvent): void {
-    this.#sessionId = event.session.id;
-    this.#expiresAt = event.session.expires_at;
-    this.#logger = this.#logger.child({ sessionId: this.#sessionId });
-  }
+  private handleResponseCreated(event: api_proto.ResponseCreatedEvent): void {
+    if (!event.response.id) {
+      throw new Error('response.id is missing');
+    }
+    this.currentGeneration = {
+      messageChannel: stream.createStreamChannel<llm.MessageGeneration>(),
+      functionChannel: stream.createStreamChannel<llm.FunctionCall>(),
+      messages: new Map(),
+      _doneFut: new Future(),
+      _createdTimestamp: Date.now(),
+    };
-  // eslint-disable-next-line @typescript-eslint/no-unused-vars
-  #handleSessionUpdated(event: api_proto.SessionUpdatedEvent): void {}
+    if (!event.response.metadata || !event.response.metadata.client_event_id) return;
-  // eslint-disable-next-line @typescript-eslint/no-unused-vars
-  #handleConversationCreated(event: api_proto.ConversationCreatedEvent): void {}
+    const handle = this.responseCreatedFutures[event.response.metadata.client_event_id];
+    if (handle) {
+      delete this.responseCreatedFutures[event.response.metadata.client_event_id];
-  #handleInputAudioBufferCommitted(event: api_proto.InputAudioBufferCommittedEvent): void {
-    this.emit('input_speech_committed', {
-      itemId: event.item_id,
-    } as InputSpeechCommitted);
+      // set key to the response id
+      this.responseCreatedFutures[event.response.id] = handle;
+    }
+    // the generation_created event is emitted when
+    // 1. the response is not a message on response.output_item.added event
+    // 2. the content is audio on response.content_part.added event
+    // will try to recover from text response on response.content_part.done event
+    this.emit('generation_created', {
+      messageStream: this.currentGeneration.messageChannel.stream(),
+      functionStream: this.currentGeneration.functionChannel.stream(),
+      userInitiated: false,
+    } as GenerationCreatedEvent);
   }
-  // eslint-disable-next-line @typescript-eslint/no-unused-vars
-  #handleInputAudioBufferCleared(event: api_proto.InputAudioBufferClearedEvent): void {}
+  private handleResponseOutputItemAdded(event: api_proto.ResponseOutputItemAddedEvent): void {
+    if (!this.currentGeneration) {
+      throw new Error('currentGeneration is not set');
+    }
-  #handleInputAudioBufferSpeechStarted(
-    // eslint-disable-next-line @typescript-eslint/no-unused-vars
-    event: api_proto.InputAudioBufferSpeechStartedEvent,
-  ): void {
-    this.emit('input_speech_started', {
-      itemId: event.item_id,
-    } as InputSpeechStarted);
+    if (!event.item.type) {
+      throw new Error('item.type is not set');
+    }
+    if (!event.response_id) {
+      throw new Error('response_id is not set');
+    }
+    const itemType = event.item.type;
+    const responseId = event.response_id;
+    if (itemType !== 'message') {
+      // emit immediately if it's not a message, otherwise wait response.content_part.added
+      this.emitGenerationEvent(responseId);
+      this.textModeRecoveryRetries = 0;
+      return;
+    }
   }
-  #handleInputAudioBufferSpeechStopped(
-    // eslint-disable-next-line @typescript-eslint/no-unused-vars
-    event: api_proto.InputAudioBufferSpeechStoppedEvent,
-  ): void {
-    this.emit('input_speech_stopped');
+  private handleConversationItemCreated(event: api_proto.ConversationItemCreatedEvent): void {
+    if (!event.item.id) {
+      throw new Error('item.id is not set');
+    }
+    try {
+      this.remoteChatCtx.insert(event.previous_item_id, openAIItemToLivekitItem(event.item));
+    } catch (error) {
+      this.#logger.error({ error, itemId: event.item.id }, 'failed to insert conversation item');
+    }
+    const fut = this.itemCreateFutures[event.item.id];
+    if (fut) {
+      fut.resolve();
+      delete this.itemCreateFutures[event.item.id];
+    }
   }
-  // eslint-disable-next-line @typescript-eslint/no-unused-vars
-  #handleConversationItemCreated(event: api_proto.ConversationItemCreatedEvent): void {}
+  private handleConversationItemDeleted(event: api_proto.ConversationItemDeletedEvent): void {
+    if (!event.item_id) {
+      throw new Error('item_id is not set');
+    }
+    try {
+      this.remoteChatCtx.delete(event.item_id);
+    } catch (error) {
+      this.#logger.error({ error, itemId: event.item_id }, 'failed to delete conversation item');
+    }
-  #handleConversationItemInputAudioTranscriptionCompleted(
+    const fut = this.itemDeleteFutures[event.item_id];
+    if (fut) {
+      fut.resolve();
+      delete this.itemDeleteFutures[event.item_id];
+    }
+  }
+  private handleConversationItemInputAudioTranscriptionCompleted(
     event: api_proto.ConversationItemInputAudioTranscriptionCompletedEvent,
   ): void {
-    const transcript = event.transcript;
-    this.emit('input_speech_transcription_completed', {
+    const remoteItem = this.remoteChatCtx.get(event.item_id);
+    if (!remoteItem) {
+      return;
+    }
+    const item = remoteItem.item;
+    if (item instanceof llm.ChatMessage) {
+      item.content.push(event.transcript);
+    } else {
+      throw new Error('item is not a chat message');
+    }
+    this.emit('input_audio_transcription_completed', {
       itemId: event.item_id,
-      transcript: transcript,
-    } as InputSpeechTranscriptionCompleted);
+      transcript: event.transcript,
+      isFinal: true,
+    } as llm.InputTranscriptionCompleted);
   }
-  #handleConversationItemInputAudioTranscriptionFailed(
+  private handleConversationItemInputAudioTranscriptionFailed(
     event: api_proto.ConversationItemInputAudioTranscriptionFailedEvent,
   ): void {
-    const error = event.error;
-    this.#logger.error(`OpenAI Realtime failed to transcribe input audio: ${error.message}`);
-    this.emit('input_speech_transcription_failed', {
-      itemId: event.item_id,
-      message: error.message,
-    } as InputSpeechTranscriptionFailed);
+    this.#logger.error(
+      { error: event.error },
+      'OpenAI Realtime API failed to transcribe input audio',
+    );
   }
-  // eslint-disable-next-line @typescript-eslint/no-unused-vars
-  #handleConversationItemTruncated(event: api_proto.ConversationItemTruncatedEvent): void {}
-  // eslint-disable-next-line @typescript-eslint/no-unused-vars
-  #handleConversationItemDeleted(event: api_proto.ConversationItemDeletedEvent): void {}
-  #handleResponseCreated(responseCreated: api_proto.ResponseCreatedEvent): void {
-    const response = responseCreated.response;
-    const doneFut = new Future();
-    const newResponse: RealtimeResponse = {
-      id: response.id,
-      status: response.status,
-      statusDetails: response.status_details,
-      usage: null,
-      output: [],
-      doneFut: doneFut,
-      createdTimestamp: Date.now(),
-    };
-    this.#pendingResponses[newResponse.id] = newResponse;
-    this.emit('response_created', newResponse);
-  }
+  private handleResponseContentPartAdded(event: api_proto.ResponseContentPartAddedEvent): void {
+    if (!this.currentGeneration) {
+      throw new Error('currentGeneration is not set');
+    }
-  #handleResponseDone(event: api_proto.ResponseDoneEvent): void {
-    const responseData = event.response;
-    const responseId = responseData.id;
-    const response = this.#pendingResponses[responseId]!;
-    response.status = responseData.status;
-    response.statusDetails = responseData.status_details;
-    response.usage = responseData.usage ?? null;
-    this.#pendingResponses[responseId] = response;
-    response.doneFut.resolve();
-    let metricsError: Error | undefined;
-    let cancelled = false;
-    switch (response.status) {
-      case 'failed': {
-        if (response.statusDetails.type !== 'failed') break;
-        const err = response.statusDetails.error;
-        metricsError = new metrics.MultimodalLLMError({
-          type: response.statusDetails.type,
-          code: err?.code,
-          message: err?.message,
-        });
-        this.#logger
-          .child({ code: err?.code, error: err?.message })
-          .error('response generation failed');
-        break;
-      }
-      case 'incomplete': {
-        if (response.statusDetails.type !== 'incomplete') break;
-        const reason = response.statusDetails.reason;
-        metricsError = new metrics.MultimodalLLMError({
-          type: response.statusDetails.type,
-          reason,
-        });
-        this.#logger.child({ reason }).error('response generation incomplete');
-        break;
+    const itemId = event.item_id;
+    const itemType = event.part.type;
+    const responseId = event.response_id;
+    if (itemType === 'audio') {
+      this.emitGenerationEvent(responseId);
+      if (this.textModeRecoveryRetries > 0) {
+        this.#logger.info(
+          { retries: this.textModeRecoveryRetries },
+          'recovered from text-only response',
+        );
+        this.textModeRecoveryRetries = 0;
       }
-      case 'cancelled': {
-        cancelled = true;
-        break;
+      const itemGeneration: MessageGeneration = {
+        messageId: itemId,
+        textChannel: stream.createStreamChannel<string>(),
+        audioChannel: stream.createStreamChannel<AudioFrame>(),
+        audioTranscript: '',
+      };
+      this.currentGeneration.messageChannel.write({
+        messageId: itemId,
+        textStream: itemGeneration.textChannel.stream(),
+        audioStream: itemGeneration.audioChannel.stream(),
+      });
+      this.currentGeneration.messages.set(itemId, itemGeneration);
+      this.currentGeneration._firstTokenTimestamp = Date.now();
+      return;
+    } else {
+      this.interrupt();
+      if (this.textModeRecoveryRetries === 0) {
+        this.#logger.warn({ responseId }, 'received text-only response from OpenAI Realtime API');
       }
     }
-    this.emit('response_done', response);
+  }
-    let ttft: number | undefined;
-    if (response.firstTokenTimestamp) {
-      ttft = response.firstTokenTimestamp - response.createdTimestamp;
+  private handleResponseContentPartDone(event: api_proto.ResponseContentPartDoneEvent): void {
+    if (event.part.type !== 'text') {
+      return;
     }
-    const duration = Date.now() - response.createdTimestamp;
-    const usage = response.usage;
-    const metric: metrics.MultimodalLLMMetrics = {
-      timestamp: response.createdTimestamp,
-      requestId: response.id,
-      ttft: ttft!,
-      duration,
-      cancelled,
-      label: this.constructor.name,
-      completionTokens: usage?.output_tokens || 0,
-      promptTokens: usage?.input_tokens || 0,
-      totalTokens: usage?.total_tokens || 0,
-      tokensPerSecond: ((usage?.output_tokens || 0) / duration) * 1000,
-      error: metricsError,
-      inputTokenDetails: {
-        cachedTokens: usage?.input_token_details.cached_tokens || 0,
-        textTokens: usage?.input_token_details.text_tokens || 0,
-        audioTokens: usage?.input_token_details.audio_tokens || 0,
-      },
-      outputTokenDetails: {
-        textTokens: usage?.output_token_details.text_tokens || 0,
-        audioTokens: usage?.output_token_details.audio_tokens || 0,
-      },
-    };
-    this.emit('metrics_collected', metric);
-  }
+    if (!this.currentGeneration) {
+      throw new Error('currentGeneration is not set');
+    }
-  #handleResponseOutputItemAdded(event: api_proto.ResponseOutputItemAddedEvent): void {
-    const responseId = event.response_id;
-    const response = this.#pendingResponses[responseId];
-    const itemData = event.item;
+    // TODO(shubhra): handle text mode recovery
+  }
-    if (itemData.type !== 'message' && itemData.type !== 'function_call') {
-      throw new Error(`Unexpected item type: ${itemData.type}`);
+  private handleResponseAudioTranscriptDelta(
+    event: api_proto.ResponseAudioTranscriptDeltaEvent,
+  ): void {
+    if (!this.currentGeneration) {
+      throw new Error('currentGeneration is not set');
     }
-    let role: api_proto.Role;
-    if (itemData.type === 'function_call') {
-      role = 'assistant'; // function_call doesn't have a role field, defaulting it to assistant
+    const itemId = event.item_id;
+    const delta = event.delta;
+    // TODO (shubhra): add timed string support
+    const itemGeneration = this.currentGeneration.messages.get(itemId);
+    if (!itemGeneration) {
+      throw new Error('itemGeneration is not set');
     } else {
-      role = itemData.role;
+      itemGeneration.textChannel.write(delta);
+      itemGeneration.audioTranscript += delta;
     }
+  }
-    const newOutput: RealtimeOutput = {
-      responseId: responseId,
-      itemId: itemData.id,
-      outputIndex: event.output_index,
-      type: itemData.type,
-      role: role,
-      content: [],
-      doneFut: new Future(),
-    };
-    response?.output.push(newOutput);
-    this.emit('response_output_added', newOutput);
+  private handleResponseAudioDelta(event: api_proto.ResponseAudioDeltaEvent): void {
+    if (!this.currentGeneration) {
+      throw new Error('currentGeneration is not set');
+    }
+    const itemGeneration = this.currentGeneration.messages.get(event.item_id);
+    if (!itemGeneration) {
+      throw new Error('itemGeneration is not set');
+    }
+    const binaryString = atob(event.delta);
+    const len = binaryString.length;
+    const bytes = new Uint8Array(len);
+    for (let i = 0; i < len; i++) {
+      bytes[i] = binaryString.charCodeAt(i);
+    }
+    itemGeneration.audioChannel.write(
+      new AudioFrame(
+        new Int16Array(bytes.buffer),
+        api_proto.SAMPLE_RATE,
+        api_proto.NUM_CHANNELS,
+        bytes.length / 2,
+      ),
+    );
   }
-  #handleResponseOutputItemDone(event: api_proto.ResponseOutputItemDoneEvent): void {
-    const responseId = event.response_id;
-    const response = this.#pendingResponses[responseId];
-    const outputIndex = event.output_index;
-    const output = response!.output[outputIndex];
+  private handleResponseAudioTranscriptDone(
+    _event: api_proto.ResponseAudioTranscriptDoneEvent,
+  ): void {
+    if (!this.currentGeneration) {
+      throw new Error('currentGeneration is not set');
+    }
+  }
-    if (output?.type === 'function_call') {
-      if (!this.#fncCtx) {
-        this.#logger.error('function call received but no fncCtx is available');
-        return;
-      }
+  private handleResponseAudioDone(_event: api_proto.ResponseAudioDoneEvent): void {
+    if (!this.currentGeneration) {
+      throw new Error('currentGeneration is not set');
+    }
+  }
+  private handleResponseOutputItemDone(event: api_proto.ResponseOutputItemDoneEvent): void {
+    if (!this.currentGeneration) {
+      throw new Error('currentGeneration is not set');
+    }
+    const itemId = event.item.id;
+    const itemType = event.item.type;
-      // parse the arguments and call the function inside the fnc_ctx
+    if (itemType === 'function_call') {
       const item = event.item;
-      if (item.type !== 'function_call') {
-        throw new Error('Expected function_call item');
+      if (!item.call_id || !item.name || !item.arguments) {
+        throw new Error('item is not a function call');
       }
-      const func = this.#fncCtx[item.name];
-      if (!func) {
-        this.#logger.error(`no function with name ${item.name} in fncCtx`);
+      this.currentGeneration.functionChannel.write({
+        callId: item.call_id,
+        name: item.name,
+        args: item.arguments,
+      } as llm.FunctionCall);
+    } else if (itemType === 'message') {
+      const itemGeneration = this.currentGeneration.messages.get(itemId);
+      if (!itemGeneration) {
         return;
       }
+      // text response doesn't have itemGeneration
+      itemGeneration.textChannel.close();
+      itemGeneration.audioChannel.close();
+    }
+  }
-      this.emit('function_call_started', {
-        callId: item.call_id,
-      });
+  private handleResponseDone(_event: api_proto.ResponseDoneEvent): void {
+    if (!this.currentGeneration) {
+      // OpenAI has a race condition where we could receive response.done without any
+      // previous response.created (This happens generally during interruption)
+      return;
+    }
-      const parsedArgs = JSON.parse(item.arguments);
+    const createdTimestamp = this.currentGeneration._createdTimestamp;
+    const firstTokenTimestamp = this.currentGeneration._firstTokenTimestamp;
-      this.#logger.debug(
-        `[Function Call ${item.call_id}] Executing ${item.name} with arguments ${parsedArgs}`,
-      );
+    this.#logger.debug(
+      {
+        messageCount: this.currentGeneration.messages.size,
+      },
+      'Closing generation channels in handleResponseDone',
+    );
-      func.execute(parsedArgs).then(
-        (content) => {
-          this.#logger.debug(`[Function Call ${item.call_id}] ${item.name} returned ${content}`);
-          this.emit('function_call_completed', {
-            callId: item.call_id,
-          });
-          this.conversation.item.create(
-            llm.ChatMessage.createToolFromFunctionResult({
-              name: item.name,
-              toolCallId: item.call_id,
-              result: content,
-            }),
-            output.itemId,
-          );
-          this.response.create();
-        },
-        (error) => {
-          this.#logger.error(`[Function Call ${item.call_id}] ${item.name} failed with ${error}`);
-          // TODO: send it back up as failed?
-          this.emit('function_call_failed', {
-            callId: item.call_id,
-          });
-        },
-      );
+    for (const generation of this.currentGeneration.messages.values()) {
+      generation.textChannel.close();
+      generation.audioChannel.close();
     }
-    output?.doneFut.resolve();
-    this.emit('response_output_done', output);
-  }
+    this.currentGeneration.functionChannel.close();
+    this.currentGeneration.messageChannel.close();
-  #handleResponseContentPartAdded(event: api_proto.ResponseContentPartAddedEvent): void {
-    const responseId = event.response_id;
-    const response = this.#pendingResponses[responseId];
-    const outputIndex = event.output_index;
-    const output = response!.output[outputIndex];
+    for (const itemId of this.currentGeneration.messages.keys()) {
+      const remoteItem = this.remoteChatCtx.get(itemId);
+      if (remoteItem && remoteItem.item instanceof llm.ChatMessage) {
+        remoteItem.item.content.push(this.currentGeneration.messages.get(itemId)!.audioTranscript);
+      }
+    }
-    const textStream = new AsyncIterableQueue<string>();
-    const audioStream = new AsyncIterableQueue<AudioFrame>();
+    this.currentGeneration._doneFut.resolve();
+    this.currentGeneration = undefined;
-    const newContent: RealtimeContent = {
-      responseId: responseId,
-      itemId: event.item_id,
-      outputIndex: outputIndex,
-      contentIndex: event.content_index,
-      text: '',
-      audio: [],
-      textStream: textStream,
-      audioStream: audioStream,
-      toolCalls: [],
-      contentType: event.part.type,
+    // Calculate and emit metrics
+    const usage = _event.response.usage;
+    const ttft = firstTokenTimestamp ? firstTokenTimestamp - createdTimestamp : -1;
+    const duration = (Date.now() - createdTimestamp) / 1000; // Convert to seconds
+    const realtimeMetrics: metrics.RealtimeModelMetrics = {
+      type: 'realtime_model_metrics',
+      timestamp: createdTimestamp / 1000, // Convert to seconds
+      requestId: _event.response.id || '',
+      ttft,
+      duration,
+      cancelled: _event.response.status === 'cancelled',
+      label: 'openai_realtime',
+      inputTokens: usage?.input_tokens ?? 0,
+      outputTokens: usage?.output_tokens ?? 0,
+      totalTokens: usage?.total_tokens ?? 0,
+      tokensPerSecond: duration > 0 ? (usage?.output_tokens ?? 0) / duration : 0,
+      inputTokenDetails: {
+        audioTokens: usage?.input_token_details?.audio_tokens ?? 0,
+        textTokens: usage?.input_token_details?.text_tokens ?? 0,
+        imageTokens: 0, // Not supported yet
+        cachedTokens: usage?.input_token_details?.cached_tokens ?? 0,
+        cachedTokensDetails: usage?.input_token_details?.cached_tokens_details
+          ? {
+              audioTokens: usage?.input_token_details?.cached_tokens_details?.audio_tokens ?? 0,
+              textTokens: usage?.input_token_details?.cached_tokens_details?.text_tokens ?? 0,
+              imageTokens: usage?.input_token_details?.cached_tokens_details?.image_tokens ?? 0,
+            }
+          : undefined,
+      },
+      outputTokenDetails: {
+        textTokens: usage?.output_token_details?.text_tokens ?? 0,
+        audioTokens: usage?.output_token_details?.audio_tokens ?? 0,
+        imageTokens: 0,
+      },
     };
-    output?.content.push(newContent);
-    response!.firstTokenTimestamp = Date.now();
-    this.emit('response_content_added', newContent);
+    this.emit('metrics_collected', realtimeMetrics);
+    // TODO(brian): handle response done but not complete
   }
-  #handleResponseContentPartDone(event: api_proto.ResponseContentPartDoneEvent): void {
-    const content = this.#getContent(event);
-    this.emit('response_content_done', content);
+  private handleError(event: api_proto.ErrorEvent): void {
+    if (event.error.message.startsWith('Cancellation failed')) {
+      return;
+    }
+    this.#logger.error({ error: event.error }, 'OpenAI Realtime API returned an error');
+    this.emitError({
+      error: new APIError(event.error.message, {
+        body: event.error,
+        retryable: true,
+      }),
+      recoverable: true,
+    });
+    // TODO(brian): set error for response future if it exists
   }
-  #handleResponseTextDelta(event: api_proto.ResponseTextDeltaEvent): void {
-    this.emit('response_text_delta', event);
+  private emitError({ error, recoverable }: { error: Error; recoverable: boolean }): void {
+    // IMPORTANT: only emit error if there are listeners; otherwise emit will throw an error
+    this.emit('error', {
+      timestamp: Date.now(),
+      // TODO(brian): add label
+      label: '',
+      error,
+      recoverable,
+    } as llm.RealtimeModelError);
   }
-  #handleResponseTextDone(event: api_proto.ResponseTextDoneEvent): void {
-    const content = this.#getContent(event);
-    content.text = event.text;
-    this.emit('response_text_done', event);
+  private *resampleAudio(frame: AudioFrame): Generator<AudioFrame> {
+    yield frame;
   }
-  #handleResponseAudioTranscriptDelta(event: api_proto.ResponseAudioTranscriptDeltaEvent): void {
-    const content = this.#getContent(event);
-    const transcript = event.delta;
-    content.text += transcript;
+  private createResponse({
+    userInitiated,
+    instructions,
+    oldHandle,
+  }: {
+    userInitiated: boolean;
+    instructions?: string;
+    oldHandle?: CreateResponseHandle;
+  }): CreateResponseHandle {
+    const handle = oldHandle || new CreateResponseHandle({ instructions });
+    if (oldHandle && instructions) {
+      handle.instructions = instructions;
+    }
-    content.textStream.put(transcript);
+    const eventId = shortuuid('response_create_');
+    if (userInitiated) {
+      this.responseCreatedFutures[eventId] = handle;
+    }
+    const response: api_proto.ResponseCreateEvent['response'] = {};
+    if (instructions) response.instructions = instructions;
+    if (userInitiated) response.metadata = { client_event_id: eventId };
+    this.sendEvent({
+      type: 'response.create',
+      event_id: eventId,
+      response: Object.keys(response).length > 0 ? response : undefined,
+    });
+    return handle;
   }
-  #handleResponseAudioTranscriptDone(event: api_proto.ResponseAudioTranscriptDoneEvent): void {
-    const content = this.#getContent(event);
-    content.textStream.close();
+  private emitGenerationEvent(responseId: string): void {
+    if (!this.currentGeneration) {
+      throw new Error('currentGeneration is not set');
+    }
+    const generation_ev: llm.GenerationCreatedEvent = {
+      messageStream: this.currentGeneration.messageChannel.stream(),
+      functionStream: this.currentGeneration.functionChannel.stream(),
+      userInitiated: false,
+    };
+    const handle = this.responseCreatedFutures[responseId];
+    if (handle) {
+      delete this.responseCreatedFutures[responseId];
+      generation_ev.userInitiated = true;
+      if (handle.doneFut.done) {
+        this.#logger.warn({ responseId }, 'response received after timeout');
+      } else {
+        handle.doneFut.resolve(generation_ev);
+      }
+    }
+    this.#logger.debug({ responseId }, 'Emitting generation_created event');
+    this.emit('generation_created', generation_ev);
   }
+}
-  #handleResponseAudioDelta(event: api_proto.ResponseAudioDeltaEvent): void {
-    const content = this.#getContent(event);
-    const data = Buffer.from(event.delta, 'base64');
-    const audio = new AudioFrame(
-      new Int16Array(data.buffer),
-      api_proto.SAMPLE_RATE,
-      api_proto.NUM_CHANNELS,
-      data.length / 2,
-    );
-    content.audio.push(audio);
+function livekitItemToOpenAIItem(item: llm.ChatItem): api_proto.ItemResource {
+  switch (item.type) {
+    case 'function_call':
+      return {
+        id: item.id,
+        type: 'function_call',
+        call_id: item.callId,
+        name: item.name,
+        arguments: item.args,
+      } as api_proto.FunctionCallItem;
+    case 'function_call_output':
+      return {
+        id: item.id,
+        type: 'function_call_output',
+        call_id: item.callId,
+        output: item.output,
+      } as api_proto.FunctionCallOutputItem;
+    case 'message':
+      const role = item.role === 'developer' ? 'system' : item.role;
+      const contentList: api_proto.Content[] = [];
+      for (const c of item.content) {
+        if (typeof c === 'string') {
+          contentList.push({
+            type: role === 'assistant' ? 'text' : 'input_text',
+            text: c,
+          } as api_proto.InputTextContent);
+        } else if (c.type === 'image_content') {
+          // not supported for now
+          continue;
+        } else if (c.type === 'audio_content') {
+          if (role === 'user') {
+            const encodedAudio = Buffer.from(combineAudioFrames(c.frame).data).toString('base64');
+            contentList.push({
+              type: 'input_audio',
+              audio: encodedAudio,
+            } as api_proto.InputAudioContent);
+          }
+        }
+      }
+      return {
+        id: item.id,
+        type: 'message',
+        role,
+        content: contentList,
+      } as api_proto.UserItem;
+  }
+}
-    content.audioStream.put(audio);
+function openAIItemToLivekitItem(item: api_proto.ItemResource): llm.ChatItem {
+  if (!item.id) {
+    throw new Error('item.id is not set');
   }
-  #handleResponseAudioDone(event: api_proto.ResponseAudioDoneEvent): void {
-    const content = this.#getContent(event);
-    content.audioStream.close();
+  switch (item.type) {
+    case 'function_call':
+      return llm.FunctionCall.create({
+        id: item.id,
+        callId: item.call_id,
+        name: item.name,
+        args: item.arguments,
+      });
+    case 'function_call_output':
+      return llm.FunctionCallOutput.create({
+        id: item.id,
+        callId: item.call_id,
+        output: item.output,
+        isError: false,
+      });
+    case 'message':
+      const content: llm.ChatContent[] = [];
+      // item.content can be a single object or an array; normalize to array
+      const contents = Array.isArray(item.content) ? item.content : [item.content];
+      for (const c of contents) {
+        if (c.type === 'text' || c.type === 'input_text') {
+          content.push(c.text);
+        }
+      }
+      return llm.ChatMessage.create({
+        id: item.id,
+        role: item.role,
+        content,
+      });
   }
+}
-  #handleResponseFunctionCallArgumentsDelta(
-    // eslint-disable-next-line @typescript-eslint/no-unused-vars
-    event: api_proto.ResponseFunctionCallArgumentsDeltaEvent,
-  ): void {}
+function createMockAudioItem(durationSeconds: number = 2): llm.ChatMessage {
+  const audioData = Buffer.alloc(durationSeconds * SAMPLE_RATE);
+  return llm.ChatMessage.create({
+    id: shortuuid(MOCK_AUDIO_ID_PREFIX),
+    role: 'user',
+    content: [
+      {
+        type: 'audio_content',
+        frame: [
+          new AudioFrame(
+            new Int16Array(audioData.buffer),
+            SAMPLE_RATE,
+            NUM_CHANNELS,
+            audioData.length / 2,
+          ),
+        ],
+      } as llm.AudioContent,
+    ],
+  });
+}
+function toOaiToolChoice(toolChoice?: llm.ToolChoice): api_proto.ToolChoice {
+  if (typeof toolChoice === 'string') {
+    return toolChoice;
+  }
-  #handleResponseFunctionCallArgumentsDone(
-    // eslint-disable-next-line @typescript-eslint/no-unused-vars
-    event: api_proto.ResponseFunctionCallArgumentsDoneEvent,
-  ): void {}
+  if (toolChoice?.type === 'function') {
+    return toolChoice.function.name;
+  }
-  // eslint-disable-next-line @typescript-eslint/no-unused-vars
-  #handleRateLimitsUpdated(event: api_proto.RateLimitsUpdatedEvent): void {}
+  return 'auto';
 }