@livekit/agents-plugin-openai 1.0.17 → 1.0.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1 +1 @@
1
- {"version":3,"sources":["../../src/realtime/api_proto.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\n\nexport const SAMPLE_RATE = 24000;\nexport const NUM_CHANNELS = 1;\nexport const IN_FRAME_SIZE = 2400; // 100ms\nexport const OUT_FRAME_SIZE = 1200; // 50ms\n\nexport const BASE_URL = 'wss://api.openai.com/v1';\n\nexport type Model = 'gpt-4o-realtime-preview-2024-10-01' | string; // Open-ended, for future models\nexport type Voice =\n | 'alloy'\n | 'shimmer'\n | 'echo'\n | 'ash'\n | 'ballad'\n | 'coral'\n | 'sage'\n | 'verse'\n | string;\nexport type AudioFormat = 'pcm16'; // TODO: 'g711-ulaw' | 'g711-alaw'\nexport type Role = 'system' | 'assistant' | 'user' | 'tool';\nexport type GenerationFinishedReason = 'stop' | 'max_tokens' | 'content_filter' | 'interrupt';\nexport type InputTranscriptionModel = 'whisper-1' | string; // Open-ended, for future models\nexport type Modality = 'text' | 'audio';\nexport type ToolChoice = 'auto' | 'none' | 'required' | string;\nexport type State = 'initializing' | 'listening' | 'thinking' | 'speaking' | string;\nexport type ResponseStatus =\n | 'in_progress'\n | 'completed'\n | 'incomplete'\n | 'cancelled'\n | 'failed'\n | string;\nexport type ClientEventType =\n | 'session.update'\n | 'input_audio_buffer.append'\n | 'input_audio_buffer.commit'\n | 'input_audio_buffer.clear'\n | 'conversation.item.create'\n | 'conversation.item.truncate'\n | 'conversation.item.delete'\n | 'response.create'\n | 'response.cancel';\nexport type ServerEventType =\n | 'error'\n | 'session.created'\n | 'session.updated'\n | 'conversation.created'\n | 'input_audio_buffer.committed'\n | 'input_audio_buffer.cleared'\n | 'input_audio_buffer.speech_started'\n | 'input_audio_buffer.speech_stopped'\n | 'conversation.item.created'\n | 'conversation.item.input_audio_transcription.completed'\n | 'conversation.item.input_audio_transcription.failed'\n | 'conversation.item.truncated'\n | 'conversation.item.deleted'\n | 'response.created'\n | 'response.done'\n | 'response.output_item.added'\n | 'response.output_item.done'\n | 'response.content_part.added'\n | 'response.content_part.done'\n | 'response.text.delta'\n | 'response.text.done'\n | 'response.audio_transcript.delta'\n | 'response.audio_transcript.done'\n | 'response.audio.delta'\n | 'response.audio.done'\n | 'response.function_call_arguments.delta'\n | 'response.function_call_arguments.done'\n | 'rate_limits.updated';\n\nexport type AudioBase64Bytes = string;\n\nexport interface Tool {\n type: 'function';\n name: string;\n description?: string;\n parameters: {\n type: 'object';\n properties: {\n [prop: string]: {\n [prop: string]: any;\n };\n };\n required: string[];\n };\n}\n\nexport type TurnDetectionType =\n | {\n type: 'semantic_vad';\n eagerness?: 'auto' | 'low' | 'medium' | 'high'; // default: auto\n create_response?: boolean; // default: true\n interrupt_response?: boolean; // default: true\n }\n | {\n type: 'server_vad';\n threshold?: number; // 0.0 to 1.0, default: 0.5\n prefix_padding_ms?: number; // default: 300\n silence_duration_ms?: number; // default: 200\n create_response?: boolean; // default: true\n interrupt_response?: boolean; // default: true\n };\n\nexport type InputAudioTranscription = {\n model: InputTranscriptionModel;\n language?: string;\n prompt?: string;\n};\n\nexport interface InputTextContent {\n type: 'input_text';\n text: string;\n}\n\nexport interface InputAudioContent {\n type: 'input_audio';\n audio: AudioBase64Bytes;\n}\n\nexport interface TextContent {\n type: 'text';\n text: string;\n}\n\nexport interface AudioContent {\n type: 'audio';\n audio: AudioBase64Bytes;\n transcript: string;\n}\n\nexport type Content = InputTextContent | InputAudioContent | TextContent | AudioContent;\nexport type ContentPart = {\n type: 'text' | 'audio';\n audio?: AudioBase64Bytes;\n transcript?: string;\n};\n\nexport interface BaseItem {\n id: string;\n object: 'realtime.item';\n type: string;\n}\n\nexport interface SystemItem extends BaseItem {\n type: 'message';\n role: 'system';\n content: InputTextContent;\n}\n\nexport interface UserItem extends BaseItem {\n type: 'message';\n role: 'user';\n content: (InputTextContent | InputAudioContent)[];\n}\n\nexport interface AssistantItem extends BaseItem {\n type: 'message';\n role: 'assistant';\n content: (TextContent | AudioContent)[];\n}\n\nexport interface FunctionCallItem extends BaseItem {\n type: 'function_call';\n call_id: string;\n name: string;\n arguments: string;\n}\n\nexport interface FunctionCallOutputItem extends BaseItem {\n type: 'function_call_output';\n call_id: string;\n output: string;\n}\n\nexport type ItemResource =\n | SystemItem\n | UserItem\n | AssistantItem\n | FunctionCallItem\n | FunctionCallOutputItem;\n\n// Session Resource\nexport interface SessionResource {\n id: string;\n object: 'realtime.session';\n model: string;\n modalities: ['text', 'audio'] | ['text']; // default: [\"text\", \"audio\"]\n instructions: string;\n voice: Voice; // default: \"alloy\"\n input_audio_format: AudioFormat; // default: \"pcm16\"\n output_audio_format: AudioFormat; // default: \"pcm16\"\n input_audio_transcription: InputAudioTranscription | null;\n turn_detection: TurnDetectionType | null;\n tools: Tool[];\n tool_choice: ToolChoice; // default: \"auto\"\n temperature: number; // default: 0.8\n max_response_output_tokens: number | 'inf';\n expires_at: number;\n}\n\n// Conversation Resource\nexport interface ConversationResource {\n id: string;\n object: 'realtime.conversation';\n}\n\nexport type ResponseStatusDetails =\n | {\n type: 'incomplete';\n reason: 'max_output_tokens' | 'content_filter' | string;\n }\n | {\n type: 'failed';\n error?: {\n code: 'server_error' | 'rate_limit_exceeded' | string;\n message: string;\n };\n }\n | {\n type: 'cancelled';\n reason: 'turn_detected' | 'client_cancelled' | string;\n };\n\nexport interface ModelUsage {\n total_tokens: number;\n input_tokens: number;\n output_tokens: number;\n input_token_details: {\n text_tokens: number;\n audio_tokens: number;\n cached_tokens: number;\n cached_tokens_details: {\n text_tokens: number;\n audio_tokens: number;\n image_tokens: number;\n };\n };\n output_token_details: {\n text_tokens: number;\n audio_tokens: number;\n };\n}\n\nexport interface ResponseResource {\n id: string;\n object: 'realtime.response';\n status: ResponseStatus;\n status_details: ResponseStatusDetails;\n output: ItemResource[];\n usage?: ModelUsage;\n metadata?: Record<string, string>;\n}\n\n// Client Events\ninterface BaseClientEvent {\n event_id?: string;\n type: ClientEventType;\n}\n\nexport interface SessionUpdateEvent extends BaseClientEvent {\n type: 'session.update';\n session: Partial<{\n model: Model;\n modalities: ['text', 'audio'] | ['text'];\n instructions: string;\n voice: Voice;\n input_audio_format: AudioFormat;\n output_audio_format: AudioFormat;\n input_audio_transcription: InputAudioTranscription | null;\n turn_detection: TurnDetectionType | null;\n tools: Tool[];\n tool_choice: ToolChoice;\n temperature: number;\n max_response_output_tokens?: number | 'inf';\n speed?: number;\n }>;\n}\n\nexport interface InputAudioBufferAppendEvent extends BaseClientEvent {\n type: 'input_audio_buffer.append';\n audio: AudioBase64Bytes;\n}\n\nexport interface InputAudioBufferCommitEvent extends BaseClientEvent {\n type: 'input_audio_buffer.commit';\n}\n\nexport interface InputAudioBufferClearEvent extends BaseClientEvent {\n type: 'input_audio_buffer.clear';\n}\n\nexport interface UserItemCreate {\n id: string;\n type: 'message';\n role: 'user';\n content: (InputTextContent | InputAudioContent)[];\n}\n\nexport interface AssistantItemCreate {\n id: string;\n type: 'message';\n role: 'assistant';\n content: TextContent[];\n}\n\nexport interface SystemItemCreate {\n id: string;\n type: 'message';\n role: 'system';\n content: InputTextContent[];\n}\n\nexport interface FunctionCallOutputItemCreate {\n id: string;\n type: 'function_call_output';\n call_id: string;\n output: string;\n}\n\nexport type ConversationItemCreateContent =\n | UserItemCreate\n | AssistantItemCreate\n | SystemItemCreate\n | FunctionCallOutputItemCreate;\n\nexport interface ConversationItemCreateEvent extends BaseClientEvent {\n type: 'conversation.item.create';\n previous_item_id?: string;\n item: ConversationItemCreateContent;\n}\n\nexport interface ConversationItemTruncateEvent extends BaseClientEvent {\n type: 'conversation.item.truncate';\n item_id: string;\n content_index: number;\n audio_end_ms: number;\n}\n\nexport interface ConversationItemDeleteEvent extends BaseClientEvent {\n type: 'conversation.item.delete';\n item_id: string;\n}\n\nexport interface ResponseCreateEvent extends BaseClientEvent {\n type: 'response.create';\n response?: Partial<{\n modalities: ['text', 'audio'] | ['text'];\n instructions: string;\n voice: Voice;\n output_audio_format: AudioFormat;\n tools?: Tool[];\n tool_choice: ToolChoice;\n temperature: number;\n max_output_tokens: number | 'inf';\n metadata?: Record<string, string>;\n }>;\n}\n\nexport interface ResponseCancelEvent extends BaseClientEvent {\n type: 'response.cancel';\n}\n\nexport type ClientEvent =\n | SessionUpdateEvent\n | InputAudioBufferAppendEvent\n | InputAudioBufferCommitEvent\n | InputAudioBufferClearEvent\n | ConversationItemCreateEvent\n | ConversationItemTruncateEvent\n | ConversationItemDeleteEvent\n | ResponseCreateEvent\n | ResponseCancelEvent;\n\ninterface BaseServerEvent {\n event_id: string;\n type: ServerEventType;\n}\n\nexport interface ErrorEvent extends BaseServerEvent {\n type: 'error';\n error: {\n type: 'invalid_request_error' | 'server_error' | string;\n code?: string;\n message: string;\n param: string;\n event_id: string;\n };\n}\n\nexport interface SessionCreatedEvent extends BaseServerEvent {\n type: 'session.created';\n session: SessionResource;\n}\n\nexport interface SessionUpdatedEvent extends BaseServerEvent {\n type: 'session.updated';\n session: SessionResource;\n}\n\nexport interface ConversationCreatedEvent extends BaseServerEvent {\n type: 'conversation.created';\n conversation: ConversationResource;\n}\n\nexport interface InputAudioBufferCommittedEvent extends BaseServerEvent {\n type: 'input_audio_buffer.committed';\n item_id: string;\n}\n\nexport interface InputAudioBufferClearedEvent extends BaseServerEvent {\n type: 'input_audio_buffer.cleared';\n}\n\nexport interface InputAudioBufferSpeechStartedEvent extends BaseServerEvent {\n type: 'input_audio_buffer.speech_started';\n audio_start_ms: number;\n item_id: string;\n}\n\nexport interface InputAudioBufferSpeechStoppedEvent extends BaseServerEvent {\n type: 'input_audio_buffer.speech_stopped';\n audio_end_ms: number;\n item_id: string;\n}\n\nexport interface ConversationItemCreatedEvent extends BaseServerEvent {\n type: 'conversation.item.created';\n previous_item_id: string;\n item: ItemResource;\n}\n\nexport interface ConversationItemInputAudioTranscriptionCompletedEvent extends BaseServerEvent {\n type: 'conversation.item.input_audio_transcription.completed';\n item_id: string;\n content_index: number;\n transcript: string;\n}\n\nexport interface ConversationItemInputAudioTranscriptionFailedEvent extends BaseServerEvent {\n type: 'conversation.item.input_audio_transcription.failed';\n item_id: string;\n content_index: number;\n error: {\n type: string;\n code?: string;\n message: string;\n param: null;\n };\n}\n\nexport interface ConversationItemTruncatedEvent extends BaseServerEvent {\n type: 'conversation.item.truncated';\n item_id: string;\n content_index: number;\n audio_end_ms: number;\n}\n\nexport interface ConversationItemDeletedEvent extends BaseServerEvent {\n type: 'conversation.item.deleted';\n item_id: string;\n}\n\nexport interface ResponseCreatedEvent extends BaseServerEvent {\n type: 'response.created';\n response: ResponseResource;\n}\n\nexport interface ResponseDoneEvent extends BaseServerEvent {\n type: 'response.done';\n response: ResponseResource;\n}\n\nexport interface ResponseOutputItemAddedEvent extends BaseServerEvent {\n type: 'response.output_item.added';\n response_id: string;\n output_index: number;\n item: ItemResource;\n}\n\nexport interface ResponseOutputItemDoneEvent extends BaseServerEvent {\n type: 'response.output_item.done';\n response_id: string;\n output_index: number;\n item: ItemResource;\n}\n\nexport interface ResponseContentPartAddedEvent extends BaseServerEvent {\n type: 'response.content_part.added';\n response_id: string;\n item_id: string;\n output_index: number;\n content_index: number;\n part: ContentPart;\n}\n\nexport interface ResponseContentPartDoneEvent extends BaseServerEvent {\n type: 'response.content_part.done';\n response_id: string;\n item_id: string;\n output_index: number;\n content_index: number;\n part: ContentPart;\n}\n\nexport interface ResponseTextDeltaEvent extends BaseServerEvent {\n type: 'response.text.delta';\n response_id: string;\n output_index: number;\n content_index: number;\n delta: string;\n}\n\nexport interface ResponseTextDoneEvent extends BaseServerEvent {\n type: 'response.text.done';\n response_id: string;\n output_index: number;\n content_index: number;\n text: string;\n}\n\nexport interface ResponseAudioTranscriptDeltaEvent extends BaseServerEvent {\n type: 'response.audio_transcript.delta';\n response_id: string;\n item_id: string;\n output_index: number;\n content_index: number;\n delta: string;\n}\n\nexport interface ResponseAudioTranscriptDoneEvent extends BaseServerEvent {\n type: 'response.audio_transcript.done';\n response_id: string;\n output_index: number;\n content_index: number;\n transcript: string;\n}\n\nexport interface ResponseAudioDeltaEvent extends BaseServerEvent {\n type: 'response.audio.delta';\n response_id: string;\n item_id: string;\n output_index: number;\n content_index: number;\n delta: AudioBase64Bytes;\n}\n\nexport interface ResponseAudioDoneEvent extends BaseServerEvent {\n type: 'response.audio.done';\n response_id: string;\n output_index: number;\n content_index: number;\n}\n\nexport interface ResponseFunctionCallArgumentsDeltaEvent extends BaseServerEvent {\n type: 'response.function_call_arguments.delta';\n response_id: string;\n output_index: number;\n delta: string;\n}\n\nexport interface ResponseFunctionCallArgumentsDoneEvent extends BaseServerEvent {\n type: 'response.function_call_arguments.done';\n response_id: string;\n output_index: number;\n arguments: string;\n}\n\nexport interface RateLimitsUpdatedEvent extends BaseServerEvent {\n type: 'rate_limits.updated';\n rate_limits: {\n name: 'requests' | 'tokens' | 'input_tokens' | 'output_tokens' | string;\n limit: number;\n remaining: number;\n reset_seconds: number;\n }[];\n}\n\nexport type ServerEvent =\n | ErrorEvent\n | SessionCreatedEvent\n | SessionUpdatedEvent\n | ConversationCreatedEvent\n | InputAudioBufferCommittedEvent\n | InputAudioBufferClearedEvent\n | InputAudioBufferSpeechStartedEvent\n | InputAudioBufferSpeechStoppedEvent\n | ConversationItemCreatedEvent\n | ConversationItemInputAudioTranscriptionCompletedEvent\n | ConversationItemInputAudioTranscriptionFailedEvent\n | ConversationItemTruncatedEvent\n | ConversationItemDeletedEvent\n | ResponseCreatedEvent\n | ResponseDoneEvent\n | ResponseOutputItemAddedEvent\n | ResponseOutputItemDoneEvent\n | ResponseContentPartAddedEvent\n | ResponseContentPartDoneEvent\n | ResponseTextDeltaEvent\n | ResponseTextDoneEvent\n | ResponseAudioTranscriptDeltaEvent\n | ResponseAudioTranscriptDoneEvent\n | ResponseAudioDeltaEvent\n | ResponseAudioDoneEvent\n | ResponseFunctionCallArgumentsDeltaEvent\n | ResponseFunctionCallArgumentsDoneEvent\n | RateLimitsUpdatedEvent;\n"],"mappings":"AAIO,MAAM,cAAc;AACpB,MAAM,eAAe;AACrB,MAAM,gBAAgB;AACtB,MAAM,iBAAiB;AAEvB,MAAM,WAAW;","names":[]}
1
+ {"version":3,"sources":["../../src/realtime/api_proto.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\n\nexport const SAMPLE_RATE = 24000;\nexport const NUM_CHANNELS = 1;\nexport const IN_FRAME_SIZE = 2400; // 100ms\nexport const OUT_FRAME_SIZE = 1200; // 50ms\n\nexport const BASE_URL = 'wss://api.openai.com/v1';\n\nexport type Model = 'gpt-4o-realtime-preview-2024-10-01' | string; // Open-ended, for future models\nexport type Voice =\n | 'alloy'\n | 'shimmer'\n | 'echo'\n | 'ash'\n | 'ballad'\n | 'coral'\n | 'sage'\n | 'verse'\n | string;\nexport type AudioFormat = 'pcm16'; // TODO: 'g711-ulaw' | 'g711-alaw'\nexport type Role = 'system' | 'assistant' | 'user' | 'tool';\nexport type GenerationFinishedReason = 'stop' | 'max_tokens' | 'content_filter' | 'interrupt';\nexport type InputTranscriptionModel = 'whisper-1' | string; // Open-ended, for future models\nexport type Modality = 'text' | 'audio';\nexport type ToolChoice = 'auto' | 'none' | 'required' | string;\nexport type State = 'initializing' | 'listening' | 'thinking' | 'speaking' | string;\nexport type ResponseStatus =\n | 'in_progress'\n | 'completed'\n | 'incomplete'\n | 'cancelled'\n | 'failed'\n | string;\nexport type ClientEventType =\n | 'session.update'\n | 'input_audio_buffer.append'\n | 'input_audio_buffer.commit'\n | 'input_audio_buffer.clear'\n | 'conversation.item.create'\n | 'conversation.item.truncate'\n | 'conversation.item.delete'\n | 'response.create'\n | 'response.cancel';\nexport type ServerEventType =\n | 'error'\n | 'session.created'\n | 'session.updated'\n | 'conversation.created'\n | 'input_audio_buffer.committed'\n | 'input_audio_buffer.cleared'\n | 'input_audio_buffer.speech_started'\n | 'input_audio_buffer.speech_stopped'\n | 'conversation.item.created'\n | 'conversation.item.input_audio_transcription.completed'\n | 'conversation.item.input_audio_transcription.failed'\n | 'conversation.item.truncated'\n | 'conversation.item.deleted'\n | 'response.created'\n | 'response.done'\n | 'response.output_item.added'\n | 'response.output_item.done'\n | 'response.content_part.added'\n | 'response.content_part.done'\n | 'response.text.delta'\n | 'response.text.done'\n | 'response.audio_transcript.delta'\n | 'response.audio_transcript.done'\n | 'response.audio.delta'\n | 'response.audio.done'\n | 'response.function_call_arguments.delta'\n | 'response.function_call_arguments.done'\n | 'rate_limits.updated';\n\nexport type AudioBase64Bytes = string;\n\nexport interface Tool {\n type: 'function';\n name: string;\n description?: string;\n parameters: {\n type: 'object';\n properties: {\n [prop: string]: {\n [prop: string]: any;\n };\n };\n required: string[];\n };\n}\n\nexport type TurnDetectionType =\n | {\n type: 'semantic_vad';\n eagerness?: 'auto' | 'low' | 'medium' | 'high'; // default: auto\n create_response?: boolean; // default: true\n interrupt_response?: boolean; // default: true\n }\n | {\n type: 'server_vad';\n threshold?: number; // 0.0 to 1.0, default: 0.5\n prefix_padding_ms?: number; // default: 300\n silence_duration_ms?: number; // default: 200\n create_response?: boolean; // default: true\n interrupt_response?: boolean; // default: true\n };\n\nexport type InputAudioTranscription = {\n model: InputTranscriptionModel;\n language?: string;\n prompt?: string;\n};\n\nexport interface InputTextContent {\n type: 'input_text';\n text: string;\n}\n\nexport interface InputAudioContent {\n type: 'input_audio';\n audio: AudioBase64Bytes;\n}\n\nexport interface TextContent {\n type: 'text';\n text: string;\n}\n\nexport interface AudioContent {\n type: 'audio';\n audio: AudioBase64Bytes;\n transcript: string;\n}\n\nexport type Content = InputTextContent | InputAudioContent | TextContent | AudioContent;\nexport type ContentPart = {\n type: 'text' | 'audio';\n audio?: AudioBase64Bytes;\n transcript?: string;\n};\n\nexport interface BaseItem {\n id: string;\n object: 'realtime.item';\n type: string;\n}\n\nexport interface SystemItem extends BaseItem {\n type: 'message';\n role: 'system';\n content: InputTextContent;\n}\n\nexport interface UserItem extends BaseItem {\n type: 'message';\n role: 'user';\n content: (InputTextContent | InputAudioContent)[];\n}\n\nexport interface AssistantItem extends BaseItem {\n type: 'message';\n role: 'assistant';\n content: (TextContent | AudioContent)[];\n}\n\nexport interface FunctionCallItem extends BaseItem {\n type: 'function_call';\n call_id: string;\n name: string;\n arguments: string;\n}\n\nexport interface FunctionCallOutputItem extends BaseItem {\n type: 'function_call_output';\n call_id: string;\n output: string;\n}\n\nexport type ItemResource =\n | SystemItem\n | UserItem\n | AssistantItem\n | FunctionCallItem\n | FunctionCallOutputItem;\n\n// Session Resource\nexport interface SessionResource {\n id: string;\n object: 'realtime.session';\n model: string;\n modalities: Modality[]; // default: [\"text\", \"audio\"]\n instructions: string;\n voice: Voice; // default: \"alloy\"\n input_audio_format: AudioFormat; // default: \"pcm16\"\n output_audio_format: AudioFormat; // default: \"pcm16\"\n input_audio_transcription: InputAudioTranscription | null;\n turn_detection: TurnDetectionType | null;\n tools: Tool[];\n tool_choice: ToolChoice; // default: \"auto\"\n temperature: number; // default: 0.8\n max_response_output_tokens: number | 'inf';\n expires_at: number;\n}\n\n// Conversation Resource\nexport interface ConversationResource {\n id: string;\n object: 'realtime.conversation';\n}\n\nexport type ResponseStatusDetails =\n | {\n type: 'incomplete';\n reason: 'max_output_tokens' | 'content_filter' | string;\n }\n | {\n type: 'failed';\n error?: {\n code: 'server_error' | 'rate_limit_exceeded' | string;\n message: string;\n };\n }\n | {\n type: 'cancelled';\n reason: 'turn_detected' | 'client_cancelled' | string;\n };\n\nexport interface ModelUsage {\n total_tokens: number;\n input_tokens: number;\n output_tokens: number;\n input_token_details: {\n text_tokens: number;\n audio_tokens: number;\n cached_tokens: number;\n cached_tokens_details: {\n text_tokens: number;\n audio_tokens: number;\n image_tokens: number;\n };\n };\n output_token_details: {\n text_tokens: number;\n audio_tokens: number;\n };\n}\n\nexport interface ResponseResource {\n id: string;\n object: 'realtime.response';\n status: ResponseStatus;\n status_details: ResponseStatusDetails;\n output: ItemResource[];\n usage?: ModelUsage;\n metadata?: Record<string, string>;\n}\n\n// Client Events\ninterface BaseClientEvent {\n event_id?: string;\n type: ClientEventType;\n}\n\nexport interface SessionUpdateEvent extends BaseClientEvent {\n type: 'session.update';\n session: Partial<{\n model: Model;\n modalities: Modality[];\n instructions: string;\n voice: Voice;\n input_audio_format: AudioFormat;\n output_audio_format: AudioFormat;\n input_audio_transcription: InputAudioTranscription | null;\n turn_detection: TurnDetectionType | null;\n tools: Tool[];\n tool_choice: ToolChoice;\n temperature: number;\n max_response_output_tokens?: number | 'inf';\n speed?: number;\n }>;\n}\n\nexport interface InputAudioBufferAppendEvent extends BaseClientEvent {\n type: 'input_audio_buffer.append';\n audio: AudioBase64Bytes;\n}\n\nexport interface InputAudioBufferCommitEvent extends BaseClientEvent {\n type: 'input_audio_buffer.commit';\n}\n\nexport interface InputAudioBufferClearEvent extends BaseClientEvent {\n type: 'input_audio_buffer.clear';\n}\n\nexport interface UserItemCreate {\n id: string;\n type: 'message';\n role: 'user';\n content: (InputTextContent | InputAudioContent)[];\n}\n\nexport interface AssistantItemCreate {\n id: string;\n type: 'message';\n role: 'assistant';\n content: TextContent[];\n}\n\nexport interface SystemItemCreate {\n id: string;\n type: 'message';\n role: 'system';\n content: InputTextContent[];\n}\n\nexport interface FunctionCallOutputItemCreate {\n id: string;\n type: 'function_call_output';\n call_id: string;\n output: string;\n}\n\nexport type ConversationItemCreateContent =\n | UserItemCreate\n | AssistantItemCreate\n | SystemItemCreate\n | FunctionCallOutputItemCreate;\n\nexport interface ConversationItemCreateEvent extends BaseClientEvent {\n type: 'conversation.item.create';\n previous_item_id?: string;\n item: ConversationItemCreateContent;\n}\n\nexport interface ConversationItemTruncateEvent extends BaseClientEvent {\n type: 'conversation.item.truncate';\n item_id: string;\n content_index: number;\n audio_end_ms: number;\n}\n\nexport interface ConversationItemDeleteEvent extends BaseClientEvent {\n type: 'conversation.item.delete';\n item_id: string;\n}\n\nexport interface ResponseCreateEvent extends BaseClientEvent {\n type: 'response.create';\n response?: Partial<{\n modalities: Modality[];\n instructions: string;\n voice: Voice;\n output_audio_format: AudioFormat;\n tools?: Tool[];\n tool_choice: ToolChoice;\n temperature: number;\n max_output_tokens: number | 'inf';\n metadata?: Record<string, string>;\n }>;\n}\n\nexport interface ResponseCancelEvent extends BaseClientEvent {\n type: 'response.cancel';\n}\n\nexport type ClientEvent =\n | SessionUpdateEvent\n | InputAudioBufferAppendEvent\n | InputAudioBufferCommitEvent\n | InputAudioBufferClearEvent\n | ConversationItemCreateEvent\n | ConversationItemTruncateEvent\n | ConversationItemDeleteEvent\n | ResponseCreateEvent\n | ResponseCancelEvent;\n\ninterface BaseServerEvent {\n event_id: string;\n type: ServerEventType;\n}\n\nexport interface ErrorEvent extends BaseServerEvent {\n type: 'error';\n error: {\n type: 'invalid_request_error' | 'server_error' | string;\n code?: string;\n message: string;\n param: string;\n event_id: string;\n };\n}\n\nexport interface SessionCreatedEvent extends BaseServerEvent {\n type: 'session.created';\n session: SessionResource;\n}\n\nexport interface SessionUpdatedEvent extends BaseServerEvent {\n type: 'session.updated';\n session: SessionResource;\n}\n\nexport interface ConversationCreatedEvent extends BaseServerEvent {\n type: 'conversation.created';\n conversation: ConversationResource;\n}\n\nexport interface InputAudioBufferCommittedEvent extends BaseServerEvent {\n type: 'input_audio_buffer.committed';\n item_id: string;\n}\n\nexport interface InputAudioBufferClearedEvent extends BaseServerEvent {\n type: 'input_audio_buffer.cleared';\n}\n\nexport interface InputAudioBufferSpeechStartedEvent extends BaseServerEvent {\n type: 'input_audio_buffer.speech_started';\n audio_start_ms: number;\n item_id: string;\n}\n\nexport interface InputAudioBufferSpeechStoppedEvent extends BaseServerEvent {\n type: 'input_audio_buffer.speech_stopped';\n audio_end_ms: number;\n item_id: string;\n}\n\nexport interface ConversationItemCreatedEvent extends BaseServerEvent {\n type: 'conversation.item.created';\n previous_item_id: string;\n item: ItemResource;\n}\n\nexport interface ConversationItemInputAudioTranscriptionCompletedEvent extends BaseServerEvent {\n type: 'conversation.item.input_audio_transcription.completed';\n item_id: string;\n content_index: number;\n transcript: string;\n}\n\nexport interface ConversationItemInputAudioTranscriptionFailedEvent extends BaseServerEvent {\n type: 'conversation.item.input_audio_transcription.failed';\n item_id: string;\n content_index: number;\n error: {\n type: string;\n code?: string;\n message: string;\n param: null;\n };\n}\n\nexport interface ConversationItemTruncatedEvent extends BaseServerEvent {\n type: 'conversation.item.truncated';\n item_id: string;\n content_index: number;\n audio_end_ms: number;\n}\n\nexport interface ConversationItemDeletedEvent extends BaseServerEvent {\n type: 'conversation.item.deleted';\n item_id: string;\n}\n\nexport interface ResponseCreatedEvent extends BaseServerEvent {\n type: 'response.created';\n response: ResponseResource;\n}\n\nexport interface ResponseDoneEvent extends BaseServerEvent {\n type: 'response.done';\n response: ResponseResource;\n}\n\nexport interface ResponseOutputItemAddedEvent extends BaseServerEvent {\n type: 'response.output_item.added';\n response_id: string;\n output_index: number;\n item: ItemResource;\n}\n\nexport interface ResponseOutputItemDoneEvent extends BaseServerEvent {\n type: 'response.output_item.done';\n response_id: string;\n output_index: number;\n item: ItemResource;\n}\n\nexport interface ResponseContentPartAddedEvent extends BaseServerEvent {\n type: 'response.content_part.added';\n response_id: string;\n item_id: string;\n output_index: number;\n content_index: number;\n part: ContentPart;\n}\n\nexport interface ResponseContentPartDoneEvent extends BaseServerEvent {\n type: 'response.content_part.done';\n response_id: string;\n item_id: string;\n output_index: number;\n content_index: number;\n part: ContentPart;\n}\n\nexport interface ResponseTextDeltaEvent extends BaseServerEvent {\n type: 'response.text.delta';\n response_id: string;\n item_id: string;\n output_index: number;\n content_index: number;\n delta: string;\n}\n\nexport interface ResponseTextDoneEvent extends BaseServerEvent {\n type: 'response.text.done';\n response_id: string;\n item_id: string;\n output_index: number;\n content_index: number;\n text: string;\n}\n\nexport interface ResponseAudioTranscriptDeltaEvent extends BaseServerEvent {\n type: 'response.audio_transcript.delta';\n response_id: string;\n item_id: string;\n output_index: number;\n content_index: number;\n delta: string;\n}\n\nexport interface ResponseAudioTranscriptDoneEvent extends BaseServerEvent {\n type: 'response.audio_transcript.done';\n response_id: string;\n output_index: number;\n content_index: number;\n transcript: string;\n}\n\nexport interface ResponseAudioDeltaEvent extends BaseServerEvent {\n type: 'response.audio.delta';\n response_id: string;\n item_id: string;\n output_index: number;\n content_index: number;\n delta: AudioBase64Bytes;\n}\n\nexport interface ResponseAudioDoneEvent extends BaseServerEvent {\n type: 'response.audio.done';\n response_id: string;\n output_index: number;\n content_index: number;\n}\n\nexport interface ResponseFunctionCallArgumentsDeltaEvent extends BaseServerEvent {\n type: 'response.function_call_arguments.delta';\n response_id: string;\n output_index: number;\n delta: string;\n}\n\nexport interface ResponseFunctionCallArgumentsDoneEvent extends BaseServerEvent {\n type: 'response.function_call_arguments.done';\n response_id: string;\n output_index: number;\n arguments: string;\n}\n\nexport interface RateLimitsUpdatedEvent extends BaseServerEvent {\n type: 'rate_limits.updated';\n rate_limits: {\n name: 'requests' | 'tokens' | 'input_tokens' | 'output_tokens' | string;\n limit: number;\n remaining: number;\n reset_seconds: number;\n }[];\n}\n\nexport type ServerEvent =\n | ErrorEvent\n | SessionCreatedEvent\n | SessionUpdatedEvent\n | ConversationCreatedEvent\n | InputAudioBufferCommittedEvent\n | InputAudioBufferClearedEvent\n | InputAudioBufferSpeechStartedEvent\n | InputAudioBufferSpeechStoppedEvent\n | ConversationItemCreatedEvent\n | ConversationItemInputAudioTranscriptionCompletedEvent\n | ConversationItemInputAudioTranscriptionFailedEvent\n | ConversationItemTruncatedEvent\n | ConversationItemDeletedEvent\n | ResponseCreatedEvent\n | ResponseDoneEvent\n | ResponseOutputItemAddedEvent\n | ResponseOutputItemDoneEvent\n | ResponseContentPartAddedEvent\n | ResponseContentPartDoneEvent\n | ResponseTextDeltaEvent\n | ResponseTextDoneEvent\n | ResponseAudioTranscriptDeltaEvent\n | ResponseAudioTranscriptDoneEvent\n | ResponseAudioDeltaEvent\n | ResponseAudioDoneEvent\n | ResponseFunctionCallArgumentsDeltaEvent\n | ResponseFunctionCallArgumentsDoneEvent\n | RateLimitsUpdatedEvent;\n"],"mappings":"AAIO,MAAM,cAAc;AACpB,MAAM,eAAe;AACrB,MAAM,gBAAgB;AACtB,MAAM,iBAAiB;AAEvB,MAAM,WAAW;","names":[]}
@@ -84,7 +84,8 @@ const DEFAULT_REALTIME_MODEL_OPTIONS = {
84
84
  toolChoice: DEFAULT_TOOL_CHOICE,
85
85
  maxResponseOutputTokens: DEFAULT_MAX_RESPONSE_OUTPUT_TOKENS,
86
86
  maxSessionDuration: DEFAULT_MAX_SESSION_DURATION,
87
- connOptions: import_agents.DEFAULT_API_CONNECT_OPTIONS
87
+ connOptions: import_agents.DEFAULT_API_CONNECT_OPTIONS,
88
+ modalities: ["text", "audio"]
88
89
  };
89
90
  class RealtimeModel extends import_agents.llm.RealtimeModel {
90
91
  sampleRate = api_proto.SAMPLE_RATE;
@@ -94,11 +95,13 @@ class RealtimeModel extends import_agents.llm.RealtimeModel {
94
95
  /* @internal */
95
96
  _options;
96
97
  constructor(options = {}) {
98
+ const modalities = options.modalities || DEFAULT_REALTIME_MODEL_OPTIONS.modalities;
97
99
  super({
98
100
  messageTruncation: true,
99
101
  turnDetection: options.turnDetection !== null,
100
102
  userTranscription: options.inputAudioTranscription !== null,
101
- autoToolReplyGeneration: false
103
+ autoToolReplyGeneration: false,
104
+ audioOutput: modalities.includes("audio")
102
105
  });
103
106
  const isAzure = !!(options.apiVersion || options.entraToken || options.azureDeployment);
104
107
  if (options.apiKey === "" && !isAzure) {
@@ -121,13 +124,15 @@ class RealtimeModel extends import_agents.llm.RealtimeModel {
121
124
  }
122
125
  options.baseURL = `${azureEndpoint.replace(/\/$/, "")}/openai`;
123
126
  }
127
+ const { modalities: _, ...optionsWithoutModalities } = options;
124
128
  this._options = {
125
129
  ...DEFAULT_REALTIME_MODEL_OPTIONS,
126
- ...options,
130
+ ...optionsWithoutModalities,
127
131
  baseURL: options.baseURL || BASE_URL,
128
132
  apiKey,
129
133
  isAzure,
130
- model: options.model || DEFAULT_REALTIME_MODEL_OPTIONS.model
134
+ model: options.model || DEFAULT_REALTIME_MODEL_OPTIONS.model,
135
+ modalities
131
136
  };
132
137
  }
133
138
  /**
@@ -267,6 +272,7 @@ class RealtimeSession extends import_agents.llm.RealtimeSession {
267
272
  this.messageChannel.put(command);
268
273
  }
269
274
  createSessionUpdateEvent() {
275
+ const modalities = this.oaiRealtimeModel._options.modalities.includes("audio") ? ["text", "audio"] : ["text"];
270
276
  return {
271
277
  type: "session.update",
272
278
  session: {
@@ -274,7 +280,7 @@ class RealtimeSession extends import_agents.llm.RealtimeSession {
274
280
  voice: this.oaiRealtimeModel._options.voice,
275
281
  input_audio_format: "pcm16",
276
282
  output_audio_format: "pcm16",
277
- modalities: ["text", "audio"],
283
+ modalities,
278
284
  turn_detection: this.oaiRealtimeModel._options.turnDetection,
279
285
  input_audio_transcription: this.oaiRealtimeModel._options.inputAudioTranscription,
280
286
  // TODO(shubhra): add inputAudioNoiseReduction
@@ -462,12 +468,31 @@ class RealtimeSession extends import_agents.llm.RealtimeSession {
462
468
  });
463
469
  }
464
470
  async truncate(_options) {
465
- this.sendEvent({
466
- type: "conversation.item.truncate",
467
- content_index: 0,
468
- item_id: _options.messageId,
469
- audio_end_ms: _options.audioEndMs
470
- });
471
+ if (!_options.modalities || _options.modalities.includes("audio")) {
472
+ this.sendEvent({
473
+ type: "conversation.item.truncate",
474
+ content_index: 0,
475
+ item_id: _options.messageId,
476
+ audio_end_ms: _options.audioEndMs
477
+ });
478
+ } else if (_options.audioTranscript !== void 0) {
479
+ const chatCtx = this.chatCtx.copy();
480
+ const idx = chatCtx.indexById(_options.messageId);
481
+ if (idx !== void 0) {
482
+ const item = chatCtx.items[idx];
483
+ if (item && item.type === "message") {
484
+ const newItem = import_agents.llm.ChatMessage.create({
485
+ ...item,
486
+ content: [_options.audioTranscript]
487
+ });
488
+ chatCtx.items[idx] = newItem;
489
+ const events = this.createChatCtxUpdateEvents(chatCtx);
490
+ for (const ev of events) {
491
+ this.sendEvent(ev);
492
+ }
493
+ }
494
+ }
495
+ }
471
496
  }
472
497
  loggableEvent(event) {
473
498
  const untypedEvent = {};
@@ -683,6 +708,12 @@ class RealtimeSession extends import_agents.llm.RealtimeSession {
683
708
  case "response.content_part.done":
684
709
  this.handleResponseContentPartDone(event);
685
710
  break;
711
+ case "response.text.delta":
712
+ this.handleResponseTextDelta(event);
713
+ break;
714
+ case "response.text.done":
715
+ this.handleResponseTextDone(event);
716
+ break;
686
717
  case "response.audio_transcript.delta":
687
718
  this.handleResponseAudioTranscriptDelta(event);
688
719
  break;
@@ -799,6 +830,29 @@ class RealtimeSession extends import_agents.llm.RealtimeSession {
799
830
  this.textModeRecoveryRetries = 0;
800
831
  return;
801
832
  }
833
+ const itemId = event.item.id;
834
+ if (!itemId) {
835
+ throw new Error("item.id is not set");
836
+ }
837
+ const modalitiesFut = new import_agents.Future();
838
+ const itemGeneration = {
839
+ messageId: itemId,
840
+ textChannel: import_agents.stream.createStreamChannel(),
841
+ audioChannel: import_agents.stream.createStreamChannel(),
842
+ audioTranscript: "",
843
+ modalities: modalitiesFut
844
+ };
845
+ if (!this.oaiRealtimeModel.capabilities.audioOutput) {
846
+ itemGeneration.audioChannel.close();
847
+ modalitiesFut.resolve(["text"]);
848
+ }
849
+ this.currentGeneration.messageChannel.write({
850
+ messageId: itemId,
851
+ textStream: itemGeneration.textChannel.stream(),
852
+ audioStream: itemGeneration.audioChannel.stream(),
853
+ modalities: modalitiesFut.await
854
+ });
855
+ this.currentGeneration.messages.set(itemId, itemGeneration);
802
856
  }
803
857
  handleConversationItemCreated(event) {
804
858
  if (!event.item.id) {
@@ -859,35 +913,20 @@ class RealtimeSession extends import_agents.llm.RealtimeSession {
859
913
  }
860
914
  const itemId = event.item_id;
861
915
  const itemType = event.part.type;
862
- const responseId = event.response_id;
863
- if (itemType === "audio") {
864
- this.resolveGeneration(responseId);
865
- if (this.textModeRecoveryRetries > 0) {
866
- this.#logger.info(
867
- { retries: this.textModeRecoveryRetries },
868
- "recovered from text-only response"
869
- );
870
- this.textModeRecoveryRetries = 0;
871
- }
872
- const itemGeneration = {
873
- messageId: itemId,
874
- textChannel: import_agents.stream.createStreamChannel(),
875
- audioChannel: import_agents.stream.createStreamChannel(),
876
- audioTranscript: ""
877
- };
878
- this.currentGeneration.messageChannel.write({
879
- messageId: itemId,
880
- textStream: itemGeneration.textChannel.stream(),
881
- audioStream: itemGeneration.audioChannel.stream()
882
- });
883
- this.currentGeneration.messages.set(itemId, itemGeneration);
884
- this.currentGeneration._firstTokenTimestamp = Date.now();
916
+ const itemGeneration = this.currentGeneration.messages.get(itemId);
917
+ if (!itemGeneration) {
918
+ this.#logger.warn(`itemGeneration not found for itemId=${itemId}`);
885
919
  return;
886
- } else {
887
- this.interrupt();
888
- if (this.textModeRecoveryRetries === 0) {
889
- this.#logger.warn({ responseId }, "received text-only response from OpenAI Realtime API");
890
- }
920
+ }
921
+ if (itemType === "text" && this.oaiRealtimeModel.capabilities.audioOutput) {
922
+ this.#logger.warn("Text response received from OpenAI Realtime API in audio modality.");
923
+ }
924
+ if (!itemGeneration.modalities.done) {
925
+ const modalityResult = itemType === "text" ? ["text"] : ["audio", "text"];
926
+ itemGeneration.modalities.resolve(modalityResult);
927
+ }
928
+ if (this.currentGeneration._firstTokenTimestamp === void 0) {
929
+ this.currentGeneration._firstTokenTimestamp = Date.now();
891
930
  }
892
931
  }
893
932
  handleResponseContentPartDone(event) {
@@ -898,6 +937,25 @@ class RealtimeSession extends import_agents.llm.RealtimeSession {
898
937
  throw new Error("currentGeneration is not set");
899
938
  }
900
939
  }
940
+ handleResponseTextDelta(event) {
941
+ if (!this.currentGeneration) {
942
+ throw new Error("currentGeneration is not set");
943
+ }
944
+ const itemGeneration = this.currentGeneration.messages.get(event.item_id);
945
+ if (!itemGeneration) {
946
+ throw new Error("itemGeneration is not set");
947
+ }
948
+ if (!this.oaiRealtimeModel.capabilities.audioOutput && !this.currentGeneration._firstTokenTimestamp) {
949
+ this.currentGeneration._firstTokenTimestamp = Date.now();
950
+ }
951
+ itemGeneration.textChannel.write(event.delta);
952
+ itemGeneration.audioTranscript += event.delta;
953
+ }
954
+ handleResponseTextDone(_event) {
955
+ if (!this.currentGeneration) {
956
+ throw new Error("currentGeneration is not set");
957
+ }
958
+ }
901
959
  handleResponseAudioTranscriptDelta(event) {
902
960
  if (!this.currentGeneration) {
903
961
  throw new Error("currentGeneration is not set");
@@ -920,6 +978,12 @@ class RealtimeSession extends import_agents.llm.RealtimeSession {
920
978
  if (!itemGeneration) {
921
979
  throw new Error("itemGeneration is not set");
922
980
  }
981
+ if (this.currentGeneration._firstTokenTimestamp === void 0) {
982
+ this.currentGeneration._firstTokenTimestamp = Date.now();
983
+ }
984
+ if (!itemGeneration.modalities.done) {
985
+ itemGeneration.modalities.resolve(["audio", "text"]);
986
+ }
923
987
  const binaryString = atob(event.delta);
924
988
  const len = binaryString.length;
925
989
  const bytes = new Uint8Array(len);
@@ -968,6 +1032,9 @@ class RealtimeSession extends import_agents.llm.RealtimeSession {
968
1032
  }
969
1033
  itemGeneration.textChannel.close();
970
1034
  itemGeneration.audioChannel.close();
1035
+ if (!itemGeneration.modalities.done) {
1036
+ itemGeneration.modalities.resolve(this.oaiRealtimeModel._options.modalities);
1037
+ }
971
1038
  }
972
1039
  }
973
1040
  handleResponseDone(_event) {
@@ -986,6 +1053,9 @@ class RealtimeSession extends import_agents.llm.RealtimeSession {
986
1053
  for (const generation of this.currentGeneration.messages.values()) {
987
1054
  generation.textChannel.close();
988
1055
  generation.audioChannel.close();
1056
+ if (!generation.modalities.done) {
1057
+ generation.modalities.resolve(this.oaiRealtimeModel._options.modalities);
1058
+ }
989
1059
  }
990
1060
  this.currentGeneration.functionChannel.close();
991
1061
  this.currentGeneration.messageChannel.close();
@@ -1145,6 +1215,8 @@ function livekitItemToOpenAIItem(item) {
1145
1215
  role,
1146
1216
  content: contentList
1147
1217
  };
1218
+ default:
1219
+ throw new Error(`Unsupported item type: ${item.type}`);
1148
1220
  }
1149
1221
  }
1150
1222
  function openAIItemToLivekitItem(item) {