npm - @livekit/agents-plugin-openai - Versions diffs - 1.0.17 → 1.0.19 - Mend

@livekit/agents-plugin-openai 1.0.17 → 1.0.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

package/dist/llm.cjs +5 -2
package/dist/llm.cjs.map +1 -1
package/dist/llm.d.cts +2 -1
package/dist/llm.d.ts +2 -1
package/dist/llm.d.ts.map +1 -1
package/dist/llm.js +5 -2
package/dist/llm.js.map +1 -1
package/dist/llm.test.cjs +9 -0
package/dist/llm.test.cjs.map +1 -1
package/dist/llm.test.js +10 -1
package/dist/llm.test.js.map +1 -1
package/dist/realtime/api_proto.cjs.map +1 -1
package/dist/realtime/api_proto.d.cts +5 -3
package/dist/realtime/api_proto.d.ts +5 -3
package/dist/realtime/api_proto.d.ts.map +1 -1
package/dist/realtime/api_proto.js.map +1 -1
package/dist/realtime/realtime_model.cjs +111 -39
package/dist/realtime/realtime_model.cjs.map +1 -1
package/dist/realtime/realtime_model.d.cts +7 -0
package/dist/realtime/realtime_model.d.ts +7 -0
package/dist/realtime/realtime_model.d.ts.map +1 -1
package/dist/realtime/realtime_model.js +111 -39
package/dist/realtime/realtime_model.js.map +1 -1
package/package.json +7 -7
package/src/llm.test.ts +11 -1
package/src/llm.ts +6 -2
package/src/realtime/api_proto.ts +5 -3
package/src/realtime/realtime_model.ts +146 -39

package/dist/realtime/api_proto.js.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"sources":["../../src/realtime/api_proto.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\n\nexport const SAMPLE_RATE = 24000;\nexport const NUM_CHANNELS = 1;\nexport const IN_FRAME_SIZE = 2400; // 100ms\nexport const OUT_FRAME_SIZE = 1200; // 50ms\n\nexport const BASE_URL = 'wss://api.openai.com/v1';\n\nexport type Model = 'gpt-4o-realtime-preview-2024-10-01' \| string; // Open-ended, for future models\nexport type Voice =\n \| 'alloy'\n \| 'shimmer'\n \| 'echo'\n \| 'ash'\n \| 'ballad'\n \| 'coral'\n \| 'sage'\n \| 'verse'\n \| string;\nexport type AudioFormat = 'pcm16'; // TODO: 'g711-ulaw' \| 'g711-alaw'\nexport type Role = 'system' \| 'assistant' \| 'user' \| 'tool';\nexport type GenerationFinishedReason = 'stop' \| 'max_tokens' \| 'content_filter' \| 'interrupt';\nexport type InputTranscriptionModel = 'whisper-1' \| string; // Open-ended, for future models\nexport type Modality = 'text' \| 'audio';\nexport type ToolChoice = 'auto' \| 'none' \| 'required' \| string;\nexport type State = 'initializing' \| 'listening' \| 'thinking' \| 'speaking' \| string;\nexport type ResponseStatus =\n \| 'in_progress'\n \| 'completed'\n \| 'incomplete'\n \| 'cancelled'\n \| 'failed'\n \| string;\nexport type ClientEventType =\n \| 'session.update'\n \| 'input_audio_buffer.append'\n \| 'input_audio_buffer.commit'\n \| 'input_audio_buffer.clear'\n \| 'conversation.item.create'\n \| 'conversation.item.truncate'\n \| 'conversation.item.delete'\n \| 'response.create'\n \| 'response.cancel';\nexport type ServerEventType =\n \| 'error'\n \| 'session.created'\n \| 'session.updated'\n \| 'conversation.created'\n \| 'input_audio_buffer.committed'\n \| 'input_audio_buffer.cleared'\n \| 'input_audio_buffer.speech_started'\n \| 'input_audio_buffer.speech_stopped'\n \| 'conversation.item.created'\n \| 'conversation.item.input_audio_transcription.completed'\n \| 'conversation.item.input_audio_transcription.failed'\n \| 'conversation.item.truncated'\n \| 'conversation.item.deleted'\n \| 'response.created'\n \| 'response.done'\n \| 'response.output_item.added'\n \| 'response.output_item.done'\n \| 'response.content_part.added'\n \| 'response.content_part.done'\n \| 'response.text.delta'\n \| 'response.text.done'\n \| 'response.audio_transcript.delta'\n \| 'response.audio_transcript.done'\n \| 'response.audio.delta'\n \| 'response.audio.done'\n \| 'response.function_call_arguments.delta'\n \| 'response.function_call_arguments.done'\n \| 'rate_limits.updated';\n\nexport type AudioBase64Bytes = string;\n\nexport interface Tool {\n type: 'function';\n name: string;\n description?: string;\n parameters: {\n type: 'object';\n properties: {\n [prop: string]: {\n [prop: string]: any;\n };\n };\n required: string[];\n };\n}\n\nexport type TurnDetectionType =\n \| {\n type: 'semantic_vad';\n eagerness?: 'auto' \| 'low' \| 'medium' \| 'high'; // default: auto\n create_response?: boolean; // default: true\n interrupt_response?: boolean; // default: true\n }\n \| {\n type: 'server_vad';\n threshold?: number; // 0.0 to 1.0, default: 0.5\n prefix_padding_ms?: number; // default: 300\n silence_duration_ms?: number; // default: 200\n create_response?: boolean; // default: true\n interrupt_response?: boolean; // default: true\n };\n\nexport type InputAudioTranscription = {\n model: InputTranscriptionModel;\n language?: string;\n prompt?: string;\n};\n\nexport interface InputTextContent {\n type: 'input_text';\n text: string;\n}\n\nexport interface InputAudioContent {\n type: 'input_audio';\n audio: AudioBase64Bytes;\n}\n\nexport interface TextContent {\n type: 'text';\n text: string;\n}\n\nexport interface AudioContent {\n type: 'audio';\n audio: AudioBase64Bytes;\n transcript: string;\n}\n\nexport type Content = InputTextContent \| InputAudioContent \| TextContent \| AudioContent;\nexport type ContentPart = {\n type: 'text' \| 'audio';\n audio?: AudioBase64Bytes;\n transcript?: string;\n};\n\nexport interface BaseItem {\n id: string;\n object: 'realtime.item';\n type: string;\n}\n\nexport interface SystemItem extends BaseItem {\n type: 'message';\n role: 'system';\n content: InputTextContent;\n}\n\nexport interface UserItem extends BaseItem {\n type: 'message';\n role: 'user';\n content: (InputTextContent \| InputAudioContent)[];\n}\n\nexport interface AssistantItem extends BaseItem {\n type: 'message';\n role: 'assistant';\n content: (TextContent \| AudioContent)[];\n}\n\nexport interface FunctionCallItem extends BaseItem {\n type: 'function_call';\n call_id: string;\n name: string;\n arguments: string;\n}\n\nexport interface FunctionCallOutputItem extends BaseItem {\n type: 'function_call_output';\n call_id: string;\n output: string;\n}\n\nexport type ItemResource =\n \| SystemItem\n \| UserItem\n \| AssistantItem\n \| FunctionCallItem\n \| FunctionCallOutputItem;\n\n// Session Resource\nexport interface SessionResource {\n id: string;\n object: 'realtime.session';\n model: string;\n modalities: ['text', 'audio'] \| ['text']; // default: [\"text\", \"audio\"]\n instructions: string;\n voice: Voice; // default: \"alloy\"\n input_audio_format: AudioFormat; // default: \"pcm16\"\n output_audio_format: AudioFormat; // default: \"pcm16\"\n input_audio_transcription: InputAudioTranscription \| null;\n turn_detection: TurnDetectionType \| null;\n tools: Tool[];\n tool_choice: ToolChoice; // default: \"auto\"\n temperature: number; // default: 0.8\n max_response_output_tokens: number \| 'inf';\n expires_at: number;\n}\n\n// Conversation Resource\nexport interface ConversationResource {\n id: string;\n object: 'realtime.conversation';\n}\n\nexport type ResponseStatusDetails =\n \| {\n type: 'incomplete';\n reason: 'max_output_tokens' \| 'content_filter' \| string;\n }\n \| {\n type: 'failed';\n error?: {\n code: 'server_error' \| 'rate_limit_exceeded' \| string;\n message: string;\n };\n }\n \| {\n type: 'cancelled';\n reason: 'turn_detected' \| 'client_cancelled' \| string;\n };\n\nexport interface ModelUsage {\n total_tokens: number;\n input_tokens: number;\n output_tokens: number;\n input_token_details: {\n text_tokens: number;\n audio_tokens: number;\n cached_tokens: number;\n cached_tokens_details: {\n text_tokens: number;\n audio_tokens: number;\n image_tokens: number;\n };\n };\n output_token_details: {\n text_tokens: number;\n audio_tokens: number;\n };\n}\n\nexport interface ResponseResource {\n id: string;\n object: 'realtime.response';\n status: ResponseStatus;\n status_details: ResponseStatusDetails;\n output: ItemResource[];\n usage?: ModelUsage;\n metadata?: Record<string, string>;\n}\n\n// Client Events\ninterface BaseClientEvent {\n event_id?: string;\n type: ClientEventType;\n}\n\nexport interface SessionUpdateEvent extends BaseClientEvent {\n type: 'session.update';\n session: Partial<{\n model: Model;\n modalities: ['text', 'audio'] \| ['text'];\n instructions: string;\n voice: Voice;\n input_audio_format: AudioFormat;\n output_audio_format: AudioFormat;\n input_audio_transcription: InputAudioTranscription \| null;\n turn_detection: TurnDetectionType \| null;\n tools: Tool[];\n tool_choice: ToolChoice;\n temperature: number;\n max_response_output_tokens?: number \| 'inf';\n speed?: number;\n }>;\n}\n\nexport interface InputAudioBufferAppendEvent extends BaseClientEvent {\n type: 'input_audio_buffer.append';\n audio: AudioBase64Bytes;\n}\n\nexport interface InputAudioBufferCommitEvent extends BaseClientEvent {\n type: 'input_audio_buffer.commit';\n}\n\nexport interface InputAudioBufferClearEvent extends BaseClientEvent {\n type: 'input_audio_buffer.clear';\n}\n\nexport interface UserItemCreate {\n id: string;\n type: 'message';\n role: 'user';\n content: (InputTextContent \| InputAudioContent)[];\n}\n\nexport interface AssistantItemCreate {\n id: string;\n type: 'message';\n role: 'assistant';\n content: TextContent[];\n}\n\nexport interface SystemItemCreate {\n id: string;\n type: 'message';\n role: 'system';\n content: InputTextContent[];\n}\n\nexport interface FunctionCallOutputItemCreate {\n id: string;\n type: 'function_call_output';\n call_id: string;\n output: string;\n}\n\nexport type ConversationItemCreateContent =\n \| UserItemCreate\n \| AssistantItemCreate\n \| SystemItemCreate\n \| FunctionCallOutputItemCreate;\n\nexport interface ConversationItemCreateEvent extends BaseClientEvent {\n type: 'conversation.item.create';\n previous_item_id?: string;\n item: ConversationItemCreateContent;\n}\n\nexport interface ConversationItemTruncateEvent extends BaseClientEvent {\n type: 'conversation.item.truncate';\n item_id: string;\n content_index: number;\n audio_end_ms: number;\n}\n\nexport interface ConversationItemDeleteEvent extends BaseClientEvent {\n type: 'conversation.item.delete';\n item_id: string;\n}\n\nexport interface ResponseCreateEvent extends BaseClientEvent {\n type: 'response.create';\n response?: Partial<{\n modalities: ['text', 'audio'] \| ['text'];\n instructions: string;\n voice: Voice;\n output_audio_format: AudioFormat;\n tools?: Tool[];\n tool_choice: ToolChoice;\n temperature: number;\n max_output_tokens: number \| 'inf';\n metadata?: Record<string, string>;\n }>;\n}\n\nexport interface ResponseCancelEvent extends BaseClientEvent {\n type: 'response.cancel';\n}\n\nexport type ClientEvent =\n \| SessionUpdateEvent\n \| InputAudioBufferAppendEvent\n \| InputAudioBufferCommitEvent\n \| InputAudioBufferClearEvent\n \| ConversationItemCreateEvent\n \| ConversationItemTruncateEvent\n \| ConversationItemDeleteEvent\n \| ResponseCreateEvent\n \| ResponseCancelEvent;\n\ninterface BaseServerEvent {\n event_id: string;\n type: ServerEventType;\n}\n\nexport interface ErrorEvent extends BaseServerEvent {\n type: 'error';\n error: {\n type: 'invalid_request_error' \| 'server_error' \| string;\n code?: string;\n message: string;\n param: string;\n event_id: string;\n };\n}\n\nexport interface SessionCreatedEvent extends BaseServerEvent {\n type: 'session.created';\n session: SessionResource;\n}\n\nexport interface SessionUpdatedEvent extends BaseServerEvent {\n type: 'session.updated';\n session: SessionResource;\n}\n\nexport interface ConversationCreatedEvent extends BaseServerEvent {\n type: 'conversation.created';\n conversation: ConversationResource;\n}\n\nexport interface InputAudioBufferCommittedEvent extends BaseServerEvent {\n type: 'input_audio_buffer.committed';\n item_id: string;\n}\n\nexport interface InputAudioBufferClearedEvent extends BaseServerEvent {\n type: 'input_audio_buffer.cleared';\n}\n\nexport interface InputAudioBufferSpeechStartedEvent extends BaseServerEvent {\n type: 'input_audio_buffer.speech_started';\n audio_start_ms: number;\n item_id: string;\n}\n\nexport interface InputAudioBufferSpeechStoppedEvent extends BaseServerEvent {\n type: 'input_audio_buffer.speech_stopped';\n audio_end_ms: number;\n item_id: string;\n}\n\nexport interface ConversationItemCreatedEvent extends BaseServerEvent {\n type: 'conversation.item.created';\n previous_item_id: string;\n item: ItemResource;\n}\n\nexport interface ConversationItemInputAudioTranscriptionCompletedEvent extends BaseServerEvent {\n type: 'conversation.item.input_audio_transcription.completed';\n item_id: string;\n content_index: number;\n transcript: string;\n}\n\nexport interface ConversationItemInputAudioTranscriptionFailedEvent extends BaseServerEvent {\n type: 'conversation.item.input_audio_transcription.failed';\n item_id: string;\n content_index: number;\n error: {\n type: string;\n code?: string;\n message: string;\n param: null;\n };\n}\n\nexport interface ConversationItemTruncatedEvent extends BaseServerEvent {\n type: 'conversation.item.truncated';\n item_id: string;\n content_index: number;\n audio_end_ms: number;\n}\n\nexport interface ConversationItemDeletedEvent extends BaseServerEvent {\n type: 'conversation.item.deleted';\n item_id: string;\n}\n\nexport interface ResponseCreatedEvent extends BaseServerEvent {\n type: 'response.created';\n response: ResponseResource;\n}\n\nexport interface ResponseDoneEvent extends BaseServerEvent {\n type: 'response.done';\n response: ResponseResource;\n}\n\nexport interface ResponseOutputItemAddedEvent extends BaseServerEvent {\n type: 'response.output_item.added';\n response_id: string;\n output_index: number;\n item: ItemResource;\n}\n\nexport interface ResponseOutputItemDoneEvent extends BaseServerEvent {\n type: 'response.output_item.done';\n response_id: string;\n output_index: number;\n item: ItemResource;\n}\n\nexport interface ResponseContentPartAddedEvent extends BaseServerEvent {\n type: 'response.content_part.added';\n response_id: string;\n item_id: string;\n output_index: number;\n content_index: number;\n part: ContentPart;\n}\n\nexport interface ResponseContentPartDoneEvent extends BaseServerEvent {\n type: 'response.content_part.done';\n response_id: string;\n item_id: string;\n output_index: number;\n content_index: number;\n part: ContentPart;\n}\n\nexport interface ResponseTextDeltaEvent extends BaseServerEvent {\n type: 'response.text.delta';\n response_id: string;\n output_index: number;\n content_index: number;\n delta: string;\n}\n\nexport interface ResponseTextDoneEvent extends BaseServerEvent {\n type: 'response.text.done';\n response_id: string;\n output_index: number;\n content_index: number;\n text: string;\n}\n\nexport interface ResponseAudioTranscriptDeltaEvent extends BaseServerEvent {\n type: 'response.audio_transcript.delta';\n response_id: string;\n item_id: string;\n output_index: number;\n content_index: number;\n delta: string;\n}\n\nexport interface ResponseAudioTranscriptDoneEvent extends BaseServerEvent {\n type: 'response.audio_transcript.done';\n response_id: string;\n output_index: number;\n content_index: number;\n transcript: string;\n}\n\nexport interface ResponseAudioDeltaEvent extends BaseServerEvent {\n type: 'response.audio.delta';\n response_id: string;\n item_id: string;\n output_index: number;\n content_index: number;\n delta: AudioBase64Bytes;\n}\n\nexport interface ResponseAudioDoneEvent extends BaseServerEvent {\n type: 'response.audio.done';\n response_id: string;\n output_index: number;\n content_index: number;\n}\n\nexport interface ResponseFunctionCallArgumentsDeltaEvent extends BaseServerEvent {\n type: 'response.function_call_arguments.delta';\n response_id: string;\n output_index: number;\n delta: string;\n}\n\nexport interface ResponseFunctionCallArgumentsDoneEvent extends BaseServerEvent {\n type: 'response.function_call_arguments.done';\n response_id: string;\n output_index: number;\n arguments: string;\n}\n\nexport interface RateLimitsUpdatedEvent extends BaseServerEvent {\n type: 'rate_limits.updated';\n rate_limits: {\n name: 'requests' \| 'tokens' \| 'input_tokens' \| 'output_tokens' \| string;\n limit: number;\n remaining: number;\n reset_seconds: number;\n }[];\n}\n\nexport type ServerEvent =\n \| ErrorEvent\n \| SessionCreatedEvent\n \| SessionUpdatedEvent\n \| ConversationCreatedEvent\n \| InputAudioBufferCommittedEvent\n \| InputAudioBufferClearedEvent\n \| InputAudioBufferSpeechStartedEvent\n \| InputAudioBufferSpeechStoppedEvent\n \| ConversationItemCreatedEvent\n \| ConversationItemInputAudioTranscriptionCompletedEvent\n \| ConversationItemInputAudioTranscriptionFailedEvent\n \| ConversationItemTruncatedEvent\n \| ConversationItemDeletedEvent\n \| ResponseCreatedEvent\n \| ResponseDoneEvent\n \| ResponseOutputItemAddedEvent\n \| ResponseOutputItemDoneEvent\n \| ResponseContentPartAddedEvent\n \| ResponseContentPartDoneEvent\n \| ResponseTextDeltaEvent\n \| ResponseTextDoneEvent\n \| ResponseAudioTranscriptDeltaEvent\n \| ResponseAudioTranscriptDoneEvent\n \| ResponseAudioDeltaEvent\n \| ResponseAudioDoneEvent\n \| ResponseFunctionCallArgumentsDeltaEvent\n \| ResponseFunctionCallArgumentsDoneEvent\n \| RateLimitsUpdatedEvent;\n"],"mappings":"AAIO,MAAM,cAAc;AACpB,MAAM,eAAe;AACrB,MAAM,gBAAgB;AACtB,MAAM,iBAAiB;AAEvB,MAAM,WAAW;","names":[]}
1	+ {"version":3,"sources":["../../src/realtime/api_proto.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\n\nexport const SAMPLE_RATE = 24000;\nexport const NUM_CHANNELS = 1;\nexport const IN_FRAME_SIZE = 2400; // 100ms\nexport const OUT_FRAME_SIZE = 1200; // 50ms\n\nexport const BASE_URL = 'wss://api.openai.com/v1';\n\nexport type Model = 'gpt-4o-realtime-preview-2024-10-01' \| string; // Open-ended, for future models\nexport type Voice =\n \| 'alloy'\n \| 'shimmer'\n \| 'echo'\n \| 'ash'\n \| 'ballad'\n \| 'coral'\n \| 'sage'\n \| 'verse'\n \| string;\nexport type AudioFormat = 'pcm16'; // TODO: 'g711-ulaw' \| 'g711-alaw'\nexport type Role = 'system' \| 'assistant' \| 'user' \| 'tool';\nexport type GenerationFinishedReason = 'stop' \| 'max_tokens' \| 'content_filter' \| 'interrupt';\nexport type InputTranscriptionModel = 'whisper-1' \| string; // Open-ended, for future models\nexport type Modality = 'text' \| 'audio';\nexport type ToolChoice = 'auto' \| 'none' \| 'required' \| string;\nexport type State = 'initializing' \| 'listening' \| 'thinking' \| 'speaking' \| string;\nexport type ResponseStatus =\n \| 'in_progress'\n \| 'completed'\n \| 'incomplete'\n \| 'cancelled'\n \| 'failed'\n \| string;\nexport type ClientEventType =\n \| 'session.update'\n \| 'input_audio_buffer.append'\n \| 'input_audio_buffer.commit'\n \| 'input_audio_buffer.clear'\n \| 'conversation.item.create'\n \| 'conversation.item.truncate'\n \| 'conversation.item.delete'\n \| 'response.create'\n \| 'response.cancel';\nexport type ServerEventType =\n \| 'error'\n \| 'session.created'\n \| 'session.updated'\n \| 'conversation.created'\n \| 'input_audio_buffer.committed'\n \| 'input_audio_buffer.cleared'\n \| 'input_audio_buffer.speech_started'\n \| 'input_audio_buffer.speech_stopped'\n \| 'conversation.item.created'\n \| 'conversation.item.input_audio_transcription.completed'\n \| 'conversation.item.input_audio_transcription.failed'\n \| 'conversation.item.truncated'\n \| 'conversation.item.deleted'\n \| 'response.created'\n \| 'response.done'\n \| 'response.output_item.added'\n \| 'response.output_item.done'\n \| 'response.content_part.added'\n \| 'response.content_part.done'\n \| 'response.text.delta'\n \| 'response.text.done'\n \| 'response.audio_transcript.delta'\n \| 'response.audio_transcript.done'\n \| 'response.audio.delta'\n \| 'response.audio.done'\n \| 'response.function_call_arguments.delta'\n \| 'response.function_call_arguments.done'\n \| 'rate_limits.updated';\n\nexport type AudioBase64Bytes = string;\n\nexport interface Tool {\n type: 'function';\n name: string;\n description?: string;\n parameters: {\n type: 'object';\n properties: {\n [prop: string]: {\n [prop: string]: any;\n };\n };\n required: string[];\n };\n}\n\nexport type TurnDetectionType =\n \| {\n type: 'semantic_vad';\n eagerness?: 'auto' \| 'low' \| 'medium' \| 'high'; // default: auto\n create_response?: boolean; // default: true\n interrupt_response?: boolean; // default: true\n }\n \| {\n type: 'server_vad';\n threshold?: number; // 0.0 to 1.0, default: 0.5\n prefix_padding_ms?: number; // default: 300\n silence_duration_ms?: number; // default: 200\n create_response?: boolean; // default: true\n interrupt_response?: boolean; // default: true\n };\n\nexport type InputAudioTranscription = {\n model: InputTranscriptionModel;\n language?: string;\n prompt?: string;\n};\n\nexport interface InputTextContent {\n type: 'input_text';\n text: string;\n}\n\nexport interface InputAudioContent {\n type: 'input_audio';\n audio: AudioBase64Bytes;\n}\n\nexport interface TextContent {\n type: 'text';\n text: string;\n}\n\nexport interface AudioContent {\n type: 'audio';\n audio: AudioBase64Bytes;\n transcript: string;\n}\n\nexport type Content = InputTextContent \| InputAudioContent \| TextContent \| AudioContent;\nexport type ContentPart = {\n type: 'text' \| 'audio';\n audio?: AudioBase64Bytes;\n transcript?: string;\n};\n\nexport interface BaseItem {\n id: string;\n object: 'realtime.item';\n type: string;\n}\n\nexport interface SystemItem extends BaseItem {\n type: 'message';\n role: 'system';\n content: InputTextContent;\n}\n\nexport interface UserItem extends BaseItem {\n type: 'message';\n role: 'user';\n content: (InputTextContent \| InputAudioContent)[];\n}\n\nexport interface AssistantItem extends BaseItem {\n type: 'message';\n role: 'assistant';\n content: (TextContent \| AudioContent)[];\n}\n\nexport interface FunctionCallItem extends BaseItem {\n type: 'function_call';\n call_id: string;\n name: string;\n arguments: string;\n}\n\nexport interface FunctionCallOutputItem extends BaseItem {\n type: 'function_call_output';\n call_id: string;\n output: string;\n}\n\nexport type ItemResource =\n \| SystemItem\n \| UserItem\n \| AssistantItem\n \| FunctionCallItem\n \| FunctionCallOutputItem;\n\n// Session Resource\nexport interface SessionResource {\n id: string;\n object: 'realtime.session';\n model: string;\n modalities: Modality[]; // default: [\"text\", \"audio\"]\n instructions: string;\n voice: Voice; // default: \"alloy\"\n input_audio_format: AudioFormat; // default: \"pcm16\"\n output_audio_format: AudioFormat; // default: \"pcm16\"\n input_audio_transcription: InputAudioTranscription \| null;\n turn_detection: TurnDetectionType \| null;\n tools: Tool[];\n tool_choice: ToolChoice; // default: \"auto\"\n temperature: number; // default: 0.8\n max_response_output_tokens: number \| 'inf';\n expires_at: number;\n}\n\n// Conversation Resource\nexport interface ConversationResource {\n id: string;\n object: 'realtime.conversation';\n}\n\nexport type ResponseStatusDetails =\n \| {\n type: 'incomplete';\n reason: 'max_output_tokens' \| 'content_filter' \| string;\n }\n \| {\n type: 'failed';\n error?: {\n code: 'server_error' \| 'rate_limit_exceeded' \| string;\n message: string;\n };\n }\n \| {\n type: 'cancelled';\n reason: 'turn_detected' \| 'client_cancelled' \| string;\n };\n\nexport interface ModelUsage {\n total_tokens: number;\n input_tokens: number;\n output_tokens: number;\n input_token_details: {\n text_tokens: number;\n audio_tokens: number;\n cached_tokens: number;\n cached_tokens_details: {\n text_tokens: number;\n audio_tokens: number;\n image_tokens: number;\n };\n };\n output_token_details: {\n text_tokens: number;\n audio_tokens: number;\n };\n}\n\nexport interface ResponseResource {\n id: string;\n object: 'realtime.response';\n status: ResponseStatus;\n status_details: ResponseStatusDetails;\n output: ItemResource[];\n usage?: ModelUsage;\n metadata?: Record<string, string>;\n}\n\n// Client Events\ninterface BaseClientEvent {\n event_id?: string;\n type: ClientEventType;\n}\n\nexport interface SessionUpdateEvent extends BaseClientEvent {\n type: 'session.update';\n session: Partial<{\n model: Model;\n modalities: Modality[];\n instructions: string;\n voice: Voice;\n input_audio_format: AudioFormat;\n output_audio_format: AudioFormat;\n input_audio_transcription: InputAudioTranscription \| null;\n turn_detection: TurnDetectionType \| null;\n tools: Tool[];\n tool_choice: ToolChoice;\n temperature: number;\n max_response_output_tokens?: number \| 'inf';\n speed?: number;\n }>;\n}\n\nexport interface InputAudioBufferAppendEvent extends BaseClientEvent {\n type: 'input_audio_buffer.append';\n audio: AudioBase64Bytes;\n}\n\nexport interface InputAudioBufferCommitEvent extends BaseClientEvent {\n type: 'input_audio_buffer.commit';\n}\n\nexport interface InputAudioBufferClearEvent extends BaseClientEvent {\n type: 'input_audio_buffer.clear';\n}\n\nexport interface UserItemCreate {\n id: string;\n type: 'message';\n role: 'user';\n content: (InputTextContent \| InputAudioContent)[];\n}\n\nexport interface AssistantItemCreate {\n id: string;\n type: 'message';\n role: 'assistant';\n content: TextContent[];\n}\n\nexport interface SystemItemCreate {\n id: string;\n type: 'message';\n role: 'system';\n content: InputTextContent[];\n}\n\nexport interface FunctionCallOutputItemCreate {\n id: string;\n type: 'function_call_output';\n call_id: string;\n output: string;\n}\n\nexport type ConversationItemCreateContent =\n \| UserItemCreate\n \| AssistantItemCreate\n \| SystemItemCreate\n \| FunctionCallOutputItemCreate;\n\nexport interface ConversationItemCreateEvent extends BaseClientEvent {\n type: 'conversation.item.create';\n previous_item_id?: string;\n item: ConversationItemCreateContent;\n}\n\nexport interface ConversationItemTruncateEvent extends BaseClientEvent {\n type: 'conversation.item.truncate';\n item_id: string;\n content_index: number;\n audio_end_ms: number;\n}\n\nexport interface ConversationItemDeleteEvent extends BaseClientEvent {\n type: 'conversation.item.delete';\n item_id: string;\n}\n\nexport interface ResponseCreateEvent extends BaseClientEvent {\n type: 'response.create';\n response?: Partial<{\n modalities: Modality[];\n instructions: string;\n voice: Voice;\n output_audio_format: AudioFormat;\n tools?: Tool[];\n tool_choice: ToolChoice;\n temperature: number;\n max_output_tokens: number \| 'inf';\n metadata?: Record<string, string>;\n }>;\n}\n\nexport interface ResponseCancelEvent extends BaseClientEvent {\n type: 'response.cancel';\n}\n\nexport type ClientEvent =\n \| SessionUpdateEvent\n \| InputAudioBufferAppendEvent\n \| InputAudioBufferCommitEvent\n \| InputAudioBufferClearEvent\n \| ConversationItemCreateEvent\n \| ConversationItemTruncateEvent\n \| ConversationItemDeleteEvent\n \| ResponseCreateEvent\n \| ResponseCancelEvent;\n\ninterface BaseServerEvent {\n event_id: string;\n type: ServerEventType;\n}\n\nexport interface ErrorEvent extends BaseServerEvent {\n type: 'error';\n error: {\n type: 'invalid_request_error' \| 'server_error' \| string;\n code?: string;\n message: string;\n param: string;\n event_id: string;\n };\n}\n\nexport interface SessionCreatedEvent extends BaseServerEvent {\n type: 'session.created';\n session: SessionResource;\n}\n\nexport interface SessionUpdatedEvent extends BaseServerEvent {\n type: 'session.updated';\n session: SessionResource;\n}\n\nexport interface ConversationCreatedEvent extends BaseServerEvent {\n type: 'conversation.created';\n conversation: ConversationResource;\n}\n\nexport interface InputAudioBufferCommittedEvent extends BaseServerEvent {\n type: 'input_audio_buffer.committed';\n item_id: string;\n}\n\nexport interface InputAudioBufferClearedEvent extends BaseServerEvent {\n type: 'input_audio_buffer.cleared';\n}\n\nexport interface InputAudioBufferSpeechStartedEvent extends BaseServerEvent {\n type: 'input_audio_buffer.speech_started';\n audio_start_ms: number;\n item_id: string;\n}\n\nexport interface InputAudioBufferSpeechStoppedEvent extends BaseServerEvent {\n type: 'input_audio_buffer.speech_stopped';\n audio_end_ms: number;\n item_id: string;\n}\n\nexport interface ConversationItemCreatedEvent extends BaseServerEvent {\n type: 'conversation.item.created';\n previous_item_id: string;\n item: ItemResource;\n}\n\nexport interface ConversationItemInputAudioTranscriptionCompletedEvent extends BaseServerEvent {\n type: 'conversation.item.input_audio_transcription.completed';\n item_id: string;\n content_index: number;\n transcript: string;\n}\n\nexport interface ConversationItemInputAudioTranscriptionFailedEvent extends BaseServerEvent {\n type: 'conversation.item.input_audio_transcription.failed';\n item_id: string;\n content_index: number;\n error: {\n type: string;\n code?: string;\n message: string;\n param: null;\n };\n}\n\nexport interface ConversationItemTruncatedEvent extends BaseServerEvent {\n type: 'conversation.item.truncated';\n item_id: string;\n content_index: number;\n audio_end_ms: number;\n}\n\nexport interface ConversationItemDeletedEvent extends BaseServerEvent {\n type: 'conversation.item.deleted';\n item_id: string;\n}\n\nexport interface ResponseCreatedEvent extends BaseServerEvent {\n type: 'response.created';\n response: ResponseResource;\n}\n\nexport interface ResponseDoneEvent extends BaseServerEvent {\n type: 'response.done';\n response: ResponseResource;\n}\n\nexport interface ResponseOutputItemAddedEvent extends BaseServerEvent {\n type: 'response.output_item.added';\n response_id: string;\n output_index: number;\n item: ItemResource;\n}\n\nexport interface ResponseOutputItemDoneEvent extends BaseServerEvent {\n type: 'response.output_item.done';\n response_id: string;\n output_index: number;\n item: ItemResource;\n}\n\nexport interface ResponseContentPartAddedEvent extends BaseServerEvent {\n type: 'response.content_part.added';\n response_id: string;\n item_id: string;\n output_index: number;\n content_index: number;\n part: ContentPart;\n}\n\nexport interface ResponseContentPartDoneEvent extends BaseServerEvent {\n type: 'response.content_part.done';\n response_id: string;\n item_id: string;\n output_index: number;\n content_index: number;\n part: ContentPart;\n}\n\nexport interface ResponseTextDeltaEvent extends BaseServerEvent {\n type: 'response.text.delta';\n response_id: string;\n item_id: string;\n output_index: number;\n content_index: number;\n delta: string;\n}\n\nexport interface ResponseTextDoneEvent extends BaseServerEvent {\n type: 'response.text.done';\n response_id: string;\n item_id: string;\n output_index: number;\n content_index: number;\n text: string;\n}\n\nexport interface ResponseAudioTranscriptDeltaEvent extends BaseServerEvent {\n type: 'response.audio_transcript.delta';\n response_id: string;\n item_id: string;\n output_index: number;\n content_index: number;\n delta: string;\n}\n\nexport interface ResponseAudioTranscriptDoneEvent extends BaseServerEvent {\n type: 'response.audio_transcript.done';\n response_id: string;\n output_index: number;\n content_index: number;\n transcript: string;\n}\n\nexport interface ResponseAudioDeltaEvent extends BaseServerEvent {\n type: 'response.audio.delta';\n response_id: string;\n item_id: string;\n output_index: number;\n content_index: number;\n delta: AudioBase64Bytes;\n}\n\nexport interface ResponseAudioDoneEvent extends BaseServerEvent {\n type: 'response.audio.done';\n response_id: string;\n output_index: number;\n content_index: number;\n}\n\nexport interface ResponseFunctionCallArgumentsDeltaEvent extends BaseServerEvent {\n type: 'response.function_call_arguments.delta';\n response_id: string;\n output_index: number;\n delta: string;\n}\n\nexport interface ResponseFunctionCallArgumentsDoneEvent extends BaseServerEvent {\n type: 'response.function_call_arguments.done';\n response_id: string;\n output_index: number;\n arguments: string;\n}\n\nexport interface RateLimitsUpdatedEvent extends BaseServerEvent {\n type: 'rate_limits.updated';\n rate_limits: {\n name: 'requests' \| 'tokens' \| 'input_tokens' \| 'output_tokens' \| string;\n limit: number;\n remaining: number;\n reset_seconds: number;\n }[];\n}\n\nexport type ServerEvent =\n \| ErrorEvent\n \| SessionCreatedEvent\n \| SessionUpdatedEvent\n \| ConversationCreatedEvent\n \| InputAudioBufferCommittedEvent\n \| InputAudioBufferClearedEvent\n \| InputAudioBufferSpeechStartedEvent\n \| InputAudioBufferSpeechStoppedEvent\n \| ConversationItemCreatedEvent\n \| ConversationItemInputAudioTranscriptionCompletedEvent\n \| ConversationItemInputAudioTranscriptionFailedEvent\n \| ConversationItemTruncatedEvent\n \| ConversationItemDeletedEvent\n \| ResponseCreatedEvent\n \| ResponseDoneEvent\n \| ResponseOutputItemAddedEvent\n \| ResponseOutputItemDoneEvent\n \| ResponseContentPartAddedEvent\n \| ResponseContentPartDoneEvent\n \| ResponseTextDeltaEvent\n \| ResponseTextDoneEvent\n \| ResponseAudioTranscriptDeltaEvent\n \| ResponseAudioTranscriptDoneEvent\n \| ResponseAudioDeltaEvent\n \| ResponseAudioDoneEvent\n \| ResponseFunctionCallArgumentsDeltaEvent\n \| ResponseFunctionCallArgumentsDoneEvent\n \| RateLimitsUpdatedEvent;\n"],"mappings":"AAIO,MAAM,cAAc;AACpB,MAAM,eAAe;AACrB,MAAM,gBAAgB;AACtB,MAAM,iBAAiB;AAEvB,MAAM,WAAW;","names":[]}

package/dist/realtime/realtime_model.cjs CHANGED Viewed

@@ -84,7 +84,8 @@ const DEFAULT_REALTIME_MODEL_OPTIONS = {
   toolChoice: DEFAULT_TOOL_CHOICE,
   maxResponseOutputTokens: DEFAULT_MAX_RESPONSE_OUTPUT_TOKENS,
   maxSessionDuration: DEFAULT_MAX_SESSION_DURATION,
-  connOptions: import_agents.DEFAULT_API_CONNECT_OPTIONS
+  connOptions: import_agents.DEFAULT_API_CONNECT_OPTIONS,
+  modalities: ["text", "audio"]
 };
 class RealtimeModel extends import_agents.llm.RealtimeModel {
   sampleRate = api_proto.SAMPLE_RATE;
@@ -94,11 +95,13 @@ class RealtimeModel extends import_agents.llm.RealtimeModel {
   /* @internal */
   _options;
   constructor(options = {}) {
+    const modalities = options.modalities || DEFAULT_REALTIME_MODEL_OPTIONS.modalities;
     super({
       messageTruncation: true,
       turnDetection: options.turnDetection !== null,
       userTranscription: options.inputAudioTranscription !== null,
-      autoToolReplyGeneration: false
+      autoToolReplyGeneration: false,
+      audioOutput: modalities.includes("audio")
     });
     const isAzure = !!(options.apiVersion || options.entraToken || options.azureDeployment);
     if (options.apiKey === "" && !isAzure) {
@@ -121,13 +124,15 @@ class RealtimeModel extends import_agents.llm.RealtimeModel {
       }
       options.baseURL = `${azureEndpoint.replace(/\/$/, "")}/openai`;
     }
+    const { modalities: _, ...optionsWithoutModalities } = options;
     this._options = {
       ...DEFAULT_REALTIME_MODEL_OPTIONS,
-      ...options,
+      ...optionsWithoutModalities,
       baseURL: options.baseURL || BASE_URL,
       apiKey,
       isAzure,
-      model: options.model || DEFAULT_REALTIME_MODEL_OPTIONS.model
+      model: options.model || DEFAULT_REALTIME_MODEL_OPTIONS.model,
+      modalities
     };
   }
   /**
@@ -267,6 +272,7 @@ class RealtimeSession extends import_agents.llm.RealtimeSession {
     this.messageChannel.put(command);
   }
   createSessionUpdateEvent() {
+    const modalities = this.oaiRealtimeModel._options.modalities.includes("audio") ? ["text", "audio"] : ["text"];
     return {
       type: "session.update",
       session: {
@@ -274,7 +280,7 @@ class RealtimeSession extends import_agents.llm.RealtimeSession {
         voice: this.oaiRealtimeModel._options.voice,
         input_audio_format: "pcm16",
         output_audio_format: "pcm16",
-        modalities: ["text", "audio"],
+        modalities,
         turn_detection: this.oaiRealtimeModel._options.turnDetection,
         input_audio_transcription: this.oaiRealtimeModel._options.inputAudioTranscription,
         // TODO(shubhra): add inputAudioNoiseReduction
@@ -462,12 +468,31 @@ class RealtimeSession extends import_agents.llm.RealtimeSession {
     });
   }
   async truncate(_options) {
-    this.sendEvent({
-      type: "conversation.item.truncate",
-      content_index: 0,
-      item_id: _options.messageId,
-      audio_end_ms: _options.audioEndMs
-    });
+    if (!_options.modalities || _options.modalities.includes("audio")) {
+      this.sendEvent({
+        type: "conversation.item.truncate",
+        content_index: 0,
+        item_id: _options.messageId,
+        audio_end_ms: _options.audioEndMs
+      });
+    } else if (_options.audioTranscript !== void 0) {
+      const chatCtx = this.chatCtx.copy();
+      const idx = chatCtx.indexById(_options.messageId);
+      if (idx !== void 0) {
+        const item = chatCtx.items[idx];
+        if (item && item.type === "message") {
+          const newItem = import_agents.llm.ChatMessage.create({
+            ...item,
+            content: [_options.audioTranscript]
+          });
+          chatCtx.items[idx] = newItem;
+          const events = this.createChatCtxUpdateEvents(chatCtx);
+          for (const ev of events) {
+            this.sendEvent(ev);
+          }
+        }
+      }
+    }
   }
   loggableEvent(event) {
     const untypedEvent = {};
@@ -683,6 +708,12 @@ class RealtimeSession extends import_agents.llm.RealtimeSession {
         case "response.content_part.done":
           this.handleResponseContentPartDone(event);
           break;
+        case "response.text.delta":
+          this.handleResponseTextDelta(event);
+          break;
+        case "response.text.done":
+          this.handleResponseTextDone(event);
+          break;
         case "response.audio_transcript.delta":
           this.handleResponseAudioTranscriptDelta(event);
           break;
@@ -799,6 +830,29 @@ class RealtimeSession extends import_agents.llm.RealtimeSession {
       this.textModeRecoveryRetries = 0;
       return;
     }
+    const itemId = event.item.id;
+    if (!itemId) {
+      throw new Error("item.id is not set");
+    }
+    const modalitiesFut = new import_agents.Future();
+    const itemGeneration = {
+      messageId: itemId,
+      textChannel: import_agents.stream.createStreamChannel(),
+      audioChannel: import_agents.stream.createStreamChannel(),
+      audioTranscript: "",
+      modalities: modalitiesFut
+    };
+    if (!this.oaiRealtimeModel.capabilities.audioOutput) {
+      itemGeneration.audioChannel.close();
+      modalitiesFut.resolve(["text"]);
+    }
+    this.currentGeneration.messageChannel.write({
+      messageId: itemId,
+      textStream: itemGeneration.textChannel.stream(),
+      audioStream: itemGeneration.audioChannel.stream(),
+      modalities: modalitiesFut.await
+    });
+    this.currentGeneration.messages.set(itemId, itemGeneration);
   }
   handleConversationItemCreated(event) {
     if (!event.item.id) {
@@ -859,35 +913,20 @@ class RealtimeSession extends import_agents.llm.RealtimeSession {
     }
     const itemId = event.item_id;
     const itemType = event.part.type;
-    const responseId = event.response_id;
-    if (itemType === "audio") {
-      this.resolveGeneration(responseId);
-      if (this.textModeRecoveryRetries > 0) {
-        this.#logger.info(
-          { retries: this.textModeRecoveryRetries },
-          "recovered from text-only response"
-        );
-        this.textModeRecoveryRetries = 0;
-      }
-      const itemGeneration = {
-        messageId: itemId,
-        textChannel: import_agents.stream.createStreamChannel(),
-        audioChannel: import_agents.stream.createStreamChannel(),
-        audioTranscript: ""
-      };
-      this.currentGeneration.messageChannel.write({
-        messageId: itemId,
-        textStream: itemGeneration.textChannel.stream(),
-        audioStream: itemGeneration.audioChannel.stream()
-      });
-      this.currentGeneration.messages.set(itemId, itemGeneration);
-      this.currentGeneration._firstTokenTimestamp = Date.now();
+    const itemGeneration = this.currentGeneration.messages.get(itemId);
+    if (!itemGeneration) {
+      this.#logger.warn(`itemGeneration not found for itemId=${itemId}`);
       return;
-    } else {
-      this.interrupt();
-      if (this.textModeRecoveryRetries === 0) {
-        this.#logger.warn({ responseId }, "received text-only response from OpenAI Realtime API");
-      }
+    }
+    if (itemType === "text" && this.oaiRealtimeModel.capabilities.audioOutput) {
+      this.#logger.warn("Text response received from OpenAI Realtime API in audio modality.");
+    }
+    if (!itemGeneration.modalities.done) {
+      const modalityResult = itemType === "text" ? ["text"] : ["audio", "text"];
+      itemGeneration.modalities.resolve(modalityResult);
+    }
+    if (this.currentGeneration._firstTokenTimestamp === void 0) {
+      this.currentGeneration._firstTokenTimestamp = Date.now();
     }
   }
   handleResponseContentPartDone(event) {
@@ -898,6 +937,25 @@ class RealtimeSession extends import_agents.llm.RealtimeSession {
       throw new Error("currentGeneration is not set");
     }
   }
+  handleResponseTextDelta(event) {
+    if (!this.currentGeneration) {
+      throw new Error("currentGeneration is not set");
+    }
+    const itemGeneration = this.currentGeneration.messages.get(event.item_id);
+    if (!itemGeneration) {
+      throw new Error("itemGeneration is not set");
+    }
+    if (!this.oaiRealtimeModel.capabilities.audioOutput && !this.currentGeneration._firstTokenTimestamp) {
+      this.currentGeneration._firstTokenTimestamp = Date.now();
+    }
+    itemGeneration.textChannel.write(event.delta);
+    itemGeneration.audioTranscript += event.delta;
+  }
+  handleResponseTextDone(_event) {
+    if (!this.currentGeneration) {
+      throw new Error("currentGeneration is not set");
+    }
+  }
   handleResponseAudioTranscriptDelta(event) {
     if (!this.currentGeneration) {
       throw new Error("currentGeneration is not set");
@@ -920,6 +978,12 @@ class RealtimeSession extends import_agents.llm.RealtimeSession {
     if (!itemGeneration) {
       throw new Error("itemGeneration is not set");
     }
+    if (this.currentGeneration._firstTokenTimestamp === void 0) {
+      this.currentGeneration._firstTokenTimestamp = Date.now();
+    }
+    if (!itemGeneration.modalities.done) {
+      itemGeneration.modalities.resolve(["audio", "text"]);
+    }
     const binaryString = atob(event.delta);
     const len = binaryString.length;
     const bytes = new Uint8Array(len);
@@ -968,6 +1032,9 @@ class RealtimeSession extends import_agents.llm.RealtimeSession {
       }
       itemGeneration.textChannel.close();
       itemGeneration.audioChannel.close();
+      if (!itemGeneration.modalities.done) {
+        itemGeneration.modalities.resolve(this.oaiRealtimeModel._options.modalities);
+      }
     }
   }
   handleResponseDone(_event) {
@@ -986,6 +1053,9 @@ class RealtimeSession extends import_agents.llm.RealtimeSession {
     for (const generation of this.currentGeneration.messages.values()) {
       generation.textChannel.close();
       generation.audioChannel.close();
+      if (!generation.modalities.done) {
+        generation.modalities.resolve(this.oaiRealtimeModel._options.modalities);
+      }
     }
     this.currentGeneration.functionChannel.close();
     this.currentGeneration.messageChannel.close();
@@ -1145,6 +1215,8 @@ function livekitItemToOpenAIItem(item) {
         role,
         content: contentList
       };
+    default:
+      throw new Error(`Unsupported item type: ${item.type}`);
   }
 }
 function openAIItemToLivekitItem(item) {