@livekit/agents-plugin-openai 1.0.17 → 1.0.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/llm.cjs +5 -2
- package/dist/llm.cjs.map +1 -1
- package/dist/llm.d.cts +2 -1
- package/dist/llm.d.ts +2 -1
- package/dist/llm.d.ts.map +1 -1
- package/dist/llm.js +5 -2
- package/dist/llm.js.map +1 -1
- package/dist/llm.test.cjs +9 -0
- package/dist/llm.test.cjs.map +1 -1
- package/dist/llm.test.js +10 -1
- package/dist/llm.test.js.map +1 -1
- package/dist/realtime/api_proto.cjs.map +1 -1
- package/dist/realtime/api_proto.d.cts +5 -3
- package/dist/realtime/api_proto.d.ts +5 -3
- package/dist/realtime/api_proto.d.ts.map +1 -1
- package/dist/realtime/api_proto.js.map +1 -1
- package/dist/realtime/realtime_model.cjs +111 -39
- package/dist/realtime/realtime_model.cjs.map +1 -1
- package/dist/realtime/realtime_model.d.cts +7 -0
- package/dist/realtime/realtime_model.d.ts +7 -0
- package/dist/realtime/realtime_model.d.ts.map +1 -1
- package/dist/realtime/realtime_model.js +111 -39
- package/dist/realtime/realtime_model.js.map +1 -1
- package/package.json +7 -7
- package/src/llm.test.ts +11 -1
- package/src/llm.ts +6 -2
- package/src/realtime/api_proto.ts +5 -3
- package/src/realtime/realtime_model.ts +146 -39
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../src/realtime/api_proto.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\n\nexport const SAMPLE_RATE = 24000;\nexport const NUM_CHANNELS = 1;\nexport const IN_FRAME_SIZE = 2400; // 100ms\nexport const OUT_FRAME_SIZE = 1200; // 50ms\n\nexport const BASE_URL = 'wss://api.openai.com/v1';\n\nexport type Model = 'gpt-4o-realtime-preview-2024-10-01' | string; // Open-ended, for future models\nexport type Voice =\n | 'alloy'\n | 'shimmer'\n | 'echo'\n | 'ash'\n | 'ballad'\n | 'coral'\n | 'sage'\n | 'verse'\n | string;\nexport type AudioFormat = 'pcm16'; // TODO: 'g711-ulaw' | 'g711-alaw'\nexport type Role = 'system' | 'assistant' | 'user' | 'tool';\nexport type GenerationFinishedReason = 'stop' | 'max_tokens' | 'content_filter' | 'interrupt';\nexport type InputTranscriptionModel = 'whisper-1' | string; // Open-ended, for future models\nexport type Modality = 'text' | 'audio';\nexport type ToolChoice = 'auto' | 'none' | 'required' | string;\nexport type State = 'initializing' | 'listening' | 'thinking' | 'speaking' | string;\nexport type ResponseStatus =\n | 'in_progress'\n | 'completed'\n | 'incomplete'\n | 'cancelled'\n | 'failed'\n | string;\nexport type ClientEventType =\n | 'session.update'\n | 'input_audio_buffer.append'\n | 'input_audio_buffer.commit'\n | 'input_audio_buffer.clear'\n | 'conversation.item.create'\n | 'conversation.item.truncate'\n | 'conversation.item.delete'\n | 'response.create'\n | 'response.cancel';\nexport type ServerEventType =\n | 'error'\n | 'session.created'\n | 'session.updated'\n | 'conversation.created'\n | 'input_audio_buffer.committed'\n | 'input_audio_buffer.cleared'\n | 'input_audio_buffer.speech_started'\n | 'input_audio_buffer.speech_stopped'\n | 'conversation.item.created'\n | 'conversation.item.input_audio_transcription.completed'\n | 'conversation.item.input_audio_transcription.failed'\n | 'conversation.item.truncated'\n | 'conversation.item.deleted'\n | 'response.created'\n | 'response.done'\n | 'response.output_item.added'\n | 'response.output_item.done'\n | 'response.content_part.added'\n | 'response.content_part.done'\n | 'response.text.delta'\n | 'response.text.done'\n | 'response.audio_transcript.delta'\n | 'response.audio_transcript.done'\n | 'response.audio.delta'\n | 'response.audio.done'\n | 'response.function_call_arguments.delta'\n | 'response.function_call_arguments.done'\n | 'rate_limits.updated';\n\nexport type AudioBase64Bytes = string;\n\nexport interface Tool {\n type: 'function';\n name: string;\n description?: string;\n parameters: {\n type: 'object';\n properties: {\n [prop: string]: {\n [prop: string]: any;\n };\n };\n required: string[];\n };\n}\n\nexport type TurnDetectionType =\n | {\n type: 'semantic_vad';\n eagerness?: 'auto' | 'low' | 'medium' | 'high'; // default: auto\n create_response?: boolean; // default: true\n interrupt_response?: boolean; // default: true\n }\n | {\n type: 'server_vad';\n threshold?: number; // 0.0 to 1.0, default: 0.5\n prefix_padding_ms?: number; // default: 300\n silence_duration_ms?: number; // default: 200\n create_response?: boolean; // default: true\n interrupt_response?: boolean; // default: true\n };\n\nexport type InputAudioTranscription = {\n model: InputTranscriptionModel;\n language?: string;\n prompt?: string;\n};\n\nexport interface InputTextContent {\n type: 'input_text';\n text: string;\n}\n\nexport interface InputAudioContent {\n type: 'input_audio';\n audio: AudioBase64Bytes;\n}\n\nexport interface TextContent {\n type: 'text';\n text: string;\n}\n\nexport interface AudioContent {\n type: 'audio';\n audio: AudioBase64Bytes;\n transcript: string;\n}\n\nexport type Content = InputTextContent | InputAudioContent | TextContent | AudioContent;\nexport type ContentPart = {\n type: 'text' | 'audio';\n audio?: AudioBase64Bytes;\n transcript?: string;\n};\n\nexport interface BaseItem {\n id: string;\n object: 'realtime.item';\n type: string;\n}\n\nexport interface SystemItem extends BaseItem {\n type: 'message';\n role: 'system';\n content: InputTextContent;\n}\n\nexport interface UserItem extends BaseItem {\n type: 'message';\n role: 'user';\n content: (InputTextContent | InputAudioContent)[];\n}\n\nexport interface AssistantItem extends BaseItem {\n type: 'message';\n role: 'assistant';\n content: (TextContent | AudioContent)[];\n}\n\nexport interface FunctionCallItem extends BaseItem {\n type: 'function_call';\n call_id: string;\n name: string;\n arguments: string;\n}\n\nexport interface FunctionCallOutputItem extends BaseItem {\n type: 'function_call_output';\n call_id: string;\n output: string;\n}\n\nexport type ItemResource =\n | SystemItem\n | UserItem\n | AssistantItem\n | FunctionCallItem\n | FunctionCallOutputItem;\n\n// Session Resource\nexport interface SessionResource {\n id: string;\n object: 'realtime.session';\n model: string;\n modalities: ['text', 'audio'] | ['text']; // default: [\"text\", \"audio\"]\n instructions: string;\n voice: Voice; // default: \"alloy\"\n input_audio_format: AudioFormat; // default: \"pcm16\"\n output_audio_format: AudioFormat; // default: \"pcm16\"\n input_audio_transcription: InputAudioTranscription | null;\n turn_detection: TurnDetectionType | null;\n tools: Tool[];\n tool_choice: ToolChoice; // default: \"auto\"\n temperature: number; // default: 0.8\n max_response_output_tokens: number | 'inf';\n expires_at: number;\n}\n\n// Conversation Resource\nexport interface ConversationResource {\n id: string;\n object: 'realtime.conversation';\n}\n\nexport type ResponseStatusDetails =\n | {\n type: 'incomplete';\n reason: 'max_output_tokens' | 'content_filter' | string;\n }\n | {\n type: 'failed';\n error?: {\n code: 'server_error' | 'rate_limit_exceeded' | string;\n message: string;\n };\n }\n | {\n type: 'cancelled';\n reason: 'turn_detected' | 'client_cancelled' | string;\n };\n\nexport interface ModelUsage {\n total_tokens: number;\n input_tokens: number;\n output_tokens: number;\n input_token_details: {\n text_tokens: number;\n audio_tokens: number;\n cached_tokens: number;\n cached_tokens_details: {\n text_tokens: number;\n audio_tokens: number;\n image_tokens: number;\n };\n };\n output_token_details: {\n text_tokens: number;\n audio_tokens: number;\n };\n}\n\nexport interface ResponseResource {\n id: string;\n object: 'realtime.response';\n status: ResponseStatus;\n status_details: ResponseStatusDetails;\n output: ItemResource[];\n usage?: ModelUsage;\n metadata?: Record<string, string>;\n}\n\n// Client Events\ninterface BaseClientEvent {\n event_id?: string;\n type: ClientEventType;\n}\n\nexport interface SessionUpdateEvent extends BaseClientEvent {\n type: 'session.update';\n session: Partial<{\n model: Model;\n modalities: ['text', 'audio'] | ['text'];\n instructions: string;\n voice: Voice;\n input_audio_format: AudioFormat;\n output_audio_format: AudioFormat;\n input_audio_transcription: InputAudioTranscription | null;\n turn_detection: TurnDetectionType | null;\n tools: Tool[];\n tool_choice: ToolChoice;\n temperature: number;\n max_response_output_tokens?: number | 'inf';\n speed?: number;\n }>;\n}\n\nexport interface InputAudioBufferAppendEvent extends BaseClientEvent {\n type: 'input_audio_buffer.append';\n audio: AudioBase64Bytes;\n}\n\nexport interface InputAudioBufferCommitEvent extends BaseClientEvent {\n type: 'input_audio_buffer.commit';\n}\n\nexport interface InputAudioBufferClearEvent extends BaseClientEvent {\n type: 'input_audio_buffer.clear';\n}\n\nexport interface UserItemCreate {\n id: string;\n type: 'message';\n role: 'user';\n content: (InputTextContent | InputAudioContent)[];\n}\n\nexport interface AssistantItemCreate {\n id: string;\n type: 'message';\n role: 'assistant';\n content: TextContent[];\n}\n\nexport interface SystemItemCreate {\n id: string;\n type: 'message';\n role: 'system';\n content: InputTextContent[];\n}\n\nexport interface FunctionCallOutputItemCreate {\n id: string;\n type: 'function_call_output';\n call_id: string;\n output: string;\n}\n\nexport type ConversationItemCreateContent =\n | UserItemCreate\n | AssistantItemCreate\n | SystemItemCreate\n | FunctionCallOutputItemCreate;\n\nexport interface ConversationItemCreateEvent extends BaseClientEvent {\n type: 'conversation.item.create';\n previous_item_id?: string;\n item: ConversationItemCreateContent;\n}\n\nexport interface ConversationItemTruncateEvent extends BaseClientEvent {\n type: 'conversation.item.truncate';\n item_id: string;\n content_index: number;\n audio_end_ms: number;\n}\n\nexport interface ConversationItemDeleteEvent extends BaseClientEvent {\n type: 'conversation.item.delete';\n item_id: string;\n}\n\nexport interface ResponseCreateEvent extends BaseClientEvent {\n type: 'response.create';\n response?: Partial<{\n modalities: ['text', 'audio'] | ['text'];\n instructions: string;\n voice: Voice;\n output_audio_format: AudioFormat;\n tools?: Tool[];\n tool_choice: ToolChoice;\n temperature: number;\n max_output_tokens: number | 'inf';\n metadata?: Record<string, string>;\n }>;\n}\n\nexport interface ResponseCancelEvent extends BaseClientEvent {\n type: 'response.cancel';\n}\n\nexport type ClientEvent =\n | SessionUpdateEvent\n | InputAudioBufferAppendEvent\n | InputAudioBufferCommitEvent\n | InputAudioBufferClearEvent\n | ConversationItemCreateEvent\n | ConversationItemTruncateEvent\n | ConversationItemDeleteEvent\n | ResponseCreateEvent\n | ResponseCancelEvent;\n\ninterface BaseServerEvent {\n event_id: string;\n type: ServerEventType;\n}\n\nexport interface ErrorEvent extends BaseServerEvent {\n type: 'error';\n error: {\n type: 'invalid_request_error' | 'server_error' | string;\n code?: string;\n message: string;\n param: string;\n event_id: string;\n };\n}\n\nexport interface SessionCreatedEvent extends BaseServerEvent {\n type: 'session.created';\n session: SessionResource;\n}\n\nexport interface SessionUpdatedEvent extends BaseServerEvent {\n type: 'session.updated';\n session: SessionResource;\n}\n\nexport interface ConversationCreatedEvent extends BaseServerEvent {\n type: 'conversation.created';\n conversation: ConversationResource;\n}\n\nexport interface InputAudioBufferCommittedEvent extends BaseServerEvent {\n type: 'input_audio_buffer.committed';\n item_id: string;\n}\n\nexport interface InputAudioBufferClearedEvent extends BaseServerEvent {\n type: 'input_audio_buffer.cleared';\n}\n\nexport interface InputAudioBufferSpeechStartedEvent extends BaseServerEvent {\n type: 'input_audio_buffer.speech_started';\n audio_start_ms: number;\n item_id: string;\n}\n\nexport interface InputAudioBufferSpeechStoppedEvent extends BaseServerEvent {\n type: 'input_audio_buffer.speech_stopped';\n audio_end_ms: number;\n item_id: string;\n}\n\nexport interface ConversationItemCreatedEvent extends BaseServerEvent {\n type: 'conversation.item.created';\n previous_item_id: string;\n item: ItemResource;\n}\n\nexport interface ConversationItemInputAudioTranscriptionCompletedEvent extends BaseServerEvent {\n type: 'conversation.item.input_audio_transcription.completed';\n item_id: string;\n content_index: number;\n transcript: string;\n}\n\nexport interface ConversationItemInputAudioTranscriptionFailedEvent extends BaseServerEvent {\n type: 'conversation.item.input_audio_transcription.failed';\n item_id: string;\n content_index: number;\n error: {\n type: string;\n code?: string;\n message: string;\n param: null;\n };\n}\n\nexport interface ConversationItemTruncatedEvent extends BaseServerEvent {\n type: 'conversation.item.truncated';\n item_id: string;\n content_index: number;\n audio_end_ms: number;\n}\n\nexport interface ConversationItemDeletedEvent extends BaseServerEvent {\n type: 'conversation.item.deleted';\n item_id: string;\n}\n\nexport interface ResponseCreatedEvent extends BaseServerEvent {\n type: 'response.created';\n response: ResponseResource;\n}\n\nexport interface ResponseDoneEvent extends BaseServerEvent {\n type: 'response.done';\n response: ResponseResource;\n}\n\nexport interface ResponseOutputItemAddedEvent extends BaseServerEvent {\n type: 'response.output_item.added';\n response_id: string;\n output_index: number;\n item: ItemResource;\n}\n\nexport interface ResponseOutputItemDoneEvent extends BaseServerEvent {\n type: 'response.output_item.done';\n response_id: string;\n output_index: number;\n item: ItemResource;\n}\n\nexport interface ResponseContentPartAddedEvent extends BaseServerEvent {\n type: 'response.content_part.added';\n response_id: string;\n item_id: string;\n output_index: number;\n content_index: number;\n part: ContentPart;\n}\n\nexport interface ResponseContentPartDoneEvent extends BaseServerEvent {\n type: 'response.content_part.done';\n response_id: string;\n item_id: string;\n output_index: number;\n content_index: number;\n part: ContentPart;\n}\n\nexport interface ResponseTextDeltaEvent extends BaseServerEvent {\n type: 'response.text.delta';\n response_id: string;\n output_index: number;\n content_index: number;\n delta: string;\n}\n\nexport interface ResponseTextDoneEvent extends BaseServerEvent {\n type: 'response.text.done';\n response_id: string;\n output_index: number;\n content_index: number;\n text: string;\n}\n\nexport interface ResponseAudioTranscriptDeltaEvent extends BaseServerEvent {\n type: 'response.audio_transcript.delta';\n response_id: string;\n item_id: string;\n output_index: number;\n content_index: number;\n delta: string;\n}\n\nexport interface ResponseAudioTranscriptDoneEvent extends BaseServerEvent {\n type: 'response.audio_transcript.done';\n response_id: string;\n output_index: number;\n content_index: number;\n transcript: string;\n}\n\nexport interface ResponseAudioDeltaEvent extends BaseServerEvent {\n type: 'response.audio.delta';\n response_id: string;\n item_id: string;\n output_index: number;\n content_index: number;\n delta: AudioBase64Bytes;\n}\n\nexport interface ResponseAudioDoneEvent extends BaseServerEvent {\n type: 'response.audio.done';\n response_id: string;\n output_index: number;\n content_index: number;\n}\n\nexport interface ResponseFunctionCallArgumentsDeltaEvent extends BaseServerEvent {\n type: 'response.function_call_arguments.delta';\n response_id: string;\n output_index: number;\n delta: string;\n}\n\nexport interface ResponseFunctionCallArgumentsDoneEvent extends BaseServerEvent {\n type: 'response.function_call_arguments.done';\n response_id: string;\n output_index: number;\n arguments: string;\n}\n\nexport interface RateLimitsUpdatedEvent extends BaseServerEvent {\n type: 'rate_limits.updated';\n rate_limits: {\n name: 'requests' | 'tokens' | 'input_tokens' | 'output_tokens' | string;\n limit: number;\n remaining: number;\n reset_seconds: number;\n }[];\n}\n\nexport type ServerEvent =\n | ErrorEvent\n | SessionCreatedEvent\n | SessionUpdatedEvent\n | ConversationCreatedEvent\n | InputAudioBufferCommittedEvent\n | InputAudioBufferClearedEvent\n | InputAudioBufferSpeechStartedEvent\n | InputAudioBufferSpeechStoppedEvent\n | ConversationItemCreatedEvent\n | ConversationItemInputAudioTranscriptionCompletedEvent\n | ConversationItemInputAudioTranscriptionFailedEvent\n | ConversationItemTruncatedEvent\n | ConversationItemDeletedEvent\n | ResponseCreatedEvent\n | ResponseDoneEvent\n | ResponseOutputItemAddedEvent\n | ResponseOutputItemDoneEvent\n | ResponseContentPartAddedEvent\n | ResponseContentPartDoneEvent\n | ResponseTextDeltaEvent\n | ResponseTextDoneEvent\n | ResponseAudioTranscriptDeltaEvent\n | ResponseAudioTranscriptDoneEvent\n | ResponseAudioDeltaEvent\n | ResponseAudioDoneEvent\n | ResponseFunctionCallArgumentsDeltaEvent\n | ResponseFunctionCallArgumentsDoneEvent\n | RateLimitsUpdatedEvent;\n"],"mappings":"AAIO,MAAM,cAAc;AACpB,MAAM,eAAe;AACrB,MAAM,gBAAgB;AACtB,MAAM,iBAAiB;AAEvB,MAAM,WAAW;","names":[]}
|
|
1
|
+
{"version":3,"sources":["../../src/realtime/api_proto.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\n\nexport const SAMPLE_RATE = 24000;\nexport const NUM_CHANNELS = 1;\nexport const IN_FRAME_SIZE = 2400; // 100ms\nexport const OUT_FRAME_SIZE = 1200; // 50ms\n\nexport const BASE_URL = 'wss://api.openai.com/v1';\n\nexport type Model = 'gpt-4o-realtime-preview-2024-10-01' | string; // Open-ended, for future models\nexport type Voice =\n | 'alloy'\n | 'shimmer'\n | 'echo'\n | 'ash'\n | 'ballad'\n | 'coral'\n | 'sage'\n | 'verse'\n | string;\nexport type AudioFormat = 'pcm16'; // TODO: 'g711-ulaw' | 'g711-alaw'\nexport type Role = 'system' | 'assistant' | 'user' | 'tool';\nexport type GenerationFinishedReason = 'stop' | 'max_tokens' | 'content_filter' | 'interrupt';\nexport type InputTranscriptionModel = 'whisper-1' | string; // Open-ended, for future models\nexport type Modality = 'text' | 'audio';\nexport type ToolChoice = 'auto' | 'none' | 'required' | string;\nexport type State = 'initializing' | 'listening' | 'thinking' | 'speaking' | string;\nexport type ResponseStatus =\n | 'in_progress'\n | 'completed'\n | 'incomplete'\n | 'cancelled'\n | 'failed'\n | string;\nexport type ClientEventType =\n | 'session.update'\n | 'input_audio_buffer.append'\n | 'input_audio_buffer.commit'\n | 'input_audio_buffer.clear'\n | 'conversation.item.create'\n | 'conversation.item.truncate'\n | 'conversation.item.delete'\n | 'response.create'\n | 'response.cancel';\nexport type ServerEventType =\n | 'error'\n | 'session.created'\n | 'session.updated'\n | 'conversation.created'\n | 'input_audio_buffer.committed'\n | 'input_audio_buffer.cleared'\n | 'input_audio_buffer.speech_started'\n | 'input_audio_buffer.speech_stopped'\n | 'conversation.item.created'\n | 'conversation.item.input_audio_transcription.completed'\n | 'conversation.item.input_audio_transcription.failed'\n | 'conversation.item.truncated'\n | 'conversation.item.deleted'\n | 'response.created'\n | 'response.done'\n | 'response.output_item.added'\n | 'response.output_item.done'\n | 'response.content_part.added'\n | 'response.content_part.done'\n | 'response.text.delta'\n | 'response.text.done'\n | 'response.audio_transcript.delta'\n | 'response.audio_transcript.done'\n | 'response.audio.delta'\n | 'response.audio.done'\n | 'response.function_call_arguments.delta'\n | 'response.function_call_arguments.done'\n | 'rate_limits.updated';\n\nexport type AudioBase64Bytes = string;\n\nexport interface Tool {\n type: 'function';\n name: string;\n description?: string;\n parameters: {\n type: 'object';\n properties: {\n [prop: string]: {\n [prop: string]: any;\n };\n };\n required: string[];\n };\n}\n\nexport type TurnDetectionType =\n | {\n type: 'semantic_vad';\n eagerness?: 'auto' | 'low' | 'medium' | 'high'; // default: auto\n create_response?: boolean; // default: true\n interrupt_response?: boolean; // default: true\n }\n | {\n type: 'server_vad';\n threshold?: number; // 0.0 to 1.0, default: 0.5\n prefix_padding_ms?: number; // default: 300\n silence_duration_ms?: number; // default: 200\n create_response?: boolean; // default: true\n interrupt_response?: boolean; // default: true\n };\n\nexport type InputAudioTranscription = {\n model: InputTranscriptionModel;\n language?: string;\n prompt?: string;\n};\n\nexport interface InputTextContent {\n type: 'input_text';\n text: string;\n}\n\nexport interface InputAudioContent {\n type: 'input_audio';\n audio: AudioBase64Bytes;\n}\n\nexport interface TextContent {\n type: 'text';\n text: string;\n}\n\nexport interface AudioContent {\n type: 'audio';\n audio: AudioBase64Bytes;\n transcript: string;\n}\n\nexport type Content = InputTextContent | InputAudioContent | TextContent | AudioContent;\nexport type ContentPart = {\n type: 'text' | 'audio';\n audio?: AudioBase64Bytes;\n transcript?: string;\n};\n\nexport interface BaseItem {\n id: string;\n object: 'realtime.item';\n type: string;\n}\n\nexport interface SystemItem extends BaseItem {\n type: 'message';\n role: 'system';\n content: InputTextContent;\n}\n\nexport interface UserItem extends BaseItem {\n type: 'message';\n role: 'user';\n content: (InputTextContent | InputAudioContent)[];\n}\n\nexport interface AssistantItem extends BaseItem {\n type: 'message';\n role: 'assistant';\n content: (TextContent | AudioContent)[];\n}\n\nexport interface FunctionCallItem extends BaseItem {\n type: 'function_call';\n call_id: string;\n name: string;\n arguments: string;\n}\n\nexport interface FunctionCallOutputItem extends BaseItem {\n type: 'function_call_output';\n call_id: string;\n output: string;\n}\n\nexport type ItemResource =\n | SystemItem\n | UserItem\n | AssistantItem\n | FunctionCallItem\n | FunctionCallOutputItem;\n\n// Session Resource\nexport interface SessionResource {\n id: string;\n object: 'realtime.session';\n model: string;\n modalities: Modality[]; // default: [\"text\", \"audio\"]\n instructions: string;\n voice: Voice; // default: \"alloy\"\n input_audio_format: AudioFormat; // default: \"pcm16\"\n output_audio_format: AudioFormat; // default: \"pcm16\"\n input_audio_transcription: InputAudioTranscription | null;\n turn_detection: TurnDetectionType | null;\n tools: Tool[];\n tool_choice: ToolChoice; // default: \"auto\"\n temperature: number; // default: 0.8\n max_response_output_tokens: number | 'inf';\n expires_at: number;\n}\n\n// Conversation Resource\nexport interface ConversationResource {\n id: string;\n object: 'realtime.conversation';\n}\n\nexport type ResponseStatusDetails =\n | {\n type: 'incomplete';\n reason: 'max_output_tokens' | 'content_filter' | string;\n }\n | {\n type: 'failed';\n error?: {\n code: 'server_error' | 'rate_limit_exceeded' | string;\n message: string;\n };\n }\n | {\n type: 'cancelled';\n reason: 'turn_detected' | 'client_cancelled' | string;\n };\n\nexport interface ModelUsage {\n total_tokens: number;\n input_tokens: number;\n output_tokens: number;\n input_token_details: {\n text_tokens: number;\n audio_tokens: number;\n cached_tokens: number;\n cached_tokens_details: {\n text_tokens: number;\n audio_tokens: number;\n image_tokens: number;\n };\n };\n output_token_details: {\n text_tokens: number;\n audio_tokens: number;\n };\n}\n\nexport interface ResponseResource {\n id: string;\n object: 'realtime.response';\n status: ResponseStatus;\n status_details: ResponseStatusDetails;\n output: ItemResource[];\n usage?: ModelUsage;\n metadata?: Record<string, string>;\n}\n\n// Client Events\ninterface BaseClientEvent {\n event_id?: string;\n type: ClientEventType;\n}\n\nexport interface SessionUpdateEvent extends BaseClientEvent {\n type: 'session.update';\n session: Partial<{\n model: Model;\n modalities: Modality[];\n instructions: string;\n voice: Voice;\n input_audio_format: AudioFormat;\n output_audio_format: AudioFormat;\n input_audio_transcription: InputAudioTranscription | null;\n turn_detection: TurnDetectionType | null;\n tools: Tool[];\n tool_choice: ToolChoice;\n temperature: number;\n max_response_output_tokens?: number | 'inf';\n speed?: number;\n }>;\n}\n\nexport interface InputAudioBufferAppendEvent extends BaseClientEvent {\n type: 'input_audio_buffer.append';\n audio: AudioBase64Bytes;\n}\n\nexport interface InputAudioBufferCommitEvent extends BaseClientEvent {\n type: 'input_audio_buffer.commit';\n}\n\nexport interface InputAudioBufferClearEvent extends BaseClientEvent {\n type: 'input_audio_buffer.clear';\n}\n\nexport interface UserItemCreate {\n id: string;\n type: 'message';\n role: 'user';\n content: (InputTextContent | InputAudioContent)[];\n}\n\nexport interface AssistantItemCreate {\n id: string;\n type: 'message';\n role: 'assistant';\n content: TextContent[];\n}\n\nexport interface SystemItemCreate {\n id: string;\n type: 'message';\n role: 'system';\n content: InputTextContent[];\n}\n\nexport interface FunctionCallOutputItemCreate {\n id: string;\n type: 'function_call_output';\n call_id: string;\n output: string;\n}\n\nexport type ConversationItemCreateContent =\n | UserItemCreate\n | AssistantItemCreate\n | SystemItemCreate\n | FunctionCallOutputItemCreate;\n\nexport interface ConversationItemCreateEvent extends BaseClientEvent {\n type: 'conversation.item.create';\n previous_item_id?: string;\n item: ConversationItemCreateContent;\n}\n\nexport interface ConversationItemTruncateEvent extends BaseClientEvent {\n type: 'conversation.item.truncate';\n item_id: string;\n content_index: number;\n audio_end_ms: number;\n}\n\nexport interface ConversationItemDeleteEvent extends BaseClientEvent {\n type: 'conversation.item.delete';\n item_id: string;\n}\n\nexport interface ResponseCreateEvent extends BaseClientEvent {\n type: 'response.create';\n response?: Partial<{\n modalities: Modality[];\n instructions: string;\n voice: Voice;\n output_audio_format: AudioFormat;\n tools?: Tool[];\n tool_choice: ToolChoice;\n temperature: number;\n max_output_tokens: number | 'inf';\n metadata?: Record<string, string>;\n }>;\n}\n\nexport interface ResponseCancelEvent extends BaseClientEvent {\n type: 'response.cancel';\n}\n\nexport type ClientEvent =\n | SessionUpdateEvent\n | InputAudioBufferAppendEvent\n | InputAudioBufferCommitEvent\n | InputAudioBufferClearEvent\n | ConversationItemCreateEvent\n | ConversationItemTruncateEvent\n | ConversationItemDeleteEvent\n | ResponseCreateEvent\n | ResponseCancelEvent;\n\ninterface BaseServerEvent {\n event_id: string;\n type: ServerEventType;\n}\n\nexport interface ErrorEvent extends BaseServerEvent {\n type: 'error';\n error: {\n type: 'invalid_request_error' | 'server_error' | string;\n code?: string;\n message: string;\n param: string;\n event_id: string;\n };\n}\n\nexport interface SessionCreatedEvent extends BaseServerEvent {\n type: 'session.created';\n session: SessionResource;\n}\n\nexport interface SessionUpdatedEvent extends BaseServerEvent {\n type: 'session.updated';\n session: SessionResource;\n}\n\nexport interface ConversationCreatedEvent extends BaseServerEvent {\n type: 'conversation.created';\n conversation: ConversationResource;\n}\n\nexport interface InputAudioBufferCommittedEvent extends BaseServerEvent {\n type: 'input_audio_buffer.committed';\n item_id: string;\n}\n\nexport interface InputAudioBufferClearedEvent extends BaseServerEvent {\n type: 'input_audio_buffer.cleared';\n}\n\nexport interface InputAudioBufferSpeechStartedEvent extends BaseServerEvent {\n type: 'input_audio_buffer.speech_started';\n audio_start_ms: number;\n item_id: string;\n}\n\nexport interface InputAudioBufferSpeechStoppedEvent extends BaseServerEvent {\n type: 'input_audio_buffer.speech_stopped';\n audio_end_ms: number;\n item_id: string;\n}\n\nexport interface ConversationItemCreatedEvent extends BaseServerEvent {\n type: 'conversation.item.created';\n previous_item_id: string;\n item: ItemResource;\n}\n\nexport interface ConversationItemInputAudioTranscriptionCompletedEvent extends BaseServerEvent {\n type: 'conversation.item.input_audio_transcription.completed';\n item_id: string;\n content_index: number;\n transcript: string;\n}\n\nexport interface ConversationItemInputAudioTranscriptionFailedEvent extends BaseServerEvent {\n type: 'conversation.item.input_audio_transcription.failed';\n item_id: string;\n content_index: number;\n error: {\n type: string;\n code?: string;\n message: string;\n param: null;\n };\n}\n\nexport interface ConversationItemTruncatedEvent extends BaseServerEvent {\n type: 'conversation.item.truncated';\n item_id: string;\n content_index: number;\n audio_end_ms: number;\n}\n\nexport interface ConversationItemDeletedEvent extends BaseServerEvent {\n type: 'conversation.item.deleted';\n item_id: string;\n}\n\nexport interface ResponseCreatedEvent extends BaseServerEvent {\n type: 'response.created';\n response: ResponseResource;\n}\n\nexport interface ResponseDoneEvent extends BaseServerEvent {\n type: 'response.done';\n response: ResponseResource;\n}\n\nexport interface ResponseOutputItemAddedEvent extends BaseServerEvent {\n type: 'response.output_item.added';\n response_id: string;\n output_index: number;\n item: ItemResource;\n}\n\nexport interface ResponseOutputItemDoneEvent extends BaseServerEvent {\n type: 'response.output_item.done';\n response_id: string;\n output_index: number;\n item: ItemResource;\n}\n\nexport interface ResponseContentPartAddedEvent extends BaseServerEvent {\n type: 'response.content_part.added';\n response_id: string;\n item_id: string;\n output_index: number;\n content_index: number;\n part: ContentPart;\n}\n\nexport interface ResponseContentPartDoneEvent extends BaseServerEvent {\n type: 'response.content_part.done';\n response_id: string;\n item_id: string;\n output_index: number;\n content_index: number;\n part: ContentPart;\n}\n\nexport interface ResponseTextDeltaEvent extends BaseServerEvent {\n type: 'response.text.delta';\n response_id: string;\n item_id: string;\n output_index: number;\n content_index: number;\n delta: string;\n}\n\nexport interface ResponseTextDoneEvent extends BaseServerEvent {\n type: 'response.text.done';\n response_id: string;\n item_id: string;\n output_index: number;\n content_index: number;\n text: string;\n}\n\nexport interface ResponseAudioTranscriptDeltaEvent extends BaseServerEvent {\n type: 'response.audio_transcript.delta';\n response_id: string;\n item_id: string;\n output_index: number;\n content_index: number;\n delta: string;\n}\n\nexport interface ResponseAudioTranscriptDoneEvent extends BaseServerEvent {\n type: 'response.audio_transcript.done';\n response_id: string;\n output_index: number;\n content_index: number;\n transcript: string;\n}\n\nexport interface ResponseAudioDeltaEvent extends BaseServerEvent {\n type: 'response.audio.delta';\n response_id: string;\n item_id: string;\n output_index: number;\n content_index: number;\n delta: AudioBase64Bytes;\n}\n\nexport interface ResponseAudioDoneEvent extends BaseServerEvent {\n type: 'response.audio.done';\n response_id: string;\n output_index: number;\n content_index: number;\n}\n\nexport interface ResponseFunctionCallArgumentsDeltaEvent extends BaseServerEvent {\n type: 'response.function_call_arguments.delta';\n response_id: string;\n output_index: number;\n delta: string;\n}\n\nexport interface ResponseFunctionCallArgumentsDoneEvent extends BaseServerEvent {\n type: 'response.function_call_arguments.done';\n response_id: string;\n output_index: number;\n arguments: string;\n}\n\nexport interface RateLimitsUpdatedEvent extends BaseServerEvent {\n type: 'rate_limits.updated';\n rate_limits: {\n name: 'requests' | 'tokens' | 'input_tokens' | 'output_tokens' | string;\n limit: number;\n remaining: number;\n reset_seconds: number;\n }[];\n}\n\nexport type ServerEvent =\n | ErrorEvent\n | SessionCreatedEvent\n | SessionUpdatedEvent\n | ConversationCreatedEvent\n | InputAudioBufferCommittedEvent\n | InputAudioBufferClearedEvent\n | InputAudioBufferSpeechStartedEvent\n | InputAudioBufferSpeechStoppedEvent\n | ConversationItemCreatedEvent\n | ConversationItemInputAudioTranscriptionCompletedEvent\n | ConversationItemInputAudioTranscriptionFailedEvent\n | ConversationItemTruncatedEvent\n | ConversationItemDeletedEvent\n | ResponseCreatedEvent\n | ResponseDoneEvent\n | ResponseOutputItemAddedEvent\n | ResponseOutputItemDoneEvent\n | ResponseContentPartAddedEvent\n | ResponseContentPartDoneEvent\n | ResponseTextDeltaEvent\n | ResponseTextDoneEvent\n | ResponseAudioTranscriptDeltaEvent\n | ResponseAudioTranscriptDoneEvent\n | ResponseAudioDeltaEvent\n | ResponseAudioDoneEvent\n | ResponseFunctionCallArgumentsDeltaEvent\n | ResponseFunctionCallArgumentsDoneEvent\n | RateLimitsUpdatedEvent;\n"],"mappings":"AAIO,MAAM,cAAc;AACpB,MAAM,eAAe;AACrB,MAAM,gBAAgB;AACtB,MAAM,iBAAiB;AAEvB,MAAM,WAAW;","names":[]}
|
|
@@ -84,7 +84,8 @@ const DEFAULT_REALTIME_MODEL_OPTIONS = {
|
|
|
84
84
|
toolChoice: DEFAULT_TOOL_CHOICE,
|
|
85
85
|
maxResponseOutputTokens: DEFAULT_MAX_RESPONSE_OUTPUT_TOKENS,
|
|
86
86
|
maxSessionDuration: DEFAULT_MAX_SESSION_DURATION,
|
|
87
|
-
connOptions: import_agents.DEFAULT_API_CONNECT_OPTIONS
|
|
87
|
+
connOptions: import_agents.DEFAULT_API_CONNECT_OPTIONS,
|
|
88
|
+
modalities: ["text", "audio"]
|
|
88
89
|
};
|
|
89
90
|
class RealtimeModel extends import_agents.llm.RealtimeModel {
|
|
90
91
|
sampleRate = api_proto.SAMPLE_RATE;
|
|
@@ -94,11 +95,13 @@ class RealtimeModel extends import_agents.llm.RealtimeModel {
|
|
|
94
95
|
/* @internal */
|
|
95
96
|
_options;
|
|
96
97
|
constructor(options = {}) {
|
|
98
|
+
const modalities = options.modalities || DEFAULT_REALTIME_MODEL_OPTIONS.modalities;
|
|
97
99
|
super({
|
|
98
100
|
messageTruncation: true,
|
|
99
101
|
turnDetection: options.turnDetection !== null,
|
|
100
102
|
userTranscription: options.inputAudioTranscription !== null,
|
|
101
|
-
autoToolReplyGeneration: false
|
|
103
|
+
autoToolReplyGeneration: false,
|
|
104
|
+
audioOutput: modalities.includes("audio")
|
|
102
105
|
});
|
|
103
106
|
const isAzure = !!(options.apiVersion || options.entraToken || options.azureDeployment);
|
|
104
107
|
if (options.apiKey === "" && !isAzure) {
|
|
@@ -121,13 +124,15 @@ class RealtimeModel extends import_agents.llm.RealtimeModel {
|
|
|
121
124
|
}
|
|
122
125
|
options.baseURL = `${azureEndpoint.replace(/\/$/, "")}/openai`;
|
|
123
126
|
}
|
|
127
|
+
const { modalities: _, ...optionsWithoutModalities } = options;
|
|
124
128
|
this._options = {
|
|
125
129
|
...DEFAULT_REALTIME_MODEL_OPTIONS,
|
|
126
|
-
...
|
|
130
|
+
...optionsWithoutModalities,
|
|
127
131
|
baseURL: options.baseURL || BASE_URL,
|
|
128
132
|
apiKey,
|
|
129
133
|
isAzure,
|
|
130
|
-
model: options.model || DEFAULT_REALTIME_MODEL_OPTIONS.model
|
|
134
|
+
model: options.model || DEFAULT_REALTIME_MODEL_OPTIONS.model,
|
|
135
|
+
modalities
|
|
131
136
|
};
|
|
132
137
|
}
|
|
133
138
|
/**
|
|
@@ -267,6 +272,7 @@ class RealtimeSession extends import_agents.llm.RealtimeSession {
|
|
|
267
272
|
this.messageChannel.put(command);
|
|
268
273
|
}
|
|
269
274
|
createSessionUpdateEvent() {
|
|
275
|
+
const modalities = this.oaiRealtimeModel._options.modalities.includes("audio") ? ["text", "audio"] : ["text"];
|
|
270
276
|
return {
|
|
271
277
|
type: "session.update",
|
|
272
278
|
session: {
|
|
@@ -274,7 +280,7 @@ class RealtimeSession extends import_agents.llm.RealtimeSession {
|
|
|
274
280
|
voice: this.oaiRealtimeModel._options.voice,
|
|
275
281
|
input_audio_format: "pcm16",
|
|
276
282
|
output_audio_format: "pcm16",
|
|
277
|
-
modalities
|
|
283
|
+
modalities,
|
|
278
284
|
turn_detection: this.oaiRealtimeModel._options.turnDetection,
|
|
279
285
|
input_audio_transcription: this.oaiRealtimeModel._options.inputAudioTranscription,
|
|
280
286
|
// TODO(shubhra): add inputAudioNoiseReduction
|
|
@@ -462,12 +468,31 @@ class RealtimeSession extends import_agents.llm.RealtimeSession {
|
|
|
462
468
|
});
|
|
463
469
|
}
|
|
464
470
|
async truncate(_options) {
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
+
if (!_options.modalities || _options.modalities.includes("audio")) {
|
|
472
|
+
this.sendEvent({
|
|
473
|
+
type: "conversation.item.truncate",
|
|
474
|
+
content_index: 0,
|
|
475
|
+
item_id: _options.messageId,
|
|
476
|
+
audio_end_ms: _options.audioEndMs
|
|
477
|
+
});
|
|
478
|
+
} else if (_options.audioTranscript !== void 0) {
|
|
479
|
+
const chatCtx = this.chatCtx.copy();
|
|
480
|
+
const idx = chatCtx.indexById(_options.messageId);
|
|
481
|
+
if (idx !== void 0) {
|
|
482
|
+
const item = chatCtx.items[idx];
|
|
483
|
+
if (item && item.type === "message") {
|
|
484
|
+
const newItem = import_agents.llm.ChatMessage.create({
|
|
485
|
+
...item,
|
|
486
|
+
content: [_options.audioTranscript]
|
|
487
|
+
});
|
|
488
|
+
chatCtx.items[idx] = newItem;
|
|
489
|
+
const events = this.createChatCtxUpdateEvents(chatCtx);
|
|
490
|
+
for (const ev of events) {
|
|
491
|
+
this.sendEvent(ev);
|
|
492
|
+
}
|
|
493
|
+
}
|
|
494
|
+
}
|
|
495
|
+
}
|
|
471
496
|
}
|
|
472
497
|
loggableEvent(event) {
|
|
473
498
|
const untypedEvent = {};
|
|
@@ -683,6 +708,12 @@ class RealtimeSession extends import_agents.llm.RealtimeSession {
|
|
|
683
708
|
case "response.content_part.done":
|
|
684
709
|
this.handleResponseContentPartDone(event);
|
|
685
710
|
break;
|
|
711
|
+
case "response.text.delta":
|
|
712
|
+
this.handleResponseTextDelta(event);
|
|
713
|
+
break;
|
|
714
|
+
case "response.text.done":
|
|
715
|
+
this.handleResponseTextDone(event);
|
|
716
|
+
break;
|
|
686
717
|
case "response.audio_transcript.delta":
|
|
687
718
|
this.handleResponseAudioTranscriptDelta(event);
|
|
688
719
|
break;
|
|
@@ -799,6 +830,29 @@ class RealtimeSession extends import_agents.llm.RealtimeSession {
|
|
|
799
830
|
this.textModeRecoveryRetries = 0;
|
|
800
831
|
return;
|
|
801
832
|
}
|
|
833
|
+
const itemId = event.item.id;
|
|
834
|
+
if (!itemId) {
|
|
835
|
+
throw new Error("item.id is not set");
|
|
836
|
+
}
|
|
837
|
+
const modalitiesFut = new import_agents.Future();
|
|
838
|
+
const itemGeneration = {
|
|
839
|
+
messageId: itemId,
|
|
840
|
+
textChannel: import_agents.stream.createStreamChannel(),
|
|
841
|
+
audioChannel: import_agents.stream.createStreamChannel(),
|
|
842
|
+
audioTranscript: "",
|
|
843
|
+
modalities: modalitiesFut
|
|
844
|
+
};
|
|
845
|
+
if (!this.oaiRealtimeModel.capabilities.audioOutput) {
|
|
846
|
+
itemGeneration.audioChannel.close();
|
|
847
|
+
modalitiesFut.resolve(["text"]);
|
|
848
|
+
}
|
|
849
|
+
this.currentGeneration.messageChannel.write({
|
|
850
|
+
messageId: itemId,
|
|
851
|
+
textStream: itemGeneration.textChannel.stream(),
|
|
852
|
+
audioStream: itemGeneration.audioChannel.stream(),
|
|
853
|
+
modalities: modalitiesFut.await
|
|
854
|
+
});
|
|
855
|
+
this.currentGeneration.messages.set(itemId, itemGeneration);
|
|
802
856
|
}
|
|
803
857
|
handleConversationItemCreated(event) {
|
|
804
858
|
if (!event.item.id) {
|
|
@@ -859,35 +913,20 @@ class RealtimeSession extends import_agents.llm.RealtimeSession {
|
|
|
859
913
|
}
|
|
860
914
|
const itemId = event.item_id;
|
|
861
915
|
const itemType = event.part.type;
|
|
862
|
-
const
|
|
863
|
-
if (
|
|
864
|
-
this.
|
|
865
|
-
if (this.textModeRecoveryRetries > 0) {
|
|
866
|
-
this.#logger.info(
|
|
867
|
-
{ retries: this.textModeRecoveryRetries },
|
|
868
|
-
"recovered from text-only response"
|
|
869
|
-
);
|
|
870
|
-
this.textModeRecoveryRetries = 0;
|
|
871
|
-
}
|
|
872
|
-
const itemGeneration = {
|
|
873
|
-
messageId: itemId,
|
|
874
|
-
textChannel: import_agents.stream.createStreamChannel(),
|
|
875
|
-
audioChannel: import_agents.stream.createStreamChannel(),
|
|
876
|
-
audioTranscript: ""
|
|
877
|
-
};
|
|
878
|
-
this.currentGeneration.messageChannel.write({
|
|
879
|
-
messageId: itemId,
|
|
880
|
-
textStream: itemGeneration.textChannel.stream(),
|
|
881
|
-
audioStream: itemGeneration.audioChannel.stream()
|
|
882
|
-
});
|
|
883
|
-
this.currentGeneration.messages.set(itemId, itemGeneration);
|
|
884
|
-
this.currentGeneration._firstTokenTimestamp = Date.now();
|
|
916
|
+
const itemGeneration = this.currentGeneration.messages.get(itemId);
|
|
917
|
+
if (!itemGeneration) {
|
|
918
|
+
this.#logger.warn(`itemGeneration not found for itemId=${itemId}`);
|
|
885
919
|
return;
|
|
886
|
-
}
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
920
|
+
}
|
|
921
|
+
if (itemType === "text" && this.oaiRealtimeModel.capabilities.audioOutput) {
|
|
922
|
+
this.#logger.warn("Text response received from OpenAI Realtime API in audio modality.");
|
|
923
|
+
}
|
|
924
|
+
if (!itemGeneration.modalities.done) {
|
|
925
|
+
const modalityResult = itemType === "text" ? ["text"] : ["audio", "text"];
|
|
926
|
+
itemGeneration.modalities.resolve(modalityResult);
|
|
927
|
+
}
|
|
928
|
+
if (this.currentGeneration._firstTokenTimestamp === void 0) {
|
|
929
|
+
this.currentGeneration._firstTokenTimestamp = Date.now();
|
|
891
930
|
}
|
|
892
931
|
}
|
|
893
932
|
handleResponseContentPartDone(event) {
|
|
@@ -898,6 +937,25 @@ class RealtimeSession extends import_agents.llm.RealtimeSession {
|
|
|
898
937
|
throw new Error("currentGeneration is not set");
|
|
899
938
|
}
|
|
900
939
|
}
|
|
940
|
+
handleResponseTextDelta(event) {
|
|
941
|
+
if (!this.currentGeneration) {
|
|
942
|
+
throw new Error("currentGeneration is not set");
|
|
943
|
+
}
|
|
944
|
+
const itemGeneration = this.currentGeneration.messages.get(event.item_id);
|
|
945
|
+
if (!itemGeneration) {
|
|
946
|
+
throw new Error("itemGeneration is not set");
|
|
947
|
+
}
|
|
948
|
+
if (!this.oaiRealtimeModel.capabilities.audioOutput && !this.currentGeneration._firstTokenTimestamp) {
|
|
949
|
+
this.currentGeneration._firstTokenTimestamp = Date.now();
|
|
950
|
+
}
|
|
951
|
+
itemGeneration.textChannel.write(event.delta);
|
|
952
|
+
itemGeneration.audioTranscript += event.delta;
|
|
953
|
+
}
|
|
954
|
+
handleResponseTextDone(_event) {
|
|
955
|
+
if (!this.currentGeneration) {
|
|
956
|
+
throw new Error("currentGeneration is not set");
|
|
957
|
+
}
|
|
958
|
+
}
|
|
901
959
|
handleResponseAudioTranscriptDelta(event) {
|
|
902
960
|
if (!this.currentGeneration) {
|
|
903
961
|
throw new Error("currentGeneration is not set");
|
|
@@ -920,6 +978,12 @@ class RealtimeSession extends import_agents.llm.RealtimeSession {
|
|
|
920
978
|
if (!itemGeneration) {
|
|
921
979
|
throw new Error("itemGeneration is not set");
|
|
922
980
|
}
|
|
981
|
+
if (this.currentGeneration._firstTokenTimestamp === void 0) {
|
|
982
|
+
this.currentGeneration._firstTokenTimestamp = Date.now();
|
|
983
|
+
}
|
|
984
|
+
if (!itemGeneration.modalities.done) {
|
|
985
|
+
itemGeneration.modalities.resolve(["audio", "text"]);
|
|
986
|
+
}
|
|
923
987
|
const binaryString = atob(event.delta);
|
|
924
988
|
const len = binaryString.length;
|
|
925
989
|
const bytes = new Uint8Array(len);
|
|
@@ -968,6 +1032,9 @@ class RealtimeSession extends import_agents.llm.RealtimeSession {
|
|
|
968
1032
|
}
|
|
969
1033
|
itemGeneration.textChannel.close();
|
|
970
1034
|
itemGeneration.audioChannel.close();
|
|
1035
|
+
if (!itemGeneration.modalities.done) {
|
|
1036
|
+
itemGeneration.modalities.resolve(this.oaiRealtimeModel._options.modalities);
|
|
1037
|
+
}
|
|
971
1038
|
}
|
|
972
1039
|
}
|
|
973
1040
|
handleResponseDone(_event) {
|
|
@@ -986,6 +1053,9 @@ class RealtimeSession extends import_agents.llm.RealtimeSession {
|
|
|
986
1053
|
for (const generation of this.currentGeneration.messages.values()) {
|
|
987
1054
|
generation.textChannel.close();
|
|
988
1055
|
generation.audioChannel.close();
|
|
1056
|
+
if (!generation.modalities.done) {
|
|
1057
|
+
generation.modalities.resolve(this.oaiRealtimeModel._options.modalities);
|
|
1058
|
+
}
|
|
989
1059
|
}
|
|
990
1060
|
this.currentGeneration.functionChannel.close();
|
|
991
1061
|
this.currentGeneration.messageChannel.close();
|
|
@@ -1145,6 +1215,8 @@ function livekitItemToOpenAIItem(item) {
|
|
|
1145
1215
|
role,
|
|
1146
1216
|
content: contentList
|
|
1147
1217
|
};
|
|
1218
|
+
default:
|
|
1219
|
+
throw new Error(`Unsupported item type: ${item.type}`);
|
|
1148
1220
|
}
|
|
1149
1221
|
}
|
|
1150
1222
|
function openAIItemToLivekitItem(item) {
|