videosdk-plugins-openai 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of videosdk-plugins-openai might be problematic. Click here for more details.
- videosdk_plugins_openai-0.0.1/.gitignore +7 -0
- videosdk_plugins_openai-0.0.1/PKG-INFO +27 -0
- videosdk_plugins_openai-0.0.1/README.md +9 -0
- videosdk_plugins_openai-0.0.1/pyproject.toml +35 -0
- videosdk_plugins_openai-0.0.1/videosdk/plugins/openai/__init__.py +6 -0
- videosdk_plugins_openai-0.0.1/videosdk/plugins/openai/realtime_api.py +524 -0
- videosdk_plugins_openai-0.0.1/videosdk/plugins/openai/version.py +1 -0
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: videosdk-plugins-openai
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: VideoSDK Agent Framework plugin for OpenAI services
|
|
5
|
+
Author: videosdk
|
|
6
|
+
License-Expression: Apache-2.0
|
|
7
|
+
Keywords: ai,audio,openai,video,videosdk
|
|
8
|
+
Classifier: Development Status :: 4 - Beta
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Topic :: Communications :: Conferencing
|
|
11
|
+
Classifier: Topic :: Multimedia :: Sound/Audio
|
|
12
|
+
Classifier: Topic :: Multimedia :: Video
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
14
|
+
Requires-Python: >=3.9.0
|
|
15
|
+
Requires-Dist: openai[realtime]>=1.68.2
|
|
16
|
+
Requires-Dist: videosdk-agents>=0.0.1
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
|
|
19
|
+
VideoSDK OpenAI Plugin
|
|
20
|
+
|
|
21
|
+
Agent Framework plugin for realtime services from OpenAI.
|
|
22
|
+
|
|
23
|
+
## Installation
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
pip install videosdk-plugins-openai
|
|
27
|
+
```
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "videosdk-plugins-openai"
|
|
7
|
+
dynamic = ["version"]
|
|
8
|
+
description = "VideoSDK Agent Framework plugin for OpenAI services"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "Apache-2.0"
|
|
11
|
+
requires-python = ">=3.9.0"
|
|
12
|
+
authors = [{ name = "videosdk"}]
|
|
13
|
+
keywords = ["video", "audio", "ai", "openai", "videosdk"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Intended Audience :: Developers",
|
|
16
|
+
"Development Status :: 4 - Beta",
|
|
17
|
+
"Intended Audience :: Developers",
|
|
18
|
+
"Topic :: Communications :: Conferencing",
|
|
19
|
+
"Topic :: Multimedia :: Sound/Audio",
|
|
20
|
+
"Topic :: Multimedia :: Video",
|
|
21
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
22
|
+
]
|
|
23
|
+
dependencies = [
|
|
24
|
+
"videosdk-agents>=0.0.1",
|
|
25
|
+
"openai[realtime]>=1.68.2",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
[tool.hatch.version]
|
|
29
|
+
path = "videosdk/plugins/openai/version.py"
|
|
30
|
+
|
|
31
|
+
[tool.hatch.build.targets.wheel]
|
|
32
|
+
packages = ["videosdk"]
|
|
33
|
+
|
|
34
|
+
[tool.hatch.build.targets.sdist]
|
|
35
|
+
include = ["/videosdk"]
|
|
@@ -0,0 +1,524 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
from typing import Any, Dict, Optional, Literal, List
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from urllib.parse import parse_qs, urlencode, urlparse, urlunparse
|
|
9
|
+
from dotenv import load_dotenv
|
|
10
|
+
import uuid
|
|
11
|
+
import base64
|
|
12
|
+
import aiohttp
|
|
13
|
+
import traceback
|
|
14
|
+
from agent import (
|
|
15
|
+
FunctionTool,
|
|
16
|
+
is_function_tool,
|
|
17
|
+
get_tool_info,
|
|
18
|
+
build_openai_schema,
|
|
19
|
+
CustomAudioStreamTrack,
|
|
20
|
+
ToolChoice
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
load_dotenv()
|
|
24
|
+
|
|
25
|
+
from agent.realtime_base_model import RealtimeBaseModel
|
|
26
|
+
from openai.types.beta.realtime.session import InputAudioTranscription, TurnDetection
|
|
27
|
+
|
|
28
|
+
OPENAI_BASE_URL = "https://api.openai.com/v1"
|
|
29
|
+
SAMPLE_RATE = 24000
|
|
30
|
+
NUM_CHANNELS = 1
|
|
31
|
+
|
|
32
|
+
DEFAULT_TEMPERATURE = 0.8
|
|
33
|
+
DEFAULT_TURN_DETECTION = TurnDetection(
|
|
34
|
+
type="server_vad",
|
|
35
|
+
threshold=0.5,
|
|
36
|
+
prefix_padding_ms=300,
|
|
37
|
+
silence_duration_ms=200,
|
|
38
|
+
create_response=True,
|
|
39
|
+
interrupt_response=True,
|
|
40
|
+
)
|
|
41
|
+
DEFAULT_INPUT_AUDIO_TRANSCRIPTION = InputAudioTranscription(
|
|
42
|
+
model="gpt-4o-mini-transcribe",
|
|
43
|
+
)
|
|
44
|
+
DEFAULT_TOOL_CHOICE = "auto"
|
|
45
|
+
|
|
46
|
+
OpenAIEventTypes = Literal[
|
|
47
|
+
"instructions_updated",
|
|
48
|
+
"tools_updated"
|
|
49
|
+
]
|
|
50
|
+
DEFAULT_VOICE = "alloy"
|
|
51
|
+
DEFAULT_INPUT_AUDIO_FORMAT = "pcm16"
|
|
52
|
+
DEFAULT_OUTPUT_AUDIO_FORMAT = "pcm16"
|
|
53
|
+
|
|
54
|
+
@dataclass
|
|
55
|
+
class OpenAIRealtimeConfig:
|
|
56
|
+
"""Configuration for the OpenAI realtime API
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
voice: Voice ID for audio output. Default is 'alloy'
|
|
60
|
+
temperature: Controls randomness in response generation. Higher values (e.g. 0.8) make output more random,
|
|
61
|
+
lower values make it more deterministic. Default is 0.8
|
|
62
|
+
turn_detection: Configuration for detecting user speech turns. Contains settings for:
|
|
63
|
+
- type: Detection type ('server_vad')
|
|
64
|
+
- threshold: Voice activity detection threshold (0.0-1.0)
|
|
65
|
+
- prefix_padding_ms: Padding before speech start (ms)
|
|
66
|
+
- silence_duration_ms: Silence duration to mark end (ms)
|
|
67
|
+
- create_response: Whether to generate response on turn
|
|
68
|
+
- interrupt_response: Whether to allow interruption
|
|
69
|
+
input_audio_transcription: Configuration for audio transcription. Contains:
|
|
70
|
+
- model: Model to use for transcription
|
|
71
|
+
tool_choice: How tools should be selected ('auto' or 'none'). Default is 'auto'
|
|
72
|
+
modalities: List of enabled response types ["text", "audio"]. Default includes both
|
|
73
|
+
"""
|
|
74
|
+
voice: str = DEFAULT_VOICE
|
|
75
|
+
temperature: float = DEFAULT_TEMPERATURE
|
|
76
|
+
turn_detection: TurnDetection | None = field(default_factory=lambda: DEFAULT_TURN_DETECTION)
|
|
77
|
+
input_audio_transcription: InputAudioTranscription | None = field(default_factory=lambda: DEFAULT_INPUT_AUDIO_TRANSCRIPTION)
|
|
78
|
+
tool_choice: ToolChoice | None = DEFAULT_TOOL_CHOICE
|
|
79
|
+
modalities: list[str] = field(default_factory=lambda: ["text", "audio"])
|
|
80
|
+
|
|
81
|
+
@dataclass
|
|
82
|
+
class OpenAISession:
|
|
83
|
+
"""Represents an OpenAI WebSocket session"""
|
|
84
|
+
ws: aiohttp.ClientWebSocketResponse
|
|
85
|
+
msg_queue: asyncio.Queue[Dict[str, Any]]
|
|
86
|
+
tasks: list[asyncio.Task]
|
|
87
|
+
|
|
88
|
+
class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
|
|
89
|
+
"""OpenAI's realtime model implementation."""
|
|
90
|
+
|
|
91
|
+
def __init__(
|
|
92
|
+
self,
|
|
93
|
+
*,
|
|
94
|
+
model: str,
|
|
95
|
+
config: OpenAIRealtimeConfig | None = None,
|
|
96
|
+
api_key: str | None = None,
|
|
97
|
+
base_url: str | None = None,
|
|
98
|
+
) -> None:
|
|
99
|
+
"""
|
|
100
|
+
Initialize OpenAI realtime model.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
model: The OpenAI model identifier to use (e.g. 'gpt-4', 'gpt-3.5-turbo')
|
|
104
|
+
config: Optional configuration object for customizing model behavior. Contains settings for:
|
|
105
|
+
- voice: Voice ID to use for audio output
|
|
106
|
+
- temperature: Sampling temperature for responses
|
|
107
|
+
- turn_detection: Settings for detecting user speech turns
|
|
108
|
+
- input_audio_transcription: Settings for audio transcription
|
|
109
|
+
- tool_choice: How tools should be selected ('auto' or 'none')
|
|
110
|
+
- modalities: List of enabled modalities ('text', 'audio')
|
|
111
|
+
api_key: OpenAI API key. If not provided, will attempt to read from OPENAI_API_KEY env var
|
|
112
|
+
base_url: Base URL for OpenAI API. Defaults to 'https://api.openai.com/v1'
|
|
113
|
+
|
|
114
|
+
Raises:
|
|
115
|
+
ValueError: If no API key is provided and none found in environment variables
|
|
116
|
+
"""
|
|
117
|
+
super().__init__()
|
|
118
|
+
self.model = model
|
|
119
|
+
self.api_key = api_key or os.getenv("OPENAI_API_KEY")
|
|
120
|
+
self.base_url = base_url or OPENAI_BASE_URL
|
|
121
|
+
if not self.api_key:
|
|
122
|
+
raise ValueError("OpenAI API key must be provided or set in OPENAI_API_KEY environment variable")
|
|
123
|
+
self._http_session: Optional[aiohttp.ClientSession] = None
|
|
124
|
+
self._session: Optional[OpenAISession] = None
|
|
125
|
+
self._closing = False
|
|
126
|
+
self._instructions: Optional[str] = None
|
|
127
|
+
self._tools: Optional[List[FunctionTool]] = None
|
|
128
|
+
self.loop = None
|
|
129
|
+
self.audio_track: Optional[CustomAudioStreamTrack] = None
|
|
130
|
+
self._formatted_tools: Optional[List[Dict[str, Any]]] = None
|
|
131
|
+
self.config: OpenAIRealtimeConfig = config or OpenAIRealtimeConfig()
|
|
132
|
+
self.on("instructions_updated", self._handle_instructions_updated)
|
|
133
|
+
self.on("tools_updated", self._handle_tools_updated)
|
|
134
|
+
|
|
135
|
+
async def connect(self) -> None:
|
|
136
|
+
headers = {"Agent": "VideoSDK Agents"}
|
|
137
|
+
headers["Authorization"] = f"Bearer {self.api_key}"
|
|
138
|
+
headers["OpenAI-Beta"] = "realtime=v1"
|
|
139
|
+
|
|
140
|
+
url = self.process_base_url(self.base_url, self.model)
|
|
141
|
+
|
|
142
|
+
self._session = await self._create_session(url, headers)
|
|
143
|
+
await self._handle_websocket(self._session)
|
|
144
|
+
await self.send_first_session_update()
|
|
145
|
+
|
|
146
|
+
async def handle_audio_input(self, audio_data: bytes) -> None:
|
|
147
|
+
"""Handle incoming audio data from the user"""
|
|
148
|
+
if self._session and not self._closing:
|
|
149
|
+
base64_audio_data = base64.b64encode(audio_data).decode("utf-8")
|
|
150
|
+
audio_event = {
|
|
151
|
+
"type": "input_audio_buffer.append",
|
|
152
|
+
"audio": base64_audio_data
|
|
153
|
+
}
|
|
154
|
+
await self.send_event(audio_event)
|
|
155
|
+
|
|
156
|
+
async def _ensure_http_session(self) -> aiohttp.ClientSession:
|
|
157
|
+
"""Ensure we have an HTTP session"""
|
|
158
|
+
if not self._http_session:
|
|
159
|
+
self._http_session = aiohttp.ClientSession()
|
|
160
|
+
return self._http_session
|
|
161
|
+
|
|
162
|
+
async def _create_session(self, url: str, headers: dict) -> OpenAISession:
|
|
163
|
+
"""Create a new WebSocket session"""
|
|
164
|
+
|
|
165
|
+
http_session = await self._ensure_http_session()
|
|
166
|
+
ws = await http_session.ws_connect(url, headers=headers, autoping=True, heartbeat=10, autoclose=False, timeout=30)
|
|
167
|
+
msg_queue: asyncio.Queue = asyncio.Queue()
|
|
168
|
+
tasks: list[asyncio.Task] = []
|
|
169
|
+
|
|
170
|
+
self._closing = False
|
|
171
|
+
|
|
172
|
+
return OpenAISession(ws=ws, msg_queue=msg_queue, tasks=tasks)
|
|
173
|
+
|
|
174
|
+
async def send_message(self, message: str) -> None:
|
|
175
|
+
"""Send a message to the OpenAI realtime API"""
|
|
176
|
+
await self.send_event({
|
|
177
|
+
"type": "conversation.item.create",
|
|
178
|
+
"item": {
|
|
179
|
+
"type": "message",
|
|
180
|
+
"role": "assistant",
|
|
181
|
+
"content": [
|
|
182
|
+
{
|
|
183
|
+
"type": "text",
|
|
184
|
+
"text": "Repeat the user's exact message back to them:" + message + "DO NOT ADD ANYTHING ELSE",
|
|
185
|
+
}
|
|
186
|
+
]
|
|
187
|
+
}
|
|
188
|
+
})
|
|
189
|
+
await self.create_response()
|
|
190
|
+
|
|
191
|
+
async def create_response(self) -> None:
|
|
192
|
+
"""Create a response to the OpenAI realtime API"""
|
|
193
|
+
if not self._session:
|
|
194
|
+
raise RuntimeError("No active WebSocket session")
|
|
195
|
+
|
|
196
|
+
# Create response event
|
|
197
|
+
response_event = {
|
|
198
|
+
"type": "response.create",
|
|
199
|
+
"event_id": str(uuid.uuid4()),
|
|
200
|
+
"response": {
|
|
201
|
+
"instructions": self._instructions,
|
|
202
|
+
"metadata": {
|
|
203
|
+
"client_event_id": str(uuid.uuid4())
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
# Send the event through our message queue
|
|
209
|
+
await self.send_event(response_event)
|
|
210
|
+
|
|
211
|
+
# session_update = {
|
|
212
|
+
# "type": "session.update",
|
|
213
|
+
# "session": {
|
|
214
|
+
# "instructions": self._instructions
|
|
215
|
+
# }
|
|
216
|
+
# }
|
|
217
|
+
|
|
218
|
+
# await self.send_event(session_update)
|
|
219
|
+
|
|
220
|
+
async def _handle_websocket(self, session: OpenAISession) -> None:
|
|
221
|
+
"""Start WebSocket send/receive tasks"""
|
|
222
|
+
session.tasks.extend([
|
|
223
|
+
asyncio.create_task(self._send_loop(session), name="send_loop"),
|
|
224
|
+
asyncio.create_task(self._receive_loop(session), name="receive_loop")
|
|
225
|
+
])
|
|
226
|
+
|
|
227
|
+
async def _send_loop(self, session: OpenAISession) -> None:
|
|
228
|
+
"""Send messages from queue to WebSocket"""
|
|
229
|
+
try:
|
|
230
|
+
while not self._closing:
|
|
231
|
+
msg = await session.msg_queue.get()
|
|
232
|
+
if isinstance(msg, dict):
|
|
233
|
+
await session.ws.send_json(msg)
|
|
234
|
+
else:
|
|
235
|
+
await session.ws.send_str(str(msg))
|
|
236
|
+
except asyncio.CancelledError:
|
|
237
|
+
pass
|
|
238
|
+
finally:
|
|
239
|
+
await self._cleanup_session(session)
|
|
240
|
+
|
|
241
|
+
async def _receive_loop(self, session: OpenAISession) -> None:
|
|
242
|
+
"""Receive and process WebSocket messages"""
|
|
243
|
+
try:
|
|
244
|
+
while not self._closing:
|
|
245
|
+
msg = await session.ws.receive()
|
|
246
|
+
|
|
247
|
+
if msg.type == aiohttp.WSMsgType.CLOSED:
|
|
248
|
+
print("WebSocket closed with reason:", msg.extra)
|
|
249
|
+
break
|
|
250
|
+
elif msg.type == aiohttp.WSMsgType.ERROR:
|
|
251
|
+
print("WebSocket error:", msg.data)
|
|
252
|
+
break
|
|
253
|
+
elif msg.type == aiohttp.WSMsgType.TEXT:
|
|
254
|
+
await self._handle_message(json.loads(msg.data))
|
|
255
|
+
except Exception as e:
|
|
256
|
+
print("WebSocket receive error:", str(e))
|
|
257
|
+
finally:
|
|
258
|
+
await self._cleanup_session(session)
|
|
259
|
+
|
|
260
|
+
async def _handle_message(self, data: dict) -> None:
|
|
261
|
+
"""Handle incoming WebSocket messages"""
|
|
262
|
+
try:
|
|
263
|
+
event_type = data.get('type')
|
|
264
|
+
|
|
265
|
+
if event_type == "input_audio_buffer.speech_started":
|
|
266
|
+
await self._handle_speech_started(data)
|
|
267
|
+
|
|
268
|
+
elif event_type == "input_audio_buffer.speech_stopped":
|
|
269
|
+
await self._handle_speech_stopped(data)
|
|
270
|
+
|
|
271
|
+
elif event_type == "response.created":
|
|
272
|
+
await self._handle_response_created(data)
|
|
273
|
+
|
|
274
|
+
elif event_type == "response.output_item.added":
|
|
275
|
+
await self._handle_output_item_added(data)
|
|
276
|
+
|
|
277
|
+
elif event_type == "response.content_part.added":
|
|
278
|
+
await self._handle_content_part_added(data)
|
|
279
|
+
|
|
280
|
+
elif event_type == "response.audio.delta":
|
|
281
|
+
await self._handle_audio_delta(data)
|
|
282
|
+
|
|
283
|
+
elif event_type == "response.audio_transcript.delta":
|
|
284
|
+
await self._handle_transcript_delta(data)
|
|
285
|
+
|
|
286
|
+
elif event_type == "response.done":
|
|
287
|
+
await self._handle_response_done(data)
|
|
288
|
+
|
|
289
|
+
elif event_type == "error":
|
|
290
|
+
await self._handle_error(data)
|
|
291
|
+
|
|
292
|
+
elif event_type == "response.function_call_arguments.delta":
|
|
293
|
+
await self._handle_function_call_arguments_delta(data)
|
|
294
|
+
|
|
295
|
+
elif event_type == "response.function_call_arguments.done":
|
|
296
|
+
await self._handle_function_call_arguments_done(data)
|
|
297
|
+
|
|
298
|
+
elif event_type == "response.output_item.done":
|
|
299
|
+
await self._handle_output_item_done(data)
|
|
300
|
+
|
|
301
|
+
elif event_type == "conversation.item.input_audio_transcription.completed":
|
|
302
|
+
await self._handle_input_audio_transcription_completed(data)
|
|
303
|
+
|
|
304
|
+
except Exception as e:
|
|
305
|
+
self.emit_error(f"Error handling event {event_type}: {str(e)}")
|
|
306
|
+
|
|
307
|
+
async def _handle_speech_started(self, data: dict) -> None:
|
|
308
|
+
"""Handle speech detection start"""
|
|
309
|
+
await self.interrupt()
|
|
310
|
+
self.audio_track.interrupt()
|
|
311
|
+
|
|
312
|
+
async def _handle_speech_stopped(self, data: dict) -> None:
|
|
313
|
+
"""Handle speech detection end"""
|
|
314
|
+
|
|
315
|
+
async def _handle_response_created(self, data: dict) -> None:
|
|
316
|
+
"""Handle initial response creation"""
|
|
317
|
+
response_id = data.get("response", {}).get("id")
|
|
318
|
+
|
|
319
|
+
self.emit("response_created", {"response_id": response_id})
|
|
320
|
+
|
|
321
|
+
async def _handle_output_item_added(self, data: dict) -> None:
|
|
322
|
+
"""Handle new output item addition"""
|
|
323
|
+
|
|
324
|
+
async def _handle_output_item_done(self, data: dict) -> None:
|
|
325
|
+
"""Handle output item done"""
|
|
326
|
+
try:
|
|
327
|
+
item = data.get("item", {})
|
|
328
|
+
if item.get("type") == "function_call" and item.get("status") == "completed":
|
|
329
|
+
name = item.get("name")
|
|
330
|
+
arguments = json.loads(item.get("arguments", "{}"))
|
|
331
|
+
|
|
332
|
+
if name and self._tools:
|
|
333
|
+
for tool in self._tools:
|
|
334
|
+
tool_info = get_tool_info(tool)
|
|
335
|
+
if tool_info.name == name:
|
|
336
|
+
try:
|
|
337
|
+
result = await tool(**arguments)
|
|
338
|
+
await self.send_event({
|
|
339
|
+
"type": "conversation.item.create",
|
|
340
|
+
"item": {
|
|
341
|
+
"type": "function_call_output",
|
|
342
|
+
"call_id": item.get("call_id"),
|
|
343
|
+
"output": json.dumps(result)
|
|
344
|
+
}
|
|
345
|
+
})
|
|
346
|
+
|
|
347
|
+
await self.send_event({
|
|
348
|
+
"type": "response.create",
|
|
349
|
+
"event_id": str(uuid.uuid4()),
|
|
350
|
+
"response": {
|
|
351
|
+
"instructions": self._instructions,
|
|
352
|
+
"metadata": {
|
|
353
|
+
"client_event_id": str(uuid.uuid4())
|
|
354
|
+
}
|
|
355
|
+
}
|
|
356
|
+
})
|
|
357
|
+
|
|
358
|
+
except Exception as e:
|
|
359
|
+
print(f"Error executing function {name}: {e}")
|
|
360
|
+
break
|
|
361
|
+
except Exception as e:
|
|
362
|
+
print(f"Error handling output item done: {e}")
|
|
363
|
+
|
|
364
|
+
async def _handle_content_part_added(self, data: dict) -> None:
|
|
365
|
+
"""Handle new content part"""
|
|
366
|
+
|
|
367
|
+
async def _handle_audio_delta(self, data: dict) -> None:
|
|
368
|
+
"""Handle audio chunk"""
|
|
369
|
+
try:
|
|
370
|
+
base64_audio_data = base64.b64decode(data.get("delta"))
|
|
371
|
+
if base64_audio_data:
|
|
372
|
+
if self.audio_track and self.loop:
|
|
373
|
+
self.loop.create_task(self.audio_track.add_new_bytes(base64_audio_data))
|
|
374
|
+
except Exception as e:
|
|
375
|
+
print(f"[ERROR] Error handling audio delta: {e}")
|
|
376
|
+
traceback.print_exc()
|
|
377
|
+
|
|
378
|
+
async def interrupt(self) -> None:
|
|
379
|
+
"""Interrupt the current response and flush audio"""
|
|
380
|
+
if self._session and not self._closing:
|
|
381
|
+
cancel_event = {
|
|
382
|
+
"type": "response.cancel",
|
|
383
|
+
"event_id": str(uuid.uuid4())
|
|
384
|
+
}
|
|
385
|
+
await self.send_event(cancel_event)
|
|
386
|
+
|
|
387
|
+
async def _handle_transcript_delta(self, data: dict) -> None:
|
|
388
|
+
"""Handle transcript chunk"""
|
|
389
|
+
|
|
390
|
+
async def _handle_input_audio_transcription_completed(self, data: dict) -> None:
|
|
391
|
+
"""Handle input audio transcription completion"""
|
|
392
|
+
# if "transcript" in data:
|
|
393
|
+
# self.emit("transcription_event", {"text": data["transcript"]})
|
|
394
|
+
|
|
395
|
+
async def _handle_response_done(self, data: dict) -> None:
|
|
396
|
+
"""Handle response completion"""
|
|
397
|
+
|
|
398
|
+
async def _handle_function_call_arguments_delta(self, data: dict) -> None:
|
|
399
|
+
"""Handle function call arguments delta"""
|
|
400
|
+
|
|
401
|
+
async def _handle_function_call_arguments_done(self, data: dict) -> None:
|
|
402
|
+
"""Handle function call arguments done"""
|
|
403
|
+
|
|
404
|
+
async def _handle_error(self, data: dict) -> None:
|
|
405
|
+
"""Handle error events"""
|
|
406
|
+
|
|
407
|
+
async def _cleanup_session(self, session: OpenAISession) -> None:
|
|
408
|
+
"""Clean up session resources"""
|
|
409
|
+
if self._closing:
|
|
410
|
+
return
|
|
411
|
+
|
|
412
|
+
self._closing = True
|
|
413
|
+
|
|
414
|
+
for task in session.tasks:
|
|
415
|
+
if not task.done():
|
|
416
|
+
task.cancel()
|
|
417
|
+
try:
|
|
418
|
+
await asyncio.wait_for(task, timeout=1.0) # Add timeout
|
|
419
|
+
except (asyncio.CancelledError, asyncio.TimeoutError):
|
|
420
|
+
pass
|
|
421
|
+
|
|
422
|
+
# Close WebSocket
|
|
423
|
+
if not session.ws.closed:
|
|
424
|
+
try:
|
|
425
|
+
await session.ws.close()
|
|
426
|
+
except Exception:
|
|
427
|
+
pass
|
|
428
|
+
|
|
429
|
+
async def send_event(self, event: Dict[str, Any]) -> None:
|
|
430
|
+
"""Send an event to the WebSocket"""
|
|
431
|
+
if self._session and not self._closing:
|
|
432
|
+
await self._session.msg_queue.put(event)
|
|
433
|
+
|
|
434
|
+
async def aclose(self) -> None:
|
|
435
|
+
"""Cleanup all resources"""
|
|
436
|
+
if self._closing:
|
|
437
|
+
return
|
|
438
|
+
|
|
439
|
+
self._closing = True
|
|
440
|
+
|
|
441
|
+
if self._session:
|
|
442
|
+
await self._cleanup_session(self._session)
|
|
443
|
+
|
|
444
|
+
if self._http_session and not self._http_session.closed:
|
|
445
|
+
await self._http_session.close()
|
|
446
|
+
|
|
447
|
+
async def send_first_session_update(self) -> None:
|
|
448
|
+
"""Send initial session update with default values after connection"""
|
|
449
|
+
if not self._session:
|
|
450
|
+
return
|
|
451
|
+
|
|
452
|
+
session_update = {
|
|
453
|
+
"type": "session.update",
|
|
454
|
+
"session": {
|
|
455
|
+
"model": self.model,
|
|
456
|
+
"voice": self.config.voice,
|
|
457
|
+
"instructions": self._instructions or "You are a helpful voice assistant that can answer questions and help with tasks.",
|
|
458
|
+
"temperature": self.config.temperature,
|
|
459
|
+
"turn_detection": self.config.turn_detection.model_dump(
|
|
460
|
+
by_alias=True,
|
|
461
|
+
exclude_unset=True,
|
|
462
|
+
exclude_defaults=True,
|
|
463
|
+
),
|
|
464
|
+
"input_audio_transcription": self.config.input_audio_transcription.model_dump(
|
|
465
|
+
by_alias=True,
|
|
466
|
+
exclude_unset=True,
|
|
467
|
+
exclude_defaults=True,
|
|
468
|
+
),
|
|
469
|
+
"tool_choice": self.config.tool_choice,
|
|
470
|
+
"tools": self._formatted_tools or [],
|
|
471
|
+
"modalities": self.config.modalities,
|
|
472
|
+
"input_audio_format": DEFAULT_INPUT_AUDIO_FORMAT,
|
|
473
|
+
"output_audio_format": DEFAULT_OUTPUT_AUDIO_FORMAT,
|
|
474
|
+
"max_response_output_tokens": "inf"
|
|
475
|
+
}
|
|
476
|
+
}
|
|
477
|
+
# Send the event
|
|
478
|
+
await self.send_event(session_update)
|
|
479
|
+
|
|
480
|
+
def process_base_url(self, url: str, model: str) -> str:
|
|
481
|
+
if url.startswith("http"):
|
|
482
|
+
url = url.replace("http", "ws", 1)
|
|
483
|
+
|
|
484
|
+
parsed_url = urlparse(url)
|
|
485
|
+
query_params = parse_qs(parsed_url.query)
|
|
486
|
+
|
|
487
|
+
if not parsed_url.path or parsed_url.path.rstrip("/") in ["", "/v1", "/openai"]:
|
|
488
|
+
path = parsed_url.path.rstrip("/") + "/realtime"
|
|
489
|
+
else:
|
|
490
|
+
path = parsed_url.path
|
|
491
|
+
|
|
492
|
+
if "model" not in query_params:
|
|
493
|
+
query_params["model"] = [model]
|
|
494
|
+
|
|
495
|
+
new_query = urlencode(query_params, doseq=True)
|
|
496
|
+
new_url = urlunparse((parsed_url.scheme, parsed_url.netloc, path, "", new_query, ""))
|
|
497
|
+
|
|
498
|
+
return new_url
|
|
499
|
+
|
|
500
|
+
def _handle_instructions_updated(self, data: Dict[str, Any]) -> None:
|
|
501
|
+
"""Handle instructions_updated event"""
|
|
502
|
+
self._instructions = data.get("instructions")
|
|
503
|
+
|
|
504
|
+
def _format_tools_for_session(self, tools: List[FunctionTool]) -> List[Dict[str, Any]]:
|
|
505
|
+
"""Format tools for OpenAI session update"""
|
|
506
|
+
oai_tools = []
|
|
507
|
+
for tool in tools:
|
|
508
|
+
if not is_function_tool(tool):
|
|
509
|
+
continue
|
|
510
|
+
|
|
511
|
+
try:
|
|
512
|
+
tool_schema = build_openai_schema(tool)
|
|
513
|
+
oai_tools.append(tool_schema)
|
|
514
|
+
except Exception as e:
|
|
515
|
+
print(f"Failed to format tool {tool}: {e}")
|
|
516
|
+
continue
|
|
517
|
+
|
|
518
|
+
return oai_tools
|
|
519
|
+
|
|
520
|
+
def _handle_tools_updated(self, data: Dict[str, Any]) -> None:
|
|
521
|
+
"""Handle tools_updated event"""
|
|
522
|
+
tools = data.get("tools", [])
|
|
523
|
+
self._tools = tools
|
|
524
|
+
self._formatted_tools = self._format_tools_for_session(tools) # Format for OpenAI
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.0.1"
|