videosdk-plugins-openai 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of videosdk-plugins-openai might be problematic. Click here for more details.

@@ -0,0 +1,7 @@
1
+ myenv/
2
+ venv/
3
+ env/
4
+ __pycache__
5
+
6
+ .env
7
+ .env.local
@@ -0,0 +1,27 @@
1
+ Metadata-Version: 2.4
2
+ Name: videosdk-plugins-openai
3
+ Version: 0.0.1
4
+ Summary: VideoSDK Agent Framework plugin for OpenAI services
5
+ Author: videosdk
6
+ License-Expression: Apache-2.0
7
+ Keywords: ai,audio,openai,video,videosdk
8
+ Classifier: Development Status :: 4 - Beta
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: Topic :: Communications :: Conferencing
11
+ Classifier: Topic :: Multimedia :: Sound/Audio
12
+ Classifier: Topic :: Multimedia :: Video
13
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
14
+ Requires-Python: >=3.9.0
15
+ Requires-Dist: openai[realtime]>=1.68.2
16
+ Requires-Dist: videosdk-agents>=0.0.1
17
+ Description-Content-Type: text/markdown
18
+
19
+ VideoSDK OpenAI Plugin
20
+
21
+ Agent Framework plugin for realtime services from OpenAI.
22
+
23
+ ## Installation
24
+
25
+ ```bash
26
+ pip install videosdk-plugins-openai
27
+ ```
@@ -0,0 +1,9 @@
1
+ VideoSDK OpenAI Plugin
2
+
3
+ Agent Framework plugin for realtime services from OpenAI.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install videosdk-plugins-openai
9
+ ```
@@ -0,0 +1,35 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "videosdk-plugins-openai"
7
+ dynamic = ["version"]
8
+ description = "VideoSDK Agent Framework plugin for OpenAI services"
9
+ readme = "README.md"
10
+ license = "Apache-2.0"
11
+ requires-python = ">=3.9.0"
12
+ authors = [{ name = "videosdk"}]
13
+ keywords = ["video", "audio", "ai", "openai", "videosdk"]
14
+ classifiers = [
15
+ "Intended Audience :: Developers",
16
+ "Development Status :: 4 - Beta",
17
+ "Intended Audience :: Developers",
18
+ "Topic :: Communications :: Conferencing",
19
+ "Topic :: Multimedia :: Sound/Audio",
20
+ "Topic :: Multimedia :: Video",
21
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
22
+ ]
23
+ dependencies = [
24
+ "videosdk-agents>=0.0.1",
25
+ "openai[realtime]>=1.68.2",
26
+ ]
27
+
28
+ [tool.hatch.version]
29
+ path = "videosdk/plugins/openai/version.py"
30
+
31
+ [tool.hatch.build.targets.wheel]
32
+ packages = ["videosdk"]
33
+
34
+ [tool.hatch.build.targets.sdist]
35
+ include = ["/videosdk"]
@@ -0,0 +1,6 @@
1
+ from .realtime_api import OpenAIRealtime, OpenAIRealtimeConfig
2
+
3
+ __all__ = [
4
+ 'OpenAIRealtime',
5
+ 'OpenAIRealtimeConfig'
6
+ ]
@@ -0,0 +1,524 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import json
5
+ import os
6
+ from typing import Any, Dict, Optional, Literal, List
7
+ from dataclasses import dataclass, field
8
+ from urllib.parse import parse_qs, urlencode, urlparse, urlunparse
9
+ from dotenv import load_dotenv
10
+ import uuid
11
+ import base64
12
+ import aiohttp
13
+ import traceback
14
+ from agent import (
15
+ FunctionTool,
16
+ is_function_tool,
17
+ get_tool_info,
18
+ build_openai_schema,
19
+ CustomAudioStreamTrack,
20
+ ToolChoice
21
+ )
22
+
23
+ load_dotenv()
24
+
25
+ from agent.realtime_base_model import RealtimeBaseModel
26
+ from openai.types.beta.realtime.session import InputAudioTranscription, TurnDetection
27
+
28
+ OPENAI_BASE_URL = "https://api.openai.com/v1"
29
+ SAMPLE_RATE = 24000
30
+ NUM_CHANNELS = 1
31
+
32
+ DEFAULT_TEMPERATURE = 0.8
33
+ DEFAULT_TURN_DETECTION = TurnDetection(
34
+ type="server_vad",
35
+ threshold=0.5,
36
+ prefix_padding_ms=300,
37
+ silence_duration_ms=200,
38
+ create_response=True,
39
+ interrupt_response=True,
40
+ )
41
+ DEFAULT_INPUT_AUDIO_TRANSCRIPTION = InputAudioTranscription(
42
+ model="gpt-4o-mini-transcribe",
43
+ )
44
+ DEFAULT_TOOL_CHOICE = "auto"
45
+
46
+ OpenAIEventTypes = Literal[
47
+ "instructions_updated",
48
+ "tools_updated"
49
+ ]
50
+ DEFAULT_VOICE = "alloy"
51
+ DEFAULT_INPUT_AUDIO_FORMAT = "pcm16"
52
+ DEFAULT_OUTPUT_AUDIO_FORMAT = "pcm16"
53
+
54
+ @dataclass
55
+ class OpenAIRealtimeConfig:
56
+ """Configuration for the OpenAI realtime API
57
+
58
+ Args:
59
+ voice: Voice ID for audio output. Default is 'alloy'
60
+ temperature: Controls randomness in response generation. Higher values (e.g. 0.8) make output more random,
61
+ lower values make it more deterministic. Default is 0.8
62
+ turn_detection: Configuration for detecting user speech turns. Contains settings for:
63
+ - type: Detection type ('server_vad')
64
+ - threshold: Voice activity detection threshold (0.0-1.0)
65
+ - prefix_padding_ms: Padding before speech start (ms)
66
+ - silence_duration_ms: Silence duration to mark end (ms)
67
+ - create_response: Whether to generate response on turn
68
+ - interrupt_response: Whether to allow interruption
69
+ input_audio_transcription: Configuration for audio transcription. Contains:
70
+ - model: Model to use for transcription
71
+ tool_choice: How tools should be selected ('auto' or 'none'). Default is 'auto'
72
+ modalities: List of enabled response types ["text", "audio"]. Default includes both
73
+ """
74
+ voice: str = DEFAULT_VOICE
75
+ temperature: float = DEFAULT_TEMPERATURE
76
+ turn_detection: TurnDetection | None = field(default_factory=lambda: DEFAULT_TURN_DETECTION)
77
+ input_audio_transcription: InputAudioTranscription | None = field(default_factory=lambda: DEFAULT_INPUT_AUDIO_TRANSCRIPTION)
78
+ tool_choice: ToolChoice | None = DEFAULT_TOOL_CHOICE
79
+ modalities: list[str] = field(default_factory=lambda: ["text", "audio"])
80
+
81
+ @dataclass
82
+ class OpenAISession:
83
+ """Represents an OpenAI WebSocket session"""
84
+ ws: aiohttp.ClientWebSocketResponse
85
+ msg_queue: asyncio.Queue[Dict[str, Any]]
86
+ tasks: list[asyncio.Task]
87
+
88
+ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
89
+ """OpenAI's realtime model implementation."""
90
+
91
+ def __init__(
92
+ self,
93
+ *,
94
+ model: str,
95
+ config: OpenAIRealtimeConfig | None = None,
96
+ api_key: str | None = None,
97
+ base_url: str | None = None,
98
+ ) -> None:
99
+ """
100
+ Initialize OpenAI realtime model.
101
+
102
+ Args:
103
+ model: The OpenAI model identifier to use (e.g. 'gpt-4', 'gpt-3.5-turbo')
104
+ config: Optional configuration object for customizing model behavior. Contains settings for:
105
+ - voice: Voice ID to use for audio output
106
+ - temperature: Sampling temperature for responses
107
+ - turn_detection: Settings for detecting user speech turns
108
+ - input_audio_transcription: Settings for audio transcription
109
+ - tool_choice: How tools should be selected ('auto' or 'none')
110
+ - modalities: List of enabled modalities ('text', 'audio')
111
+ api_key: OpenAI API key. If not provided, will attempt to read from OPENAI_API_KEY env var
112
+ base_url: Base URL for OpenAI API. Defaults to 'https://api.openai.com/v1'
113
+
114
+ Raises:
115
+ ValueError: If no API key is provided and none found in environment variables
116
+ """
117
+ super().__init__()
118
+ self.model = model
119
+ self.api_key = api_key or os.getenv("OPENAI_API_KEY")
120
+ self.base_url = base_url or OPENAI_BASE_URL
121
+ if not self.api_key:
122
+ raise ValueError("OpenAI API key must be provided or set in OPENAI_API_KEY environment variable")
123
+ self._http_session: Optional[aiohttp.ClientSession] = None
124
+ self._session: Optional[OpenAISession] = None
125
+ self._closing = False
126
+ self._instructions: Optional[str] = None
127
+ self._tools: Optional[List[FunctionTool]] = None
128
+ self.loop = None
129
+ self.audio_track: Optional[CustomAudioStreamTrack] = None
130
+ self._formatted_tools: Optional[List[Dict[str, Any]]] = None
131
+ self.config: OpenAIRealtimeConfig = config or OpenAIRealtimeConfig()
132
+ self.on("instructions_updated", self._handle_instructions_updated)
133
+ self.on("tools_updated", self._handle_tools_updated)
134
+
135
+ async def connect(self) -> None:
136
+ headers = {"Agent": "VideoSDK Agents"}
137
+ headers["Authorization"] = f"Bearer {self.api_key}"
138
+ headers["OpenAI-Beta"] = "realtime=v1"
139
+
140
+ url = self.process_base_url(self.base_url, self.model)
141
+
142
+ self._session = await self._create_session(url, headers)
143
+ await self._handle_websocket(self._session)
144
+ await self.send_first_session_update()
145
+
146
+ async def handle_audio_input(self, audio_data: bytes) -> None:
147
+ """Handle incoming audio data from the user"""
148
+ if self._session and not self._closing:
149
+ base64_audio_data = base64.b64encode(audio_data).decode("utf-8")
150
+ audio_event = {
151
+ "type": "input_audio_buffer.append",
152
+ "audio": base64_audio_data
153
+ }
154
+ await self.send_event(audio_event)
155
+
156
+ async def _ensure_http_session(self) -> aiohttp.ClientSession:
157
+ """Ensure we have an HTTP session"""
158
+ if not self._http_session:
159
+ self._http_session = aiohttp.ClientSession()
160
+ return self._http_session
161
+
162
+ async def _create_session(self, url: str, headers: dict) -> OpenAISession:
163
+ """Create a new WebSocket session"""
164
+
165
+ http_session = await self._ensure_http_session()
166
+ ws = await http_session.ws_connect(url, headers=headers, autoping=True, heartbeat=10, autoclose=False, timeout=30)
167
+ msg_queue: asyncio.Queue = asyncio.Queue()
168
+ tasks: list[asyncio.Task] = []
169
+
170
+ self._closing = False
171
+
172
+ return OpenAISession(ws=ws, msg_queue=msg_queue, tasks=tasks)
173
+
174
+ async def send_message(self, message: str) -> None:
175
+ """Send a message to the OpenAI realtime API"""
176
+ await self.send_event({
177
+ "type": "conversation.item.create",
178
+ "item": {
179
+ "type": "message",
180
+ "role": "assistant",
181
+ "content": [
182
+ {
183
+ "type": "text",
184
+ "text": "Repeat the user's exact message back to them:" + message + "DO NOT ADD ANYTHING ELSE",
185
+ }
186
+ ]
187
+ }
188
+ })
189
+ await self.create_response()
190
+
191
+ async def create_response(self) -> None:
192
+ """Create a response to the OpenAI realtime API"""
193
+ if not self._session:
194
+ raise RuntimeError("No active WebSocket session")
195
+
196
+ # Create response event
197
+ response_event = {
198
+ "type": "response.create",
199
+ "event_id": str(uuid.uuid4()),
200
+ "response": {
201
+ "instructions": self._instructions,
202
+ "metadata": {
203
+ "client_event_id": str(uuid.uuid4())
204
+ }
205
+ }
206
+ }
207
+
208
+ # Send the event through our message queue
209
+ await self.send_event(response_event)
210
+
211
+ # session_update = {
212
+ # "type": "session.update",
213
+ # "session": {
214
+ # "instructions": self._instructions
215
+ # }
216
+ # }
217
+
218
+ # await self.send_event(session_update)
219
+
220
+ async def _handle_websocket(self, session: OpenAISession) -> None:
221
+ """Start WebSocket send/receive tasks"""
222
+ session.tasks.extend([
223
+ asyncio.create_task(self._send_loop(session), name="send_loop"),
224
+ asyncio.create_task(self._receive_loop(session), name="receive_loop")
225
+ ])
226
+
227
+ async def _send_loop(self, session: OpenAISession) -> None:
228
+ """Send messages from queue to WebSocket"""
229
+ try:
230
+ while not self._closing:
231
+ msg = await session.msg_queue.get()
232
+ if isinstance(msg, dict):
233
+ await session.ws.send_json(msg)
234
+ else:
235
+ await session.ws.send_str(str(msg))
236
+ except asyncio.CancelledError:
237
+ pass
238
+ finally:
239
+ await self._cleanup_session(session)
240
+
241
+ async def _receive_loop(self, session: OpenAISession) -> None:
242
+ """Receive and process WebSocket messages"""
243
+ try:
244
+ while not self._closing:
245
+ msg = await session.ws.receive()
246
+
247
+ if msg.type == aiohttp.WSMsgType.CLOSED:
248
+ print("WebSocket closed with reason:", msg.extra)
249
+ break
250
+ elif msg.type == aiohttp.WSMsgType.ERROR:
251
+ print("WebSocket error:", msg.data)
252
+ break
253
+ elif msg.type == aiohttp.WSMsgType.TEXT:
254
+ await self._handle_message(json.loads(msg.data))
255
+ except Exception as e:
256
+ print("WebSocket receive error:", str(e))
257
+ finally:
258
+ await self._cleanup_session(session)
259
+
260
+ async def _handle_message(self, data: dict) -> None:
261
+ """Handle incoming WebSocket messages"""
262
+ try:
263
+ event_type = data.get('type')
264
+
265
+ if event_type == "input_audio_buffer.speech_started":
266
+ await self._handle_speech_started(data)
267
+
268
+ elif event_type == "input_audio_buffer.speech_stopped":
269
+ await self._handle_speech_stopped(data)
270
+
271
+ elif event_type == "response.created":
272
+ await self._handle_response_created(data)
273
+
274
+ elif event_type == "response.output_item.added":
275
+ await self._handle_output_item_added(data)
276
+
277
+ elif event_type == "response.content_part.added":
278
+ await self._handle_content_part_added(data)
279
+
280
+ elif event_type == "response.audio.delta":
281
+ await self._handle_audio_delta(data)
282
+
283
+ elif event_type == "response.audio_transcript.delta":
284
+ await self._handle_transcript_delta(data)
285
+
286
+ elif event_type == "response.done":
287
+ await self._handle_response_done(data)
288
+
289
+ elif event_type == "error":
290
+ await self._handle_error(data)
291
+
292
+ elif event_type == "response.function_call_arguments.delta":
293
+ await self._handle_function_call_arguments_delta(data)
294
+
295
+ elif event_type == "response.function_call_arguments.done":
296
+ await self._handle_function_call_arguments_done(data)
297
+
298
+ elif event_type == "response.output_item.done":
299
+ await self._handle_output_item_done(data)
300
+
301
+ elif event_type == "conversation.item.input_audio_transcription.completed":
302
+ await self._handle_input_audio_transcription_completed(data)
303
+
304
+ except Exception as e:
305
+ self.emit_error(f"Error handling event {event_type}: {str(e)}")
306
+
307
+ async def _handle_speech_started(self, data: dict) -> None:
308
+ """Handle speech detection start"""
309
+ await self.interrupt()
310
+ self.audio_track.interrupt()
311
+
312
+ async def _handle_speech_stopped(self, data: dict) -> None:
313
+ """Handle speech detection end"""
314
+
315
+ async def _handle_response_created(self, data: dict) -> None:
316
+ """Handle initial response creation"""
317
+ response_id = data.get("response", {}).get("id")
318
+
319
+ self.emit("response_created", {"response_id": response_id})
320
+
321
+ async def _handle_output_item_added(self, data: dict) -> None:
322
+ """Handle new output item addition"""
323
+
324
+ async def _handle_output_item_done(self, data: dict) -> None:
325
+ """Handle output item done"""
326
+ try:
327
+ item = data.get("item", {})
328
+ if item.get("type") == "function_call" and item.get("status") == "completed":
329
+ name = item.get("name")
330
+ arguments = json.loads(item.get("arguments", "{}"))
331
+
332
+ if name and self._tools:
333
+ for tool in self._tools:
334
+ tool_info = get_tool_info(tool)
335
+ if tool_info.name == name:
336
+ try:
337
+ result = await tool(**arguments)
338
+ await self.send_event({
339
+ "type": "conversation.item.create",
340
+ "item": {
341
+ "type": "function_call_output",
342
+ "call_id": item.get("call_id"),
343
+ "output": json.dumps(result)
344
+ }
345
+ })
346
+
347
+ await self.send_event({
348
+ "type": "response.create",
349
+ "event_id": str(uuid.uuid4()),
350
+ "response": {
351
+ "instructions": self._instructions,
352
+ "metadata": {
353
+ "client_event_id": str(uuid.uuid4())
354
+ }
355
+ }
356
+ })
357
+
358
+ except Exception as e:
359
+ print(f"Error executing function {name}: {e}")
360
+ break
361
+ except Exception as e:
362
+ print(f"Error handling output item done: {e}")
363
+
364
+ async def _handle_content_part_added(self, data: dict) -> None:
365
+ """Handle new content part"""
366
+
367
+ async def _handle_audio_delta(self, data: dict) -> None:
368
+ """Handle audio chunk"""
369
+ try:
370
+ base64_audio_data = base64.b64decode(data.get("delta"))
371
+ if base64_audio_data:
372
+ if self.audio_track and self.loop:
373
+ self.loop.create_task(self.audio_track.add_new_bytes(base64_audio_data))
374
+ except Exception as e:
375
+ print(f"[ERROR] Error handling audio delta: {e}")
376
+ traceback.print_exc()
377
+
378
+ async def interrupt(self) -> None:
379
+ """Interrupt the current response and flush audio"""
380
+ if self._session and not self._closing:
381
+ cancel_event = {
382
+ "type": "response.cancel",
383
+ "event_id": str(uuid.uuid4())
384
+ }
385
+ await self.send_event(cancel_event)
386
+
387
+ async def _handle_transcript_delta(self, data: dict) -> None:
388
+ """Handle transcript chunk"""
389
+
390
+ async def _handle_input_audio_transcription_completed(self, data: dict) -> None:
391
+ """Handle input audio transcription completion"""
392
+ # if "transcript" in data:
393
+ # self.emit("transcription_event", {"text": data["transcript"]})
394
+
395
+ async def _handle_response_done(self, data: dict) -> None:
396
+ """Handle response completion"""
397
+
398
+ async def _handle_function_call_arguments_delta(self, data: dict) -> None:
399
+ """Handle function call arguments delta"""
400
+
401
+ async def _handle_function_call_arguments_done(self, data: dict) -> None:
402
+ """Handle function call arguments done"""
403
+
404
+ async def _handle_error(self, data: dict) -> None:
405
+ """Handle error events"""
406
+
407
+ async def _cleanup_session(self, session: OpenAISession) -> None:
408
+ """Clean up session resources"""
409
+ if self._closing:
410
+ return
411
+
412
+ self._closing = True
413
+
414
+ for task in session.tasks:
415
+ if not task.done():
416
+ task.cancel()
417
+ try:
418
+ await asyncio.wait_for(task, timeout=1.0) # Add timeout
419
+ except (asyncio.CancelledError, asyncio.TimeoutError):
420
+ pass
421
+
422
+ # Close WebSocket
423
+ if not session.ws.closed:
424
+ try:
425
+ await session.ws.close()
426
+ except Exception:
427
+ pass
428
+
429
+ async def send_event(self, event: Dict[str, Any]) -> None:
430
+ """Send an event to the WebSocket"""
431
+ if self._session and not self._closing:
432
+ await self._session.msg_queue.put(event)
433
+
434
+ async def aclose(self) -> None:
435
+ """Cleanup all resources"""
436
+ if self._closing:
437
+ return
438
+
439
+ self._closing = True
440
+
441
+ if self._session:
442
+ await self._cleanup_session(self._session)
443
+
444
+ if self._http_session and not self._http_session.closed:
445
+ await self._http_session.close()
446
+
447
+ async def send_first_session_update(self) -> None:
448
+ """Send initial session update with default values after connection"""
449
+ if not self._session:
450
+ return
451
+
452
+ session_update = {
453
+ "type": "session.update",
454
+ "session": {
455
+ "model": self.model,
456
+ "voice": self.config.voice,
457
+ "instructions": self._instructions or "You are a helpful voice assistant that can answer questions and help with tasks.",
458
+ "temperature": self.config.temperature,
459
+ "turn_detection": self.config.turn_detection.model_dump(
460
+ by_alias=True,
461
+ exclude_unset=True,
462
+ exclude_defaults=True,
463
+ ),
464
+ "input_audio_transcription": self.config.input_audio_transcription.model_dump(
465
+ by_alias=True,
466
+ exclude_unset=True,
467
+ exclude_defaults=True,
468
+ ),
469
+ "tool_choice": self.config.tool_choice,
470
+ "tools": self._formatted_tools or [],
471
+ "modalities": self.config.modalities,
472
+ "input_audio_format": DEFAULT_INPUT_AUDIO_FORMAT,
473
+ "output_audio_format": DEFAULT_OUTPUT_AUDIO_FORMAT,
474
+ "max_response_output_tokens": "inf"
475
+ }
476
+ }
477
+ # Send the event
478
+ await self.send_event(session_update)
479
+
480
+ def process_base_url(self, url: str, model: str) -> str:
481
+ if url.startswith("http"):
482
+ url = url.replace("http", "ws", 1)
483
+
484
+ parsed_url = urlparse(url)
485
+ query_params = parse_qs(parsed_url.query)
486
+
487
+ if not parsed_url.path or parsed_url.path.rstrip("/") in ["", "/v1", "/openai"]:
488
+ path = parsed_url.path.rstrip("/") + "/realtime"
489
+ else:
490
+ path = parsed_url.path
491
+
492
+ if "model" not in query_params:
493
+ query_params["model"] = [model]
494
+
495
+ new_query = urlencode(query_params, doseq=True)
496
+ new_url = urlunparse((parsed_url.scheme, parsed_url.netloc, path, "", new_query, ""))
497
+
498
+ return new_url
499
+
500
+ def _handle_instructions_updated(self, data: Dict[str, Any]) -> None:
501
+ """Handle instructions_updated event"""
502
+ self._instructions = data.get("instructions")
503
+
504
+ def _format_tools_for_session(self, tools: List[FunctionTool]) -> List[Dict[str, Any]]:
505
+ """Format tools for OpenAI session update"""
506
+ oai_tools = []
507
+ for tool in tools:
508
+ if not is_function_tool(tool):
509
+ continue
510
+
511
+ try:
512
+ tool_schema = build_openai_schema(tool)
513
+ oai_tools.append(tool_schema)
514
+ except Exception as e:
515
+ print(f"Failed to format tool {tool}: {e}")
516
+ continue
517
+
518
+ return oai_tools
519
+
520
+ def _handle_tools_updated(self, data: Dict[str, Any]) -> None:
521
+ """Handle tools_updated event"""
522
+ tools = data.get("tools", [])
523
+ self._tools = tools
524
+ self._formatted_tools = self._format_tools_for_session(tools) # Format for OpenAI
@@ -0,0 +1 @@
1
+ __version__ = "0.0.1"