videosdk-plugins-openai 0.0.4__tar.gz → 0.0.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of videosdk-plugins-openai might be problematic. Click here for more details.
- {videosdk_plugins_openai-0.0.4 → videosdk_plugins_openai-0.0.5}/.gitignore +1 -0
- {videosdk_plugins_openai-0.0.4 → videosdk_plugins_openai-0.0.5}/PKG-INFO +2 -2
- {videosdk_plugins_openai-0.0.4 → videosdk_plugins_openai-0.0.5}/pyproject.toml +1 -1
- {videosdk_plugins_openai-0.0.4 → videosdk_plugins_openai-0.0.5}/videosdk/plugins/openai/realtime_api.py +82 -22
- videosdk_plugins_openai-0.0.5/videosdk/plugins/openai/version.py +1 -0
- videosdk_plugins_openai-0.0.4/videosdk/plugins/openai/version.py +0 -1
- {videosdk_plugins_openai-0.0.4 → videosdk_plugins_openai-0.0.5}/README.md +0 -0
- {videosdk_plugins_openai-0.0.4 → videosdk_plugins_openai-0.0.5}/videosdk/plugins/openai/__init__.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: videosdk-plugins-openai
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.5
|
|
4
4
|
Summary: VideoSDK Agent Framework plugin for OpenAI services
|
|
5
5
|
Author: videosdk
|
|
6
6
|
Keywords: ai,audio,openai,video,videosdk
|
|
@@ -12,7 +12,7 @@ Classifier: Topic :: Multimedia :: Video
|
|
|
12
12
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
13
13
|
Requires-Python: >=3.11
|
|
14
14
|
Requires-Dist: openai[realtime]>=1.68.2
|
|
15
|
-
Requires-Dist: videosdk-agents>=0.0.
|
|
15
|
+
Requires-Dist: videosdk-agents>=0.0.8
|
|
16
16
|
Description-Content-Type: text/markdown
|
|
17
17
|
|
|
18
18
|
VideoSDK OpenAI Plugin
|
|
@@ -18,7 +18,8 @@ from videosdk.agents import (
|
|
|
18
18
|
build_openai_schema,
|
|
19
19
|
CustomAudioStreamTrack,
|
|
20
20
|
ToolChoice,
|
|
21
|
-
RealtimeBaseModel
|
|
21
|
+
RealtimeBaseModel,
|
|
22
|
+
Agent
|
|
22
23
|
)
|
|
23
24
|
|
|
24
25
|
load_dotenv()
|
|
@@ -44,7 +45,8 @@ DEFAULT_TOOL_CHOICE = "auto"
|
|
|
44
45
|
|
|
45
46
|
OpenAIEventTypes = Literal[
|
|
46
47
|
"instructions_updated",
|
|
47
|
-
"tools_updated"
|
|
48
|
+
"tools_updated",
|
|
49
|
+
"text_response"
|
|
48
50
|
]
|
|
49
51
|
DEFAULT_VOICE = "alloy"
|
|
50
52
|
DEFAULT_INPUT_AUDIO_FORMAT = "pcm16"
|
|
@@ -128,8 +130,14 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
|
|
|
128
130
|
self.audio_track: Optional[CustomAudioStreamTrack] = None
|
|
129
131
|
self._formatted_tools: Optional[List[Dict[str, Any]]] = None
|
|
130
132
|
self.config: OpenAIRealtimeConfig = config or OpenAIRealtimeConfig()
|
|
131
|
-
self.on("instructions_updated", self._handle_instructions_updated)
|
|
132
|
-
self.on("tools_updated", self._handle_tools_updated)
|
|
133
|
+
# self.on("instructions_updated", self._handle_instructions_updated)
|
|
134
|
+
# self.on("tools_updated", self._handle_tools_updated)
|
|
135
|
+
|
|
136
|
+
def set_agent(self, agent: Agent) -> None:
|
|
137
|
+
self._instructions = agent.instructions
|
|
138
|
+
self._tools = agent.tools
|
|
139
|
+
self.tools_formatted = self._format_tools_for_session(self._tools)
|
|
140
|
+
self._formatted_tools = self.tools_formatted
|
|
133
141
|
|
|
134
142
|
async def connect(self) -> None:
|
|
135
143
|
headers = {"Agent": "VideoSDK Agents"}
|
|
@@ -144,7 +152,7 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
|
|
|
144
152
|
|
|
145
153
|
async def handle_audio_input(self, audio_data: bytes) -> None:
|
|
146
154
|
"""Handle incoming audio data from the user"""
|
|
147
|
-
if self._session and not self._closing:
|
|
155
|
+
if self._session and not self._closing and "audio" in self.config.modalities:
|
|
148
156
|
base64_audio_data = base64.b64encode(audio_data).decode("utf-8")
|
|
149
157
|
audio_event = {
|
|
150
158
|
"type": "input_audio_buffer.append",
|
|
@@ -299,17 +307,23 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
|
|
|
299
307
|
|
|
300
308
|
elif event_type == "conversation.item.input_audio_transcription.completed":
|
|
301
309
|
await self._handle_input_audio_transcription_completed(data)
|
|
310
|
+
|
|
311
|
+
elif event_type == "response.text.done":
|
|
312
|
+
await self._handle_text_done(data)
|
|
302
313
|
|
|
303
314
|
except Exception as e:
|
|
304
315
|
self.emit_error(f"Error handling event {event_type}: {str(e)}")
|
|
305
316
|
|
|
306
317
|
async def _handle_speech_started(self, data: dict) -> None:
|
|
307
318
|
"""Handle speech detection start"""
|
|
308
|
-
|
|
309
|
-
|
|
319
|
+
if "audio" in self.config.modalities:
|
|
320
|
+
await self.interrupt()
|
|
321
|
+
if self.audio_track:
|
|
322
|
+
self.audio_track.interrupt()
|
|
310
323
|
|
|
311
324
|
async def _handle_speech_stopped(self, data: dict) -> None:
|
|
312
325
|
"""Handle speech detection end"""
|
|
326
|
+
pass
|
|
313
327
|
|
|
314
328
|
async def _handle_response_created(self, data: dict) -> None:
|
|
315
329
|
"""Handle initial response creation"""
|
|
@@ -365,6 +379,9 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
|
|
|
365
379
|
|
|
366
380
|
async def _handle_audio_delta(self, data: dict) -> None:
|
|
367
381
|
"""Handle audio chunk"""
|
|
382
|
+
if "audio" not in self.config.modalities:
|
|
383
|
+
return
|
|
384
|
+
|
|
368
385
|
try:
|
|
369
386
|
base64_audio_data = base64.b64decode(data.get("delta"))
|
|
370
387
|
if base64_audio_data:
|
|
@@ -448,31 +465,45 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
|
|
|
448
465
|
if not self._session:
|
|
449
466
|
return
|
|
450
467
|
|
|
468
|
+
# Conditionally set turn detection and audio transcription based on modalities
|
|
469
|
+
turn_detection = None
|
|
470
|
+
input_audio_transcription = None
|
|
471
|
+
|
|
472
|
+
if "audio" in self.config.modalities:
|
|
473
|
+
turn_detection = self.config.turn_detection.model_dump(
|
|
474
|
+
by_alias=True,
|
|
475
|
+
exclude_unset=True,
|
|
476
|
+
exclude_defaults=True,
|
|
477
|
+
) if self.config.turn_detection else None
|
|
478
|
+
input_audio_transcription = self.config.input_audio_transcription.model_dump(
|
|
479
|
+
by_alias=True,
|
|
480
|
+
exclude_unset=True,
|
|
481
|
+
exclude_defaults=True,
|
|
482
|
+
) if self.config.input_audio_transcription else None
|
|
483
|
+
|
|
451
484
|
session_update = {
|
|
452
485
|
"type": "session.update",
|
|
453
486
|
"session": {
|
|
454
487
|
"model": self.model,
|
|
455
|
-
"
|
|
456
|
-
"instructions": self._instructions or "You are a helpful voice assistant that can answer questions and help with tasks.",
|
|
488
|
+
"instructions": self._instructions or "You are a helpful assistant that can answer questions and help with tasks.",
|
|
457
489
|
"temperature": self.config.temperature,
|
|
458
|
-
"turn_detection": self.config.turn_detection.model_dump(
|
|
459
|
-
by_alias=True,
|
|
460
|
-
exclude_unset=True,
|
|
461
|
-
exclude_defaults=True,
|
|
462
|
-
),
|
|
463
|
-
"input_audio_transcription": self.config.input_audio_transcription.model_dump(
|
|
464
|
-
by_alias=True,
|
|
465
|
-
exclude_unset=True,
|
|
466
|
-
exclude_defaults=True,
|
|
467
|
-
),
|
|
468
490
|
"tool_choice": self.config.tool_choice,
|
|
469
491
|
"tools": self._formatted_tools or [],
|
|
470
492
|
"modalities": self.config.modalities,
|
|
471
|
-
"input_audio_format": DEFAULT_INPUT_AUDIO_FORMAT,
|
|
472
|
-
"output_audio_format": DEFAULT_OUTPUT_AUDIO_FORMAT,
|
|
473
493
|
"max_response_output_tokens": "inf"
|
|
474
494
|
}
|
|
475
495
|
}
|
|
496
|
+
|
|
497
|
+
# Only add audio-related configurations if audio modality is enabled
|
|
498
|
+
if "audio" in self.config.modalities:
|
|
499
|
+
session_update["session"]["voice"] = self.config.voice
|
|
500
|
+
session_update["session"]["input_audio_format"] = DEFAULT_INPUT_AUDIO_FORMAT
|
|
501
|
+
session_update["session"]["output_audio_format"] = DEFAULT_OUTPUT_AUDIO_FORMAT
|
|
502
|
+
if turn_detection:
|
|
503
|
+
session_update["session"]["turn_detection"] = turn_detection
|
|
504
|
+
if input_audio_transcription:
|
|
505
|
+
session_update["session"]["input_audio_transcription"] = input_audio_transcription
|
|
506
|
+
|
|
476
507
|
# Send the event
|
|
477
508
|
await self.send_event(session_update)
|
|
478
509
|
|
|
@@ -521,4 +552,33 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
|
|
|
521
552
|
tools = data.get("tools", [])
|
|
522
553
|
self._tools = tools
|
|
523
554
|
self.tools_formatted = self._format_tools_for_session(tools)
|
|
524
|
-
self._formatted_tools = self.tools_formatted
|
|
555
|
+
self._formatted_tools = self.tools_formatted
|
|
556
|
+
|
|
557
|
+
async def send_text_message(self, message: str) -> None:
|
|
558
|
+
"""Send a text message to the OpenAI realtime API"""
|
|
559
|
+
if not self._session:
|
|
560
|
+
raise RuntimeError("No active WebSocket session")
|
|
561
|
+
|
|
562
|
+
await self.send_event({
|
|
563
|
+
"type": "conversation.item.create",
|
|
564
|
+
"item": {
|
|
565
|
+
"type": "message",
|
|
566
|
+
"role": "user",
|
|
567
|
+
"content": [
|
|
568
|
+
{
|
|
569
|
+
"type": "input_text",
|
|
570
|
+
"text": message
|
|
571
|
+
}
|
|
572
|
+
]
|
|
573
|
+
}
|
|
574
|
+
})
|
|
575
|
+
await self.create_response()
|
|
576
|
+
|
|
577
|
+
async def _handle_text_done(self, data: dict) -> None:
|
|
578
|
+
"""Handle text response completion"""
|
|
579
|
+
try:
|
|
580
|
+
text_content = data.get("text", "")
|
|
581
|
+
if text_content:
|
|
582
|
+
self.emit("text_response", {"text": text_content, "type": "done"})
|
|
583
|
+
except Exception as e:
|
|
584
|
+
print(f"[ERROR] Error handling text done: {e}")
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.0.5"
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.0.4"
|
|
File without changes
|
{videosdk_plugins_openai-0.0.4 → videosdk_plugins_openai-0.0.5}/videosdk/plugins/openai/__init__.py
RENAMED
|
File without changes
|