PyPI - videosdk-plugins-openai - Versions diffs - 0.0.4__tar.gz → 0.0.5__tar.gz - Mend

videosdk-plugins-openai 0.0.4tar.gz → 0.0.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of videosdk-plugins-openai might be problematic. Click here for more details.

Files changed (8) hide show

{videosdk_plugins_openai-0.0.4 → videosdk_plugins_openai-0.0.5}/.gitignore RENAMED Viewed

@@ -7,3 +7,4 @@ __pycache__
 .env.local
 test_env/
 dist/
+.DS_Store

{videosdk_plugins_openai-0.0.4 → videosdk_plugins_openai-0.0.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: videosdk-plugins-openai
-Version: 0.0.4
+Version: 0.0.5
 Summary: VideoSDK Agent Framework plugin for OpenAI services
 Author: videosdk
 Keywords: ai,audio,openai,video,videosdk
@@ -12,7 +12,7 @@ Classifier: Topic :: Multimedia :: Video
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Requires-Python: >=3.11
 Requires-Dist: openai[realtime]>=1.68.2
-Requires-Dist: videosdk-agents>=0.0.6
+Requires-Dist: videosdk-agents>=0.0.8
 Description-Content-Type: text/markdown
 VideoSDK OpenAI Plugin

{videosdk_plugins_openai-0.0.4 → videosdk_plugins_openai-0.0.5}/pyproject.toml RENAMED Viewed

@@ -20,7 +20,7 @@ classifiers = [
     "Topic :: Scientific/Engineering :: Artificial Intelligence",
 ]
 dependencies = [
-    "videosdk-agents>=0.0.6",
+    "videosdk-agents>=0.0.8",
     "openai[realtime]>=1.68.2",
 ]

{videosdk_plugins_openai-0.0.4 → videosdk_plugins_openai-0.0.5}/videosdk/plugins/openai/realtime_api.py RENAMED Viewed

@@ -18,7 +18,8 @@ from videosdk.agents import (
     build_openai_schema,
     CustomAudioStreamTrack,
     ToolChoice,
-    RealtimeBaseModel
+    RealtimeBaseModel,
+    Agent
 )
 load_dotenv()
@@ -44,7 +45,8 @@ DEFAULT_TOOL_CHOICE = "auto"
 OpenAIEventTypes = Literal[
     "instructions_updated",
-    "tools_updated"
+    "tools_updated",
+    "text_response"
 ]
 DEFAULT_VOICE = "alloy"
 DEFAULT_INPUT_AUDIO_FORMAT = "pcm16"
@@ -128,8 +130,14 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
         self.audio_track: Optional[CustomAudioStreamTrack] = None
         self._formatted_tools: Optional[List[Dict[str, Any]]] = None
         self.config: OpenAIRealtimeConfig = config or OpenAIRealtimeConfig()
-        self.on("instructions_updated", self._handle_instructions_updated)
-        self.on("tools_updated", self._handle_tools_updated)
+        # self.on("instructions_updated", self._handle_instructions_updated)
+        # self.on("tools_updated", self._handle_tools_updated)
+    def set_agent(self, agent: Agent) -> None:
+        self._instructions = agent.instructions
+        self._tools = agent.tools
+        self.tools_formatted = self._format_tools_for_session(self._tools)
+        self._formatted_tools = self.tools_formatted
     async def connect(self) -> None:
         headers = {"Agent": "VideoSDK Agents"}
@@ -144,7 +152,7 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
     async def handle_audio_input(self, audio_data: bytes) -> None:
         """Handle incoming audio data from the user"""
-        if self._session and not self._closing:
+        if self._session and not self._closing and "audio" in self.config.modalities:
             base64_audio_data = base64.b64encode(audio_data).decode("utf-8")
             audio_event = {
                 "type": "input_audio_buffer.append",
@@ -299,17 +307,23 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
             elif event_type == "conversation.item.input_audio_transcription.completed":
                 await self._handle_input_audio_transcription_completed(data)
+            elif event_type == "response.text.done":
+                await self._handle_text_done(data)
         except Exception as e:
             self.emit_error(f"Error handling event {event_type}: {str(e)}")
     async def _handle_speech_started(self, data: dict) -> None:
         """Handle speech detection start"""
-        await self.interrupt()
-        self.audio_track.interrupt()
+        if "audio" in self.config.modalities:
+            await self.interrupt()
+            if self.audio_track:
+                self.audio_track.interrupt()
     async def _handle_speech_stopped(self, data: dict) -> None:
         """Handle speech detection end"""
+        pass
     async def _handle_response_created(self, data: dict) -> None:
         """Handle initial response creation"""
@@ -365,6 +379,9 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
     async def _handle_audio_delta(self, data: dict) -> None:
         """Handle audio chunk"""
+        if "audio" not in self.config.modalities:
+            return
         try:
             base64_audio_data = base64.b64decode(data.get("delta"))
             if base64_audio_data:
@@ -448,31 +465,45 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
         if not self._session:
             return
+        # Conditionally set turn detection and audio transcription based on modalities
+        turn_detection = None
+        input_audio_transcription = None
+        if "audio" in self.config.modalities:
+            turn_detection = self.config.turn_detection.model_dump(
+                by_alias=True,
+                exclude_unset=True,
+                exclude_defaults=True,
+            ) if self.config.turn_detection else None
+            input_audio_transcription = self.config.input_audio_transcription.model_dump(
+                by_alias=True,
+                exclude_unset=True,
+                exclude_defaults=True,
+            ) if self.config.input_audio_transcription else None
         session_update = {
             "type": "session.update",
             "session": {
                 "model": self.model,
-                "voice": self.config.voice,
-                "instructions": self._instructions or  "You are a helpful voice assistant that can answer questions and help with tasks.",
+                "instructions": self._instructions or "You are a helpful assistant that can answer questions and help with tasks.",
                 "temperature": self.config.temperature,
-                "turn_detection": self.config.turn_detection.model_dump(
-                    by_alias=True,
-                    exclude_unset=True,
-                    exclude_defaults=True,
-                ),
-                "input_audio_transcription": self.config.input_audio_transcription.model_dump(
-                    by_alias=True,
-                    exclude_unset=True,
-                    exclude_defaults=True,
-                ),
                 "tool_choice": self.config.tool_choice,
                 "tools": self._formatted_tools or [],
                 "modalities": self.config.modalities,
-                "input_audio_format": DEFAULT_INPUT_AUDIO_FORMAT,
-                "output_audio_format": DEFAULT_OUTPUT_AUDIO_FORMAT,
                 "max_response_output_tokens": "inf"
             }
         }
+        # Only add audio-related configurations if audio modality is enabled
+        if "audio" in self.config.modalities:
+            session_update["session"]["voice"] = self.config.voice
+            session_update["session"]["input_audio_format"] = DEFAULT_INPUT_AUDIO_FORMAT
+            session_update["session"]["output_audio_format"] = DEFAULT_OUTPUT_AUDIO_FORMAT
+            if turn_detection:
+                session_update["session"]["turn_detection"] = turn_detection
+            if input_audio_transcription:
+                session_update["session"]["input_audio_transcription"] = input_audio_transcription
         # Send the event
         await self.send_event(session_update)
@@ -521,4 +552,33 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
         tools = data.get("tools", [])
         self._tools = tools
         self.tools_formatted = self._format_tools_for_session(tools)
-        self._formatted_tools = self.tools_formatted
+        self._formatted_tools = self.tools_formatted
+    async def send_text_message(self, message: str) -> None:
+        """Send a text message to the OpenAI realtime API"""
+        if not self._session:
+            raise RuntimeError("No active WebSocket session")
+        await self.send_event({
+            "type": "conversation.item.create",
+            "item": {
+                "type": "message",
+                "role": "user",
+                "content": [
+                    {
+                        "type": "input_text",
+                        "text": message
+                    }
+                ]
+            }
+        })
+        await self.create_response()
+    async def _handle_text_done(self, data: dict) -> None:
+        """Handle text response completion"""
+        try:
+            text_content = data.get("text", "")
+            if text_content:
+                self.emit("text_response", {"text": text_content, "type": "done"})
+        except Exception as e:
+            print(f"[ERROR] Error handling text done: {e}")

videosdk_plugins_openai-0.0.5/videosdk/plugins/openai/version.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.0.5"

videosdk_plugins_openai-0.0.4/videosdk/plugins/openai/version.py DELETED Viewed

	@@ -1 +0,0 @@
1	- __version__ = "0.0.4"

{videosdk_plugins_openai-0.0.4 → videosdk_plugins_openai-0.0.5}/README.md RENAMED Viewed

File without changes

{videosdk_plugins_openai-0.0.4 → videosdk_plugins_openai-0.0.5}/videosdk/plugins/openai/__init__.py RENAMED Viewed

File without changes

videosdk-plugins-openai 0.0.4__tar.gz → 0.0.5__tar.gz

Potentially problematic release.

videosdk-plugins-openai 0.0.4tar.gz → 0.0.5tar.gz