PyPI - videosdk-plugins-openai - Versions diffs - 0.0.21__tar.gz → 0.0.23__tar.gz - Mend

videosdk-plugins-openai 0.0.21tar.gz → 0.0.23tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of videosdk-plugins-openai might be problematic. Click here for more details.

Files changed (11) hide show

{videosdk_plugins_openai-0.0.21 → videosdk_plugins_openai-0.0.23}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: videosdk-plugins-openai
-Version: 0.0.21
+Version: 0.0.23
 Summary: VideoSDK Agent Framework plugin for OpenAI services
 Author: videosdk
 License-Expression: Apache-2.0
@@ -13,7 +13,7 @@ Classifier: Topic :: Multimedia :: Video
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Requires-Python: >=3.11
 Requires-Dist: openai[realtime]>=1.68.2
-Requires-Dist: videosdk-agents>=0.0.21
+Requires-Dist: videosdk-agents>=0.0.23
 Description-Content-Type: text/markdown
 # VideoSDK OpenAI Plugin

{videosdk_plugins_openai-0.0.21 → videosdk_plugins_openai-0.0.23}/pyproject.toml RENAMED Viewed

@@ -21,7 +21,7 @@ classifiers = [
     "Topic :: Scientific/Engineering :: Artificial Intelligence",
 ]
 dependencies = [
-    "videosdk-agents>=0.0.21",
+    "videosdk-agents>=0.0.23",
     "openai[realtime]>=1.68.2",
 ]

{videosdk_plugins_openai-0.0.21 → videosdk_plugins_openai-0.0.23}/videosdk/plugins/openai/llm.py RENAMED Viewed

@@ -133,7 +133,7 @@ class OpenAILLM(LLM):
                     tool_schema = build_openai_schema(tool)
                     formatted_tools.append(tool_schema)
                 except Exception as e:
-                    print(f"Failed to format tool {tool}: {e}")
+                    self.emit("error", f"Failed to format tool {tool}: {e}")
                     continue
             if formatted_tools:
@@ -167,7 +167,7 @@ class OpenAILLM(LLM):
                         args = json.loads(current_function_call["arguments"])
                         current_function_call["arguments"] = args
                     except json.JSONDecodeError:
-                        print(f"Failed to parse function arguments: {current_function_call['arguments']}")
+                        self.emit("error", f"Failed to parse function arguments: {current_function_call['arguments']}")
                         current_function_call["arguments"] = {}
                     yield LLMResponse(

{videosdk_plugins_openai-0.0.21 → videosdk_plugins_openai-0.0.23}/videosdk/plugins/openai/realtime_api.py RENAMED Viewed

@@ -24,6 +24,8 @@ from videosdk.agents import (
     global_event_emitter,
     Agent
 )
+from videosdk.agents import realtime_metrics_collector
 load_dotenv()
 from openai.types.beta.realtime.session import InputAudioTranscription, TurnDetection
@@ -45,9 +47,9 @@ DEFAULT_INPUT_AUDIO_TRANSCRIPTION = InputAudioTranscription(
 DEFAULT_TOOL_CHOICE = "auto"
 OpenAIEventTypes = Literal[
-    "instructions_updated",
-    "tools_updated",
-    "text_response"
+    "user_speech_started",
+    "text_response",
+    "error"
 ]
 DEFAULT_VOICE = "alloy"
 DEFAULT_INPUT_AUDIO_FORMAT = "pcm16"
@@ -121,6 +123,7 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
         self.api_key = api_key or os.getenv("OPENAI_API_KEY")
         self.base_url = base_url or OPENAI_BASE_URL
         if not self.api_key:
+            self.emit("error", "OpenAI API key must be provided or set in OPENAI_API_KEY environment variable")
             raise ValueError("OpenAI API key must be provided or set in OPENAI_API_KEY environment variable")
         self._http_session: Optional[aiohttp.ClientSession] = None
         self._session: Optional[OpenAISession] = None
@@ -133,6 +136,7 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
         self.config: OpenAIRealtimeConfig = config or OpenAIRealtimeConfig()
         self.input_sample_rate = 48000
         self.target_sample_rate = 16000
+        self._agent_speaking = False
     def set_agent(self, agent: Agent) -> None:
         self._instructions = agent.instructions
@@ -202,6 +206,7 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
     async def create_response(self) -> None:
         """Create a response to the OpenAI realtime API"""
         if not self._session:
+            self.emit("error", "No active WebSocket session")
             raise RuntimeError("No active WebSocket session")
         response_event = {
@@ -245,15 +250,15 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
                 msg = await session.ws.receive()
                 if msg.type == aiohttp.WSMsgType.CLOSED:
-                    print("WebSocket closed with reason:", msg.extra)
+                    self.emit("error", f"WebSocket closed with reason: {msg.extra}")
                     break
                 elif msg.type == aiohttp.WSMsgType.ERROR:
-                    print("WebSocket error:", msg.data)
+                    self.emit("error", f"WebSocket error: {msg.data}")
                     break
                 elif msg.type == aiohttp.WSMsgType.TEXT:
                     await self._handle_message(json.loads(msg.data))
         except Exception as e:
-            print("WebSocket receive error:", str(e))
+            self.emit("error", f"WebSocket receive error: {str(e)}")
         finally:
             await self._cleanup_session(session)
@@ -277,11 +282,14 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
             elif event_type == "response.content_part.added":
                 await self._handle_content_part_added(data)
+            elif event_type == "response.text.delta":
+                await self._handle_text_delta(data)
             elif event_type == "response.audio.delta":
                 await self._handle_audio_delta(data)
             elif event_type == "response.audio_transcript.delta":
-                await self._handle_transcript_delta(data)
+                await self._handle_audio_transcript_delta(data)
             elif event_type == "response.done":
                 await self._handle_response_done(data)
@@ -305,18 +313,20 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
                 await self._handle_text_done(data)
         except Exception as e:
-            self.emit_error(f"Error handling event {event_type}: {str(e)}")
+            self.emit("error", f"Error handling event {event_type}: {str(e)}")
     async def _handle_speech_started(self, data: dict) -> None:
         """Handle speech detection start"""
         if "audio" in self.config.modalities:
+            self.emit("user_speech_started", {"type": "done"})
             await self.interrupt()
             if self.audio_track:
                 self.audio_track.interrupt()
+        await realtime_metrics_collector.set_user_speech_start()
     async def _handle_speech_stopped(self, data: dict) -> None:
         """Handle speech detection end"""
-        pass
+        await realtime_metrics_collector.set_user_speech_end()
     async def _handle_response_created(self, data: dict) -> None:
         """Handle initial response creation"""
@@ -338,6 +348,7 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
                         tool_info = get_tool_info(tool)
                         if tool_info.name == name:
                             try:
+                                await realtime_metrics_collector.add_tool_call(name)
                                 result = await tool(**arguments)
                                 await self.send_event({
                                     "type": "conversation.item.create",
@@ -360,26 +371,33 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
                                 })
                             except Exception as e:
-                                print(f"Error executing function {name}: {e}")
+                                self.emit("error", f"Error executing function {name}: {e}")
                             break
         except Exception as e:
-            print(f"Error handling output item done: {e}")
+            self.emit("error", f"Error handling output item done: {e}")
     async def _handle_content_part_added(self, data: dict) -> None:
         """Handle new content part"""
+    async def _handle_text_delta(self, data: dict) -> None:
+        """Handle text delta chunk"""
+        pass
     async def _handle_audio_delta(self, data: dict) -> None:
         """Handle audio chunk"""
         if "audio" not in self.config.modalities:
             return
         try:
+            if not self._agent_speaking:
+                await realtime_metrics_collector.set_agent_speech_start()
+                self._agent_speaking = True
             base64_audio_data = base64.b64decode(data.get("delta"))
             if base64_audio_data:
                 if self.audio_track and self.loop:
                     self.loop.create_task(self.audio_track.add_new_bytes(base64_audio_data))
         except Exception as e:
-            print(f"[ERROR] Error handling audio delta: {e}")
+            self.emit("error", f"Error handling audio delta: {e}")
             traceback.print_exc()
     async def interrupt(self) -> None:
@@ -390,18 +408,36 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
                 "event_id": str(uuid.uuid4())
             }
             await self.send_event(cancel_event)
+            await realtime_metrics_collector.set_interrupted()
         if self.audio_track:
             self.audio_track.interrupt()
+        if self._agent_speaking:
+            await realtime_metrics_collector.set_agent_speech_end(timeout=1.0)
+            self._agent_speaking = False
-    async def _handle_transcript_delta(self, data: dict) -> None:
+    async def _handle_audio_transcript_delta(self, data: dict) -> None:
         """Handle transcript chunk"""
+        delta_content = data.get("delta", "")
+        if not hasattr(self, '_current_audio_transcript'):
+            self._current_audio_transcript = ""
+        self._current_audio_transcript += delta_content
     async def _handle_input_audio_transcription_completed(self, data: dict) -> None:
-        """Handle input audio transcription completion"""
+        """Handle input audio transcription completion for user transcript"""
+        transcript = data.get("transcript", "")
+        if transcript:
+            await realtime_metrics_collector.set_user_transcript(transcript)
     async def _handle_response_done(self, data: dict) -> None:
-        """Handle response completion"""
+        """Handle response completion for agent transcript"""
+        if hasattr(self, '_current_audio_transcript') and self._current_audio_transcript:
+            await realtime_metrics_collector.set_agent_response(self._current_audio_transcript)
+            global_event_emitter.emit("text_response", {"text": self._current_audio_transcript, "type": "done"})
+            self._current_audio_transcript = ""
+        await realtime_metrics_collector.set_agent_speech_end(timeout=1.0)
+        self._agent_speaking = False
+        pass
     async def _handle_function_call_arguments_delta(self, data: dict) -> None:
         """Handle function call arguments delta"""
@@ -526,7 +562,7 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
                 tool_schema = build_openai_schema(tool)
                 oai_tools.append(tool_schema)
             except Exception as e:
-                print(f"Failed to format tool {tool}: {e}")
+                self.emit("error", f"Failed to format tool {tool}: {e}")
                 continue
         return oai_tools
@@ -534,6 +570,7 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
     async def send_text_message(self, message: str) -> None:
         """Send a text message to the OpenAI realtime API"""
         if not self._session:
+            self.emit("error", "No active WebSocket session")
             raise RuntimeError("No active WebSocket session")
         await self.send_event({
@@ -551,11 +588,3 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
         })
         await self.create_response()
-    async def _handle_text_done(self, data: dict) -> None:
-        """Handle text response completion"""
-        try:
-            text_content = data.get("text", "")
-            if text_content:
-                global_event_emitter.emit("text_response", {"text": text_content, "type": "done"})
-        except Exception as e:
-            print(f"[ERROR] Error handling text done: {e}")

{videosdk_plugins_openai-0.0.21 → videosdk_plugins_openai-0.0.23}/videosdk/plugins/openai/tts.py RENAMED Viewed

@@ -36,6 +36,7 @@ class OpenAITTS(TTS):
         self.audio_track = None
         self.loop = None
         self.response_format = response_format
+        self._first_chunk_sent = False
         self.api_key = api_key or os.getenv("OPENAI_API_KEY")
         if not self.api_key:
@@ -55,6 +56,10 @@ class OpenAITTS(TTS):
                 ),
             ),
         )
+    def reset_first_audio_tracking(self) -> None:
+        """Reset the first audio tracking state for next TTS task"""
+        self._first_chunk_sent = False
     async def synthesize(
         self,
@@ -95,7 +100,6 @@ class OpenAITTS(TTS):
                     if chunk:
                         audio_data += chunk
             if audio_data:
                 await self._stream_audio_chunks(audio_data)
@@ -116,6 +120,10 @@ class OpenAITTS(TTS):
                 chunk += b'\x00' * padding_needed
             if len(chunk) == chunk_size:
+                if not self._first_chunk_sent and self._first_audio_callback:
+                    self._first_chunk_sent = True
+                    await self._first_audio_callback()
                 self.loop.create_task(self.audio_track.add_new_bytes(chunk))
                 await asyncio.sleep(0.001)

videosdk_plugins_openai-0.0.23/videosdk/plugins/openai/version.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.0.23"

videosdk_plugins_openai-0.0.21/videosdk/plugins/openai/version.py DELETED Viewed

	@@ -1 +0,0 @@
1	- __version__ = "0.0.21"

{videosdk_plugins_openai-0.0.21 → videosdk_plugins_openai-0.0.23}/.gitignore RENAMED Viewed

File without changes

{videosdk_plugins_openai-0.0.21 → videosdk_plugins_openai-0.0.23}/README.md RENAMED Viewed

File without changes

{videosdk_plugins_openai-0.0.21 → videosdk_plugins_openai-0.0.23}/videosdk/plugins/openai/__init__.py RENAMED Viewed

File without changes

{videosdk_plugins_openai-0.0.21 → videosdk_plugins_openai-0.0.23}/videosdk/plugins/openai/stt.py RENAMED Viewed

File without changes

videosdk-plugins-openai 0.0.21__tar.gz → 0.0.23__tar.gz

Potentially problematic release.

videosdk-plugins-openai 0.0.21tar.gz → 0.0.23tar.gz