PyPI - videosdk-plugins-openai - Versions diffs - 0.0.30__tar.gz → 0.0.32__tar.gz - Mend

videosdk-plugins-openai 0.0.30tar.gz → 0.0.32tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of videosdk-plugins-openai might be problematic. Click here for more details.

Files changed (11) hide show

{videosdk_plugins_openai-0.0.30 → videosdk_plugins_openai-0.0.32}/.gitignore RENAMED Viewed

@@ -2,13 +2,12 @@ myenv/
 venv/
 env/
 __pycache__/
+.venv/
 .env
 .env.local
 test_env/
 dist/
 .DS_Store
 node_modules/
 credentials.json
 .Python
@@ -16,3 +15,5 @@ build/
 eggs/
 sdist/
 wheels/
+docs/
+agent-sdk-reference/

{videosdk_plugins_openai-0.0.30 → videosdk_plugins_openai-0.0.32}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: videosdk-plugins-openai
-Version: 0.0.30
+Version: 0.0.32
 Summary: VideoSDK Agent Framework plugin for OpenAI services
 Author: videosdk
 License-Expression: Apache-2.0
@@ -13,7 +13,7 @@ Classifier: Topic :: Multimedia :: Video
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Requires-Python: >=3.11
 Requires-Dist: openai[realtime]>=1.68.2
-Requires-Dist: videosdk-agents>=0.0.30
+Requires-Dist: videosdk-agents>=0.0.32
 Description-Content-Type: text/markdown
 # VideoSDK OpenAI Plugin

{videosdk_plugins_openai-0.0.30 → videosdk_plugins_openai-0.0.32}/pyproject.toml RENAMED Viewed

@@ -20,7 +20,7 @@ classifiers = [
     "Topic :: Multimedia :: Video",
     "Topic :: Scientific/Engineering :: Artificial Intelligence",
 ]
-dependencies = ["videosdk-agents>=0.0.30", "openai[realtime]>=1.68.2"]
+dependencies = ["videosdk-agents>=0.0.32", "openai[realtime]>=1.68.2"]
 [tool.hatch.version]
 path = "videosdk/plugins/openai/version.py"

{videosdk_plugins_openai-0.0.30 → videosdk_plugins_openai-0.0.32}/videosdk/plugins/openai/llm.py RENAMED Viewed

@@ -26,13 +26,23 @@ class OpenAILLM(LLM):
     def __init__(
         self,
         *,
-        model: str = "gpt-4o",
         api_key: str | None = None,
+        model: str = "gpt-4o-mini",
         base_url: str | None = None,
         temperature: float = 0.7,
         tool_choice: ToolChoice = "auto",
         max_completion_tokens: int | None = None,
     ) -> None:
+        """Initialize the OpenAI LLM plugin.
+        Args:
+            api_key (Optional[str], optional): OpenAI API key. Defaults to None.
+            model (str): The model to use for the LLM plugin. Defaults to "gpt-4o".
+            base_url (Optional[str], optional): The base URL for the OpenAI API. Defaults to None.
+            temperature (float): The temperature to use for the LLM plugin. Defaults to 0.7.
+            tool_choice (ToolChoice): The tool choice to use for the LLM plugin. Defaults to "auto".
+            max_completion_tokens (Optional[int], optional): The maximum completion tokens to use for the LLM plugin. Defaults to None.
+        """
         super().__init__()
         self.api_key = api_key or os.getenv("OPENAI_API_KEY")
         if not self.api_key:
@@ -59,6 +69,77 @@ class OpenAILLM(LLM):
             ),
         )
+    @staticmethod
+    def azure(
+        *,
+        model: str = "gpt-4o-mini",
+        azure_endpoint: str | None = None,
+        azure_deployment: str | None = None,
+        api_version: str | None = None,
+        api_key: str | None = None,
+        azure_ad_token: str | None = None,
+        organization: str | None = None,
+        project: str | None = None,
+        base_url: str | None = None,
+        temperature: float = 0.7,
+        tool_choice: ToolChoice = "auto",
+        max_completion_tokens: int | None = None,
+        timeout: httpx.Timeout | None = None,
+    ) -> "OpenAILLM":
+        """
+        Create a new instance of Azure OpenAI LLM.
+        This automatically infers the following arguments from their corresponding environment variables if they are not provided:
+        - `api_key` from `AZURE_OPENAI_API_KEY`
+        - `organization` from `OPENAI_ORG_ID`
+        - `project` from `OPENAI_PROJECT_ID`
+        - `azure_ad_token` from `AZURE_OPENAI_AD_TOKEN`
+        - `api_version` from `OPENAI_API_VERSION`
+        - `azure_endpoint` from `AZURE_OPENAI_ENDPOINT`
+        - `azure_deployment` from `AZURE_OPENAI_DEPLOYMENT` (if not provided, uses `model` as deployment name)
+        """
+        azure_endpoint = azure_endpoint or os.getenv("AZURE_OPENAI_ENDPOINT")
+        azure_deployment = azure_deployment or os.getenv("AZURE_OPENAI_DEPLOYMENT")
+        api_version = api_version or os.getenv("OPENAI_API_VERSION")
+        api_key = api_key or os.getenv("AZURE_OPENAI_API_KEY")
+        azure_ad_token = azure_ad_token or os.getenv("AZURE_OPENAI_AD_TOKEN")
+        organization = organization or os.getenv("OPENAI_ORG_ID")
+        project = project or os.getenv("OPENAI_PROJECT_ID")
+        if not azure_deployment:
+            azure_deployment = model
+        if not azure_endpoint:
+            raise ValueError("Azure endpoint must be provided either through azure_endpoint parameter or AZURE_OPENAI_ENDPOINT environment variable")
+        if not api_key and not azure_ad_token:
+            raise ValueError("Either API key or Azure AD token must be provided")
+        azure_client = openai.AsyncAzureOpenAI(
+            max_retries=0,
+            azure_endpoint=azure_endpoint,
+            azure_deployment=azure_deployment,
+            api_version=api_version,
+            api_key=api_key,
+            azure_ad_token=azure_ad_token,
+            organization=organization,
+            project=project,
+            base_url=base_url,
+            timeout=timeout
+            if timeout
+            else httpx.Timeout(connect=15.0, read=5.0, write=5.0, pool=5.0),
+        )
+        instance = OpenAILLM(
+            model=model,
+            temperature=temperature,
+            tool_choice=tool_choice,
+            max_completion_tokens=max_completion_tokens,
+        )
+        instance._client = azure_client
+        return instance
     async def chat(
         self,
         messages: ChatContext,
@@ -202,4 +283,4 @@ class OpenAILLM(LLM):
         """Cleanup resources by closing the HTTP client"""
         await self.cancel_current_generation()
         if self._client:
-            await self._client.close()
+            await self._client.close()

{videosdk_plugins_openai-0.0.30 → videosdk_plugins_openai-0.0.32}/videosdk/plugins/openai/stt.py RENAMED Viewed

@@ -3,8 +3,11 @@ from __future__ import annotations
 import asyncio
 import base64
 import os
+import time
 from typing import Any, Optional
 from urllib.parse import urlencode
+import io
+import wave
 from scipy import signal
 import aiohttp
 import httpx
@@ -17,12 +20,25 @@ class OpenAISTT(BaseSTT):
         self,
         *,
         api_key: str | None = None,
-        model: str = "whisper-1",
+        model: str = "gpt-4o-mini-transcribe",
         base_url: str | None = None,
         prompt: str | None = None,
         language: str = "en",
         turn_detection: dict | None = None,
+        enable_streaming: bool = True,
+        silence_threshold: float = 0.01,
+        silence_duration: float = 0.8,
     ) -> None:
+        """Initialize the OpenAI STT plugin.
+        Args:
+            api_key (Optional[str], optional): OpenAI API key. Defaults to None.
+            model (str): The model to use for the STT plugin. Defaults to "whisper-1".
+            base_url (Optional[str], optional): The base URL for the OpenAI API. Defaults to None.
+            prompt (Optional[str], optional): The prompt for the STT plugin. Defaults to None.
+            language (str): The language to use for the STT plugin. Defaults to "en".
+            turn_detection (dict | None): The turn detection for the STT plugin. Defaults to None.
+        """
         super().__init__()
         self.api_key = api_key or os.getenv("OPENAI_API_KEY")
@@ -38,6 +54,11 @@ class OpenAISTT(BaseSTT):
             "prefix_padding_ms": 300,
             "silence_duration_ms": 500,
         }
+        self.enable_streaming = enable_streaming
+        # Custom VAD parameters for non-streaming mode
+        self.silence_threshold_bytes = int(silence_threshold * 32767)
+        self.silence_duration_frames = int(silence_duration * 48000)  # input_sample_rate
         self.client = openai.AsyncClient(
             max_retries=0,
@@ -59,9 +80,88 @@ class OpenAISTT(BaseSTT):
         self._ws_task: Optional[asyncio.Task] = None
         self._current_text = ""
         self._last_interim_at = 0
         self.input_sample_rate = 48000
         self.target_sample_rate = 16000
+        self._audio_buffer = bytearray()
+        # Custom VAD state for non-streaming mode
+        self._is_speaking = False
+        self._silence_frames = 0
+    @staticmethod
+    def azure(
+        *,
+        model: str = "gpt-4o-mini-transcribe",
+        language: str = "en",
+        prompt: str | None = None,
+        turn_detection: dict | None = None,
+        azure_endpoint: str | None = None,
+        azure_deployment: str | None = None,
+        api_version: str | None = None,
+        api_key: str | None = None,
+        azure_ad_token: str | None = None,
+        organization: str | None = None,
+        project: str | None = None,
+        base_url: str | None = None,
+        enable_streaming: bool = False,
+        timeout: httpx.Timeout | None = None,
+    ) -> "OpenAISTT":
+        """
+        Create a new instance of Azure OpenAI STT.
+        This automatically infers the following arguments from their corresponding environment variables if they are not provided:
+        - `api_key` from `AZURE_OPENAI_API_KEY`
+        - `organization` from `OPENAI_ORG_ID`
+        - `project` from `OPENAI_PROJECT_ID`
+        - `azure_ad_token` from `AZURE_OPENAI_AD_TOKEN`
+        - `api_version` from `OPENAI_API_VERSION`
+        - `azure_endpoint` from `AZURE_OPENAI_ENDPOINT`
+        - `azure_deployment` from `AZURE_OPENAI_DEPLOYMENT` (if not provided, uses `model` as deployment name)
+        """
+        # Get values from environment variables if not provided
+        azure_endpoint = azure_endpoint or os.getenv("AZURE_OPENAI_ENDPOINT")
+        azure_deployment = azure_deployment or os.getenv("AZURE_OPENAI_DEPLOYMENT")
+        api_version = api_version or os.getenv("OPENAI_API_VERSION")
+        api_key = api_key or os.getenv("AZURE_OPENAI_API_KEY")
+        azure_ad_token = azure_ad_token or os.getenv("AZURE_OPENAI_AD_TOKEN")
+        organization = organization or os.getenv("OPENAI_ORG_ID")
+        project = project or os.getenv("OPENAI_PROJECT_ID")
+        # If azure_deployment is not provided, use model as the deployment name
+        if not azure_deployment:
+            azure_deployment = model
+        if not azure_endpoint:
+            raise ValueError("Azure endpoint must be provided either through azure_endpoint parameter or AZURE_OPENAI_ENDPOINT environment variable")
+        if not api_key and not azure_ad_token:
+            raise ValueError("Either API key or Azure AD token must be provided")
+        azure_client = openai.AsyncAzureOpenAI(
+            max_retries=0,
+            azure_endpoint=azure_endpoint,
+            azure_deployment=azure_deployment,
+            api_version=api_version,
+            api_key=api_key,
+            azure_ad_token=azure_ad_token,
+            organization=organization,
+            project=project,
+            base_url=base_url,
+            timeout=timeout
+            if timeout
+            else httpx.Timeout(connect=15.0, read=5.0, write=5.0, pool=5.0),
+        )
+        instance = OpenAISTT(
+            model=model,
+            language=language,
+            prompt=prompt,
+            turn_detection=turn_detection,
+            enable_streaming=enable_streaming,
+        )
+        instance.client = azure_client
+        return instance
     async def process_audio(
         self,
@@ -69,7 +169,11 @@ class OpenAISTT(BaseSTT):
         language: Optional[str] = None,
         **kwargs: Any
     ) -> None:
-        """Process audio frames and send to OpenAI's Realtime API"""
+        """Process audio frames and send to OpenAI based on enabled mode"""
+        if not self.enable_streaming:
+            await self._transcribe_non_streaming(audio_frames)
+            return
         if not self._ws:
             await self._connect_ws()
@@ -95,6 +199,80 @@ class OpenAISTT(BaseSTT):
                     self._ws_task.cancel()
                     self._ws_task = None
+    async def _transcribe_non_streaming(self, audio_frames: bytes) -> None:
+        """HTTP-based transcription using OpenAI audio/transcriptions API with custom VAD"""
+        if not audio_frames:
+            return
+        self._audio_buffer.extend(audio_frames)
+        # Custom VAD logic similar to other STT implementations
+        is_silent_chunk = self._is_silent(audio_frames)
+        if not is_silent_chunk:
+            if not self._is_speaking:
+                self._is_speaking = True
+                global_event_emitter.emit("speech_started")
+            self._silence_frames = 0
+        else:
+            if self._is_speaking:
+                self._silence_frames += len(audio_frames) // 4  # Approximate frame count
+                if self._silence_frames > self.silence_duration_frames:
+                    global_event_emitter.emit("speech_stopped")
+                    await self._process_audio_buffer()
+                    self._is_speaking = False
+                    self._silence_frames = 0
+    def _is_silent(self, audio_chunk: bytes) -> bool:
+        """Simple VAD: check if the max amplitude is below a threshold."""
+        audio_data = np.frombuffer(audio_chunk, dtype=np.int16)
+        return np.max(np.abs(audio_data)) < self.silence_threshold_bytes
+    async def _process_audio_buffer(self) -> None:
+        """Process the accumulated audio buffer with OpenAI transcription"""
+        if not self._audio_buffer:
+            return
+        audio_data = bytes(self._audio_buffer)
+        self._audio_buffer.clear()
+        wav_bytes = self._audio_frames_to_wav_bytes(audio_data)
+        try:
+            resp = await self.client.audio.transcriptions.create(
+                file=("audio.wav", wav_bytes, "audio/wav"),
+                model=self.model,
+                language=self.language,
+                prompt=self.prompt or openai.NOT_GIVEN,
+            )
+            text = getattr(resp, "text", "")
+            if text and self._transcript_callback:
+                await self._transcript_callback(STTResponse(
+                    event_type=SpeechEventType.FINAL,
+                    data=SpeechData(text=text, language=self.language),
+                    metadata={"model": self.model}
+                ))
+        except Exception as e:
+            print(f"OpenAI transcription error: {str(e)}")
+            self.emit("error", str(e))
+    def _audio_frames_to_wav_bytes(self, audio_frames: bytes) -> bytes:
+        """Convert audio frames to WAV bytes"""
+        pcm = np.frombuffer(audio_frames, dtype=np.int16)
+        resampled = signal.resample(pcm, int(len(pcm) * self.target_sample_rate / self.input_sample_rate))
+        resampled = resampled.astype(np.int16)
+        buf = io.BytesIO()
+        with wave.open(buf, "wb") as wf:
+            wf.setnchannels(1)  # Mono
+            wf.setsampwidth(2)  # 16-bit PCM
+            wf.setframerate(self.target_sample_rate)
+            wf.writeframes(resampled.tobytes())
+        return buf.getvalue()
     async def _listen_for_responses(self) -> None:
         """Background task to listen for WebSocket responses"""
         if not self._ws:
@@ -233,6 +411,8 @@ class OpenAISTT(BaseSTT):
     async def aclose(self) -> None:
         """Cleanup resources"""
+        self._audio_buffer.clear()
         if self._ws_task:
             self._ws_task.cancel()
             try:
@@ -254,7 +434,4 @@ class OpenAISTT(BaseSTT):
     async def _ensure_ws_connection(self):
         """Ensure WebSocket is connected, reconnect if necessary"""
         if not self._ws or self._ws.closed:
-            await self._connect_ws()
+            await self._connect_ws()

{videosdk_plugins_openai-0.0.30 → videosdk_plugins_openai-0.0.32}/videosdk/plugins/openai/tts.py RENAMED Viewed

@@ -17,19 +17,29 @@ _RESPONSE_FORMATS = Union[Literal["mp3",
                                   "opus", "aac", "flac", "wav", "pcm"], str]
 class OpenAITTS(TTS):
     def __init__(
         self,
         *,
+        api_key: str | None = None,
         model: str = DEFAULT_MODEL,
         voice: str = DEFAULT_VOICE,
         speed: float = 1.0,
         instructions: str | None = None,
-        api_key: str | None = None,
         base_url: str | None = None,
         response_format: str = "pcm",
     ) -> None:
+        """Initialize the OpenAI TTS plugin.
+        Args:
+            api_key (Optional[str], optional): OpenAI API key. Defaults to None.
+            model (str): The model to use for the TTS plugin. Defaults to "gpt-4o-mini-tts".
+            voice (str): The voice to use for the TTS plugin. Defaults to "ash".
+            speed (float): The speed to use for the TTS plugin. Defaults to 1.0.
+            instructions (Optional[str], optional): Additional instructions for the TTS plugin. Defaults to None.
+            base_url (Optional[str], optional): Custom base URL for the OpenAI API. Defaults to None.
+            response_format (str): The response format to use for the TTS plugin. Defaults to "pcm".
+        """
         super().__init__(sample_rate=OPENAI_TTS_SAMPLE_RATE, num_channels=OPENAI_TTS_CHANNELS)
         self.model = model
@@ -64,6 +74,79 @@ class OpenAITTS(TTS):
             ),
         )
+    @staticmethod
+    def azure(
+        *,
+        model: str = DEFAULT_MODEL,
+        voice: str = DEFAULT_VOICE,
+        speed: float = 1.0,
+        instructions: str | None = None,
+        azure_endpoint: str | None = None,
+        azure_deployment: str | None = None,
+        api_version: str | None = None,
+        api_key: str | None = None,
+        azure_ad_token: str | None = None,
+        organization: str | None = None,
+        project: str | None = None,
+        base_url: str | None = None,
+        response_format: str = "pcm",
+        timeout: httpx.Timeout | None = None,
+    ) -> "OpenAITTS":
+        """
+        Create a new instance of Azure OpenAI TTS.
+        This automatically infers the following arguments from their corresponding environment variables if they are not provided:
+        - `api_key` from `AZURE_OPENAI_API_KEY`
+        - `organization` from `OPENAI_ORG_ID`
+        - `project` from `OPENAI_PROJECT_ID`
+        - `azure_ad_token` from `AZURE_OPENAI_AD_TOKEN`
+        - `api_version` from `OPENAI_API_VERSION`
+        - `azure_endpoint` from `AZURE_OPENAI_ENDPOINT`
+        - `azure_deployment` from `AZURE_OPENAI_DEPLOYMENT` (if not provided, uses `model` as deployment name)
+        """
+        azure_endpoint = azure_endpoint or os.getenv("AZURE_OPENAI_ENDPOINT")
+        azure_deployment = azure_deployment or os.getenv("AZURE_OPENAI_DEPLOYMENT")
+        api_version = api_version or os.getenv("OPENAI_API_VERSION")
+        api_key = api_key or os.getenv("AZURE_OPENAI_API_KEY")
+        azure_ad_token = azure_ad_token or os.getenv("AZURE_OPENAI_AD_TOKEN")
+        organization = organization or os.getenv("OPENAI_ORG_ID")
+        project = project or os.getenv("OPENAI_PROJECT_ID")
+        if not azure_deployment:
+            azure_deployment = model
+        if not azure_endpoint:
+            raise ValueError("Azure endpoint must be provided either through azure_endpoint parameter or AZURE_OPENAI_ENDPOINT environment variable")
+        if not api_key and not azure_ad_token:
+            raise ValueError("Either API key or Azure AD token must be provided")
+        azure_client = openai.AsyncAzureOpenAI(
+            max_retries=0,
+            azure_endpoint=azure_endpoint,
+            azure_deployment=azure_deployment,
+            api_version=api_version,
+            api_key=api_key,
+            azure_ad_token=azure_ad_token,
+            organization=organization,
+            project=project,
+            base_url=base_url,
+            timeout=timeout
+            if timeout
+            else httpx.Timeout(connect=15.0, read=5.0, write=5.0, pool=5.0),
+        )
+        instance = OpenAITTS(
+            model=model,
+            voice=voice,
+            speed=speed,
+            instructions=instructions,
+            response_format=response_format,
+        )
+        instance._client = azure_client
+        return instance
     def reset_first_audio_tracking(self) -> None:
         """Reset the first audio tracking state for next TTS task"""
         self._first_chunk_sent = False

videosdk_plugins_openai-0.0.32/videosdk/plugins/openai/version.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.0.32"

videosdk_plugins_openai-0.0.30/videosdk/plugins/openai/version.py DELETED Viewed

	@@ -1 +0,0 @@
1	- __version__ = "0.0.30"

{videosdk_plugins_openai-0.0.30 → videosdk_plugins_openai-0.0.32}/README.md RENAMED Viewed

File without changes

{videosdk_plugins_openai-0.0.30 → videosdk_plugins_openai-0.0.32}/videosdk/plugins/openai/__init__.py RENAMED Viewed

File without changes

{videosdk_plugins_openai-0.0.30 → videosdk_plugins_openai-0.0.32}/videosdk/plugins/openai/realtime_api.py RENAMED Viewed

@@ -100,15 +100,16 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
     def __init__(
         self,
         *,
+        api_key: str | None = None,
         model: str,
         config: OpenAIRealtimeConfig | None = None,
-        api_key: str | None = None,
         base_url: str | None = None,
     ) -> None:
         """
         Initialize OpenAI realtime model.
         Args:
+            api_key: OpenAI API key. If not provided, will attempt to read from OPENAI_API_KEY env var
             model: The OpenAI model identifier to use (e.g. 'gpt-4', 'gpt-3.5-turbo')
             config: Optional configuration object for customizing model behavior. Contains settings for:
                    - voice: Voice ID to use for audio output
@@ -117,7 +118,6 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
                    - input_audio_transcription: Settings for audio transcription
                    - tool_choice: How tools should be selected ('auto' or 'none')
                    - modalities: List of enabled modalities ('text', 'audio')
-            api_key: OpenAI API key. If not provided, will attempt to read from OPENAI_API_KEY env var
             base_url: Base URL for OpenAI API. Defaults to 'https://api.openai.com/v1'
         Raises:

videosdk-plugins-openai 0.0.30__tar.gz → 0.0.32__tar.gz

Potentially problematic release.

videosdk-plugins-openai 0.0.30tar.gz → 0.0.32tar.gz