videosdk-plugins-openai 0.0.5__tar.gz → 0.0.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of videosdk-plugins-openai might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videosdk-plugins-openai
3
- Version: 0.0.5
3
+ Version: 0.0.6
4
4
  Summary: VideoSDK Agent Framework plugin for OpenAI services
5
5
  Author: videosdk
6
6
  Keywords: ai,audio,openai,video,videosdk
@@ -12,7 +12,7 @@ Classifier: Topic :: Multimedia :: Video
12
12
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
13
13
  Requires-Python: >=3.11
14
14
  Requires-Dist: openai[realtime]>=1.68.2
15
- Requires-Dist: videosdk-agents>=0.0.8
15
+ Requires-Dist: videosdk-agents>=0.0.9
16
16
  Description-Content-Type: text/markdown
17
17
 
18
18
  VideoSDK OpenAI Plugin
@@ -20,7 +20,7 @@ classifiers = [
20
20
  "Topic :: Scientific/Engineering :: Artificial Intelligence",
21
21
  ]
22
22
  dependencies = [
23
- "videosdk-agents>=0.0.8",
23
+ "videosdk-agents>=0.0.9",
24
24
  "openai[realtime]>=1.68.2",
25
25
  ]
26
26
 
@@ -0,0 +1,12 @@
1
+ from .realtime_api import OpenAIRealtime, OpenAIRealtimeConfig
2
+ from .llm import OpenAILLM
3
+ from .stt import OpenAISTT
4
+ from .tts import OpenAITTS
5
+
6
+ __all__ = [
7
+ 'OpenAIRealtime',
8
+ 'OpenAIRealtimeConfig',
9
+ 'OpenAILLM',
10
+ 'OpenAISTT',
11
+ 'OpenAITTS',
12
+ ]
@@ -0,0 +1,161 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ from typing import Any, AsyncIterator
5
+ import json
6
+
7
+ import httpx
8
+ import openai
9
+ from videosdk.agents import LLM, LLMResponse, ChatContext, ChatRole, ChatMessage, FunctionCall, FunctionCallOutput, ToolChoice, FunctionTool, is_function_tool, build_openai_schema
10
+
11
+ class OpenAILLM(LLM):
12
+
13
+ def __init__(
14
+ self,
15
+ *,
16
+ model: str = "gpt-4o",
17
+ api_key: str | None = None,
18
+ base_url: str | None = None,
19
+ temperature: float = 0.7,
20
+ tool_choice: ToolChoice = "auto",
21
+ max_completion_tokens: int | None = None,
22
+ ) -> None:
23
+ super().__init__()
24
+ self.api_key = api_key or os.getenv("OPENAI_API_KEY")
25
+ if not self.api_key:
26
+ raise ValueError("OpenAI API key must be provided either through api_key parameter or OPENAI_API_KEY environment variable")
27
+
28
+ self.model = model
29
+ self.temperature = temperature
30
+ self.tool_choice = tool_choice
31
+ self.max_completion_tokens = max_completion_tokens
32
+
33
+ self._client = openai.AsyncOpenAI(
34
+ api_key=self.api_key,
35
+ base_url=base_url or None,
36
+ max_retries=0,
37
+ http_client=httpx.AsyncClient(
38
+ timeout=httpx.Timeout(connect=15.0, read=5.0, write=5.0, pool=5.0),
39
+ follow_redirects=True,
40
+ limits=httpx.Limits(
41
+ max_connections=50,
42
+ max_keepalive_connections=50,
43
+ keepalive_expiry=120,
44
+ ),
45
+ ),
46
+ )
47
+
48
+ async def chat(
49
+ self,
50
+ messages: ChatContext,
51
+ tools: list[FunctionTool] | None = None,
52
+ **kwargs: Any
53
+ ) -> AsyncIterator[LLMResponse]:
54
+ """
55
+ Implement chat functionality using OpenAI's chat completion API
56
+
57
+ Args:
58
+ messages: ChatContext containing conversation history
59
+ tools: Optional list of function tools available to the model
60
+ **kwargs: Additional arguments passed to the OpenAI API
61
+
62
+ Yields:
63
+ LLMResponse objects containing the model's responses
64
+ """
65
+ completion_params = {
66
+ "model": self.model,
67
+ "messages": [
68
+ {
69
+ "role": msg.role.value,
70
+ "content": msg.content,
71
+ **({"name": msg.name} if hasattr(msg, 'name') else {})
72
+ } if isinstance(msg, ChatMessage) else
73
+ {
74
+ "role": "assistant",
75
+ "content": None,
76
+ "function_call": {
77
+ "name": msg.name,
78
+ "arguments": msg.arguments
79
+ }
80
+ } if isinstance(msg, FunctionCall) else
81
+ {
82
+ "role": "function",
83
+ "name": msg.name,
84
+ "content": msg.output
85
+ } if isinstance(msg, FunctionCallOutput) else None
86
+ for msg in messages.items
87
+ if msg is not None
88
+ ],
89
+ "temperature": self.temperature,
90
+ "stream": True,
91
+ "max_tokens": self.max_completion_tokens,
92
+ }
93
+
94
+ if tools:
95
+ formatted_tools = []
96
+ for tool in tools:
97
+ if not is_function_tool(tool):
98
+ continue
99
+ try:
100
+ tool_schema = build_openai_schema(tool)
101
+ formatted_tools.append(tool_schema)
102
+ except Exception as e:
103
+ print(f"Failed to format tool {tool}: {e}")
104
+ continue
105
+
106
+ if formatted_tools:
107
+ completion_params["functions"] = formatted_tools
108
+ completion_params["function_call"] = self.tool_choice
109
+
110
+ completion_params.update(kwargs)
111
+ try:
112
+ response_stream = await self._client.chat.completions.create(**completion_params)
113
+ current_content = ""
114
+ current_function_call = None
115
+
116
+ async for chunk in response_stream:
117
+ if not chunk.choices:
118
+ continue
119
+
120
+ delta = chunk.choices[0].delta
121
+ if delta.function_call:
122
+ if current_function_call is None:
123
+ current_function_call = {
124
+ "name": delta.function_call.name or "",
125
+ "arguments": delta.function_call.arguments or ""
126
+ }
127
+ else:
128
+ if delta.function_call.name:
129
+ current_function_call["name"] += delta.function_call.name
130
+ if delta.function_call.arguments:
131
+ current_function_call["arguments"] += delta.function_call.arguments
132
+ elif current_function_call is not None:
133
+ try:
134
+ args = json.loads(current_function_call["arguments"])
135
+ current_function_call["arguments"] = args
136
+ except json.JSONDecodeError:
137
+ print(f"Failed to parse function arguments: {current_function_call['arguments']}")
138
+ current_function_call["arguments"] = {}
139
+
140
+ yield LLMResponse(
141
+ content="",
142
+ role=ChatRole.ASSISTANT,
143
+ metadata={"function_call": current_function_call}
144
+ )
145
+ current_function_call = None
146
+
147
+ elif delta.content is not None:
148
+ current_content += delta.content
149
+ yield LLMResponse(
150
+ content=current_content,
151
+ role=ChatRole.ASSISTANT
152
+ )
153
+
154
+ except Exception as e:
155
+ self.emit("error", e)
156
+ raise
157
+
158
+ async def aclose(self) -> None:
159
+ """Cleanup resources by closing the HTTP client"""
160
+ if self._client:
161
+ await self._client.close()
@@ -10,6 +10,8 @@ from dotenv import load_dotenv
10
10
  import uuid
11
11
  import base64
12
12
  import aiohttp
13
+ import numpy as np
14
+ from scipy import signal
13
15
  import traceback
14
16
  from videosdk.agents import (
15
17
  FunctionTool,
@@ -26,8 +28,6 @@ load_dotenv()
26
28
  from openai.types.beta.realtime.session import InputAudioTranscription, TurnDetection
27
29
 
28
30
  OPENAI_BASE_URL = "https://api.openai.com/v1"
29
- SAMPLE_RATE = 24000
30
- NUM_CHANNELS = 1
31
31
 
32
32
  DEFAULT_TEMPERATURE = 0.8
33
33
  DEFAULT_TURN_DETECTION = TurnDetection(
@@ -130,9 +130,12 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
130
130
  self.audio_track: Optional[CustomAudioStreamTrack] = None
131
131
  self._formatted_tools: Optional[List[Dict[str, Any]]] = None
132
132
  self.config: OpenAIRealtimeConfig = config or OpenAIRealtimeConfig()
133
- # self.on("instructions_updated", self._handle_instructions_updated)
134
- # self.on("tools_updated", self._handle_tools_updated)
135
-
133
+ # global_event_emitter.on("instructions_updated", self._handle_instructions_updated)
134
+ # global_event_emitter.on("tools_updated", self._handle_tools_updated)
135
+
136
+ self.input_sample_rate = 48000
137
+ self.target_sample_rate = 16000
138
+
136
139
  def set_agent(self, agent: Agent) -> None:
137
140
  self._instructions = agent.instructions
138
141
  self._tools = agent.tools
@@ -153,6 +156,9 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
153
156
  async def handle_audio_input(self, audio_data: bytes) -> None:
154
157
  """Handle incoming audio data from the user"""
155
158
  if self._session and not self._closing and "audio" in self.config.modalities:
159
+ audio_data = np.frombuffer(audio_data, dtype=np.int16)
160
+ audio_data = signal.resample(audio_data, int(len(audio_data) * self.target_sample_rate / self.input_sample_rate))
161
+ audio_data = audio_data.astype(np.int16).tobytes()
156
162
  base64_audio_data = base64.b64encode(audio_data).decode("utf-8")
157
163
  audio_event = {
158
164
  "type": "input_audio_buffer.append",
@@ -0,0 +1,260 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import base64
5
+ import os
6
+ from typing import Any, Optional
7
+ from urllib.parse import urlencode
8
+ from scipy import signal
9
+ import aiohttp
10
+ import httpx
11
+ import openai
12
+ import numpy as np
13
+ from videosdk.agents import STT as BaseSTT, STTResponse, SpeechEventType, SpeechData, global_event_emitter
14
+
15
+ class OpenAISTT(BaseSTT):
16
+ def __init__(
17
+ self,
18
+ *,
19
+ api_key: str,
20
+ model: str = "whisper-1",
21
+ base_url: str | None = None,
22
+ prompt: str | None = None,
23
+ language: str = "en",
24
+ turn_detection: dict | None = None,
25
+ ) -> None:
26
+ super().__init__()
27
+
28
+ self.api_key = api_key or os.getenv("OPENAI_API_KEY")
29
+ if not self.api_key:
30
+ raise ValueError("OpenAI API key must be provided either through api_key parameter or OPENAI_API_KEY environment variable")
31
+
32
+ self.model = model
33
+ self.language = language
34
+ self.prompt = prompt
35
+ self.turn_detection = turn_detection or {
36
+ "type": "server_vad",
37
+ "threshold": 0.5,
38
+ "prefix_padding_ms": 300,
39
+ "silence_duration_ms": 500,
40
+ }
41
+
42
+ self.client = openai.AsyncClient(
43
+ max_retries=0,
44
+ api_key=api_key,
45
+ base_url=base_url or None,
46
+ http_client=httpx.AsyncClient(
47
+ timeout=httpx.Timeout(connect=15.0, read=5.0, write=5.0, pool=5.0),
48
+ follow_redirects=True,
49
+ limits=httpx.Limits(
50
+ max_connections=50,
51
+ max_keepalive_connections=50,
52
+ keepalive_expiry=120,
53
+ ),
54
+ ),
55
+ )
56
+
57
+ self._session: Optional[aiohttp.ClientSession] = None
58
+ self._ws: Optional[aiohttp.ClientWebSocketResponse] = None
59
+ self._ws_task: Optional[asyncio.Task] = None
60
+ self._current_text = ""
61
+ self._last_interim_at = 0
62
+
63
+ self.input_sample_rate = 48000
64
+ self.target_sample_rate = 16000
65
+
66
+ async def process_audio(
67
+ self,
68
+ audio_frames: bytes,
69
+ language: Optional[str] = None,
70
+ **kwargs: Any
71
+ ) -> None:
72
+ """Process audio frames and send to OpenAI's Realtime API"""
73
+
74
+ if not self._ws:
75
+ await self._connect_ws()
76
+ self._ws_task = asyncio.create_task(self._listen_for_responses())
77
+
78
+ try:
79
+ audio_data = np.frombuffer(audio_frames, dtype=np.int16)
80
+ audio_data = signal.resample(audio_data, int(len(audio_data) * self.target_sample_rate / self.input_sample_rate))
81
+ audio_data = audio_data.astype(np.int16).tobytes()
82
+ audio_data = base64.b64encode(audio_data).decode("utf-8")
83
+ message = {
84
+ "type": "input_audio_buffer.append",
85
+ "audio": audio_data,
86
+ }
87
+ await self._ws.send_json(message)
88
+ except Exception as e:
89
+ print(f"Error in process_audio: {str(e)}")
90
+ self.emit("error", str(e))
91
+ if self._ws:
92
+ await self._ws.close()
93
+ self._ws = None
94
+ if self._ws_task:
95
+ self._ws_task.cancel()
96
+ self._ws_task = None
97
+
98
+ async def _listen_for_responses(self) -> None:
99
+ """Background task to listen for WebSocket responses"""
100
+ if not self._ws:
101
+ return
102
+
103
+ try:
104
+ async for msg in self._ws:
105
+ if msg.type == aiohttp.WSMsgType.TEXT:
106
+ data = msg.json()
107
+ responses = self._handle_ws_message(data)
108
+ for response in responses:
109
+ if self._transcript_callback:
110
+ await self._transcript_callback(response)
111
+ elif msg.type == aiohttp.WSMsgType.ERROR:
112
+ error = f"WebSocket error: {self._ws.exception()}"
113
+ print(error)
114
+ self.emit("error", error)
115
+ break
116
+ elif msg.type == aiohttp.WSMsgType.CLOSED:
117
+ print("WebSocket connection closed")
118
+ break
119
+ except Exception as e:
120
+ error = f"Error in WebSocket listener: {str(e)}"
121
+ print(error)
122
+ self.emit("error", error)
123
+ finally:
124
+ if self._ws:
125
+ await self._ws.close()
126
+ self._ws = None
127
+
128
+ async def _connect_ws(self) -> None:
129
+ """Establish WebSocket connection with OpenAI's Realtime API"""
130
+
131
+ if not self._session:
132
+ self._session = aiohttp.ClientSession()
133
+
134
+ config = {
135
+ "type": "transcription_session.update",
136
+ "session": {
137
+ "input_audio_format": "pcm16",
138
+ "input_audio_transcription": {
139
+ "model": self.model,
140
+ "prompt": self.prompt or "",
141
+ "language": self.language if self.language else None,
142
+ },
143
+ "turn_detection": self.turn_detection,
144
+ "input_audio_noise_reduction": {
145
+ "type": "near_field"
146
+ },
147
+ "include": ["item.input_audio_transcription.logprobs"]
148
+ }
149
+ }
150
+
151
+ query_params = {
152
+ "intent": "transcription",
153
+ }
154
+ headers = {
155
+ "User-Agent": "VideoSDK",
156
+ "Authorization": f"Bearer {self.api_key}",
157
+ "OpenAI-Beta": "realtime=v1",
158
+ }
159
+
160
+ base_url = str(self.client.base_url).rstrip('/')
161
+ ws_url = f"{base_url}/realtime?{urlencode(query_params)}"
162
+ if ws_url.startswith("http"):
163
+ ws_url = ws_url.replace("http", "ws", 1)
164
+
165
+ try:
166
+ self._ws = await self._session.ws_connect(ws_url, headers=headers)
167
+
168
+ initial_response = await self._ws.receive_json()
169
+
170
+ if initial_response.get("type") != "transcription_session.created":
171
+ raise Exception(f"Expected session creation, got: {initial_response}")
172
+
173
+ await self._ws.send_json(config)
174
+
175
+ update_response = await self._ws.receive_json()
176
+
177
+ if update_response.get("type") != "transcription_session.updated":
178
+ raise Exception(f"Configuration update failed: {update_response}")
179
+
180
+ except Exception as e:
181
+ print(f"Error connecting to WebSocket: {str(e)}")
182
+ if self._ws:
183
+ await self._ws.close()
184
+ self._ws = None
185
+ raise
186
+
187
+ def _handle_ws_message(self, msg: dict) -> list[STTResponse]:
188
+ """Handle incoming WebSocket messages and generate STT responses"""
189
+ responses = []
190
+
191
+ try:
192
+ msg_type = msg.get("type")
193
+ if msg_type == "conversation.item.input_audio_transcription.delta":
194
+ delta = msg.get("delta", "")
195
+ if delta:
196
+ self._current_text += delta
197
+ current_time = asyncio.get_event_loop().time()
198
+
199
+ if current_time - self._last_interim_at > 0.5:
200
+ responses.append(STTResponse(
201
+ event_type=SpeechEventType.INTERIM,
202
+ data=SpeechData(
203
+ text=self._current_text,
204
+ language=self.language,
205
+ ),
206
+ metadata={"model": self.model}
207
+ ))
208
+ self._last_interim_at = current_time
209
+
210
+ elif msg_type == "conversation.item.input_audio_transcription.completed":
211
+ transcript = msg.get("transcript", "")
212
+ if transcript:
213
+ responses.append(STTResponse(
214
+ event_type=SpeechEventType.FINAL,
215
+ data=SpeechData(
216
+ text=transcript,
217
+ language=self.language,
218
+ ),
219
+ metadata={"model": self.model}
220
+ ))
221
+ self._current_text = ""
222
+
223
+ elif msg_type == "input_audio_buffer.speech_started":
224
+ global_event_emitter.emit("speech_started")
225
+
226
+ elif msg_type == "input_audio_buffer.speech_stopped":
227
+ global_event_emitter.emit("speech_stopped")
228
+
229
+ except Exception as e:
230
+ print(f"Error handling WebSocket message: {str(e)}")
231
+
232
+ return responses
233
+
234
+ async def aclose(self) -> None:
235
+ """Cleanup resources"""
236
+ if self._ws_task:
237
+ self._ws_task.cancel()
238
+ try:
239
+ await self._ws_task
240
+ except asyncio.CancelledError:
241
+ pass
242
+ self._ws_task = None
243
+
244
+ if self._ws:
245
+ await self._ws.close()
246
+ self._ws = None
247
+
248
+ if self._session:
249
+ await self._session.close()
250
+ self._session = None
251
+
252
+ await self.client.close()
253
+
254
+ async def _ensure_ws_connection(self):
255
+ """Ensure WebSocket is connected, reconnect if necessary"""
256
+ if not self._ws or self._ws.closed:
257
+ await self._connect_ws()
258
+
259
+
260
+
@@ -0,0 +1,109 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, AsyncIterator, Literal, Optional, Union
4
+ import httpx
5
+ import os
6
+ import openai
7
+
8
+ from videosdk.agents import TTS
9
+
10
+ OPENAI_TTS_SAMPLE_RATE = 24000
11
+ OPENAI_TTS_CHANNELS = 1
12
+
13
+ DEFAULT_MODEL = "gpt-4o-mini-tts"
14
+ DEFAULT_VOICE = "ash"
15
+ _RESPONSE_FORMATS = Union[Literal["mp3", "opus", "aac", "flac", "wav", "pcm"], str]
16
+
17
+ class OpenAITTS(TTS):
18
+ def __init__(
19
+ self,
20
+ *,
21
+ model: str = DEFAULT_MODEL,
22
+ voice: str = DEFAULT_VOICE,
23
+ speed: float = 1.0,
24
+ instructions: str | None = None,
25
+ api_key: str | None = None,
26
+ base_url: str | None = None,
27
+ response_format: str = "pcm"
28
+ ) -> None:
29
+ super().__init__(sample_rate=OPENAI_TTS_SAMPLE_RATE, num_channels=OPENAI_TTS_CHANNELS)
30
+
31
+ self.model = model
32
+ self.voice = voice
33
+ self.speed = speed
34
+ self.instructions = instructions
35
+ self.audio_track = None
36
+ self.loop = None
37
+ self.response_format = response_format
38
+
39
+ self.api_key = api_key or os.getenv("OPENAI_API_KEY")
40
+ if not self.api_key:
41
+ raise ValueError("OpenAI API key must be provided either through api_key parameter or OPENAI_API_KEY environment variable")
42
+
43
+ self._client = openai.AsyncClient(
44
+ max_retries=0,
45
+ api_key=self.api_key,
46
+ base_url=base_url or None,
47
+ http_client=httpx.AsyncClient(
48
+ timeout=httpx.Timeout(connect=15.0, read=5.0, write=5.0, pool=5.0),
49
+ follow_redirects=True,
50
+ limits=httpx.Limits(
51
+ max_connections=50,
52
+ max_keepalive_connections=50,
53
+ keepalive_expiry=120,
54
+ ),
55
+ ),
56
+ )
57
+
58
+ async def synthesize(
59
+ self,
60
+ text: AsyncIterator[str] | str,
61
+ voice_id: Optional[str] = None,
62
+ **kwargs: Any
63
+ ) -> None:
64
+ """
65
+ Convert text to speech using OpenAI's TTS API and stream to audio track
66
+
67
+ Args:
68
+ text: Text to convert to speech
69
+ voice_id: Optional voice override
70
+ **kwargs: Additional provider-specific arguments
71
+ """
72
+ try:
73
+ if isinstance(text, AsyncIterator):
74
+ full_text = ""
75
+ async for chunk in text:
76
+ full_text += chunk
77
+ else:
78
+ full_text = text
79
+
80
+ if not self.audio_track or not self.loop:
81
+ self.emit("error", "Audio track or event loop not set")
82
+ return
83
+
84
+ async with self._client.audio.speech.with_streaming_response.create(
85
+ model=self.model,
86
+ voice=voice_id or self.voice,
87
+ input=full_text,
88
+ speed=self.speed,
89
+ response_format=self.response_format,
90
+ **({"instructions": self.instructions} if self.instructions else {})
91
+ ) as response:
92
+ async for chunk in response.iter_bytes():
93
+ if chunk:
94
+ self.loop.create_task(self.audio_track.add_new_bytes(chunk))
95
+
96
+ except openai.APIError as e:
97
+ self.emit("error", str(e))
98
+ except Exception as e:
99
+ self.emit("error", f"TTS synthesis failed: {str(e)}")
100
+
101
+ async def aclose(self) -> None:
102
+ """Cleanup resources"""
103
+ await self._client.close()
104
+ await super().aclose()
105
+
106
+ async def interrupt(self) -> None:
107
+ """Interrupt the TTS process"""
108
+ if self.audio_track:
109
+ self.audio_track.interrupt()
@@ -0,0 +1 @@
1
+ __version__ = "0.0.6"
@@ -1,6 +0,0 @@
1
- from .realtime_api import OpenAIRealtime, OpenAIRealtimeConfig
2
-
3
- __all__ = [
4
- 'OpenAIRealtime',
5
- 'OpenAIRealtimeConfig'
6
- ]
@@ -1 +0,0 @@
1
- __version__ = "0.0.5"