videosdk-plugins-openai 0.0.4__tar.gz → 0.0.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of videosdk-plugins-openai might be problematic. Click here for more details.

@@ -7,3 +7,4 @@ __pycache__
7
7
  .env.local
8
8
  test_env/
9
9
  dist/
10
+ .DS_Store
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videosdk-plugins-openai
3
- Version: 0.0.4
3
+ Version: 0.0.6
4
4
  Summary: VideoSDK Agent Framework plugin for OpenAI services
5
5
  Author: videosdk
6
6
  Keywords: ai,audio,openai,video,videosdk
@@ -12,7 +12,7 @@ Classifier: Topic :: Multimedia :: Video
12
12
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
13
13
  Requires-Python: >=3.11
14
14
  Requires-Dist: openai[realtime]>=1.68.2
15
- Requires-Dist: videosdk-agents>=0.0.6
15
+ Requires-Dist: videosdk-agents>=0.0.9
16
16
  Description-Content-Type: text/markdown
17
17
 
18
18
  VideoSDK OpenAI Plugin
@@ -20,7 +20,7 @@ classifiers = [
20
20
  "Topic :: Scientific/Engineering :: Artificial Intelligence",
21
21
  ]
22
22
  dependencies = [
23
- "videosdk-agents>=0.0.6",
23
+ "videosdk-agents>=0.0.9",
24
24
  "openai[realtime]>=1.68.2",
25
25
  ]
26
26
 
@@ -0,0 +1,12 @@
1
+ from .realtime_api import OpenAIRealtime, OpenAIRealtimeConfig
2
+ from .llm import OpenAILLM
3
+ from .stt import OpenAISTT
4
+ from .tts import OpenAITTS
5
+
6
+ __all__ = [
7
+ 'OpenAIRealtime',
8
+ 'OpenAIRealtimeConfig',
9
+ 'OpenAILLM',
10
+ 'OpenAISTT',
11
+ 'OpenAITTS',
12
+ ]
@@ -0,0 +1,161 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ from typing import Any, AsyncIterator
5
+ import json
6
+
7
+ import httpx
8
+ import openai
9
+ from videosdk.agents import LLM, LLMResponse, ChatContext, ChatRole, ChatMessage, FunctionCall, FunctionCallOutput, ToolChoice, FunctionTool, is_function_tool, build_openai_schema
10
+
11
+ class OpenAILLM(LLM):
12
+
13
+ def __init__(
14
+ self,
15
+ *,
16
+ model: str = "gpt-4o",
17
+ api_key: str | None = None,
18
+ base_url: str | None = None,
19
+ temperature: float = 0.7,
20
+ tool_choice: ToolChoice = "auto",
21
+ max_completion_tokens: int | None = None,
22
+ ) -> None:
23
+ super().__init__()
24
+ self.api_key = api_key or os.getenv("OPENAI_API_KEY")
25
+ if not self.api_key:
26
+ raise ValueError("OpenAI API key must be provided either through api_key parameter or OPENAI_API_KEY environment variable")
27
+
28
+ self.model = model
29
+ self.temperature = temperature
30
+ self.tool_choice = tool_choice
31
+ self.max_completion_tokens = max_completion_tokens
32
+
33
+ self._client = openai.AsyncOpenAI(
34
+ api_key=self.api_key,
35
+ base_url=base_url or None,
36
+ max_retries=0,
37
+ http_client=httpx.AsyncClient(
38
+ timeout=httpx.Timeout(connect=15.0, read=5.0, write=5.0, pool=5.0),
39
+ follow_redirects=True,
40
+ limits=httpx.Limits(
41
+ max_connections=50,
42
+ max_keepalive_connections=50,
43
+ keepalive_expiry=120,
44
+ ),
45
+ ),
46
+ )
47
+
48
+ async def chat(
49
+ self,
50
+ messages: ChatContext,
51
+ tools: list[FunctionTool] | None = None,
52
+ **kwargs: Any
53
+ ) -> AsyncIterator[LLMResponse]:
54
+ """
55
+ Implement chat functionality using OpenAI's chat completion API
56
+
57
+ Args:
58
+ messages: ChatContext containing conversation history
59
+ tools: Optional list of function tools available to the model
60
+ **kwargs: Additional arguments passed to the OpenAI API
61
+
62
+ Yields:
63
+ LLMResponse objects containing the model's responses
64
+ """
65
+ completion_params = {
66
+ "model": self.model,
67
+ "messages": [
68
+ {
69
+ "role": msg.role.value,
70
+ "content": msg.content,
71
+ **({"name": msg.name} if hasattr(msg, 'name') else {})
72
+ } if isinstance(msg, ChatMessage) else
73
+ {
74
+ "role": "assistant",
75
+ "content": None,
76
+ "function_call": {
77
+ "name": msg.name,
78
+ "arguments": msg.arguments
79
+ }
80
+ } if isinstance(msg, FunctionCall) else
81
+ {
82
+ "role": "function",
83
+ "name": msg.name,
84
+ "content": msg.output
85
+ } if isinstance(msg, FunctionCallOutput) else None
86
+ for msg in messages.items
87
+ if msg is not None
88
+ ],
89
+ "temperature": self.temperature,
90
+ "stream": True,
91
+ "max_tokens": self.max_completion_tokens,
92
+ }
93
+
94
+ if tools:
95
+ formatted_tools = []
96
+ for tool in tools:
97
+ if not is_function_tool(tool):
98
+ continue
99
+ try:
100
+ tool_schema = build_openai_schema(tool)
101
+ formatted_tools.append(tool_schema)
102
+ except Exception as e:
103
+ print(f"Failed to format tool {tool}: {e}")
104
+ continue
105
+
106
+ if formatted_tools:
107
+ completion_params["functions"] = formatted_tools
108
+ completion_params["function_call"] = self.tool_choice
109
+
110
+ completion_params.update(kwargs)
111
+ try:
112
+ response_stream = await self._client.chat.completions.create(**completion_params)
113
+ current_content = ""
114
+ current_function_call = None
115
+
116
+ async for chunk in response_stream:
117
+ if not chunk.choices:
118
+ continue
119
+
120
+ delta = chunk.choices[0].delta
121
+ if delta.function_call:
122
+ if current_function_call is None:
123
+ current_function_call = {
124
+ "name": delta.function_call.name or "",
125
+ "arguments": delta.function_call.arguments or ""
126
+ }
127
+ else:
128
+ if delta.function_call.name:
129
+ current_function_call["name"] += delta.function_call.name
130
+ if delta.function_call.arguments:
131
+ current_function_call["arguments"] += delta.function_call.arguments
132
+ elif current_function_call is not None:
133
+ try:
134
+ args = json.loads(current_function_call["arguments"])
135
+ current_function_call["arguments"] = args
136
+ except json.JSONDecodeError:
137
+ print(f"Failed to parse function arguments: {current_function_call['arguments']}")
138
+ current_function_call["arguments"] = {}
139
+
140
+ yield LLMResponse(
141
+ content="",
142
+ role=ChatRole.ASSISTANT,
143
+ metadata={"function_call": current_function_call}
144
+ )
145
+ current_function_call = None
146
+
147
+ elif delta.content is not None:
148
+ current_content += delta.content
149
+ yield LLMResponse(
150
+ content=current_content,
151
+ role=ChatRole.ASSISTANT
152
+ )
153
+
154
+ except Exception as e:
155
+ self.emit("error", e)
156
+ raise
157
+
158
+ async def aclose(self) -> None:
159
+ """Cleanup resources by closing the HTTP client"""
160
+ if self._client:
161
+ await self._client.close()
@@ -10,6 +10,8 @@ from dotenv import load_dotenv
10
10
  import uuid
11
11
  import base64
12
12
  import aiohttp
13
+ import numpy as np
14
+ from scipy import signal
13
15
  import traceback
14
16
  from videosdk.agents import (
15
17
  FunctionTool,
@@ -18,15 +20,14 @@ from videosdk.agents import (
18
20
  build_openai_schema,
19
21
  CustomAudioStreamTrack,
20
22
  ToolChoice,
21
- RealtimeBaseModel
23
+ RealtimeBaseModel,
24
+ Agent
22
25
  )
23
26
 
24
27
  load_dotenv()
25
28
  from openai.types.beta.realtime.session import InputAudioTranscription, TurnDetection
26
29
 
27
30
  OPENAI_BASE_URL = "https://api.openai.com/v1"
28
- SAMPLE_RATE = 24000
29
- NUM_CHANNELS = 1
30
31
 
31
32
  DEFAULT_TEMPERATURE = 0.8
32
33
  DEFAULT_TURN_DETECTION = TurnDetection(
@@ -44,7 +45,8 @@ DEFAULT_TOOL_CHOICE = "auto"
44
45
 
45
46
  OpenAIEventTypes = Literal[
46
47
  "instructions_updated",
47
- "tools_updated"
48
+ "tools_updated",
49
+ "text_response"
48
50
  ]
49
51
  DEFAULT_VOICE = "alloy"
50
52
  DEFAULT_INPUT_AUDIO_FORMAT = "pcm16"
@@ -128,8 +130,17 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
128
130
  self.audio_track: Optional[CustomAudioStreamTrack] = None
129
131
  self._formatted_tools: Optional[List[Dict[str, Any]]] = None
130
132
  self.config: OpenAIRealtimeConfig = config or OpenAIRealtimeConfig()
131
- self.on("instructions_updated", self._handle_instructions_updated)
132
- self.on("tools_updated", self._handle_tools_updated)
133
+ # global_event_emitter.on("instructions_updated", self._handle_instructions_updated)
134
+ # global_event_emitter.on("tools_updated", self._handle_tools_updated)
135
+
136
+ self.input_sample_rate = 48000
137
+ self.target_sample_rate = 16000
138
+
139
+ def set_agent(self, agent: Agent) -> None:
140
+ self._instructions = agent.instructions
141
+ self._tools = agent.tools
142
+ self.tools_formatted = self._format_tools_for_session(self._tools)
143
+ self._formatted_tools = self.tools_formatted
133
144
 
134
145
  async def connect(self) -> None:
135
146
  headers = {"Agent": "VideoSDK Agents"}
@@ -144,7 +155,10 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
144
155
 
145
156
  async def handle_audio_input(self, audio_data: bytes) -> None:
146
157
  """Handle incoming audio data from the user"""
147
- if self._session and not self._closing:
158
+ if self._session and not self._closing and "audio" in self.config.modalities:
159
+ audio_data = np.frombuffer(audio_data, dtype=np.int16)
160
+ audio_data = signal.resample(audio_data, int(len(audio_data) * self.target_sample_rate / self.input_sample_rate))
161
+ audio_data = audio_data.astype(np.int16).tobytes()
148
162
  base64_audio_data = base64.b64encode(audio_data).decode("utf-8")
149
163
  audio_event = {
150
164
  "type": "input_audio_buffer.append",
@@ -299,17 +313,23 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
299
313
 
300
314
  elif event_type == "conversation.item.input_audio_transcription.completed":
301
315
  await self._handle_input_audio_transcription_completed(data)
316
+
317
+ elif event_type == "response.text.done":
318
+ await self._handle_text_done(data)
302
319
 
303
320
  except Exception as e:
304
321
  self.emit_error(f"Error handling event {event_type}: {str(e)}")
305
322
 
306
323
  async def _handle_speech_started(self, data: dict) -> None:
307
324
  """Handle speech detection start"""
308
- await self.interrupt()
309
- self.audio_track.interrupt()
325
+ if "audio" in self.config.modalities:
326
+ await self.interrupt()
327
+ if self.audio_track:
328
+ self.audio_track.interrupt()
310
329
 
311
330
  async def _handle_speech_stopped(self, data: dict) -> None:
312
331
  """Handle speech detection end"""
332
+ pass
313
333
 
314
334
  async def _handle_response_created(self, data: dict) -> None:
315
335
  """Handle initial response creation"""
@@ -365,6 +385,9 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
365
385
 
366
386
  async def _handle_audio_delta(self, data: dict) -> None:
367
387
  """Handle audio chunk"""
388
+ if "audio" not in self.config.modalities:
389
+ return
390
+
368
391
  try:
369
392
  base64_audio_data = base64.b64decode(data.get("delta"))
370
393
  if base64_audio_data:
@@ -448,31 +471,45 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
448
471
  if not self._session:
449
472
  return
450
473
 
474
+ # Conditionally set turn detection and audio transcription based on modalities
475
+ turn_detection = None
476
+ input_audio_transcription = None
477
+
478
+ if "audio" in self.config.modalities:
479
+ turn_detection = self.config.turn_detection.model_dump(
480
+ by_alias=True,
481
+ exclude_unset=True,
482
+ exclude_defaults=True,
483
+ ) if self.config.turn_detection else None
484
+ input_audio_transcription = self.config.input_audio_transcription.model_dump(
485
+ by_alias=True,
486
+ exclude_unset=True,
487
+ exclude_defaults=True,
488
+ ) if self.config.input_audio_transcription else None
489
+
451
490
  session_update = {
452
491
  "type": "session.update",
453
492
  "session": {
454
493
  "model": self.model,
455
- "voice": self.config.voice,
456
- "instructions": self._instructions or "You are a helpful voice assistant that can answer questions and help with tasks.",
494
+ "instructions": self._instructions or "You are a helpful assistant that can answer questions and help with tasks.",
457
495
  "temperature": self.config.temperature,
458
- "turn_detection": self.config.turn_detection.model_dump(
459
- by_alias=True,
460
- exclude_unset=True,
461
- exclude_defaults=True,
462
- ),
463
- "input_audio_transcription": self.config.input_audio_transcription.model_dump(
464
- by_alias=True,
465
- exclude_unset=True,
466
- exclude_defaults=True,
467
- ),
468
496
  "tool_choice": self.config.tool_choice,
469
497
  "tools": self._formatted_tools or [],
470
498
  "modalities": self.config.modalities,
471
- "input_audio_format": DEFAULT_INPUT_AUDIO_FORMAT,
472
- "output_audio_format": DEFAULT_OUTPUT_AUDIO_FORMAT,
473
499
  "max_response_output_tokens": "inf"
474
500
  }
475
501
  }
502
+
503
+ # Only add audio-related configurations if audio modality is enabled
504
+ if "audio" in self.config.modalities:
505
+ session_update["session"]["voice"] = self.config.voice
506
+ session_update["session"]["input_audio_format"] = DEFAULT_INPUT_AUDIO_FORMAT
507
+ session_update["session"]["output_audio_format"] = DEFAULT_OUTPUT_AUDIO_FORMAT
508
+ if turn_detection:
509
+ session_update["session"]["turn_detection"] = turn_detection
510
+ if input_audio_transcription:
511
+ session_update["session"]["input_audio_transcription"] = input_audio_transcription
512
+
476
513
  # Send the event
477
514
  await self.send_event(session_update)
478
515
 
@@ -521,4 +558,33 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
521
558
  tools = data.get("tools", [])
522
559
  self._tools = tools
523
560
  self.tools_formatted = self._format_tools_for_session(tools)
524
- self._formatted_tools = self.tools_formatted
561
+ self._formatted_tools = self.tools_formatted
562
+
563
+ async def send_text_message(self, message: str) -> None:
564
+ """Send a text message to the OpenAI realtime API"""
565
+ if not self._session:
566
+ raise RuntimeError("No active WebSocket session")
567
+
568
+ await self.send_event({
569
+ "type": "conversation.item.create",
570
+ "item": {
571
+ "type": "message",
572
+ "role": "user",
573
+ "content": [
574
+ {
575
+ "type": "input_text",
576
+ "text": message
577
+ }
578
+ ]
579
+ }
580
+ })
581
+ await self.create_response()
582
+
583
+ async def _handle_text_done(self, data: dict) -> None:
584
+ """Handle text response completion"""
585
+ try:
586
+ text_content = data.get("text", "")
587
+ if text_content:
588
+ self.emit("text_response", {"text": text_content, "type": "done"})
589
+ except Exception as e:
590
+ print(f"[ERROR] Error handling text done: {e}")
@@ -0,0 +1,260 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import base64
5
+ import os
6
+ from typing import Any, Optional
7
+ from urllib.parse import urlencode
8
+ from scipy import signal
9
+ import aiohttp
10
+ import httpx
11
+ import openai
12
+ import numpy as np
13
+ from videosdk.agents import STT as BaseSTT, STTResponse, SpeechEventType, SpeechData, global_event_emitter
14
+
15
+ class OpenAISTT(BaseSTT):
16
+ def __init__(
17
+ self,
18
+ *,
19
+ api_key: str,
20
+ model: str = "whisper-1",
21
+ base_url: str | None = None,
22
+ prompt: str | None = None,
23
+ language: str = "en",
24
+ turn_detection: dict | None = None,
25
+ ) -> None:
26
+ super().__init__()
27
+
28
+ self.api_key = api_key or os.getenv("OPENAI_API_KEY")
29
+ if not self.api_key:
30
+ raise ValueError("OpenAI API key must be provided either through api_key parameter or OPENAI_API_KEY environment variable")
31
+
32
+ self.model = model
33
+ self.language = language
34
+ self.prompt = prompt
35
+ self.turn_detection = turn_detection or {
36
+ "type": "server_vad",
37
+ "threshold": 0.5,
38
+ "prefix_padding_ms": 300,
39
+ "silence_duration_ms": 500,
40
+ }
41
+
42
+ self.client = openai.AsyncClient(
43
+ max_retries=0,
44
+ api_key=api_key,
45
+ base_url=base_url or None,
46
+ http_client=httpx.AsyncClient(
47
+ timeout=httpx.Timeout(connect=15.0, read=5.0, write=5.0, pool=5.0),
48
+ follow_redirects=True,
49
+ limits=httpx.Limits(
50
+ max_connections=50,
51
+ max_keepalive_connections=50,
52
+ keepalive_expiry=120,
53
+ ),
54
+ ),
55
+ )
56
+
57
+ self._session: Optional[aiohttp.ClientSession] = None
58
+ self._ws: Optional[aiohttp.ClientWebSocketResponse] = None
59
+ self._ws_task: Optional[asyncio.Task] = None
60
+ self._current_text = ""
61
+ self._last_interim_at = 0
62
+
63
+ self.input_sample_rate = 48000
64
+ self.target_sample_rate = 16000
65
+
66
+ async def process_audio(
67
+ self,
68
+ audio_frames: bytes,
69
+ language: Optional[str] = None,
70
+ **kwargs: Any
71
+ ) -> None:
72
+ """Process audio frames and send to OpenAI's Realtime API"""
73
+
74
+ if not self._ws:
75
+ await self._connect_ws()
76
+ self._ws_task = asyncio.create_task(self._listen_for_responses())
77
+
78
+ try:
79
+ audio_data = np.frombuffer(audio_frames, dtype=np.int16)
80
+ audio_data = signal.resample(audio_data, int(len(audio_data) * self.target_sample_rate / self.input_sample_rate))
81
+ audio_data = audio_data.astype(np.int16).tobytes()
82
+ audio_data = base64.b64encode(audio_data).decode("utf-8")
83
+ message = {
84
+ "type": "input_audio_buffer.append",
85
+ "audio": audio_data,
86
+ }
87
+ await self._ws.send_json(message)
88
+ except Exception as e:
89
+ print(f"Error in process_audio: {str(e)}")
90
+ self.emit("error", str(e))
91
+ if self._ws:
92
+ await self._ws.close()
93
+ self._ws = None
94
+ if self._ws_task:
95
+ self._ws_task.cancel()
96
+ self._ws_task = None
97
+
98
+ async def _listen_for_responses(self) -> None:
99
+ """Background task to listen for WebSocket responses"""
100
+ if not self._ws:
101
+ return
102
+
103
+ try:
104
+ async for msg in self._ws:
105
+ if msg.type == aiohttp.WSMsgType.TEXT:
106
+ data = msg.json()
107
+ responses = self._handle_ws_message(data)
108
+ for response in responses:
109
+ if self._transcript_callback:
110
+ await self._transcript_callback(response)
111
+ elif msg.type == aiohttp.WSMsgType.ERROR:
112
+ error = f"WebSocket error: {self._ws.exception()}"
113
+ print(error)
114
+ self.emit("error", error)
115
+ break
116
+ elif msg.type == aiohttp.WSMsgType.CLOSED:
117
+ print("WebSocket connection closed")
118
+ break
119
+ except Exception as e:
120
+ error = f"Error in WebSocket listener: {str(e)}"
121
+ print(error)
122
+ self.emit("error", error)
123
+ finally:
124
+ if self._ws:
125
+ await self._ws.close()
126
+ self._ws = None
127
+
128
+ async def _connect_ws(self) -> None:
129
+ """Establish WebSocket connection with OpenAI's Realtime API"""
130
+
131
+ if not self._session:
132
+ self._session = aiohttp.ClientSession()
133
+
134
+ config = {
135
+ "type": "transcription_session.update",
136
+ "session": {
137
+ "input_audio_format": "pcm16",
138
+ "input_audio_transcription": {
139
+ "model": self.model,
140
+ "prompt": self.prompt or "",
141
+ "language": self.language if self.language else None,
142
+ },
143
+ "turn_detection": self.turn_detection,
144
+ "input_audio_noise_reduction": {
145
+ "type": "near_field"
146
+ },
147
+ "include": ["item.input_audio_transcription.logprobs"]
148
+ }
149
+ }
150
+
151
+ query_params = {
152
+ "intent": "transcription",
153
+ }
154
+ headers = {
155
+ "User-Agent": "VideoSDK",
156
+ "Authorization": f"Bearer {self.api_key}",
157
+ "OpenAI-Beta": "realtime=v1",
158
+ }
159
+
160
+ base_url = str(self.client.base_url).rstrip('/')
161
+ ws_url = f"{base_url}/realtime?{urlencode(query_params)}"
162
+ if ws_url.startswith("http"):
163
+ ws_url = ws_url.replace("http", "ws", 1)
164
+
165
+ try:
166
+ self._ws = await self._session.ws_connect(ws_url, headers=headers)
167
+
168
+ initial_response = await self._ws.receive_json()
169
+
170
+ if initial_response.get("type") != "transcription_session.created":
171
+ raise Exception(f"Expected session creation, got: {initial_response}")
172
+
173
+ await self._ws.send_json(config)
174
+
175
+ update_response = await self._ws.receive_json()
176
+
177
+ if update_response.get("type") != "transcription_session.updated":
178
+ raise Exception(f"Configuration update failed: {update_response}")
179
+
180
+ except Exception as e:
181
+ print(f"Error connecting to WebSocket: {str(e)}")
182
+ if self._ws:
183
+ await self._ws.close()
184
+ self._ws = None
185
+ raise
186
+
187
+ def _handle_ws_message(self, msg: dict) -> list[STTResponse]:
188
+ """Handle incoming WebSocket messages and generate STT responses"""
189
+ responses = []
190
+
191
+ try:
192
+ msg_type = msg.get("type")
193
+ if msg_type == "conversation.item.input_audio_transcription.delta":
194
+ delta = msg.get("delta", "")
195
+ if delta:
196
+ self._current_text += delta
197
+ current_time = asyncio.get_event_loop().time()
198
+
199
+ if current_time - self._last_interim_at > 0.5:
200
+ responses.append(STTResponse(
201
+ event_type=SpeechEventType.INTERIM,
202
+ data=SpeechData(
203
+ text=self._current_text,
204
+ language=self.language,
205
+ ),
206
+ metadata={"model": self.model}
207
+ ))
208
+ self._last_interim_at = current_time
209
+
210
+ elif msg_type == "conversation.item.input_audio_transcription.completed":
211
+ transcript = msg.get("transcript", "")
212
+ if transcript:
213
+ responses.append(STTResponse(
214
+ event_type=SpeechEventType.FINAL,
215
+ data=SpeechData(
216
+ text=transcript,
217
+ language=self.language,
218
+ ),
219
+ metadata={"model": self.model}
220
+ ))
221
+ self._current_text = ""
222
+
223
+ elif msg_type == "input_audio_buffer.speech_started":
224
+ global_event_emitter.emit("speech_started")
225
+
226
+ elif msg_type == "input_audio_buffer.speech_stopped":
227
+ global_event_emitter.emit("speech_stopped")
228
+
229
+ except Exception as e:
230
+ print(f"Error handling WebSocket message: {str(e)}")
231
+
232
+ return responses
233
+
234
+ async def aclose(self) -> None:
235
+ """Cleanup resources"""
236
+ if self._ws_task:
237
+ self._ws_task.cancel()
238
+ try:
239
+ await self._ws_task
240
+ except asyncio.CancelledError:
241
+ pass
242
+ self._ws_task = None
243
+
244
+ if self._ws:
245
+ await self._ws.close()
246
+ self._ws = None
247
+
248
+ if self._session:
249
+ await self._session.close()
250
+ self._session = None
251
+
252
+ await self.client.close()
253
+
254
+ async def _ensure_ws_connection(self):
255
+ """Ensure WebSocket is connected, reconnect if necessary"""
256
+ if not self._ws or self._ws.closed:
257
+ await self._connect_ws()
258
+
259
+
260
+
@@ -0,0 +1,109 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, AsyncIterator, Literal, Optional, Union
4
+ import httpx
5
+ import os
6
+ import openai
7
+
8
+ from videosdk.agents import TTS
9
+
10
+ OPENAI_TTS_SAMPLE_RATE = 24000
11
+ OPENAI_TTS_CHANNELS = 1
12
+
13
+ DEFAULT_MODEL = "gpt-4o-mini-tts"
14
+ DEFAULT_VOICE = "ash"
15
+ _RESPONSE_FORMATS = Union[Literal["mp3", "opus", "aac", "flac", "wav", "pcm"], str]
16
+
17
+ class OpenAITTS(TTS):
18
+ def __init__(
19
+ self,
20
+ *,
21
+ model: str = DEFAULT_MODEL,
22
+ voice: str = DEFAULT_VOICE,
23
+ speed: float = 1.0,
24
+ instructions: str | None = None,
25
+ api_key: str | None = None,
26
+ base_url: str | None = None,
27
+ response_format: str = "pcm"
28
+ ) -> None:
29
+ super().__init__(sample_rate=OPENAI_TTS_SAMPLE_RATE, num_channels=OPENAI_TTS_CHANNELS)
30
+
31
+ self.model = model
32
+ self.voice = voice
33
+ self.speed = speed
34
+ self.instructions = instructions
35
+ self.audio_track = None
36
+ self.loop = None
37
+ self.response_format = response_format
38
+
39
+ self.api_key = api_key or os.getenv("OPENAI_API_KEY")
40
+ if not self.api_key:
41
+ raise ValueError("OpenAI API key must be provided either through api_key parameter or OPENAI_API_KEY environment variable")
42
+
43
+ self._client = openai.AsyncClient(
44
+ max_retries=0,
45
+ api_key=self.api_key,
46
+ base_url=base_url or None,
47
+ http_client=httpx.AsyncClient(
48
+ timeout=httpx.Timeout(connect=15.0, read=5.0, write=5.0, pool=5.0),
49
+ follow_redirects=True,
50
+ limits=httpx.Limits(
51
+ max_connections=50,
52
+ max_keepalive_connections=50,
53
+ keepalive_expiry=120,
54
+ ),
55
+ ),
56
+ )
57
+
58
+ async def synthesize(
59
+ self,
60
+ text: AsyncIterator[str] | str,
61
+ voice_id: Optional[str] = None,
62
+ **kwargs: Any
63
+ ) -> None:
64
+ """
65
+ Convert text to speech using OpenAI's TTS API and stream to audio track
66
+
67
+ Args:
68
+ text: Text to convert to speech
69
+ voice_id: Optional voice override
70
+ **kwargs: Additional provider-specific arguments
71
+ """
72
+ try:
73
+ if isinstance(text, AsyncIterator):
74
+ full_text = ""
75
+ async for chunk in text:
76
+ full_text += chunk
77
+ else:
78
+ full_text = text
79
+
80
+ if not self.audio_track or not self.loop:
81
+ self.emit("error", "Audio track or event loop not set")
82
+ return
83
+
84
+ async with self._client.audio.speech.with_streaming_response.create(
85
+ model=self.model,
86
+ voice=voice_id or self.voice,
87
+ input=full_text,
88
+ speed=self.speed,
89
+ response_format=self.response_format,
90
+ **({"instructions": self.instructions} if self.instructions else {})
91
+ ) as response:
92
+ async for chunk in response.iter_bytes():
93
+ if chunk:
94
+ self.loop.create_task(self.audio_track.add_new_bytes(chunk))
95
+
96
+ except openai.APIError as e:
97
+ self.emit("error", str(e))
98
+ except Exception as e:
99
+ self.emit("error", f"TTS synthesis failed: {str(e)}")
100
+
101
+ async def aclose(self) -> None:
102
+ """Cleanup resources"""
103
+ await self._client.close()
104
+ await super().aclose()
105
+
106
+ async def interrupt(self) -> None:
107
+ """Interrupt the TTS process"""
108
+ if self.audio_track:
109
+ self.audio_track.interrupt()
@@ -0,0 +1 @@
1
+ __version__ = "0.0.6"
@@ -1,6 +0,0 @@
1
- from .realtime_api import OpenAIRealtime, OpenAIRealtimeConfig
2
-
3
- __all__ = [
4
- 'OpenAIRealtime',
5
- 'OpenAIRealtimeConfig'
6
- ]
@@ -1 +0,0 @@
1
- __version__ = "0.0.4"