videosdk-plugins-openai 0.0.21__tar.gz → 0.0.23__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of videosdk-plugins-openai might be problematic. Click here for more details.
- {videosdk_plugins_openai-0.0.21 → videosdk_plugins_openai-0.0.23}/PKG-INFO +2 -2
- {videosdk_plugins_openai-0.0.21 → videosdk_plugins_openai-0.0.23}/pyproject.toml +1 -1
- {videosdk_plugins_openai-0.0.21 → videosdk_plugins_openai-0.0.23}/videosdk/plugins/openai/llm.py +2 -2
- {videosdk_plugins_openai-0.0.21 → videosdk_plugins_openai-0.0.23}/videosdk/plugins/openai/realtime_api.py +55 -26
- {videosdk_plugins_openai-0.0.21 → videosdk_plugins_openai-0.0.23}/videosdk/plugins/openai/tts.py +9 -1
- videosdk_plugins_openai-0.0.23/videosdk/plugins/openai/version.py +1 -0
- videosdk_plugins_openai-0.0.21/videosdk/plugins/openai/version.py +0 -1
- {videosdk_plugins_openai-0.0.21 → videosdk_plugins_openai-0.0.23}/.gitignore +0 -0
- {videosdk_plugins_openai-0.0.21 → videosdk_plugins_openai-0.0.23}/README.md +0 -0
- {videosdk_plugins_openai-0.0.21 → videosdk_plugins_openai-0.0.23}/videosdk/plugins/openai/__init__.py +0 -0
- {videosdk_plugins_openai-0.0.21 → videosdk_plugins_openai-0.0.23}/videosdk/plugins/openai/stt.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: videosdk-plugins-openai
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.23
|
|
4
4
|
Summary: VideoSDK Agent Framework plugin for OpenAI services
|
|
5
5
|
Author: videosdk
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -13,7 +13,7 @@ Classifier: Topic :: Multimedia :: Video
|
|
|
13
13
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
14
14
|
Requires-Python: >=3.11
|
|
15
15
|
Requires-Dist: openai[realtime]>=1.68.2
|
|
16
|
-
Requires-Dist: videosdk-agents>=0.0.
|
|
16
|
+
Requires-Dist: videosdk-agents>=0.0.23
|
|
17
17
|
Description-Content-Type: text/markdown
|
|
18
18
|
|
|
19
19
|
# VideoSDK OpenAI Plugin
|
{videosdk_plugins_openai-0.0.21 → videosdk_plugins_openai-0.0.23}/videosdk/plugins/openai/llm.py
RENAMED
|
@@ -133,7 +133,7 @@ class OpenAILLM(LLM):
|
|
|
133
133
|
tool_schema = build_openai_schema(tool)
|
|
134
134
|
formatted_tools.append(tool_schema)
|
|
135
135
|
except Exception as e:
|
|
136
|
-
|
|
136
|
+
self.emit("error", f"Failed to format tool {tool}: {e}")
|
|
137
137
|
continue
|
|
138
138
|
|
|
139
139
|
if formatted_tools:
|
|
@@ -167,7 +167,7 @@ class OpenAILLM(LLM):
|
|
|
167
167
|
args = json.loads(current_function_call["arguments"])
|
|
168
168
|
current_function_call["arguments"] = args
|
|
169
169
|
except json.JSONDecodeError:
|
|
170
|
-
|
|
170
|
+
self.emit("error", f"Failed to parse function arguments: {current_function_call['arguments']}")
|
|
171
171
|
current_function_call["arguments"] = {}
|
|
172
172
|
|
|
173
173
|
yield LLMResponse(
|
|
@@ -24,6 +24,8 @@ from videosdk.agents import (
|
|
|
24
24
|
global_event_emitter,
|
|
25
25
|
Agent
|
|
26
26
|
)
|
|
27
|
+
from videosdk.agents import realtime_metrics_collector
|
|
28
|
+
|
|
27
29
|
|
|
28
30
|
load_dotenv()
|
|
29
31
|
from openai.types.beta.realtime.session import InputAudioTranscription, TurnDetection
|
|
@@ -45,9 +47,9 @@ DEFAULT_INPUT_AUDIO_TRANSCRIPTION = InputAudioTranscription(
|
|
|
45
47
|
DEFAULT_TOOL_CHOICE = "auto"
|
|
46
48
|
|
|
47
49
|
OpenAIEventTypes = Literal[
|
|
48
|
-
"
|
|
49
|
-
"
|
|
50
|
-
"
|
|
50
|
+
"user_speech_started",
|
|
51
|
+
"text_response",
|
|
52
|
+
"error"
|
|
51
53
|
]
|
|
52
54
|
DEFAULT_VOICE = "alloy"
|
|
53
55
|
DEFAULT_INPUT_AUDIO_FORMAT = "pcm16"
|
|
@@ -121,6 +123,7 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
|
|
|
121
123
|
self.api_key = api_key or os.getenv("OPENAI_API_KEY")
|
|
122
124
|
self.base_url = base_url or OPENAI_BASE_URL
|
|
123
125
|
if not self.api_key:
|
|
126
|
+
self.emit("error", "OpenAI API key must be provided or set in OPENAI_API_KEY environment variable")
|
|
124
127
|
raise ValueError("OpenAI API key must be provided or set in OPENAI_API_KEY environment variable")
|
|
125
128
|
self._http_session: Optional[aiohttp.ClientSession] = None
|
|
126
129
|
self._session: Optional[OpenAISession] = None
|
|
@@ -133,6 +136,7 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
|
|
|
133
136
|
self.config: OpenAIRealtimeConfig = config or OpenAIRealtimeConfig()
|
|
134
137
|
self.input_sample_rate = 48000
|
|
135
138
|
self.target_sample_rate = 16000
|
|
139
|
+
self._agent_speaking = False
|
|
136
140
|
|
|
137
141
|
def set_agent(self, agent: Agent) -> None:
|
|
138
142
|
self._instructions = agent.instructions
|
|
@@ -202,6 +206,7 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
|
|
|
202
206
|
async def create_response(self) -> None:
|
|
203
207
|
"""Create a response to the OpenAI realtime API"""
|
|
204
208
|
if not self._session:
|
|
209
|
+
self.emit("error", "No active WebSocket session")
|
|
205
210
|
raise RuntimeError("No active WebSocket session")
|
|
206
211
|
|
|
207
212
|
response_event = {
|
|
@@ -245,15 +250,15 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
|
|
|
245
250
|
msg = await session.ws.receive()
|
|
246
251
|
|
|
247
252
|
if msg.type == aiohttp.WSMsgType.CLOSED:
|
|
248
|
-
|
|
253
|
+
self.emit("error", f"WebSocket closed with reason: {msg.extra}")
|
|
249
254
|
break
|
|
250
255
|
elif msg.type == aiohttp.WSMsgType.ERROR:
|
|
251
|
-
|
|
256
|
+
self.emit("error", f"WebSocket error: {msg.data}")
|
|
252
257
|
break
|
|
253
258
|
elif msg.type == aiohttp.WSMsgType.TEXT:
|
|
254
259
|
await self._handle_message(json.loads(msg.data))
|
|
255
260
|
except Exception as e:
|
|
256
|
-
|
|
261
|
+
self.emit("error", f"WebSocket receive error: {str(e)}")
|
|
257
262
|
finally:
|
|
258
263
|
await self._cleanup_session(session)
|
|
259
264
|
|
|
@@ -277,11 +282,14 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
|
|
|
277
282
|
elif event_type == "response.content_part.added":
|
|
278
283
|
await self._handle_content_part_added(data)
|
|
279
284
|
|
|
285
|
+
elif event_type == "response.text.delta":
|
|
286
|
+
await self._handle_text_delta(data)
|
|
287
|
+
|
|
280
288
|
elif event_type == "response.audio.delta":
|
|
281
289
|
await self._handle_audio_delta(data)
|
|
282
290
|
|
|
283
291
|
elif event_type == "response.audio_transcript.delta":
|
|
284
|
-
await self.
|
|
292
|
+
await self._handle_audio_transcript_delta(data)
|
|
285
293
|
|
|
286
294
|
elif event_type == "response.done":
|
|
287
295
|
await self._handle_response_done(data)
|
|
@@ -305,18 +313,20 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
|
|
|
305
313
|
await self._handle_text_done(data)
|
|
306
314
|
|
|
307
315
|
except Exception as e:
|
|
308
|
-
self.
|
|
316
|
+
self.emit("error", f"Error handling event {event_type}: {str(e)}")
|
|
309
317
|
|
|
310
318
|
async def _handle_speech_started(self, data: dict) -> None:
|
|
311
319
|
"""Handle speech detection start"""
|
|
312
320
|
if "audio" in self.config.modalities:
|
|
321
|
+
self.emit("user_speech_started", {"type": "done"})
|
|
313
322
|
await self.interrupt()
|
|
314
323
|
if self.audio_track:
|
|
315
324
|
self.audio_track.interrupt()
|
|
325
|
+
await realtime_metrics_collector.set_user_speech_start()
|
|
316
326
|
|
|
317
327
|
async def _handle_speech_stopped(self, data: dict) -> None:
|
|
318
328
|
"""Handle speech detection end"""
|
|
319
|
-
|
|
329
|
+
await realtime_metrics_collector.set_user_speech_end()
|
|
320
330
|
|
|
321
331
|
async def _handle_response_created(self, data: dict) -> None:
|
|
322
332
|
"""Handle initial response creation"""
|
|
@@ -338,6 +348,7 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
|
|
|
338
348
|
tool_info = get_tool_info(tool)
|
|
339
349
|
if tool_info.name == name:
|
|
340
350
|
try:
|
|
351
|
+
await realtime_metrics_collector.add_tool_call(name)
|
|
341
352
|
result = await tool(**arguments)
|
|
342
353
|
await self.send_event({
|
|
343
354
|
"type": "conversation.item.create",
|
|
@@ -360,26 +371,33 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
|
|
|
360
371
|
})
|
|
361
372
|
|
|
362
373
|
except Exception as e:
|
|
363
|
-
|
|
374
|
+
self.emit("error", f"Error executing function {name}: {e}")
|
|
364
375
|
break
|
|
365
376
|
except Exception as e:
|
|
366
|
-
|
|
377
|
+
self.emit("error", f"Error handling output item done: {e}")
|
|
367
378
|
|
|
368
379
|
async def _handle_content_part_added(self, data: dict) -> None:
|
|
369
380
|
"""Handle new content part"""
|
|
370
381
|
|
|
382
|
+
async def _handle_text_delta(self, data: dict) -> None:
|
|
383
|
+
"""Handle text delta chunk"""
|
|
384
|
+
pass
|
|
385
|
+
|
|
371
386
|
async def _handle_audio_delta(self, data: dict) -> None:
|
|
372
387
|
"""Handle audio chunk"""
|
|
373
388
|
if "audio" not in self.config.modalities:
|
|
374
389
|
return
|
|
375
390
|
|
|
376
391
|
try:
|
|
392
|
+
if not self._agent_speaking:
|
|
393
|
+
await realtime_metrics_collector.set_agent_speech_start()
|
|
394
|
+
self._agent_speaking = True
|
|
377
395
|
base64_audio_data = base64.b64decode(data.get("delta"))
|
|
378
396
|
if base64_audio_data:
|
|
379
397
|
if self.audio_track and self.loop:
|
|
380
398
|
self.loop.create_task(self.audio_track.add_new_bytes(base64_audio_data))
|
|
381
399
|
except Exception as e:
|
|
382
|
-
|
|
400
|
+
self.emit("error", f"Error handling audio delta: {e}")
|
|
383
401
|
traceback.print_exc()
|
|
384
402
|
|
|
385
403
|
async def interrupt(self) -> None:
|
|
@@ -390,18 +408,36 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
|
|
|
390
408
|
"event_id": str(uuid.uuid4())
|
|
391
409
|
}
|
|
392
410
|
await self.send_event(cancel_event)
|
|
411
|
+
await realtime_metrics_collector.set_interrupted()
|
|
393
412
|
if self.audio_track:
|
|
394
413
|
self.audio_track.interrupt()
|
|
414
|
+
if self._agent_speaking:
|
|
415
|
+
await realtime_metrics_collector.set_agent_speech_end(timeout=1.0)
|
|
416
|
+
self._agent_speaking = False
|
|
395
417
|
|
|
396
|
-
async def
|
|
418
|
+
async def _handle_audio_transcript_delta(self, data: dict) -> None:
|
|
397
419
|
"""Handle transcript chunk"""
|
|
398
|
-
|
|
420
|
+
delta_content = data.get("delta", "")
|
|
421
|
+
if not hasattr(self, '_current_audio_transcript'):
|
|
422
|
+
self._current_audio_transcript = ""
|
|
423
|
+
self._current_audio_transcript += delta_content
|
|
424
|
+
|
|
399
425
|
async def _handle_input_audio_transcription_completed(self, data: dict) -> None:
|
|
400
|
-
"""Handle input audio transcription completion"""
|
|
426
|
+
"""Handle input audio transcription completion for user transcript"""
|
|
427
|
+
transcript = data.get("transcript", "")
|
|
428
|
+
if transcript:
|
|
429
|
+
await realtime_metrics_collector.set_user_transcript(transcript)
|
|
401
430
|
|
|
402
431
|
async def _handle_response_done(self, data: dict) -> None:
|
|
403
|
-
"""Handle response completion"""
|
|
404
|
-
|
|
432
|
+
"""Handle response completion for agent transcript"""
|
|
433
|
+
if hasattr(self, '_current_audio_transcript') and self._current_audio_transcript:
|
|
434
|
+
await realtime_metrics_collector.set_agent_response(self._current_audio_transcript)
|
|
435
|
+
global_event_emitter.emit("text_response", {"text": self._current_audio_transcript, "type": "done"})
|
|
436
|
+
self._current_audio_transcript = ""
|
|
437
|
+
await realtime_metrics_collector.set_agent_speech_end(timeout=1.0)
|
|
438
|
+
self._agent_speaking = False
|
|
439
|
+
pass
|
|
440
|
+
|
|
405
441
|
async def _handle_function_call_arguments_delta(self, data: dict) -> None:
|
|
406
442
|
"""Handle function call arguments delta"""
|
|
407
443
|
|
|
@@ -526,7 +562,7 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
|
|
|
526
562
|
tool_schema = build_openai_schema(tool)
|
|
527
563
|
oai_tools.append(tool_schema)
|
|
528
564
|
except Exception as e:
|
|
529
|
-
|
|
565
|
+
self.emit("error", f"Failed to format tool {tool}: {e}")
|
|
530
566
|
continue
|
|
531
567
|
|
|
532
568
|
return oai_tools
|
|
@@ -534,6 +570,7 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
|
|
|
534
570
|
async def send_text_message(self, message: str) -> None:
|
|
535
571
|
"""Send a text message to the OpenAI realtime API"""
|
|
536
572
|
if not self._session:
|
|
573
|
+
self.emit("error", "No active WebSocket session")
|
|
537
574
|
raise RuntimeError("No active WebSocket session")
|
|
538
575
|
|
|
539
576
|
await self.send_event({
|
|
@@ -551,11 +588,3 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
|
|
|
551
588
|
})
|
|
552
589
|
await self.create_response()
|
|
553
590
|
|
|
554
|
-
async def _handle_text_done(self, data: dict) -> None:
|
|
555
|
-
"""Handle text response completion"""
|
|
556
|
-
try:
|
|
557
|
-
text_content = data.get("text", "")
|
|
558
|
-
if text_content:
|
|
559
|
-
global_event_emitter.emit("text_response", {"text": text_content, "type": "done"})
|
|
560
|
-
except Exception as e:
|
|
561
|
-
print(f"[ERROR] Error handling text done: {e}")
|
{videosdk_plugins_openai-0.0.21 → videosdk_plugins_openai-0.0.23}/videosdk/plugins/openai/tts.py
RENAMED
|
@@ -36,6 +36,7 @@ class OpenAITTS(TTS):
|
|
|
36
36
|
self.audio_track = None
|
|
37
37
|
self.loop = None
|
|
38
38
|
self.response_format = response_format
|
|
39
|
+
self._first_chunk_sent = False
|
|
39
40
|
|
|
40
41
|
self.api_key = api_key or os.getenv("OPENAI_API_KEY")
|
|
41
42
|
if not self.api_key:
|
|
@@ -55,6 +56,10 @@ class OpenAITTS(TTS):
|
|
|
55
56
|
),
|
|
56
57
|
),
|
|
57
58
|
)
|
|
59
|
+
|
|
60
|
+
def reset_first_audio_tracking(self) -> None:
|
|
61
|
+
"""Reset the first audio tracking state for next TTS task"""
|
|
62
|
+
self._first_chunk_sent = False
|
|
58
63
|
|
|
59
64
|
async def synthesize(
|
|
60
65
|
self,
|
|
@@ -95,7 +100,6 @@ class OpenAITTS(TTS):
|
|
|
95
100
|
if chunk:
|
|
96
101
|
audio_data += chunk
|
|
97
102
|
|
|
98
|
-
|
|
99
103
|
if audio_data:
|
|
100
104
|
await self._stream_audio_chunks(audio_data)
|
|
101
105
|
|
|
@@ -116,6 +120,10 @@ class OpenAITTS(TTS):
|
|
|
116
120
|
chunk += b'\x00' * padding_needed
|
|
117
121
|
|
|
118
122
|
if len(chunk) == chunk_size:
|
|
123
|
+
if not self._first_chunk_sent and self._first_audio_callback:
|
|
124
|
+
self._first_chunk_sent = True
|
|
125
|
+
await self._first_audio_callback()
|
|
126
|
+
|
|
119
127
|
self.loop.create_task(self.audio_track.add_new_bytes(chunk))
|
|
120
128
|
await asyncio.sleep(0.001)
|
|
121
129
|
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.0.23"
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.0.21"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{videosdk_plugins_openai-0.0.21 → videosdk_plugins_openai-0.0.23}/videosdk/plugins/openai/stt.py
RENAMED
|
File without changes
|