videosdk-plugins-openai 0.0.22__py3-none-any.whl → 0.0.23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of videosdk-plugins-openai might be problematic. Click here for more details.
- videosdk/plugins/openai/llm.py +2 -2
- videosdk/plugins/openai/realtime_api.py +50 -22
- videosdk/plugins/openai/tts.py +9 -1
- videosdk/plugins/openai/version.py +1 -1
- {videosdk_plugins_openai-0.0.22.dist-info → videosdk_plugins_openai-0.0.23.dist-info}/METADATA +2 -2
- videosdk_plugins_openai-0.0.23.dist-info/RECORD +9 -0
- videosdk_plugins_openai-0.0.22.dist-info/RECORD +0 -9
- {videosdk_plugins_openai-0.0.22.dist-info → videosdk_plugins_openai-0.0.23.dist-info}/WHEEL +0 -0
videosdk/plugins/openai/llm.py
CHANGED
|
@@ -133,7 +133,7 @@ class OpenAILLM(LLM):
|
|
|
133
133
|
tool_schema = build_openai_schema(tool)
|
|
134
134
|
formatted_tools.append(tool_schema)
|
|
135
135
|
except Exception as e:
|
|
136
|
-
|
|
136
|
+
self.emit("error", f"Failed to format tool {tool}: {e}")
|
|
137
137
|
continue
|
|
138
138
|
|
|
139
139
|
if formatted_tools:
|
|
@@ -167,7 +167,7 @@ class OpenAILLM(LLM):
|
|
|
167
167
|
args = json.loads(current_function_call["arguments"])
|
|
168
168
|
current_function_call["arguments"] = args
|
|
169
169
|
except json.JSONDecodeError:
|
|
170
|
-
|
|
170
|
+
self.emit("error", f"Failed to parse function arguments: {current_function_call['arguments']}")
|
|
171
171
|
current_function_call["arguments"] = {}
|
|
172
172
|
|
|
173
173
|
yield LLMResponse(
|
|
@@ -24,6 +24,8 @@ from videosdk.agents import (
|
|
|
24
24
|
global_event_emitter,
|
|
25
25
|
Agent
|
|
26
26
|
)
|
|
27
|
+
from videosdk.agents import realtime_metrics_collector
|
|
28
|
+
|
|
27
29
|
|
|
28
30
|
load_dotenv()
|
|
29
31
|
from openai.types.beta.realtime.session import InputAudioTranscription, TurnDetection
|
|
@@ -121,6 +123,7 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
|
|
|
121
123
|
self.api_key = api_key or os.getenv("OPENAI_API_KEY")
|
|
122
124
|
self.base_url = base_url or OPENAI_BASE_URL
|
|
123
125
|
if not self.api_key:
|
|
126
|
+
self.emit("error", "OpenAI API key must be provided or set in OPENAI_API_KEY environment variable")
|
|
124
127
|
raise ValueError("OpenAI API key must be provided or set in OPENAI_API_KEY environment variable")
|
|
125
128
|
self._http_session: Optional[aiohttp.ClientSession] = None
|
|
126
129
|
self._session: Optional[OpenAISession] = None
|
|
@@ -133,6 +136,7 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
|
|
|
133
136
|
self.config: OpenAIRealtimeConfig = config or OpenAIRealtimeConfig()
|
|
134
137
|
self.input_sample_rate = 48000
|
|
135
138
|
self.target_sample_rate = 16000
|
|
139
|
+
self._agent_speaking = False
|
|
136
140
|
|
|
137
141
|
def set_agent(self, agent: Agent) -> None:
|
|
138
142
|
self._instructions = agent.instructions
|
|
@@ -202,6 +206,7 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
|
|
|
202
206
|
async def create_response(self) -> None:
|
|
203
207
|
"""Create a response to the OpenAI realtime API"""
|
|
204
208
|
if not self._session:
|
|
209
|
+
self.emit("error", "No active WebSocket session")
|
|
205
210
|
raise RuntimeError("No active WebSocket session")
|
|
206
211
|
|
|
207
212
|
response_event = {
|
|
@@ -245,15 +250,15 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
|
|
|
245
250
|
msg = await session.ws.receive()
|
|
246
251
|
|
|
247
252
|
if msg.type == aiohttp.WSMsgType.CLOSED:
|
|
248
|
-
|
|
253
|
+
self.emit("error", f"WebSocket closed with reason: {msg.extra}")
|
|
249
254
|
break
|
|
250
255
|
elif msg.type == aiohttp.WSMsgType.ERROR:
|
|
251
|
-
|
|
256
|
+
self.emit("error", f"WebSocket error: {msg.data}")
|
|
252
257
|
break
|
|
253
258
|
elif msg.type == aiohttp.WSMsgType.TEXT:
|
|
254
259
|
await self._handle_message(json.loads(msg.data))
|
|
255
260
|
except Exception as e:
|
|
256
|
-
|
|
261
|
+
self.emit("error", f"WebSocket receive error: {str(e)}")
|
|
257
262
|
finally:
|
|
258
263
|
await self._cleanup_session(session)
|
|
259
264
|
|
|
@@ -277,11 +282,14 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
|
|
|
277
282
|
elif event_type == "response.content_part.added":
|
|
278
283
|
await self._handle_content_part_added(data)
|
|
279
284
|
|
|
285
|
+
elif event_type == "response.text.delta":
|
|
286
|
+
await self._handle_text_delta(data)
|
|
287
|
+
|
|
280
288
|
elif event_type == "response.audio.delta":
|
|
281
289
|
await self._handle_audio_delta(data)
|
|
282
290
|
|
|
283
291
|
elif event_type == "response.audio_transcript.delta":
|
|
284
|
-
await self.
|
|
292
|
+
await self._handle_audio_transcript_delta(data)
|
|
285
293
|
|
|
286
294
|
elif event_type == "response.done":
|
|
287
295
|
await self._handle_response_done(data)
|
|
@@ -314,10 +322,11 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
|
|
|
314
322
|
await self.interrupt()
|
|
315
323
|
if self.audio_track:
|
|
316
324
|
self.audio_track.interrupt()
|
|
325
|
+
await realtime_metrics_collector.set_user_speech_start()
|
|
317
326
|
|
|
318
327
|
async def _handle_speech_stopped(self, data: dict) -> None:
|
|
319
328
|
"""Handle speech detection end"""
|
|
320
|
-
|
|
329
|
+
await realtime_metrics_collector.set_user_speech_end()
|
|
321
330
|
|
|
322
331
|
async def _handle_response_created(self, data: dict) -> None:
|
|
323
332
|
"""Handle initial response creation"""
|
|
@@ -339,6 +348,7 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
|
|
|
339
348
|
tool_info = get_tool_info(tool)
|
|
340
349
|
if tool_info.name == name:
|
|
341
350
|
try:
|
|
351
|
+
await realtime_metrics_collector.add_tool_call(name)
|
|
342
352
|
result = await tool(**arguments)
|
|
343
353
|
await self.send_event({
|
|
344
354
|
"type": "conversation.item.create",
|
|
@@ -361,26 +371,33 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
|
|
|
361
371
|
})
|
|
362
372
|
|
|
363
373
|
except Exception as e:
|
|
364
|
-
|
|
374
|
+
self.emit("error", f"Error executing function {name}: {e}")
|
|
365
375
|
break
|
|
366
376
|
except Exception as e:
|
|
367
|
-
|
|
377
|
+
self.emit("error", f"Error handling output item done: {e}")
|
|
368
378
|
|
|
369
379
|
async def _handle_content_part_added(self, data: dict) -> None:
|
|
370
380
|
"""Handle new content part"""
|
|
371
381
|
|
|
382
|
+
async def _handle_text_delta(self, data: dict) -> None:
|
|
383
|
+
"""Handle text delta chunk"""
|
|
384
|
+
pass
|
|
385
|
+
|
|
372
386
|
async def _handle_audio_delta(self, data: dict) -> None:
|
|
373
387
|
"""Handle audio chunk"""
|
|
374
388
|
if "audio" not in self.config.modalities:
|
|
375
389
|
return
|
|
376
390
|
|
|
377
391
|
try:
|
|
392
|
+
if not self._agent_speaking:
|
|
393
|
+
await realtime_metrics_collector.set_agent_speech_start()
|
|
394
|
+
self._agent_speaking = True
|
|
378
395
|
base64_audio_data = base64.b64decode(data.get("delta"))
|
|
379
396
|
if base64_audio_data:
|
|
380
397
|
if self.audio_track and self.loop:
|
|
381
398
|
self.loop.create_task(self.audio_track.add_new_bytes(base64_audio_data))
|
|
382
399
|
except Exception as e:
|
|
383
|
-
|
|
400
|
+
self.emit("error", f"Error handling audio delta: {e}")
|
|
384
401
|
traceback.print_exc()
|
|
385
402
|
|
|
386
403
|
async def interrupt(self) -> None:
|
|
@@ -391,18 +408,36 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
|
|
|
391
408
|
"event_id": str(uuid.uuid4())
|
|
392
409
|
}
|
|
393
410
|
await self.send_event(cancel_event)
|
|
411
|
+
await realtime_metrics_collector.set_interrupted()
|
|
394
412
|
if self.audio_track:
|
|
395
413
|
self.audio_track.interrupt()
|
|
414
|
+
if self._agent_speaking:
|
|
415
|
+
await realtime_metrics_collector.set_agent_speech_end(timeout=1.0)
|
|
416
|
+
self._agent_speaking = False
|
|
396
417
|
|
|
397
|
-
async def
|
|
418
|
+
async def _handle_audio_transcript_delta(self, data: dict) -> None:
|
|
398
419
|
"""Handle transcript chunk"""
|
|
399
|
-
|
|
420
|
+
delta_content = data.get("delta", "")
|
|
421
|
+
if not hasattr(self, '_current_audio_transcript'):
|
|
422
|
+
self._current_audio_transcript = ""
|
|
423
|
+
self._current_audio_transcript += delta_content
|
|
424
|
+
|
|
400
425
|
async def _handle_input_audio_transcription_completed(self, data: dict) -> None:
|
|
401
|
-
"""Handle input audio transcription completion"""
|
|
426
|
+
"""Handle input audio transcription completion for user transcript"""
|
|
427
|
+
transcript = data.get("transcript", "")
|
|
428
|
+
if transcript:
|
|
429
|
+
await realtime_metrics_collector.set_user_transcript(transcript)
|
|
402
430
|
|
|
403
431
|
async def _handle_response_done(self, data: dict) -> None:
|
|
404
|
-
"""Handle response completion"""
|
|
405
|
-
|
|
432
|
+
"""Handle response completion for agent transcript"""
|
|
433
|
+
if hasattr(self, '_current_audio_transcript') and self._current_audio_transcript:
|
|
434
|
+
await realtime_metrics_collector.set_agent_response(self._current_audio_transcript)
|
|
435
|
+
global_event_emitter.emit("text_response", {"text": self._current_audio_transcript, "type": "done"})
|
|
436
|
+
self._current_audio_transcript = ""
|
|
437
|
+
await realtime_metrics_collector.set_agent_speech_end(timeout=1.0)
|
|
438
|
+
self._agent_speaking = False
|
|
439
|
+
pass
|
|
440
|
+
|
|
406
441
|
async def _handle_function_call_arguments_delta(self, data: dict) -> None:
|
|
407
442
|
"""Handle function call arguments delta"""
|
|
408
443
|
|
|
@@ -527,7 +562,7 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
|
|
|
527
562
|
tool_schema = build_openai_schema(tool)
|
|
528
563
|
oai_tools.append(tool_schema)
|
|
529
564
|
except Exception as e:
|
|
530
|
-
|
|
565
|
+
self.emit("error", f"Failed to format tool {tool}: {e}")
|
|
531
566
|
continue
|
|
532
567
|
|
|
533
568
|
return oai_tools
|
|
@@ -535,6 +570,7 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
|
|
|
535
570
|
async def send_text_message(self, message: str) -> None:
|
|
536
571
|
"""Send a text message to the OpenAI realtime API"""
|
|
537
572
|
if not self._session:
|
|
573
|
+
self.emit("error", "No active WebSocket session")
|
|
538
574
|
raise RuntimeError("No active WebSocket session")
|
|
539
575
|
|
|
540
576
|
await self.send_event({
|
|
@@ -552,11 +588,3 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
|
|
|
552
588
|
})
|
|
553
589
|
await self.create_response()
|
|
554
590
|
|
|
555
|
-
async def _handle_text_done(self, data: dict) -> None:
|
|
556
|
-
"""Handle text response completion"""
|
|
557
|
-
try:
|
|
558
|
-
text_content = data.get("text", "")
|
|
559
|
-
if text_content:
|
|
560
|
-
global_event_emitter.emit("text_response", {"text": text_content, "type": "done"})
|
|
561
|
-
except Exception as e:
|
|
562
|
-
print(f"[ERROR] Error handling text done: {e}")
|
videosdk/plugins/openai/tts.py
CHANGED
|
@@ -36,6 +36,7 @@ class OpenAITTS(TTS):
|
|
|
36
36
|
self.audio_track = None
|
|
37
37
|
self.loop = None
|
|
38
38
|
self.response_format = response_format
|
|
39
|
+
self._first_chunk_sent = False
|
|
39
40
|
|
|
40
41
|
self.api_key = api_key or os.getenv("OPENAI_API_KEY")
|
|
41
42
|
if not self.api_key:
|
|
@@ -55,6 +56,10 @@ class OpenAITTS(TTS):
|
|
|
55
56
|
),
|
|
56
57
|
),
|
|
57
58
|
)
|
|
59
|
+
|
|
60
|
+
def reset_first_audio_tracking(self) -> None:
|
|
61
|
+
"""Reset the first audio tracking state for next TTS task"""
|
|
62
|
+
self._first_chunk_sent = False
|
|
58
63
|
|
|
59
64
|
async def synthesize(
|
|
60
65
|
self,
|
|
@@ -95,7 +100,6 @@ class OpenAITTS(TTS):
|
|
|
95
100
|
if chunk:
|
|
96
101
|
audio_data += chunk
|
|
97
102
|
|
|
98
|
-
|
|
99
103
|
if audio_data:
|
|
100
104
|
await self._stream_audio_chunks(audio_data)
|
|
101
105
|
|
|
@@ -116,6 +120,10 @@ class OpenAITTS(TTS):
|
|
|
116
120
|
chunk += b'\x00' * padding_needed
|
|
117
121
|
|
|
118
122
|
if len(chunk) == chunk_size:
|
|
123
|
+
if not self._first_chunk_sent and self._first_audio_callback:
|
|
124
|
+
self._first_chunk_sent = True
|
|
125
|
+
await self._first_audio_callback()
|
|
126
|
+
|
|
119
127
|
self.loop.create_task(self.audio_track.add_new_bytes(chunk))
|
|
120
128
|
await asyncio.sleep(0.001)
|
|
121
129
|
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.0.
|
|
1
|
+
__version__ = "0.0.23"
|
{videosdk_plugins_openai-0.0.22.dist-info → videosdk_plugins_openai-0.0.23.dist-info}/METADATA
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: videosdk-plugins-openai
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.23
|
|
4
4
|
Summary: VideoSDK Agent Framework plugin for OpenAI services
|
|
5
5
|
Author: videosdk
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -13,7 +13,7 @@ Classifier: Topic :: Multimedia :: Video
|
|
|
13
13
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
14
14
|
Requires-Python: >=3.11
|
|
15
15
|
Requires-Dist: openai[realtime]>=1.68.2
|
|
16
|
-
Requires-Dist: videosdk-agents>=0.0.
|
|
16
|
+
Requires-Dist: videosdk-agents>=0.0.23
|
|
17
17
|
Description-Content-Type: text/markdown
|
|
18
18
|
|
|
19
19
|
# VideoSDK OpenAI Plugin
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
videosdk/plugins/openai/__init__.py,sha256=1jbc4HOYxkLeruM9RAqmZYSBdnr74gnPHmCNMKXEPrg,259
|
|
2
|
+
videosdk/plugins/openai/llm.py,sha256=igKq1LRrJfgrIbhVFik8aJp1Cux5069sAX-tusfCg6k,7148
|
|
3
|
+
videosdk/plugins/openai/realtime_api.py,sha256=G-1Rn0QpKeVozvrbzh7KExVoYXKgTejBNtfyTBRBP1g,24313
|
|
4
|
+
videosdk/plugins/openai/stt.py,sha256=YZROX-BjTqtWiT6ouMZacLkMYbmao3emB-88ewN93jg,9492
|
|
5
|
+
videosdk/plugins/openai/tts.py,sha256=m-15GslICL9dOa_H7YqIHP5ifif2OL-7DeTRQunQs9A,4814
|
|
6
|
+
videosdk/plugins/openai/version.py,sha256=AZGfcffmm2o30wrhu9YNi-7Caw1Fg2eavlgdTcC0Ml0,22
|
|
7
|
+
videosdk_plugins_openai-0.0.23.dist-info/METADATA,sha256=v58yrMuzRPg7-CRE4ZDLcXP5dAbrH2GiWXWGKGkQh8M,827
|
|
8
|
+
videosdk_plugins_openai-0.0.23.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
9
|
+
videosdk_plugins_openai-0.0.23.dist-info/RECORD,,
|
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
videosdk/plugins/openai/__init__.py,sha256=1jbc4HOYxkLeruM9RAqmZYSBdnr74gnPHmCNMKXEPrg,259
|
|
2
|
-
videosdk/plugins/openai/llm.py,sha256=h6xuJmyjg6InL9tr5pKBGt_5bNMpJ4XqnO72OtmCJ0c,7122
|
|
3
|
-
videosdk/plugins/openai/realtime_api.py,sha256=WSzDWHcCQC8QsKLDmA5mm_oSN8UIHYMplesNliV5eUc,22611
|
|
4
|
-
videosdk/plugins/openai/stt.py,sha256=YZROX-BjTqtWiT6ouMZacLkMYbmao3emB-88ewN93jg,9492
|
|
5
|
-
videosdk/plugins/openai/tts.py,sha256=o5ktMUzjPkj64L5qqRaKPTWq7Na56TshMnLfU-sK36k,4417
|
|
6
|
-
videosdk/plugins/openai/version.py,sha256=NoiGDztYD4fsDDnfSPiSzRkknkNHhFUtKZj0mhQiTYM,22
|
|
7
|
-
videosdk_plugins_openai-0.0.22.dist-info/METADATA,sha256=9BJRuTdobykpCbIf5Gwr33z074lZjp-tCjdgBn5GUqg,827
|
|
8
|
-
videosdk_plugins_openai-0.0.22.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
9
|
-
videosdk_plugins_openai-0.0.22.dist-info/RECORD,,
|
|
File without changes
|