videosdk-plugins-openai 0.0.22__py3-none-any.whl → 0.0.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of videosdk-plugins-openai might be problematic. Click here for more details.

@@ -133,7 +133,7 @@ class OpenAILLM(LLM):
133
133
  tool_schema = build_openai_schema(tool)
134
134
  formatted_tools.append(tool_schema)
135
135
  except Exception as e:
136
- print(f"Failed to format tool {tool}: {e}")
136
+ self.emit("error", f"Failed to format tool {tool}: {e}")
137
137
  continue
138
138
 
139
139
  if formatted_tools:
@@ -167,7 +167,7 @@ class OpenAILLM(LLM):
167
167
  args = json.loads(current_function_call["arguments"])
168
168
  current_function_call["arguments"] = args
169
169
  except json.JSONDecodeError:
170
- print(f"Failed to parse function arguments: {current_function_call['arguments']}")
170
+ self.emit("error", f"Failed to parse function arguments: {current_function_call['arguments']}")
171
171
  current_function_call["arguments"] = {}
172
172
 
173
173
  yield LLMResponse(
@@ -24,6 +24,8 @@ from videosdk.agents import (
24
24
  global_event_emitter,
25
25
  Agent
26
26
  )
27
+ from videosdk.agents import realtime_metrics_collector
28
+
27
29
 
28
30
  load_dotenv()
29
31
  from openai.types.beta.realtime.session import InputAudioTranscription, TurnDetection
@@ -121,6 +123,7 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
121
123
  self.api_key = api_key or os.getenv("OPENAI_API_KEY")
122
124
  self.base_url = base_url or OPENAI_BASE_URL
123
125
  if not self.api_key:
126
+ self.emit("error", "OpenAI API key must be provided or set in OPENAI_API_KEY environment variable")
124
127
  raise ValueError("OpenAI API key must be provided or set in OPENAI_API_KEY environment variable")
125
128
  self._http_session: Optional[aiohttp.ClientSession] = None
126
129
  self._session: Optional[OpenAISession] = None
@@ -133,6 +136,7 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
133
136
  self.config: OpenAIRealtimeConfig = config or OpenAIRealtimeConfig()
134
137
  self.input_sample_rate = 48000
135
138
  self.target_sample_rate = 16000
139
+ self._agent_speaking = False
136
140
 
137
141
  def set_agent(self, agent: Agent) -> None:
138
142
  self._instructions = agent.instructions
@@ -202,6 +206,7 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
202
206
  async def create_response(self) -> None:
203
207
  """Create a response to the OpenAI realtime API"""
204
208
  if not self._session:
209
+ self.emit("error", "No active WebSocket session")
205
210
  raise RuntimeError("No active WebSocket session")
206
211
 
207
212
  response_event = {
@@ -245,15 +250,15 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
245
250
  msg = await session.ws.receive()
246
251
 
247
252
  if msg.type == aiohttp.WSMsgType.CLOSED:
248
- print("WebSocket closed with reason:", msg.extra)
253
+ self.emit("error", f"WebSocket closed with reason: {msg.extra}")
249
254
  break
250
255
  elif msg.type == aiohttp.WSMsgType.ERROR:
251
- print("WebSocket error:", msg.data)
256
+ self.emit("error", f"WebSocket error: {msg.data}")
252
257
  break
253
258
  elif msg.type == aiohttp.WSMsgType.TEXT:
254
259
  await self._handle_message(json.loads(msg.data))
255
260
  except Exception as e:
256
- print("WebSocket receive error:", str(e))
261
+ self.emit("error", f"WebSocket receive error: {str(e)}")
257
262
  finally:
258
263
  await self._cleanup_session(session)
259
264
 
@@ -277,11 +282,14 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
277
282
  elif event_type == "response.content_part.added":
278
283
  await self._handle_content_part_added(data)
279
284
 
285
+ elif event_type == "response.text.delta":
286
+ await self._handle_text_delta(data)
287
+
280
288
  elif event_type == "response.audio.delta":
281
289
  await self._handle_audio_delta(data)
282
290
 
283
291
  elif event_type == "response.audio_transcript.delta":
284
- await self._handle_transcript_delta(data)
292
+ await self._handle_audio_transcript_delta(data)
285
293
 
286
294
  elif event_type == "response.done":
287
295
  await self._handle_response_done(data)
@@ -314,10 +322,11 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
314
322
  await self.interrupt()
315
323
  if self.audio_track:
316
324
  self.audio_track.interrupt()
325
+ await realtime_metrics_collector.set_user_speech_start()
317
326
 
318
327
  async def _handle_speech_stopped(self, data: dict) -> None:
319
328
  """Handle speech detection end"""
320
- pass
329
+ await realtime_metrics_collector.set_user_speech_end()
321
330
 
322
331
  async def _handle_response_created(self, data: dict) -> None:
323
332
  """Handle initial response creation"""
@@ -339,6 +348,7 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
339
348
  tool_info = get_tool_info(tool)
340
349
  if tool_info.name == name:
341
350
  try:
351
+ await realtime_metrics_collector.add_tool_call(name)
342
352
  result = await tool(**arguments)
343
353
  await self.send_event({
344
354
  "type": "conversation.item.create",
@@ -361,26 +371,33 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
361
371
  })
362
372
 
363
373
  except Exception as e:
364
- print(f"Error executing function {name}: {e}")
374
+ self.emit("error", f"Error executing function {name}: {e}")
365
375
  break
366
376
  except Exception as e:
367
- print(f"Error handling output item done: {e}")
377
+ self.emit("error", f"Error handling output item done: {e}")
368
378
 
369
379
  async def _handle_content_part_added(self, data: dict) -> None:
370
380
  """Handle new content part"""
371
381
 
382
+ async def _handle_text_delta(self, data: dict) -> None:
383
+ """Handle text delta chunk"""
384
+ pass
385
+
372
386
  async def _handle_audio_delta(self, data: dict) -> None:
373
387
  """Handle audio chunk"""
374
388
  if "audio" not in self.config.modalities:
375
389
  return
376
390
 
377
391
  try:
392
+ if not self._agent_speaking:
393
+ await realtime_metrics_collector.set_agent_speech_start()
394
+ self._agent_speaking = True
378
395
  base64_audio_data = base64.b64decode(data.get("delta"))
379
396
  if base64_audio_data:
380
397
  if self.audio_track and self.loop:
381
398
  self.loop.create_task(self.audio_track.add_new_bytes(base64_audio_data))
382
399
  except Exception as e:
383
- print(f"[ERROR] Error handling audio delta: {e}")
400
+ self.emit("error", f"Error handling audio delta: {e}")
384
401
  traceback.print_exc()
385
402
 
386
403
  async def interrupt(self) -> None:
@@ -391,18 +408,52 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
391
408
  "event_id": str(uuid.uuid4())
392
409
  }
393
410
  await self.send_event(cancel_event)
411
+ await realtime_metrics_collector.set_interrupted()
394
412
  if self.audio_track:
395
413
  self.audio_track.interrupt()
414
+ if self._agent_speaking:
415
+ await realtime_metrics_collector.set_agent_speech_end(timeout=1.0)
416
+ self._agent_speaking = False
396
417
 
397
- async def _handle_transcript_delta(self, data: dict) -> None:
418
+ async def _handle_audio_transcript_delta(self, data: dict) -> None:
398
419
  """Handle transcript chunk"""
399
-
420
+ delta_content = data.get("delta", "")
421
+ if not hasattr(self, '_current_audio_transcript'):
422
+ self._current_audio_transcript = ""
423
+ self._current_audio_transcript += delta_content
424
+
400
425
  async def _handle_input_audio_transcription_completed(self, data: dict) -> None:
401
- """Handle input audio transcription completion"""
426
+ """Handle input audio transcription completion for user transcript"""
427
+ transcript = data.get("transcript", "")
428
+ if transcript:
429
+ await realtime_metrics_collector.set_user_transcript(transcript)
430
+ try:
431
+ self.emit("realtime_model_transcription", {
432
+ "role": "user",
433
+ "text": transcript,
434
+ "is_final": True
435
+ })
436
+ except Exception:
437
+ pass
402
438
 
403
439
  async def _handle_response_done(self, data: dict) -> None:
404
- """Handle response completion"""
405
-
440
+ """Handle response completion for agent transcript"""
441
+ if hasattr(self, '_current_audio_transcript') and self._current_audio_transcript:
442
+ await realtime_metrics_collector.set_agent_response(self._current_audio_transcript)
443
+ global_event_emitter.emit("text_response", {"text": self._current_audio_transcript, "type": "done"})
444
+ try:
445
+ self.emit("realtime_model_transcription", {
446
+ "role": "agent",
447
+ "text": self._current_audio_transcript,
448
+ "is_final": True
449
+ })
450
+ except Exception:
451
+ pass
452
+ self._current_audio_transcript = ""
453
+ await realtime_metrics_collector.set_agent_speech_end(timeout=1.0)
454
+ self._agent_speaking = False
455
+ pass
456
+
406
457
  async def _handle_function_call_arguments_delta(self, data: dict) -> None:
407
458
  """Handle function call arguments delta"""
408
459
 
@@ -527,7 +578,7 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
527
578
  tool_schema = build_openai_schema(tool)
528
579
  oai_tools.append(tool_schema)
529
580
  except Exception as e:
530
- print(f"Failed to format tool {tool}: {e}")
581
+ self.emit("error", f"Failed to format tool {tool}: {e}")
531
582
  continue
532
583
 
533
584
  return oai_tools
@@ -535,6 +586,7 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
535
586
  async def send_text_message(self, message: str) -> None:
536
587
  """Send a text message to the OpenAI realtime API"""
537
588
  if not self._session:
589
+ self.emit("error", "No active WebSocket session")
538
590
  raise RuntimeError("No active WebSocket session")
539
591
 
540
592
  await self.send_event({
@@ -552,11 +604,3 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
552
604
  })
553
605
  await self.create_response()
554
606
 
555
- async def _handle_text_done(self, data: dict) -> None:
556
- """Handle text response completion"""
557
- try:
558
- text_content = data.get("text", "")
559
- if text_content:
560
- global_event_emitter.emit("text_response", {"text": text_content, "type": "done"})
561
- except Exception as e:
562
- print(f"[ERROR] Error handling text done: {e}")
@@ -36,6 +36,7 @@ class OpenAITTS(TTS):
36
36
  self.audio_track = None
37
37
  self.loop = None
38
38
  self.response_format = response_format
39
+ self._first_chunk_sent = False
39
40
 
40
41
  self.api_key = api_key or os.getenv("OPENAI_API_KEY")
41
42
  if not self.api_key:
@@ -55,6 +56,10 @@ class OpenAITTS(TTS):
55
56
  ),
56
57
  ),
57
58
  )
59
+
60
+ def reset_first_audio_tracking(self) -> None:
61
+ """Reset the first audio tracking state for next TTS task"""
62
+ self._first_chunk_sent = False
58
63
 
59
64
  async def synthesize(
60
65
  self,
@@ -95,7 +100,6 @@ class OpenAITTS(TTS):
95
100
  if chunk:
96
101
  audio_data += chunk
97
102
 
98
-
99
103
  if audio_data:
100
104
  await self._stream_audio_chunks(audio_data)
101
105
 
@@ -116,6 +120,10 @@ class OpenAITTS(TTS):
116
120
  chunk += b'\x00' * padding_needed
117
121
 
118
122
  if len(chunk) == chunk_size:
123
+ if not self._first_chunk_sent and self._first_audio_callback:
124
+ self._first_chunk_sent = True
125
+ await self._first_audio_callback()
126
+
119
127
  self.loop.create_task(self.audio_track.add_new_bytes(chunk))
120
128
  await asyncio.sleep(0.001)
121
129
 
@@ -1 +1 @@
1
- __version__ = "0.0.22"
1
+ __version__ = "0.0.24"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videosdk-plugins-openai
3
- Version: 0.0.22
3
+ Version: 0.0.24
4
4
  Summary: VideoSDK Agent Framework plugin for OpenAI services
5
5
  Author: videosdk
6
6
  License-Expression: Apache-2.0
@@ -13,7 +13,7 @@ Classifier: Topic :: Multimedia :: Video
13
13
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
14
14
  Requires-Python: >=3.11
15
15
  Requires-Dist: openai[realtime]>=1.68.2
16
- Requires-Dist: videosdk-agents>=0.0.22
16
+ Requires-Dist: videosdk-agents>=0.0.24
17
17
  Description-Content-Type: text/markdown
18
18
 
19
19
  # VideoSDK OpenAI Plugin
@@ -0,0 +1,9 @@
1
+ videosdk/plugins/openai/__init__.py,sha256=1jbc4HOYxkLeruM9RAqmZYSBdnr74gnPHmCNMKXEPrg,259
2
+ videosdk/plugins/openai/llm.py,sha256=igKq1LRrJfgrIbhVFik8aJp1Cux5069sAX-tusfCg6k,7148
3
+ videosdk/plugins/openai/realtime_api.py,sha256=s73iBlZE5bo1vDdnYOYw9VVE_0aliFJwUv4yEjxDBhE,24854
4
+ videosdk/plugins/openai/stt.py,sha256=YZROX-BjTqtWiT6ouMZacLkMYbmao3emB-88ewN93jg,9492
5
+ videosdk/plugins/openai/tts.py,sha256=m-15GslICL9dOa_H7YqIHP5ifif2OL-7DeTRQunQs9A,4814
6
+ videosdk/plugins/openai/version.py,sha256=sE45w-zCTIpVQyXYRbP5E390wIU6bNzRFuAh6ySoc1w,22
7
+ videosdk_plugins_openai-0.0.24.dist-info/METADATA,sha256=P7ontFkdhIuKtS_JjxdTTGtQWdtFoxphiRC66k4CtKA,827
8
+ videosdk_plugins_openai-0.0.24.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
9
+ videosdk_plugins_openai-0.0.24.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- videosdk/plugins/openai/__init__.py,sha256=1jbc4HOYxkLeruM9RAqmZYSBdnr74gnPHmCNMKXEPrg,259
2
- videosdk/plugins/openai/llm.py,sha256=h6xuJmyjg6InL9tr5pKBGt_5bNMpJ4XqnO72OtmCJ0c,7122
3
- videosdk/plugins/openai/realtime_api.py,sha256=WSzDWHcCQC8QsKLDmA5mm_oSN8UIHYMplesNliV5eUc,22611
4
- videosdk/plugins/openai/stt.py,sha256=YZROX-BjTqtWiT6ouMZacLkMYbmao3emB-88ewN93jg,9492
5
- videosdk/plugins/openai/tts.py,sha256=o5ktMUzjPkj64L5qqRaKPTWq7Na56TshMnLfU-sK36k,4417
6
- videosdk/plugins/openai/version.py,sha256=NoiGDztYD4fsDDnfSPiSzRkknkNHhFUtKZj0mhQiTYM,22
7
- videosdk_plugins_openai-0.0.22.dist-info/METADATA,sha256=9BJRuTdobykpCbIf5Gwr33z074lZjp-tCjdgBn5GUqg,827
8
- videosdk_plugins_openai-0.0.22.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
9
- videosdk_plugins_openai-0.0.22.dist-info/RECORD,,