videosdk-plugins-openai 0.0.21__tar.gz → 0.0.23__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of videosdk-plugins-openai might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videosdk-plugins-openai
3
- Version: 0.0.21
3
+ Version: 0.0.23
4
4
  Summary: VideoSDK Agent Framework plugin for OpenAI services
5
5
  Author: videosdk
6
6
  License-Expression: Apache-2.0
@@ -13,7 +13,7 @@ Classifier: Topic :: Multimedia :: Video
13
13
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
14
14
  Requires-Python: >=3.11
15
15
  Requires-Dist: openai[realtime]>=1.68.2
16
- Requires-Dist: videosdk-agents>=0.0.21
16
+ Requires-Dist: videosdk-agents>=0.0.23
17
17
  Description-Content-Type: text/markdown
18
18
 
19
19
  # VideoSDK OpenAI Plugin
@@ -21,7 +21,7 @@ classifiers = [
21
21
  "Topic :: Scientific/Engineering :: Artificial Intelligence",
22
22
  ]
23
23
  dependencies = [
24
- "videosdk-agents>=0.0.21",
24
+ "videosdk-agents>=0.0.23",
25
25
  "openai[realtime]>=1.68.2",
26
26
  ]
27
27
 
@@ -133,7 +133,7 @@ class OpenAILLM(LLM):
133
133
  tool_schema = build_openai_schema(tool)
134
134
  formatted_tools.append(tool_schema)
135
135
  except Exception as e:
136
- print(f"Failed to format tool {tool}: {e}")
136
+ self.emit("error", f"Failed to format tool {tool}: {e}")
137
137
  continue
138
138
 
139
139
  if formatted_tools:
@@ -167,7 +167,7 @@ class OpenAILLM(LLM):
167
167
  args = json.loads(current_function_call["arguments"])
168
168
  current_function_call["arguments"] = args
169
169
  except json.JSONDecodeError:
170
- print(f"Failed to parse function arguments: {current_function_call['arguments']}")
170
+ self.emit("error", f"Failed to parse function arguments: {current_function_call['arguments']}")
171
171
  current_function_call["arguments"] = {}
172
172
 
173
173
  yield LLMResponse(
@@ -24,6 +24,8 @@ from videosdk.agents import (
24
24
  global_event_emitter,
25
25
  Agent
26
26
  )
27
+ from videosdk.agents import realtime_metrics_collector
28
+
27
29
 
28
30
  load_dotenv()
29
31
  from openai.types.beta.realtime.session import InputAudioTranscription, TurnDetection
@@ -45,9 +47,9 @@ DEFAULT_INPUT_AUDIO_TRANSCRIPTION = InputAudioTranscription(
45
47
  DEFAULT_TOOL_CHOICE = "auto"
46
48
 
47
49
  OpenAIEventTypes = Literal[
48
- "instructions_updated",
49
- "tools_updated",
50
- "text_response"
50
+ "user_speech_started",
51
+ "text_response",
52
+ "error"
51
53
  ]
52
54
  DEFAULT_VOICE = "alloy"
53
55
  DEFAULT_INPUT_AUDIO_FORMAT = "pcm16"
@@ -121,6 +123,7 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
121
123
  self.api_key = api_key or os.getenv("OPENAI_API_KEY")
122
124
  self.base_url = base_url or OPENAI_BASE_URL
123
125
  if not self.api_key:
126
+ self.emit("error", "OpenAI API key must be provided or set in OPENAI_API_KEY environment variable")
124
127
  raise ValueError("OpenAI API key must be provided or set in OPENAI_API_KEY environment variable")
125
128
  self._http_session: Optional[aiohttp.ClientSession] = None
126
129
  self._session: Optional[OpenAISession] = None
@@ -133,6 +136,7 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
133
136
  self.config: OpenAIRealtimeConfig = config or OpenAIRealtimeConfig()
134
137
  self.input_sample_rate = 48000
135
138
  self.target_sample_rate = 16000
139
+ self._agent_speaking = False
136
140
 
137
141
  def set_agent(self, agent: Agent) -> None:
138
142
  self._instructions = agent.instructions
@@ -202,6 +206,7 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
202
206
  async def create_response(self) -> None:
203
207
  """Create a response to the OpenAI realtime API"""
204
208
  if not self._session:
209
+ self.emit("error", "No active WebSocket session")
205
210
  raise RuntimeError("No active WebSocket session")
206
211
 
207
212
  response_event = {
@@ -245,15 +250,15 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
245
250
  msg = await session.ws.receive()
246
251
 
247
252
  if msg.type == aiohttp.WSMsgType.CLOSED:
248
- print("WebSocket closed with reason:", msg.extra)
253
+ self.emit("error", f"WebSocket closed with reason: {msg.extra}")
249
254
  break
250
255
  elif msg.type == aiohttp.WSMsgType.ERROR:
251
- print("WebSocket error:", msg.data)
256
+ self.emit("error", f"WebSocket error: {msg.data}")
252
257
  break
253
258
  elif msg.type == aiohttp.WSMsgType.TEXT:
254
259
  await self._handle_message(json.loads(msg.data))
255
260
  except Exception as e:
256
- print("WebSocket receive error:", str(e))
261
+ self.emit("error", f"WebSocket receive error: {str(e)}")
257
262
  finally:
258
263
  await self._cleanup_session(session)
259
264
 
@@ -277,11 +282,14 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
277
282
  elif event_type == "response.content_part.added":
278
283
  await self._handle_content_part_added(data)
279
284
 
285
+ elif event_type == "response.text.delta":
286
+ await self._handle_text_delta(data)
287
+
280
288
  elif event_type == "response.audio.delta":
281
289
  await self._handle_audio_delta(data)
282
290
 
283
291
  elif event_type == "response.audio_transcript.delta":
284
- await self._handle_transcript_delta(data)
292
+ await self._handle_audio_transcript_delta(data)
285
293
 
286
294
  elif event_type == "response.done":
287
295
  await self._handle_response_done(data)
@@ -305,18 +313,20 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
305
313
  await self._handle_text_done(data)
306
314
 
307
315
  except Exception as e:
308
- self.emit_error(f"Error handling event {event_type}: {str(e)}")
316
+ self.emit("error", f"Error handling event {event_type}: {str(e)}")
309
317
 
310
318
  async def _handle_speech_started(self, data: dict) -> None:
311
319
  """Handle speech detection start"""
312
320
  if "audio" in self.config.modalities:
321
+ self.emit("user_speech_started", {"type": "done"})
313
322
  await self.interrupt()
314
323
  if self.audio_track:
315
324
  self.audio_track.interrupt()
325
+ await realtime_metrics_collector.set_user_speech_start()
316
326
 
317
327
  async def _handle_speech_stopped(self, data: dict) -> None:
318
328
  """Handle speech detection end"""
319
- pass
329
+ await realtime_metrics_collector.set_user_speech_end()
320
330
 
321
331
  async def _handle_response_created(self, data: dict) -> None:
322
332
  """Handle initial response creation"""
@@ -338,6 +348,7 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
338
348
  tool_info = get_tool_info(tool)
339
349
  if tool_info.name == name:
340
350
  try:
351
+ await realtime_metrics_collector.add_tool_call(name)
341
352
  result = await tool(**arguments)
342
353
  await self.send_event({
343
354
  "type": "conversation.item.create",
@@ -360,26 +371,33 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
360
371
  })
361
372
 
362
373
  except Exception as e:
363
- print(f"Error executing function {name}: {e}")
374
+ self.emit("error", f"Error executing function {name}: {e}")
364
375
  break
365
376
  except Exception as e:
366
- print(f"Error handling output item done: {e}")
377
+ self.emit("error", f"Error handling output item done: {e}")
367
378
 
368
379
  async def _handle_content_part_added(self, data: dict) -> None:
369
380
  """Handle new content part"""
370
381
 
382
+ async def _handle_text_delta(self, data: dict) -> None:
383
+ """Handle text delta chunk"""
384
+ pass
385
+
371
386
  async def _handle_audio_delta(self, data: dict) -> None:
372
387
  """Handle audio chunk"""
373
388
  if "audio" not in self.config.modalities:
374
389
  return
375
390
 
376
391
  try:
392
+ if not self._agent_speaking:
393
+ await realtime_metrics_collector.set_agent_speech_start()
394
+ self._agent_speaking = True
377
395
  base64_audio_data = base64.b64decode(data.get("delta"))
378
396
  if base64_audio_data:
379
397
  if self.audio_track and self.loop:
380
398
  self.loop.create_task(self.audio_track.add_new_bytes(base64_audio_data))
381
399
  except Exception as e:
382
- print(f"[ERROR] Error handling audio delta: {e}")
400
+ self.emit("error", f"Error handling audio delta: {e}")
383
401
  traceback.print_exc()
384
402
 
385
403
  async def interrupt(self) -> None:
@@ -390,18 +408,36 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
390
408
  "event_id": str(uuid.uuid4())
391
409
  }
392
410
  await self.send_event(cancel_event)
411
+ await realtime_metrics_collector.set_interrupted()
393
412
  if self.audio_track:
394
413
  self.audio_track.interrupt()
414
+ if self._agent_speaking:
415
+ await realtime_metrics_collector.set_agent_speech_end(timeout=1.0)
416
+ self._agent_speaking = False
395
417
 
396
- async def _handle_transcript_delta(self, data: dict) -> None:
418
+ async def _handle_audio_transcript_delta(self, data: dict) -> None:
397
419
  """Handle transcript chunk"""
398
-
420
+ delta_content = data.get("delta", "")
421
+ if not hasattr(self, '_current_audio_transcript'):
422
+ self._current_audio_transcript = ""
423
+ self._current_audio_transcript += delta_content
424
+
399
425
  async def _handle_input_audio_transcription_completed(self, data: dict) -> None:
400
- """Handle input audio transcription completion"""
426
+ """Handle input audio transcription completion for user transcript"""
427
+ transcript = data.get("transcript", "")
428
+ if transcript:
429
+ await realtime_metrics_collector.set_user_transcript(transcript)
401
430
 
402
431
  async def _handle_response_done(self, data: dict) -> None:
403
- """Handle response completion"""
404
-
432
+ """Handle response completion for agent transcript"""
433
+ if hasattr(self, '_current_audio_transcript') and self._current_audio_transcript:
434
+ await realtime_metrics_collector.set_agent_response(self._current_audio_transcript)
435
+ global_event_emitter.emit("text_response", {"text": self._current_audio_transcript, "type": "done"})
436
+ self._current_audio_transcript = ""
437
+ await realtime_metrics_collector.set_agent_speech_end(timeout=1.0)
438
+ self._agent_speaking = False
439
+ pass
440
+
405
441
  async def _handle_function_call_arguments_delta(self, data: dict) -> None:
406
442
  """Handle function call arguments delta"""
407
443
 
@@ -526,7 +562,7 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
526
562
  tool_schema = build_openai_schema(tool)
527
563
  oai_tools.append(tool_schema)
528
564
  except Exception as e:
529
- print(f"Failed to format tool {tool}: {e}")
565
+ self.emit("error", f"Failed to format tool {tool}: {e}")
530
566
  continue
531
567
 
532
568
  return oai_tools
@@ -534,6 +570,7 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
534
570
  async def send_text_message(self, message: str) -> None:
535
571
  """Send a text message to the OpenAI realtime API"""
536
572
  if not self._session:
573
+ self.emit("error", "No active WebSocket session")
537
574
  raise RuntimeError("No active WebSocket session")
538
575
 
539
576
  await self.send_event({
@@ -551,11 +588,3 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
551
588
  })
552
589
  await self.create_response()
553
590
 
554
- async def _handle_text_done(self, data: dict) -> None:
555
- """Handle text response completion"""
556
- try:
557
- text_content = data.get("text", "")
558
- if text_content:
559
- global_event_emitter.emit("text_response", {"text": text_content, "type": "done"})
560
- except Exception as e:
561
- print(f"[ERROR] Error handling text done: {e}")
@@ -36,6 +36,7 @@ class OpenAITTS(TTS):
36
36
  self.audio_track = None
37
37
  self.loop = None
38
38
  self.response_format = response_format
39
+ self._first_chunk_sent = False
39
40
 
40
41
  self.api_key = api_key or os.getenv("OPENAI_API_KEY")
41
42
  if not self.api_key:
@@ -55,6 +56,10 @@ class OpenAITTS(TTS):
55
56
  ),
56
57
  ),
57
58
  )
59
+
60
+ def reset_first_audio_tracking(self) -> None:
61
+ """Reset the first audio tracking state for next TTS task"""
62
+ self._first_chunk_sent = False
58
63
 
59
64
  async def synthesize(
60
65
  self,
@@ -95,7 +100,6 @@ class OpenAITTS(TTS):
95
100
  if chunk:
96
101
  audio_data += chunk
97
102
 
98
-
99
103
  if audio_data:
100
104
  await self._stream_audio_chunks(audio_data)
101
105
 
@@ -116,6 +120,10 @@ class OpenAITTS(TTS):
116
120
  chunk += b'\x00' * padding_needed
117
121
 
118
122
  if len(chunk) == chunk_size:
123
+ if not self._first_chunk_sent and self._first_audio_callback:
124
+ self._first_chunk_sent = True
125
+ await self._first_audio_callback()
126
+
119
127
  self.loop.create_task(self.audio_track.add_new_bytes(chunk))
120
128
  await asyncio.sleep(0.001)
121
129
 
@@ -0,0 +1 @@
1
+ __version__ = "0.0.23"
@@ -1 +0,0 @@
1
- __version__ = "0.0.21"