videosdk-plugins-openai 0.0.4__tar.gz → 0.0.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of videosdk-plugins-openai might be problematic. Click here for more details.

@@ -7,3 +7,4 @@ __pycache__
7
7
  .env.local
8
8
  test_env/
9
9
  dist/
10
+ .DS_Store
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videosdk-plugins-openai
3
- Version: 0.0.4
3
+ Version: 0.0.5
4
4
  Summary: VideoSDK Agent Framework plugin for OpenAI services
5
5
  Author: videosdk
6
6
  Keywords: ai,audio,openai,video,videosdk
@@ -12,7 +12,7 @@ Classifier: Topic :: Multimedia :: Video
12
12
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
13
13
  Requires-Python: >=3.11
14
14
  Requires-Dist: openai[realtime]>=1.68.2
15
- Requires-Dist: videosdk-agents>=0.0.6
15
+ Requires-Dist: videosdk-agents>=0.0.8
16
16
  Description-Content-Type: text/markdown
17
17
 
18
18
  VideoSDK OpenAI Plugin
@@ -20,7 +20,7 @@ classifiers = [
20
20
  "Topic :: Scientific/Engineering :: Artificial Intelligence",
21
21
  ]
22
22
  dependencies = [
23
- "videosdk-agents>=0.0.6",
23
+ "videosdk-agents>=0.0.8",
24
24
  "openai[realtime]>=1.68.2",
25
25
  ]
26
26
 
@@ -18,7 +18,8 @@ from videosdk.agents import (
18
18
  build_openai_schema,
19
19
  CustomAudioStreamTrack,
20
20
  ToolChoice,
21
- RealtimeBaseModel
21
+ RealtimeBaseModel,
22
+ Agent
22
23
  )
23
24
 
24
25
  load_dotenv()
@@ -44,7 +45,8 @@ DEFAULT_TOOL_CHOICE = "auto"
44
45
 
45
46
  OpenAIEventTypes = Literal[
46
47
  "instructions_updated",
47
- "tools_updated"
48
+ "tools_updated",
49
+ "text_response"
48
50
  ]
49
51
  DEFAULT_VOICE = "alloy"
50
52
  DEFAULT_INPUT_AUDIO_FORMAT = "pcm16"
@@ -128,8 +130,14 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
128
130
  self.audio_track: Optional[CustomAudioStreamTrack] = None
129
131
  self._formatted_tools: Optional[List[Dict[str, Any]]] = None
130
132
  self.config: OpenAIRealtimeConfig = config or OpenAIRealtimeConfig()
131
- self.on("instructions_updated", self._handle_instructions_updated)
132
- self.on("tools_updated", self._handle_tools_updated)
133
+ # self.on("instructions_updated", self._handle_instructions_updated)
134
+ # self.on("tools_updated", self._handle_tools_updated)
135
+
136
+ def set_agent(self, agent: Agent) -> None:
137
+ self._instructions = agent.instructions
138
+ self._tools = agent.tools
139
+ self.tools_formatted = self._format_tools_for_session(self._tools)
140
+ self._formatted_tools = self.tools_formatted
133
141
 
134
142
  async def connect(self) -> None:
135
143
  headers = {"Agent": "VideoSDK Agents"}
@@ -144,7 +152,7 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
144
152
 
145
153
  async def handle_audio_input(self, audio_data: bytes) -> None:
146
154
  """Handle incoming audio data from the user"""
147
- if self._session and not self._closing:
155
+ if self._session and not self._closing and "audio" in self.config.modalities:
148
156
  base64_audio_data = base64.b64encode(audio_data).decode("utf-8")
149
157
  audio_event = {
150
158
  "type": "input_audio_buffer.append",
@@ -299,17 +307,23 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
299
307
 
300
308
  elif event_type == "conversation.item.input_audio_transcription.completed":
301
309
  await self._handle_input_audio_transcription_completed(data)
310
+
311
+ elif event_type == "response.text.done":
312
+ await self._handle_text_done(data)
302
313
 
303
314
  except Exception as e:
304
315
  self.emit_error(f"Error handling event {event_type}: {str(e)}")
305
316
 
306
317
  async def _handle_speech_started(self, data: dict) -> None:
307
318
  """Handle speech detection start"""
308
- await self.interrupt()
309
- self.audio_track.interrupt()
319
+ if "audio" in self.config.modalities:
320
+ await self.interrupt()
321
+ if self.audio_track:
322
+ self.audio_track.interrupt()
310
323
 
311
324
  async def _handle_speech_stopped(self, data: dict) -> None:
312
325
  """Handle speech detection end"""
326
+ pass
313
327
 
314
328
  async def _handle_response_created(self, data: dict) -> None:
315
329
  """Handle initial response creation"""
@@ -365,6 +379,9 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
365
379
 
366
380
  async def _handle_audio_delta(self, data: dict) -> None:
367
381
  """Handle audio chunk"""
382
+ if "audio" not in self.config.modalities:
383
+ return
384
+
368
385
  try:
369
386
  base64_audio_data = base64.b64decode(data.get("delta"))
370
387
  if base64_audio_data:
@@ -448,31 +465,45 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
448
465
  if not self._session:
449
466
  return
450
467
 
468
+ # Conditionally set turn detection and audio transcription based on modalities
469
+ turn_detection = None
470
+ input_audio_transcription = None
471
+
472
+ if "audio" in self.config.modalities:
473
+ turn_detection = self.config.turn_detection.model_dump(
474
+ by_alias=True,
475
+ exclude_unset=True,
476
+ exclude_defaults=True,
477
+ ) if self.config.turn_detection else None
478
+ input_audio_transcription = self.config.input_audio_transcription.model_dump(
479
+ by_alias=True,
480
+ exclude_unset=True,
481
+ exclude_defaults=True,
482
+ ) if self.config.input_audio_transcription else None
483
+
451
484
  session_update = {
452
485
  "type": "session.update",
453
486
  "session": {
454
487
  "model": self.model,
455
- "voice": self.config.voice,
456
- "instructions": self._instructions or "You are a helpful voice assistant that can answer questions and help with tasks.",
488
+ "instructions": self._instructions or "You are a helpful assistant that can answer questions and help with tasks.",
457
489
  "temperature": self.config.temperature,
458
- "turn_detection": self.config.turn_detection.model_dump(
459
- by_alias=True,
460
- exclude_unset=True,
461
- exclude_defaults=True,
462
- ),
463
- "input_audio_transcription": self.config.input_audio_transcription.model_dump(
464
- by_alias=True,
465
- exclude_unset=True,
466
- exclude_defaults=True,
467
- ),
468
490
  "tool_choice": self.config.tool_choice,
469
491
  "tools": self._formatted_tools or [],
470
492
  "modalities": self.config.modalities,
471
- "input_audio_format": DEFAULT_INPUT_AUDIO_FORMAT,
472
- "output_audio_format": DEFAULT_OUTPUT_AUDIO_FORMAT,
473
493
  "max_response_output_tokens": "inf"
474
494
  }
475
495
  }
496
+
497
+ # Only add audio-related configurations if audio modality is enabled
498
+ if "audio" in self.config.modalities:
499
+ session_update["session"]["voice"] = self.config.voice
500
+ session_update["session"]["input_audio_format"] = DEFAULT_INPUT_AUDIO_FORMAT
501
+ session_update["session"]["output_audio_format"] = DEFAULT_OUTPUT_AUDIO_FORMAT
502
+ if turn_detection:
503
+ session_update["session"]["turn_detection"] = turn_detection
504
+ if input_audio_transcription:
505
+ session_update["session"]["input_audio_transcription"] = input_audio_transcription
506
+
476
507
  # Send the event
477
508
  await self.send_event(session_update)
478
509
 
@@ -521,4 +552,33 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
521
552
  tools = data.get("tools", [])
522
553
  self._tools = tools
523
554
  self.tools_formatted = self._format_tools_for_session(tools)
524
- self._formatted_tools = self.tools_formatted
555
+ self._formatted_tools = self.tools_formatted
556
+
557
+ async def send_text_message(self, message: str) -> None:
558
+ """Send a text message to the OpenAI realtime API"""
559
+ if not self._session:
560
+ raise RuntimeError("No active WebSocket session")
561
+
562
+ await self.send_event({
563
+ "type": "conversation.item.create",
564
+ "item": {
565
+ "type": "message",
566
+ "role": "user",
567
+ "content": [
568
+ {
569
+ "type": "input_text",
570
+ "text": message
571
+ }
572
+ ]
573
+ }
574
+ })
575
+ await self.create_response()
576
+
577
+ async def _handle_text_done(self, data: dict) -> None:
578
+ """Handle text response completion"""
579
+ try:
580
+ text_content = data.get("text", "")
581
+ if text_content:
582
+ self.emit("text_response", {"text": text_content, "type": "done"})
583
+ except Exception as e:
584
+ print(f"[ERROR] Error handling text done: {e}")
@@ -0,0 +1 @@
1
+ __version__ = "0.0.5"
@@ -1 +0,0 @@
1
- __version__ = "0.0.4"