videosdk-plugins-openai 0.0.4__tar.gz → 0.0.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of videosdk-plugins-openai might be problematic. Click here for more details.
- {videosdk_plugins_openai-0.0.4 → videosdk_plugins_openai-0.0.6}/.gitignore +1 -0
- {videosdk_plugins_openai-0.0.4 → videosdk_plugins_openai-0.0.6}/PKG-INFO +2 -2
- {videosdk_plugins_openai-0.0.4 → videosdk_plugins_openai-0.0.6}/pyproject.toml +1 -1
- videosdk_plugins_openai-0.0.6/videosdk/plugins/openai/__init__.py +12 -0
- videosdk_plugins_openai-0.0.6/videosdk/plugins/openai/llm.py +161 -0
- {videosdk_plugins_openai-0.0.4 → videosdk_plugins_openai-0.0.6}/videosdk/plugins/openai/realtime_api.py +90 -24
- videosdk_plugins_openai-0.0.6/videosdk/plugins/openai/stt.py +260 -0
- videosdk_plugins_openai-0.0.6/videosdk/plugins/openai/tts.py +109 -0
- videosdk_plugins_openai-0.0.6/videosdk/plugins/openai/version.py +1 -0
- videosdk_plugins_openai-0.0.4/videosdk/plugins/openai/__init__.py +0 -6
- videosdk_plugins_openai-0.0.4/videosdk/plugins/openai/version.py +0 -1
- {videosdk_plugins_openai-0.0.4 → videosdk_plugins_openai-0.0.6}/README.md +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: videosdk-plugins-openai
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.6
|
|
4
4
|
Summary: VideoSDK Agent Framework plugin for OpenAI services
|
|
5
5
|
Author: videosdk
|
|
6
6
|
Keywords: ai,audio,openai,video,videosdk
|
|
@@ -12,7 +12,7 @@ Classifier: Topic :: Multimedia :: Video
|
|
|
12
12
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
13
13
|
Requires-Python: >=3.11
|
|
14
14
|
Requires-Dist: openai[realtime]>=1.68.2
|
|
15
|
-
Requires-Dist: videosdk-agents>=0.0.
|
|
15
|
+
Requires-Dist: videosdk-agents>=0.0.9
|
|
16
16
|
Description-Content-Type: text/markdown
|
|
17
17
|
|
|
18
18
|
VideoSDK OpenAI Plugin
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from .realtime_api import OpenAIRealtime, OpenAIRealtimeConfig
|
|
2
|
+
from .llm import OpenAILLM
|
|
3
|
+
from .stt import OpenAISTT
|
|
4
|
+
from .tts import OpenAITTS
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
'OpenAIRealtime',
|
|
8
|
+
'OpenAIRealtimeConfig',
|
|
9
|
+
'OpenAILLM',
|
|
10
|
+
'OpenAISTT',
|
|
11
|
+
'OpenAITTS',
|
|
12
|
+
]
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from typing import Any, AsyncIterator
|
|
5
|
+
import json
|
|
6
|
+
|
|
7
|
+
import httpx
|
|
8
|
+
import openai
|
|
9
|
+
from videosdk.agents import LLM, LLMResponse, ChatContext, ChatRole, ChatMessage, FunctionCall, FunctionCallOutput, ToolChoice, FunctionTool, is_function_tool, build_openai_schema
|
|
10
|
+
|
|
11
|
+
class OpenAILLM(LLM):
|
|
12
|
+
|
|
13
|
+
def __init__(
|
|
14
|
+
self,
|
|
15
|
+
*,
|
|
16
|
+
model: str = "gpt-4o",
|
|
17
|
+
api_key: str | None = None,
|
|
18
|
+
base_url: str | None = None,
|
|
19
|
+
temperature: float = 0.7,
|
|
20
|
+
tool_choice: ToolChoice = "auto",
|
|
21
|
+
max_completion_tokens: int | None = None,
|
|
22
|
+
) -> None:
|
|
23
|
+
super().__init__()
|
|
24
|
+
self.api_key = api_key or os.getenv("OPENAI_API_KEY")
|
|
25
|
+
if not self.api_key:
|
|
26
|
+
raise ValueError("OpenAI API key must be provided either through api_key parameter or OPENAI_API_KEY environment variable")
|
|
27
|
+
|
|
28
|
+
self.model = model
|
|
29
|
+
self.temperature = temperature
|
|
30
|
+
self.tool_choice = tool_choice
|
|
31
|
+
self.max_completion_tokens = max_completion_tokens
|
|
32
|
+
|
|
33
|
+
self._client = openai.AsyncOpenAI(
|
|
34
|
+
api_key=self.api_key,
|
|
35
|
+
base_url=base_url or None,
|
|
36
|
+
max_retries=0,
|
|
37
|
+
http_client=httpx.AsyncClient(
|
|
38
|
+
timeout=httpx.Timeout(connect=15.0, read=5.0, write=5.0, pool=5.0),
|
|
39
|
+
follow_redirects=True,
|
|
40
|
+
limits=httpx.Limits(
|
|
41
|
+
max_connections=50,
|
|
42
|
+
max_keepalive_connections=50,
|
|
43
|
+
keepalive_expiry=120,
|
|
44
|
+
),
|
|
45
|
+
),
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
async def chat(
|
|
49
|
+
self,
|
|
50
|
+
messages: ChatContext,
|
|
51
|
+
tools: list[FunctionTool] | None = None,
|
|
52
|
+
**kwargs: Any
|
|
53
|
+
) -> AsyncIterator[LLMResponse]:
|
|
54
|
+
"""
|
|
55
|
+
Implement chat functionality using OpenAI's chat completion API
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
messages: ChatContext containing conversation history
|
|
59
|
+
tools: Optional list of function tools available to the model
|
|
60
|
+
**kwargs: Additional arguments passed to the OpenAI API
|
|
61
|
+
|
|
62
|
+
Yields:
|
|
63
|
+
LLMResponse objects containing the model's responses
|
|
64
|
+
"""
|
|
65
|
+
completion_params = {
|
|
66
|
+
"model": self.model,
|
|
67
|
+
"messages": [
|
|
68
|
+
{
|
|
69
|
+
"role": msg.role.value,
|
|
70
|
+
"content": msg.content,
|
|
71
|
+
**({"name": msg.name} if hasattr(msg, 'name') else {})
|
|
72
|
+
} if isinstance(msg, ChatMessage) else
|
|
73
|
+
{
|
|
74
|
+
"role": "assistant",
|
|
75
|
+
"content": None,
|
|
76
|
+
"function_call": {
|
|
77
|
+
"name": msg.name,
|
|
78
|
+
"arguments": msg.arguments
|
|
79
|
+
}
|
|
80
|
+
} if isinstance(msg, FunctionCall) else
|
|
81
|
+
{
|
|
82
|
+
"role": "function",
|
|
83
|
+
"name": msg.name,
|
|
84
|
+
"content": msg.output
|
|
85
|
+
} if isinstance(msg, FunctionCallOutput) else None
|
|
86
|
+
for msg in messages.items
|
|
87
|
+
if msg is not None
|
|
88
|
+
],
|
|
89
|
+
"temperature": self.temperature,
|
|
90
|
+
"stream": True,
|
|
91
|
+
"max_tokens": self.max_completion_tokens,
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
if tools:
|
|
95
|
+
formatted_tools = []
|
|
96
|
+
for tool in tools:
|
|
97
|
+
if not is_function_tool(tool):
|
|
98
|
+
continue
|
|
99
|
+
try:
|
|
100
|
+
tool_schema = build_openai_schema(tool)
|
|
101
|
+
formatted_tools.append(tool_schema)
|
|
102
|
+
except Exception as e:
|
|
103
|
+
print(f"Failed to format tool {tool}: {e}")
|
|
104
|
+
continue
|
|
105
|
+
|
|
106
|
+
if formatted_tools:
|
|
107
|
+
completion_params["functions"] = formatted_tools
|
|
108
|
+
completion_params["function_call"] = self.tool_choice
|
|
109
|
+
|
|
110
|
+
completion_params.update(kwargs)
|
|
111
|
+
try:
|
|
112
|
+
response_stream = await self._client.chat.completions.create(**completion_params)
|
|
113
|
+
current_content = ""
|
|
114
|
+
current_function_call = None
|
|
115
|
+
|
|
116
|
+
async for chunk in response_stream:
|
|
117
|
+
if not chunk.choices:
|
|
118
|
+
continue
|
|
119
|
+
|
|
120
|
+
delta = chunk.choices[0].delta
|
|
121
|
+
if delta.function_call:
|
|
122
|
+
if current_function_call is None:
|
|
123
|
+
current_function_call = {
|
|
124
|
+
"name": delta.function_call.name or "",
|
|
125
|
+
"arguments": delta.function_call.arguments or ""
|
|
126
|
+
}
|
|
127
|
+
else:
|
|
128
|
+
if delta.function_call.name:
|
|
129
|
+
current_function_call["name"] += delta.function_call.name
|
|
130
|
+
if delta.function_call.arguments:
|
|
131
|
+
current_function_call["arguments"] += delta.function_call.arguments
|
|
132
|
+
elif current_function_call is not None:
|
|
133
|
+
try:
|
|
134
|
+
args = json.loads(current_function_call["arguments"])
|
|
135
|
+
current_function_call["arguments"] = args
|
|
136
|
+
except json.JSONDecodeError:
|
|
137
|
+
print(f"Failed to parse function arguments: {current_function_call['arguments']}")
|
|
138
|
+
current_function_call["arguments"] = {}
|
|
139
|
+
|
|
140
|
+
yield LLMResponse(
|
|
141
|
+
content="",
|
|
142
|
+
role=ChatRole.ASSISTANT,
|
|
143
|
+
metadata={"function_call": current_function_call}
|
|
144
|
+
)
|
|
145
|
+
current_function_call = None
|
|
146
|
+
|
|
147
|
+
elif delta.content is not None:
|
|
148
|
+
current_content += delta.content
|
|
149
|
+
yield LLMResponse(
|
|
150
|
+
content=current_content,
|
|
151
|
+
role=ChatRole.ASSISTANT
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
except Exception as e:
|
|
155
|
+
self.emit("error", e)
|
|
156
|
+
raise
|
|
157
|
+
|
|
158
|
+
async def aclose(self) -> None:
|
|
159
|
+
"""Cleanup resources by closing the HTTP client"""
|
|
160
|
+
if self._client:
|
|
161
|
+
await self._client.close()
|
|
@@ -10,6 +10,8 @@ from dotenv import load_dotenv
|
|
|
10
10
|
import uuid
|
|
11
11
|
import base64
|
|
12
12
|
import aiohttp
|
|
13
|
+
import numpy as np
|
|
14
|
+
from scipy import signal
|
|
13
15
|
import traceback
|
|
14
16
|
from videosdk.agents import (
|
|
15
17
|
FunctionTool,
|
|
@@ -18,15 +20,14 @@ from videosdk.agents import (
|
|
|
18
20
|
build_openai_schema,
|
|
19
21
|
CustomAudioStreamTrack,
|
|
20
22
|
ToolChoice,
|
|
21
|
-
RealtimeBaseModel
|
|
23
|
+
RealtimeBaseModel,
|
|
24
|
+
Agent
|
|
22
25
|
)
|
|
23
26
|
|
|
24
27
|
load_dotenv()
|
|
25
28
|
from openai.types.beta.realtime.session import InputAudioTranscription, TurnDetection
|
|
26
29
|
|
|
27
30
|
OPENAI_BASE_URL = "https://api.openai.com/v1"
|
|
28
|
-
SAMPLE_RATE = 24000
|
|
29
|
-
NUM_CHANNELS = 1
|
|
30
31
|
|
|
31
32
|
DEFAULT_TEMPERATURE = 0.8
|
|
32
33
|
DEFAULT_TURN_DETECTION = TurnDetection(
|
|
@@ -44,7 +45,8 @@ DEFAULT_TOOL_CHOICE = "auto"
|
|
|
44
45
|
|
|
45
46
|
OpenAIEventTypes = Literal[
|
|
46
47
|
"instructions_updated",
|
|
47
|
-
"tools_updated"
|
|
48
|
+
"tools_updated",
|
|
49
|
+
"text_response"
|
|
48
50
|
]
|
|
49
51
|
DEFAULT_VOICE = "alloy"
|
|
50
52
|
DEFAULT_INPUT_AUDIO_FORMAT = "pcm16"
|
|
@@ -128,8 +130,17 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
|
|
|
128
130
|
self.audio_track: Optional[CustomAudioStreamTrack] = None
|
|
129
131
|
self._formatted_tools: Optional[List[Dict[str, Any]]] = None
|
|
130
132
|
self.config: OpenAIRealtimeConfig = config or OpenAIRealtimeConfig()
|
|
131
|
-
|
|
132
|
-
|
|
133
|
+
# global_event_emitter.on("instructions_updated", self._handle_instructions_updated)
|
|
134
|
+
# global_event_emitter.on("tools_updated", self._handle_tools_updated)
|
|
135
|
+
|
|
136
|
+
self.input_sample_rate = 48000
|
|
137
|
+
self.target_sample_rate = 16000
|
|
138
|
+
|
|
139
|
+
def set_agent(self, agent: Agent) -> None:
|
|
140
|
+
self._instructions = agent.instructions
|
|
141
|
+
self._tools = agent.tools
|
|
142
|
+
self.tools_formatted = self._format_tools_for_session(self._tools)
|
|
143
|
+
self._formatted_tools = self.tools_formatted
|
|
133
144
|
|
|
134
145
|
async def connect(self) -> None:
|
|
135
146
|
headers = {"Agent": "VideoSDK Agents"}
|
|
@@ -144,7 +155,10 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
|
|
|
144
155
|
|
|
145
156
|
async def handle_audio_input(self, audio_data: bytes) -> None:
|
|
146
157
|
"""Handle incoming audio data from the user"""
|
|
147
|
-
if self._session and not self._closing:
|
|
158
|
+
if self._session and not self._closing and "audio" in self.config.modalities:
|
|
159
|
+
audio_data = np.frombuffer(audio_data, dtype=np.int16)
|
|
160
|
+
audio_data = signal.resample(audio_data, int(len(audio_data) * self.target_sample_rate / self.input_sample_rate))
|
|
161
|
+
audio_data = audio_data.astype(np.int16).tobytes()
|
|
148
162
|
base64_audio_data = base64.b64encode(audio_data).decode("utf-8")
|
|
149
163
|
audio_event = {
|
|
150
164
|
"type": "input_audio_buffer.append",
|
|
@@ -299,17 +313,23 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
|
|
|
299
313
|
|
|
300
314
|
elif event_type == "conversation.item.input_audio_transcription.completed":
|
|
301
315
|
await self._handle_input_audio_transcription_completed(data)
|
|
316
|
+
|
|
317
|
+
elif event_type == "response.text.done":
|
|
318
|
+
await self._handle_text_done(data)
|
|
302
319
|
|
|
303
320
|
except Exception as e:
|
|
304
321
|
self.emit_error(f"Error handling event {event_type}: {str(e)}")
|
|
305
322
|
|
|
306
323
|
async def _handle_speech_started(self, data: dict) -> None:
|
|
307
324
|
"""Handle speech detection start"""
|
|
308
|
-
|
|
309
|
-
|
|
325
|
+
if "audio" in self.config.modalities:
|
|
326
|
+
await self.interrupt()
|
|
327
|
+
if self.audio_track:
|
|
328
|
+
self.audio_track.interrupt()
|
|
310
329
|
|
|
311
330
|
async def _handle_speech_stopped(self, data: dict) -> None:
|
|
312
331
|
"""Handle speech detection end"""
|
|
332
|
+
pass
|
|
313
333
|
|
|
314
334
|
async def _handle_response_created(self, data: dict) -> None:
|
|
315
335
|
"""Handle initial response creation"""
|
|
@@ -365,6 +385,9 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
|
|
|
365
385
|
|
|
366
386
|
async def _handle_audio_delta(self, data: dict) -> None:
|
|
367
387
|
"""Handle audio chunk"""
|
|
388
|
+
if "audio" not in self.config.modalities:
|
|
389
|
+
return
|
|
390
|
+
|
|
368
391
|
try:
|
|
369
392
|
base64_audio_data = base64.b64decode(data.get("delta"))
|
|
370
393
|
if base64_audio_data:
|
|
@@ -448,31 +471,45 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
|
|
|
448
471
|
if not self._session:
|
|
449
472
|
return
|
|
450
473
|
|
|
474
|
+
# Conditionally set turn detection and audio transcription based on modalities
|
|
475
|
+
turn_detection = None
|
|
476
|
+
input_audio_transcription = None
|
|
477
|
+
|
|
478
|
+
if "audio" in self.config.modalities:
|
|
479
|
+
turn_detection = self.config.turn_detection.model_dump(
|
|
480
|
+
by_alias=True,
|
|
481
|
+
exclude_unset=True,
|
|
482
|
+
exclude_defaults=True,
|
|
483
|
+
) if self.config.turn_detection else None
|
|
484
|
+
input_audio_transcription = self.config.input_audio_transcription.model_dump(
|
|
485
|
+
by_alias=True,
|
|
486
|
+
exclude_unset=True,
|
|
487
|
+
exclude_defaults=True,
|
|
488
|
+
) if self.config.input_audio_transcription else None
|
|
489
|
+
|
|
451
490
|
session_update = {
|
|
452
491
|
"type": "session.update",
|
|
453
492
|
"session": {
|
|
454
493
|
"model": self.model,
|
|
455
|
-
"
|
|
456
|
-
"instructions": self._instructions or "You are a helpful voice assistant that can answer questions and help with tasks.",
|
|
494
|
+
"instructions": self._instructions or "You are a helpful assistant that can answer questions and help with tasks.",
|
|
457
495
|
"temperature": self.config.temperature,
|
|
458
|
-
"turn_detection": self.config.turn_detection.model_dump(
|
|
459
|
-
by_alias=True,
|
|
460
|
-
exclude_unset=True,
|
|
461
|
-
exclude_defaults=True,
|
|
462
|
-
),
|
|
463
|
-
"input_audio_transcription": self.config.input_audio_transcription.model_dump(
|
|
464
|
-
by_alias=True,
|
|
465
|
-
exclude_unset=True,
|
|
466
|
-
exclude_defaults=True,
|
|
467
|
-
),
|
|
468
496
|
"tool_choice": self.config.tool_choice,
|
|
469
497
|
"tools": self._formatted_tools or [],
|
|
470
498
|
"modalities": self.config.modalities,
|
|
471
|
-
"input_audio_format": DEFAULT_INPUT_AUDIO_FORMAT,
|
|
472
|
-
"output_audio_format": DEFAULT_OUTPUT_AUDIO_FORMAT,
|
|
473
499
|
"max_response_output_tokens": "inf"
|
|
474
500
|
}
|
|
475
501
|
}
|
|
502
|
+
|
|
503
|
+
# Only add audio-related configurations if audio modality is enabled
|
|
504
|
+
if "audio" in self.config.modalities:
|
|
505
|
+
session_update["session"]["voice"] = self.config.voice
|
|
506
|
+
session_update["session"]["input_audio_format"] = DEFAULT_INPUT_AUDIO_FORMAT
|
|
507
|
+
session_update["session"]["output_audio_format"] = DEFAULT_OUTPUT_AUDIO_FORMAT
|
|
508
|
+
if turn_detection:
|
|
509
|
+
session_update["session"]["turn_detection"] = turn_detection
|
|
510
|
+
if input_audio_transcription:
|
|
511
|
+
session_update["session"]["input_audio_transcription"] = input_audio_transcription
|
|
512
|
+
|
|
476
513
|
# Send the event
|
|
477
514
|
await self.send_event(session_update)
|
|
478
515
|
|
|
@@ -521,4 +558,33 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
|
|
|
521
558
|
tools = data.get("tools", [])
|
|
522
559
|
self._tools = tools
|
|
523
560
|
self.tools_formatted = self._format_tools_for_session(tools)
|
|
524
|
-
self._formatted_tools = self.tools_formatted
|
|
561
|
+
self._formatted_tools = self.tools_formatted
|
|
562
|
+
|
|
563
|
+
async def send_text_message(self, message: str) -> None:
|
|
564
|
+
"""Send a text message to the OpenAI realtime API"""
|
|
565
|
+
if not self._session:
|
|
566
|
+
raise RuntimeError("No active WebSocket session")
|
|
567
|
+
|
|
568
|
+
await self.send_event({
|
|
569
|
+
"type": "conversation.item.create",
|
|
570
|
+
"item": {
|
|
571
|
+
"type": "message",
|
|
572
|
+
"role": "user",
|
|
573
|
+
"content": [
|
|
574
|
+
{
|
|
575
|
+
"type": "input_text",
|
|
576
|
+
"text": message
|
|
577
|
+
}
|
|
578
|
+
]
|
|
579
|
+
}
|
|
580
|
+
})
|
|
581
|
+
await self.create_response()
|
|
582
|
+
|
|
583
|
+
async def _handle_text_done(self, data: dict) -> None:
|
|
584
|
+
"""Handle text response completion"""
|
|
585
|
+
try:
|
|
586
|
+
text_content = data.get("text", "")
|
|
587
|
+
if text_content:
|
|
588
|
+
self.emit("text_response", {"text": text_content, "type": "done"})
|
|
589
|
+
except Exception as e:
|
|
590
|
+
print(f"[ERROR] Error handling text done: {e}")
|
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import base64
|
|
5
|
+
import os
|
|
6
|
+
from typing import Any, Optional
|
|
7
|
+
from urllib.parse import urlencode
|
|
8
|
+
from scipy import signal
|
|
9
|
+
import aiohttp
|
|
10
|
+
import httpx
|
|
11
|
+
import openai
|
|
12
|
+
import numpy as np
|
|
13
|
+
from videosdk.agents import STT as BaseSTT, STTResponse, SpeechEventType, SpeechData, global_event_emitter
|
|
14
|
+
|
|
15
|
+
class OpenAISTT(BaseSTT):
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
*,
|
|
19
|
+
api_key: str,
|
|
20
|
+
model: str = "whisper-1",
|
|
21
|
+
base_url: str | None = None,
|
|
22
|
+
prompt: str | None = None,
|
|
23
|
+
language: str = "en",
|
|
24
|
+
turn_detection: dict | None = None,
|
|
25
|
+
) -> None:
|
|
26
|
+
super().__init__()
|
|
27
|
+
|
|
28
|
+
self.api_key = api_key or os.getenv("OPENAI_API_KEY")
|
|
29
|
+
if not self.api_key:
|
|
30
|
+
raise ValueError("OpenAI API key must be provided either through api_key parameter or OPENAI_API_KEY environment variable")
|
|
31
|
+
|
|
32
|
+
self.model = model
|
|
33
|
+
self.language = language
|
|
34
|
+
self.prompt = prompt
|
|
35
|
+
self.turn_detection = turn_detection or {
|
|
36
|
+
"type": "server_vad",
|
|
37
|
+
"threshold": 0.5,
|
|
38
|
+
"prefix_padding_ms": 300,
|
|
39
|
+
"silence_duration_ms": 500,
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
self.client = openai.AsyncClient(
|
|
43
|
+
max_retries=0,
|
|
44
|
+
api_key=api_key,
|
|
45
|
+
base_url=base_url or None,
|
|
46
|
+
http_client=httpx.AsyncClient(
|
|
47
|
+
timeout=httpx.Timeout(connect=15.0, read=5.0, write=5.0, pool=5.0),
|
|
48
|
+
follow_redirects=True,
|
|
49
|
+
limits=httpx.Limits(
|
|
50
|
+
max_connections=50,
|
|
51
|
+
max_keepalive_connections=50,
|
|
52
|
+
keepalive_expiry=120,
|
|
53
|
+
),
|
|
54
|
+
),
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
self._session: Optional[aiohttp.ClientSession] = None
|
|
58
|
+
self._ws: Optional[aiohttp.ClientWebSocketResponse] = None
|
|
59
|
+
self._ws_task: Optional[asyncio.Task] = None
|
|
60
|
+
self._current_text = ""
|
|
61
|
+
self._last_interim_at = 0
|
|
62
|
+
|
|
63
|
+
self.input_sample_rate = 48000
|
|
64
|
+
self.target_sample_rate = 16000
|
|
65
|
+
|
|
66
|
+
async def process_audio(
|
|
67
|
+
self,
|
|
68
|
+
audio_frames: bytes,
|
|
69
|
+
language: Optional[str] = None,
|
|
70
|
+
**kwargs: Any
|
|
71
|
+
) -> None:
|
|
72
|
+
"""Process audio frames and send to OpenAI's Realtime API"""
|
|
73
|
+
|
|
74
|
+
if not self._ws:
|
|
75
|
+
await self._connect_ws()
|
|
76
|
+
self._ws_task = asyncio.create_task(self._listen_for_responses())
|
|
77
|
+
|
|
78
|
+
try:
|
|
79
|
+
audio_data = np.frombuffer(audio_frames, dtype=np.int16)
|
|
80
|
+
audio_data = signal.resample(audio_data, int(len(audio_data) * self.target_sample_rate / self.input_sample_rate))
|
|
81
|
+
audio_data = audio_data.astype(np.int16).tobytes()
|
|
82
|
+
audio_data = base64.b64encode(audio_data).decode("utf-8")
|
|
83
|
+
message = {
|
|
84
|
+
"type": "input_audio_buffer.append",
|
|
85
|
+
"audio": audio_data,
|
|
86
|
+
}
|
|
87
|
+
await self._ws.send_json(message)
|
|
88
|
+
except Exception as e:
|
|
89
|
+
print(f"Error in process_audio: {str(e)}")
|
|
90
|
+
self.emit("error", str(e))
|
|
91
|
+
if self._ws:
|
|
92
|
+
await self._ws.close()
|
|
93
|
+
self._ws = None
|
|
94
|
+
if self._ws_task:
|
|
95
|
+
self._ws_task.cancel()
|
|
96
|
+
self._ws_task = None
|
|
97
|
+
|
|
98
|
+
async def _listen_for_responses(self) -> None:
|
|
99
|
+
"""Background task to listen for WebSocket responses"""
|
|
100
|
+
if not self._ws:
|
|
101
|
+
return
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
async for msg in self._ws:
|
|
105
|
+
if msg.type == aiohttp.WSMsgType.TEXT:
|
|
106
|
+
data = msg.json()
|
|
107
|
+
responses = self._handle_ws_message(data)
|
|
108
|
+
for response in responses:
|
|
109
|
+
if self._transcript_callback:
|
|
110
|
+
await self._transcript_callback(response)
|
|
111
|
+
elif msg.type == aiohttp.WSMsgType.ERROR:
|
|
112
|
+
error = f"WebSocket error: {self._ws.exception()}"
|
|
113
|
+
print(error)
|
|
114
|
+
self.emit("error", error)
|
|
115
|
+
break
|
|
116
|
+
elif msg.type == aiohttp.WSMsgType.CLOSED:
|
|
117
|
+
print("WebSocket connection closed")
|
|
118
|
+
break
|
|
119
|
+
except Exception as e:
|
|
120
|
+
error = f"Error in WebSocket listener: {str(e)}"
|
|
121
|
+
print(error)
|
|
122
|
+
self.emit("error", error)
|
|
123
|
+
finally:
|
|
124
|
+
if self._ws:
|
|
125
|
+
await self._ws.close()
|
|
126
|
+
self._ws = None
|
|
127
|
+
|
|
128
|
+
async def _connect_ws(self) -> None:
|
|
129
|
+
"""Establish WebSocket connection with OpenAI's Realtime API"""
|
|
130
|
+
|
|
131
|
+
if not self._session:
|
|
132
|
+
self._session = aiohttp.ClientSession()
|
|
133
|
+
|
|
134
|
+
config = {
|
|
135
|
+
"type": "transcription_session.update",
|
|
136
|
+
"session": {
|
|
137
|
+
"input_audio_format": "pcm16",
|
|
138
|
+
"input_audio_transcription": {
|
|
139
|
+
"model": self.model,
|
|
140
|
+
"prompt": self.prompt or "",
|
|
141
|
+
"language": self.language if self.language else None,
|
|
142
|
+
},
|
|
143
|
+
"turn_detection": self.turn_detection,
|
|
144
|
+
"input_audio_noise_reduction": {
|
|
145
|
+
"type": "near_field"
|
|
146
|
+
},
|
|
147
|
+
"include": ["item.input_audio_transcription.logprobs"]
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
query_params = {
|
|
152
|
+
"intent": "transcription",
|
|
153
|
+
}
|
|
154
|
+
headers = {
|
|
155
|
+
"User-Agent": "VideoSDK",
|
|
156
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
157
|
+
"OpenAI-Beta": "realtime=v1",
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
base_url = str(self.client.base_url).rstrip('/')
|
|
161
|
+
ws_url = f"{base_url}/realtime?{urlencode(query_params)}"
|
|
162
|
+
if ws_url.startswith("http"):
|
|
163
|
+
ws_url = ws_url.replace("http", "ws", 1)
|
|
164
|
+
|
|
165
|
+
try:
|
|
166
|
+
self._ws = await self._session.ws_connect(ws_url, headers=headers)
|
|
167
|
+
|
|
168
|
+
initial_response = await self._ws.receive_json()
|
|
169
|
+
|
|
170
|
+
if initial_response.get("type") != "transcription_session.created":
|
|
171
|
+
raise Exception(f"Expected session creation, got: {initial_response}")
|
|
172
|
+
|
|
173
|
+
await self._ws.send_json(config)
|
|
174
|
+
|
|
175
|
+
update_response = await self._ws.receive_json()
|
|
176
|
+
|
|
177
|
+
if update_response.get("type") != "transcription_session.updated":
|
|
178
|
+
raise Exception(f"Configuration update failed: {update_response}")
|
|
179
|
+
|
|
180
|
+
except Exception as e:
|
|
181
|
+
print(f"Error connecting to WebSocket: {str(e)}")
|
|
182
|
+
if self._ws:
|
|
183
|
+
await self._ws.close()
|
|
184
|
+
self._ws = None
|
|
185
|
+
raise
|
|
186
|
+
|
|
187
|
+
def _handle_ws_message(self, msg: dict) -> list[STTResponse]:
|
|
188
|
+
"""Handle incoming WebSocket messages and generate STT responses"""
|
|
189
|
+
responses = []
|
|
190
|
+
|
|
191
|
+
try:
|
|
192
|
+
msg_type = msg.get("type")
|
|
193
|
+
if msg_type == "conversation.item.input_audio_transcription.delta":
|
|
194
|
+
delta = msg.get("delta", "")
|
|
195
|
+
if delta:
|
|
196
|
+
self._current_text += delta
|
|
197
|
+
current_time = asyncio.get_event_loop().time()
|
|
198
|
+
|
|
199
|
+
if current_time - self._last_interim_at > 0.5:
|
|
200
|
+
responses.append(STTResponse(
|
|
201
|
+
event_type=SpeechEventType.INTERIM,
|
|
202
|
+
data=SpeechData(
|
|
203
|
+
text=self._current_text,
|
|
204
|
+
language=self.language,
|
|
205
|
+
),
|
|
206
|
+
metadata={"model": self.model}
|
|
207
|
+
))
|
|
208
|
+
self._last_interim_at = current_time
|
|
209
|
+
|
|
210
|
+
elif msg_type == "conversation.item.input_audio_transcription.completed":
|
|
211
|
+
transcript = msg.get("transcript", "")
|
|
212
|
+
if transcript:
|
|
213
|
+
responses.append(STTResponse(
|
|
214
|
+
event_type=SpeechEventType.FINAL,
|
|
215
|
+
data=SpeechData(
|
|
216
|
+
text=transcript,
|
|
217
|
+
language=self.language,
|
|
218
|
+
),
|
|
219
|
+
metadata={"model": self.model}
|
|
220
|
+
))
|
|
221
|
+
self._current_text = ""
|
|
222
|
+
|
|
223
|
+
elif msg_type == "input_audio_buffer.speech_started":
|
|
224
|
+
global_event_emitter.emit("speech_started")
|
|
225
|
+
|
|
226
|
+
elif msg_type == "input_audio_buffer.speech_stopped":
|
|
227
|
+
global_event_emitter.emit("speech_stopped")
|
|
228
|
+
|
|
229
|
+
except Exception as e:
|
|
230
|
+
print(f"Error handling WebSocket message: {str(e)}")
|
|
231
|
+
|
|
232
|
+
return responses
|
|
233
|
+
|
|
234
|
+
async def aclose(self) -> None:
|
|
235
|
+
"""Cleanup resources"""
|
|
236
|
+
if self._ws_task:
|
|
237
|
+
self._ws_task.cancel()
|
|
238
|
+
try:
|
|
239
|
+
await self._ws_task
|
|
240
|
+
except asyncio.CancelledError:
|
|
241
|
+
pass
|
|
242
|
+
self._ws_task = None
|
|
243
|
+
|
|
244
|
+
if self._ws:
|
|
245
|
+
await self._ws.close()
|
|
246
|
+
self._ws = None
|
|
247
|
+
|
|
248
|
+
if self._session:
|
|
249
|
+
await self._session.close()
|
|
250
|
+
self._session = None
|
|
251
|
+
|
|
252
|
+
await self.client.close()
|
|
253
|
+
|
|
254
|
+
async def _ensure_ws_connection(self):
|
|
255
|
+
"""Ensure WebSocket is connected, reconnect if necessary"""
|
|
256
|
+
if not self._ws or self._ws.closed:
|
|
257
|
+
await self._connect_ws()
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, AsyncIterator, Literal, Optional, Union
|
|
4
|
+
import httpx
|
|
5
|
+
import os
|
|
6
|
+
import openai
|
|
7
|
+
|
|
8
|
+
from videosdk.agents import TTS
|
|
9
|
+
|
|
10
|
+
OPENAI_TTS_SAMPLE_RATE = 24000
|
|
11
|
+
OPENAI_TTS_CHANNELS = 1
|
|
12
|
+
|
|
13
|
+
DEFAULT_MODEL = "gpt-4o-mini-tts"
|
|
14
|
+
DEFAULT_VOICE = "ash"
|
|
15
|
+
_RESPONSE_FORMATS = Union[Literal["mp3", "opus", "aac", "flac", "wav", "pcm"], str]
|
|
16
|
+
|
|
17
|
+
class OpenAITTS(TTS):
|
|
18
|
+
def __init__(
|
|
19
|
+
self,
|
|
20
|
+
*,
|
|
21
|
+
model: str = DEFAULT_MODEL,
|
|
22
|
+
voice: str = DEFAULT_VOICE,
|
|
23
|
+
speed: float = 1.0,
|
|
24
|
+
instructions: str | None = None,
|
|
25
|
+
api_key: str | None = None,
|
|
26
|
+
base_url: str | None = None,
|
|
27
|
+
response_format: str = "pcm"
|
|
28
|
+
) -> None:
|
|
29
|
+
super().__init__(sample_rate=OPENAI_TTS_SAMPLE_RATE, num_channels=OPENAI_TTS_CHANNELS)
|
|
30
|
+
|
|
31
|
+
self.model = model
|
|
32
|
+
self.voice = voice
|
|
33
|
+
self.speed = speed
|
|
34
|
+
self.instructions = instructions
|
|
35
|
+
self.audio_track = None
|
|
36
|
+
self.loop = None
|
|
37
|
+
self.response_format = response_format
|
|
38
|
+
|
|
39
|
+
self.api_key = api_key or os.getenv("OPENAI_API_KEY")
|
|
40
|
+
if not self.api_key:
|
|
41
|
+
raise ValueError("OpenAI API key must be provided either through api_key parameter or OPENAI_API_KEY environment variable")
|
|
42
|
+
|
|
43
|
+
self._client = openai.AsyncClient(
|
|
44
|
+
max_retries=0,
|
|
45
|
+
api_key=self.api_key,
|
|
46
|
+
base_url=base_url or None,
|
|
47
|
+
http_client=httpx.AsyncClient(
|
|
48
|
+
timeout=httpx.Timeout(connect=15.0, read=5.0, write=5.0, pool=5.0),
|
|
49
|
+
follow_redirects=True,
|
|
50
|
+
limits=httpx.Limits(
|
|
51
|
+
max_connections=50,
|
|
52
|
+
max_keepalive_connections=50,
|
|
53
|
+
keepalive_expiry=120,
|
|
54
|
+
),
|
|
55
|
+
),
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
async def synthesize(
|
|
59
|
+
self,
|
|
60
|
+
text: AsyncIterator[str] | str,
|
|
61
|
+
voice_id: Optional[str] = None,
|
|
62
|
+
**kwargs: Any
|
|
63
|
+
) -> None:
|
|
64
|
+
"""
|
|
65
|
+
Convert text to speech using OpenAI's TTS API and stream to audio track
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
text: Text to convert to speech
|
|
69
|
+
voice_id: Optional voice override
|
|
70
|
+
**kwargs: Additional provider-specific arguments
|
|
71
|
+
"""
|
|
72
|
+
try:
|
|
73
|
+
if isinstance(text, AsyncIterator):
|
|
74
|
+
full_text = ""
|
|
75
|
+
async for chunk in text:
|
|
76
|
+
full_text += chunk
|
|
77
|
+
else:
|
|
78
|
+
full_text = text
|
|
79
|
+
|
|
80
|
+
if not self.audio_track or not self.loop:
|
|
81
|
+
self.emit("error", "Audio track or event loop not set")
|
|
82
|
+
return
|
|
83
|
+
|
|
84
|
+
async with self._client.audio.speech.with_streaming_response.create(
|
|
85
|
+
model=self.model,
|
|
86
|
+
voice=voice_id or self.voice,
|
|
87
|
+
input=full_text,
|
|
88
|
+
speed=self.speed,
|
|
89
|
+
response_format=self.response_format,
|
|
90
|
+
**({"instructions": self.instructions} if self.instructions else {})
|
|
91
|
+
) as response:
|
|
92
|
+
async for chunk in response.iter_bytes():
|
|
93
|
+
if chunk:
|
|
94
|
+
self.loop.create_task(self.audio_track.add_new_bytes(chunk))
|
|
95
|
+
|
|
96
|
+
except openai.APIError as e:
|
|
97
|
+
self.emit("error", str(e))
|
|
98
|
+
except Exception as e:
|
|
99
|
+
self.emit("error", f"TTS synthesis failed: {str(e)}")
|
|
100
|
+
|
|
101
|
+
async def aclose(self) -> None:
|
|
102
|
+
"""Cleanup resources"""
|
|
103
|
+
await self._client.close()
|
|
104
|
+
await super().aclose()
|
|
105
|
+
|
|
106
|
+
async def interrupt(self) -> None:
|
|
107
|
+
"""Interrupt the TTS process"""
|
|
108
|
+
if self.audio_track:
|
|
109
|
+
self.audio_track.interrupt()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.0.6"
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.0.4"
|
|
File without changes
|