videosdk-plugins-openai 0.0.30__tar.gz → 0.0.32__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of videosdk-plugins-openai might be problematic. Click here for more details.
- {videosdk_plugins_openai-0.0.30 → videosdk_plugins_openai-0.0.32}/.gitignore +3 -2
- {videosdk_plugins_openai-0.0.30 → videosdk_plugins_openai-0.0.32}/PKG-INFO +2 -2
- {videosdk_plugins_openai-0.0.30 → videosdk_plugins_openai-0.0.32}/pyproject.toml +1 -1
- {videosdk_plugins_openai-0.0.30 → videosdk_plugins_openai-0.0.32}/videosdk/plugins/openai/llm.py +83 -2
- {videosdk_plugins_openai-0.0.30 → videosdk_plugins_openai-0.0.32}/videosdk/plugins/openai/stt.py +184 -7
- {videosdk_plugins_openai-0.0.30 → videosdk_plugins_openai-0.0.32}/videosdk/plugins/openai/tts.py +85 -2
- videosdk_plugins_openai-0.0.32/videosdk/plugins/openai/version.py +1 -0
- videosdk_plugins_openai-0.0.30/videosdk/plugins/openai/version.py +0 -1
- {videosdk_plugins_openai-0.0.30 → videosdk_plugins_openai-0.0.32}/README.md +0 -0
- {videosdk_plugins_openai-0.0.30 → videosdk_plugins_openai-0.0.32}/videosdk/plugins/openai/__init__.py +0 -0
- {videosdk_plugins_openai-0.0.30 → videosdk_plugins_openai-0.0.32}/videosdk/plugins/openai/realtime_api.py +2 -2
|
@@ -2,13 +2,12 @@ myenv/
|
|
|
2
2
|
venv/
|
|
3
3
|
env/
|
|
4
4
|
__pycache__/
|
|
5
|
-
|
|
5
|
+
.venv/
|
|
6
6
|
.env
|
|
7
7
|
.env.local
|
|
8
8
|
test_env/
|
|
9
9
|
dist/
|
|
10
10
|
.DS_Store
|
|
11
|
-
|
|
12
11
|
node_modules/
|
|
13
12
|
credentials.json
|
|
14
13
|
.Python
|
|
@@ -16,3 +15,5 @@ build/
|
|
|
16
15
|
eggs/
|
|
17
16
|
sdist/
|
|
18
17
|
wheels/
|
|
18
|
+
docs/
|
|
19
|
+
agent-sdk-reference/
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: videosdk-plugins-openai
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.32
|
|
4
4
|
Summary: VideoSDK Agent Framework plugin for OpenAI services
|
|
5
5
|
Author: videosdk
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -13,7 +13,7 @@ Classifier: Topic :: Multimedia :: Video
|
|
|
13
13
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
14
14
|
Requires-Python: >=3.11
|
|
15
15
|
Requires-Dist: openai[realtime]>=1.68.2
|
|
16
|
-
Requires-Dist: videosdk-agents>=0.0.
|
|
16
|
+
Requires-Dist: videosdk-agents>=0.0.32
|
|
17
17
|
Description-Content-Type: text/markdown
|
|
18
18
|
|
|
19
19
|
# VideoSDK OpenAI Plugin
|
|
@@ -20,7 +20,7 @@ classifiers = [
|
|
|
20
20
|
"Topic :: Multimedia :: Video",
|
|
21
21
|
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
22
22
|
]
|
|
23
|
-
dependencies = ["videosdk-agents>=0.0.
|
|
23
|
+
dependencies = ["videosdk-agents>=0.0.32", "openai[realtime]>=1.68.2"]
|
|
24
24
|
|
|
25
25
|
[tool.hatch.version]
|
|
26
26
|
path = "videosdk/plugins/openai/version.py"
|
{videosdk_plugins_openai-0.0.30 → videosdk_plugins_openai-0.0.32}/videosdk/plugins/openai/llm.py
RENAMED
|
@@ -26,13 +26,23 @@ class OpenAILLM(LLM):
|
|
|
26
26
|
def __init__(
|
|
27
27
|
self,
|
|
28
28
|
*,
|
|
29
|
-
model: str = "gpt-4o",
|
|
30
29
|
api_key: str | None = None,
|
|
30
|
+
model: str = "gpt-4o-mini",
|
|
31
31
|
base_url: str | None = None,
|
|
32
32
|
temperature: float = 0.7,
|
|
33
33
|
tool_choice: ToolChoice = "auto",
|
|
34
34
|
max_completion_tokens: int | None = None,
|
|
35
35
|
) -> None:
|
|
36
|
+
"""Initialize the OpenAI LLM plugin.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
api_key (Optional[str], optional): OpenAI API key. Defaults to None.
|
|
40
|
+
model (str): The model to use for the LLM plugin. Defaults to "gpt-4o".
|
|
41
|
+
base_url (Optional[str], optional): The base URL for the OpenAI API. Defaults to None.
|
|
42
|
+
temperature (float): The temperature to use for the LLM plugin. Defaults to 0.7.
|
|
43
|
+
tool_choice (ToolChoice): The tool choice to use for the LLM plugin. Defaults to "auto".
|
|
44
|
+
max_completion_tokens (Optional[int], optional): The maximum completion tokens to use for the LLM plugin. Defaults to None.
|
|
45
|
+
"""
|
|
36
46
|
super().__init__()
|
|
37
47
|
self.api_key = api_key or os.getenv("OPENAI_API_KEY")
|
|
38
48
|
if not self.api_key:
|
|
@@ -59,6 +69,77 @@ class OpenAILLM(LLM):
|
|
|
59
69
|
),
|
|
60
70
|
)
|
|
61
71
|
|
|
72
|
+
@staticmethod
|
|
73
|
+
def azure(
|
|
74
|
+
*,
|
|
75
|
+
model: str = "gpt-4o-mini",
|
|
76
|
+
azure_endpoint: str | None = None,
|
|
77
|
+
azure_deployment: str | None = None,
|
|
78
|
+
api_version: str | None = None,
|
|
79
|
+
api_key: str | None = None,
|
|
80
|
+
azure_ad_token: str | None = None,
|
|
81
|
+
organization: str | None = None,
|
|
82
|
+
project: str | None = None,
|
|
83
|
+
base_url: str | None = None,
|
|
84
|
+
temperature: float = 0.7,
|
|
85
|
+
tool_choice: ToolChoice = "auto",
|
|
86
|
+
max_completion_tokens: int | None = None,
|
|
87
|
+
timeout: httpx.Timeout | None = None,
|
|
88
|
+
) -> "OpenAILLM":
|
|
89
|
+
"""
|
|
90
|
+
Create a new instance of Azure OpenAI LLM.
|
|
91
|
+
|
|
92
|
+
This automatically infers the following arguments from their corresponding environment variables if they are not provided:
|
|
93
|
+
- `api_key` from `AZURE_OPENAI_API_KEY`
|
|
94
|
+
- `organization` from `OPENAI_ORG_ID`
|
|
95
|
+
- `project` from `OPENAI_PROJECT_ID`
|
|
96
|
+
- `azure_ad_token` from `AZURE_OPENAI_AD_TOKEN`
|
|
97
|
+
- `api_version` from `OPENAI_API_VERSION`
|
|
98
|
+
- `azure_endpoint` from `AZURE_OPENAI_ENDPOINT`
|
|
99
|
+
- `azure_deployment` from `AZURE_OPENAI_DEPLOYMENT` (if not provided, uses `model` as deployment name)
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
azure_endpoint = azure_endpoint or os.getenv("AZURE_OPENAI_ENDPOINT")
|
|
103
|
+
azure_deployment = azure_deployment or os.getenv("AZURE_OPENAI_DEPLOYMENT")
|
|
104
|
+
api_version = api_version or os.getenv("OPENAI_API_VERSION")
|
|
105
|
+
api_key = api_key or os.getenv("AZURE_OPENAI_API_KEY")
|
|
106
|
+
azure_ad_token = azure_ad_token or os.getenv("AZURE_OPENAI_AD_TOKEN")
|
|
107
|
+
organization = organization or os.getenv("OPENAI_ORG_ID")
|
|
108
|
+
project = project or os.getenv("OPENAI_PROJECT_ID")
|
|
109
|
+
|
|
110
|
+
if not azure_deployment:
|
|
111
|
+
azure_deployment = model
|
|
112
|
+
|
|
113
|
+
if not azure_endpoint:
|
|
114
|
+
raise ValueError("Azure endpoint must be provided either through azure_endpoint parameter or AZURE_OPENAI_ENDPOINT environment variable")
|
|
115
|
+
|
|
116
|
+
if not api_key and not azure_ad_token:
|
|
117
|
+
raise ValueError("Either API key or Azure AD token must be provided")
|
|
118
|
+
|
|
119
|
+
azure_client = openai.AsyncAzureOpenAI(
|
|
120
|
+
max_retries=0,
|
|
121
|
+
azure_endpoint=azure_endpoint,
|
|
122
|
+
azure_deployment=azure_deployment,
|
|
123
|
+
api_version=api_version,
|
|
124
|
+
api_key=api_key,
|
|
125
|
+
azure_ad_token=azure_ad_token,
|
|
126
|
+
organization=organization,
|
|
127
|
+
project=project,
|
|
128
|
+
base_url=base_url,
|
|
129
|
+
timeout=timeout
|
|
130
|
+
if timeout
|
|
131
|
+
else httpx.Timeout(connect=15.0, read=5.0, write=5.0, pool=5.0),
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
instance = OpenAILLM(
|
|
135
|
+
model=model,
|
|
136
|
+
temperature=temperature,
|
|
137
|
+
tool_choice=tool_choice,
|
|
138
|
+
max_completion_tokens=max_completion_tokens,
|
|
139
|
+
)
|
|
140
|
+
instance._client = azure_client
|
|
141
|
+
return instance
|
|
142
|
+
|
|
62
143
|
async def chat(
|
|
63
144
|
self,
|
|
64
145
|
messages: ChatContext,
|
|
@@ -202,4 +283,4 @@ class OpenAILLM(LLM):
|
|
|
202
283
|
"""Cleanup resources by closing the HTTP client"""
|
|
203
284
|
await self.cancel_current_generation()
|
|
204
285
|
if self._client:
|
|
205
|
-
await self._client.close()
|
|
286
|
+
await self._client.close()
|
{videosdk_plugins_openai-0.0.30 → videosdk_plugins_openai-0.0.32}/videosdk/plugins/openai/stt.py
RENAMED
|
@@ -3,8 +3,11 @@ from __future__ import annotations
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import base64
|
|
5
5
|
import os
|
|
6
|
+
import time
|
|
6
7
|
from typing import Any, Optional
|
|
7
8
|
from urllib.parse import urlencode
|
|
9
|
+
import io
|
|
10
|
+
import wave
|
|
8
11
|
from scipy import signal
|
|
9
12
|
import aiohttp
|
|
10
13
|
import httpx
|
|
@@ -17,12 +20,25 @@ class OpenAISTT(BaseSTT):
|
|
|
17
20
|
self,
|
|
18
21
|
*,
|
|
19
22
|
api_key: str | None = None,
|
|
20
|
-
model: str = "
|
|
23
|
+
model: str = "gpt-4o-mini-transcribe",
|
|
21
24
|
base_url: str | None = None,
|
|
22
25
|
prompt: str | None = None,
|
|
23
26
|
language: str = "en",
|
|
24
27
|
turn_detection: dict | None = None,
|
|
28
|
+
enable_streaming: bool = True,
|
|
29
|
+
silence_threshold: float = 0.01,
|
|
30
|
+
silence_duration: float = 0.8,
|
|
25
31
|
) -> None:
|
|
32
|
+
"""Initialize the OpenAI STT plugin.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
api_key (Optional[str], optional): OpenAI API key. Defaults to None.
|
|
36
|
+
model (str): The model to use for the STT plugin. Defaults to "whisper-1".
|
|
37
|
+
base_url (Optional[str], optional): The base URL for the OpenAI API. Defaults to None.
|
|
38
|
+
prompt (Optional[str], optional): The prompt for the STT plugin. Defaults to None.
|
|
39
|
+
language (str): The language to use for the STT plugin. Defaults to "en".
|
|
40
|
+
turn_detection (dict | None): The turn detection for the STT plugin. Defaults to None.
|
|
41
|
+
"""
|
|
26
42
|
super().__init__()
|
|
27
43
|
|
|
28
44
|
self.api_key = api_key or os.getenv("OPENAI_API_KEY")
|
|
@@ -38,6 +54,11 @@ class OpenAISTT(BaseSTT):
|
|
|
38
54
|
"prefix_padding_ms": 300,
|
|
39
55
|
"silence_duration_ms": 500,
|
|
40
56
|
}
|
|
57
|
+
self.enable_streaming = enable_streaming
|
|
58
|
+
|
|
59
|
+
# Custom VAD parameters for non-streaming mode
|
|
60
|
+
self.silence_threshold_bytes = int(silence_threshold * 32767)
|
|
61
|
+
self.silence_duration_frames = int(silence_duration * 48000) # input_sample_rate
|
|
41
62
|
|
|
42
63
|
self.client = openai.AsyncClient(
|
|
43
64
|
max_retries=0,
|
|
@@ -59,9 +80,88 @@ class OpenAISTT(BaseSTT):
|
|
|
59
80
|
self._ws_task: Optional[asyncio.Task] = None
|
|
60
81
|
self._current_text = ""
|
|
61
82
|
self._last_interim_at = 0
|
|
62
|
-
|
|
63
83
|
self.input_sample_rate = 48000
|
|
64
84
|
self.target_sample_rate = 16000
|
|
85
|
+
self._audio_buffer = bytearray()
|
|
86
|
+
|
|
87
|
+
# Custom VAD state for non-streaming mode
|
|
88
|
+
self._is_speaking = False
|
|
89
|
+
self._silence_frames = 0
|
|
90
|
+
|
|
91
|
+
@staticmethod
|
|
92
|
+
def azure(
|
|
93
|
+
*,
|
|
94
|
+
model: str = "gpt-4o-mini-transcribe",
|
|
95
|
+
language: str = "en",
|
|
96
|
+
prompt: str | None = None,
|
|
97
|
+
turn_detection: dict | None = None,
|
|
98
|
+
azure_endpoint: str | None = None,
|
|
99
|
+
azure_deployment: str | None = None,
|
|
100
|
+
api_version: str | None = None,
|
|
101
|
+
api_key: str | None = None,
|
|
102
|
+
azure_ad_token: str | None = None,
|
|
103
|
+
organization: str | None = None,
|
|
104
|
+
project: str | None = None,
|
|
105
|
+
base_url: str | None = None,
|
|
106
|
+
enable_streaming: bool = False,
|
|
107
|
+
timeout: httpx.Timeout | None = None,
|
|
108
|
+
) -> "OpenAISTT":
|
|
109
|
+
"""
|
|
110
|
+
Create a new instance of Azure OpenAI STT.
|
|
111
|
+
|
|
112
|
+
This automatically infers the following arguments from their corresponding environment variables if they are not provided:
|
|
113
|
+
- `api_key` from `AZURE_OPENAI_API_KEY`
|
|
114
|
+
- `organization` from `OPENAI_ORG_ID`
|
|
115
|
+
- `project` from `OPENAI_PROJECT_ID`
|
|
116
|
+
- `azure_ad_token` from `AZURE_OPENAI_AD_TOKEN`
|
|
117
|
+
- `api_version` from `OPENAI_API_VERSION`
|
|
118
|
+
- `azure_endpoint` from `AZURE_OPENAI_ENDPOINT`
|
|
119
|
+
- `azure_deployment` from `AZURE_OPENAI_DEPLOYMENT` (if not provided, uses `model` as deployment name)
|
|
120
|
+
"""
|
|
121
|
+
|
|
122
|
+
# Get values from environment variables if not provided
|
|
123
|
+
azure_endpoint = azure_endpoint or os.getenv("AZURE_OPENAI_ENDPOINT")
|
|
124
|
+
azure_deployment = azure_deployment or os.getenv("AZURE_OPENAI_DEPLOYMENT")
|
|
125
|
+
api_version = api_version or os.getenv("OPENAI_API_VERSION")
|
|
126
|
+
api_key = api_key or os.getenv("AZURE_OPENAI_API_KEY")
|
|
127
|
+
azure_ad_token = azure_ad_token or os.getenv("AZURE_OPENAI_AD_TOKEN")
|
|
128
|
+
organization = organization or os.getenv("OPENAI_ORG_ID")
|
|
129
|
+
project = project or os.getenv("OPENAI_PROJECT_ID")
|
|
130
|
+
|
|
131
|
+
# If azure_deployment is not provided, use model as the deployment name
|
|
132
|
+
if not azure_deployment:
|
|
133
|
+
azure_deployment = model
|
|
134
|
+
|
|
135
|
+
if not azure_endpoint:
|
|
136
|
+
raise ValueError("Azure endpoint must be provided either through azure_endpoint parameter or AZURE_OPENAI_ENDPOINT environment variable")
|
|
137
|
+
|
|
138
|
+
if not api_key and not azure_ad_token:
|
|
139
|
+
raise ValueError("Either API key or Azure AD token must be provided")
|
|
140
|
+
|
|
141
|
+
azure_client = openai.AsyncAzureOpenAI(
|
|
142
|
+
max_retries=0,
|
|
143
|
+
azure_endpoint=azure_endpoint,
|
|
144
|
+
azure_deployment=azure_deployment,
|
|
145
|
+
api_version=api_version,
|
|
146
|
+
api_key=api_key,
|
|
147
|
+
azure_ad_token=azure_ad_token,
|
|
148
|
+
organization=organization,
|
|
149
|
+
project=project,
|
|
150
|
+
base_url=base_url,
|
|
151
|
+
timeout=timeout
|
|
152
|
+
if timeout
|
|
153
|
+
else httpx.Timeout(connect=15.0, read=5.0, write=5.0, pool=5.0),
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
instance = OpenAISTT(
|
|
157
|
+
model=model,
|
|
158
|
+
language=language,
|
|
159
|
+
prompt=prompt,
|
|
160
|
+
turn_detection=turn_detection,
|
|
161
|
+
enable_streaming=enable_streaming,
|
|
162
|
+
)
|
|
163
|
+
instance.client = azure_client
|
|
164
|
+
return instance
|
|
65
165
|
|
|
66
166
|
async def process_audio(
|
|
67
167
|
self,
|
|
@@ -69,7 +169,11 @@ class OpenAISTT(BaseSTT):
|
|
|
69
169
|
language: Optional[str] = None,
|
|
70
170
|
**kwargs: Any
|
|
71
171
|
) -> None:
|
|
72
|
-
"""Process audio frames and send to OpenAI
|
|
172
|
+
"""Process audio frames and send to OpenAI based on enabled mode"""
|
|
173
|
+
|
|
174
|
+
if not self.enable_streaming:
|
|
175
|
+
await self._transcribe_non_streaming(audio_frames)
|
|
176
|
+
return
|
|
73
177
|
|
|
74
178
|
if not self._ws:
|
|
75
179
|
await self._connect_ws()
|
|
@@ -95,6 +199,80 @@ class OpenAISTT(BaseSTT):
|
|
|
95
199
|
self._ws_task.cancel()
|
|
96
200
|
self._ws_task = None
|
|
97
201
|
|
|
202
|
+
async def _transcribe_non_streaming(self, audio_frames: bytes) -> None:
|
|
203
|
+
"""HTTP-based transcription using OpenAI audio/transcriptions API with custom VAD"""
|
|
204
|
+
if not audio_frames:
|
|
205
|
+
return
|
|
206
|
+
|
|
207
|
+
self._audio_buffer.extend(audio_frames)
|
|
208
|
+
|
|
209
|
+
# Custom VAD logic similar to other STT implementations
|
|
210
|
+
is_silent_chunk = self._is_silent(audio_frames)
|
|
211
|
+
|
|
212
|
+
if not is_silent_chunk:
|
|
213
|
+
if not self._is_speaking:
|
|
214
|
+
self._is_speaking = True
|
|
215
|
+
global_event_emitter.emit("speech_started")
|
|
216
|
+
self._silence_frames = 0
|
|
217
|
+
else:
|
|
218
|
+
if self._is_speaking:
|
|
219
|
+
self._silence_frames += len(audio_frames) // 4 # Approximate frame count
|
|
220
|
+
if self._silence_frames > self.silence_duration_frames:
|
|
221
|
+
global_event_emitter.emit("speech_stopped")
|
|
222
|
+
await self._process_audio_buffer()
|
|
223
|
+
self._is_speaking = False
|
|
224
|
+
self._silence_frames = 0
|
|
225
|
+
|
|
226
|
+
def _is_silent(self, audio_chunk: bytes) -> bool:
|
|
227
|
+
"""Simple VAD: check if the max amplitude is below a threshold."""
|
|
228
|
+
audio_data = np.frombuffer(audio_chunk, dtype=np.int16)
|
|
229
|
+
return np.max(np.abs(audio_data)) < self.silence_threshold_bytes
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
async def _process_audio_buffer(self) -> None:
|
|
234
|
+
"""Process the accumulated audio buffer with OpenAI transcription"""
|
|
235
|
+
if not self._audio_buffer:
|
|
236
|
+
return
|
|
237
|
+
|
|
238
|
+
audio_data = bytes(self._audio_buffer)
|
|
239
|
+
self._audio_buffer.clear()
|
|
240
|
+
|
|
241
|
+
wav_bytes = self._audio_frames_to_wav_bytes(audio_data)
|
|
242
|
+
|
|
243
|
+
try:
|
|
244
|
+
resp = await self.client.audio.transcriptions.create(
|
|
245
|
+
file=("audio.wav", wav_bytes, "audio/wav"),
|
|
246
|
+
model=self.model,
|
|
247
|
+
language=self.language,
|
|
248
|
+
prompt=self.prompt or openai.NOT_GIVEN,
|
|
249
|
+
)
|
|
250
|
+
text = getattr(resp, "text", "")
|
|
251
|
+
if text and self._transcript_callback:
|
|
252
|
+
await self._transcript_callback(STTResponse(
|
|
253
|
+
event_type=SpeechEventType.FINAL,
|
|
254
|
+
data=SpeechData(text=text, language=self.language),
|
|
255
|
+
metadata={"model": self.model}
|
|
256
|
+
))
|
|
257
|
+
except Exception as e:
|
|
258
|
+
print(f"OpenAI transcription error: {str(e)}")
|
|
259
|
+
self.emit("error", str(e))
|
|
260
|
+
|
|
261
|
+
def _audio_frames_to_wav_bytes(self, audio_frames: bytes) -> bytes:
|
|
262
|
+
"""Convert audio frames to WAV bytes"""
|
|
263
|
+
pcm = np.frombuffer(audio_frames, dtype=np.int16)
|
|
264
|
+
resampled = signal.resample(pcm, int(len(pcm) * self.target_sample_rate / self.input_sample_rate))
|
|
265
|
+
resampled = resampled.astype(np.int16)
|
|
266
|
+
|
|
267
|
+
buf = io.BytesIO()
|
|
268
|
+
with wave.open(buf, "wb") as wf:
|
|
269
|
+
wf.setnchannels(1) # Mono
|
|
270
|
+
wf.setsampwidth(2) # 16-bit PCM
|
|
271
|
+
wf.setframerate(self.target_sample_rate)
|
|
272
|
+
wf.writeframes(resampled.tobytes())
|
|
273
|
+
|
|
274
|
+
return buf.getvalue()
|
|
275
|
+
|
|
98
276
|
async def _listen_for_responses(self) -> None:
|
|
99
277
|
"""Background task to listen for WebSocket responses"""
|
|
100
278
|
if not self._ws:
|
|
@@ -233,6 +411,8 @@ class OpenAISTT(BaseSTT):
|
|
|
233
411
|
|
|
234
412
|
async def aclose(self) -> None:
|
|
235
413
|
"""Cleanup resources"""
|
|
414
|
+
self._audio_buffer.clear()
|
|
415
|
+
|
|
236
416
|
if self._ws_task:
|
|
237
417
|
self._ws_task.cancel()
|
|
238
418
|
try:
|
|
@@ -254,7 +434,4 @@ class OpenAISTT(BaseSTT):
|
|
|
254
434
|
async def _ensure_ws_connection(self):
|
|
255
435
|
"""Ensure WebSocket is connected, reconnect if necessary"""
|
|
256
436
|
if not self._ws or self._ws.closed:
|
|
257
|
-
await self._connect_ws()
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
437
|
+
await self._connect_ws()
|
{videosdk_plugins_openai-0.0.30 → videosdk_plugins_openai-0.0.32}/videosdk/plugins/openai/tts.py
RENAMED
|
@@ -17,19 +17,29 @@ _RESPONSE_FORMATS = Union[Literal["mp3",
|
|
|
17
17
|
"opus", "aac", "flac", "wav", "pcm"], str]
|
|
18
18
|
|
|
19
19
|
|
|
20
|
-
|
|
21
20
|
class OpenAITTS(TTS):
|
|
22
21
|
def __init__(
|
|
23
22
|
self,
|
|
24
23
|
*,
|
|
24
|
+
api_key: str | None = None,
|
|
25
25
|
model: str = DEFAULT_MODEL,
|
|
26
26
|
voice: str = DEFAULT_VOICE,
|
|
27
27
|
speed: float = 1.0,
|
|
28
28
|
instructions: str | None = None,
|
|
29
|
-
api_key: str | None = None,
|
|
30
29
|
base_url: str | None = None,
|
|
31
30
|
response_format: str = "pcm",
|
|
32
31
|
) -> None:
|
|
32
|
+
"""Initialize the OpenAI TTS plugin.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
api_key (Optional[str], optional): OpenAI API key. Defaults to None.
|
|
36
|
+
model (str): The model to use for the TTS plugin. Defaults to "gpt-4o-mini-tts".
|
|
37
|
+
voice (str): The voice to use for the TTS plugin. Defaults to "ash".
|
|
38
|
+
speed (float): The speed to use for the TTS plugin. Defaults to 1.0.
|
|
39
|
+
instructions (Optional[str], optional): Additional instructions for the TTS plugin. Defaults to None.
|
|
40
|
+
base_url (Optional[str], optional): Custom base URL for the OpenAI API. Defaults to None.
|
|
41
|
+
response_format (str): The response format to use for the TTS plugin. Defaults to "pcm".
|
|
42
|
+
"""
|
|
33
43
|
super().__init__(sample_rate=OPENAI_TTS_SAMPLE_RATE, num_channels=OPENAI_TTS_CHANNELS)
|
|
34
44
|
|
|
35
45
|
self.model = model
|
|
@@ -64,6 +74,79 @@ class OpenAITTS(TTS):
|
|
|
64
74
|
),
|
|
65
75
|
)
|
|
66
76
|
|
|
77
|
+
@staticmethod
|
|
78
|
+
def azure(
|
|
79
|
+
*,
|
|
80
|
+
model: str = DEFAULT_MODEL,
|
|
81
|
+
voice: str = DEFAULT_VOICE,
|
|
82
|
+
speed: float = 1.0,
|
|
83
|
+
instructions: str | None = None,
|
|
84
|
+
azure_endpoint: str | None = None,
|
|
85
|
+
azure_deployment: str | None = None,
|
|
86
|
+
api_version: str | None = None,
|
|
87
|
+
api_key: str | None = None,
|
|
88
|
+
azure_ad_token: str | None = None,
|
|
89
|
+
organization: str | None = None,
|
|
90
|
+
project: str | None = None,
|
|
91
|
+
base_url: str | None = None,
|
|
92
|
+
response_format: str = "pcm",
|
|
93
|
+
timeout: httpx.Timeout | None = None,
|
|
94
|
+
) -> "OpenAITTS":
|
|
95
|
+
"""
|
|
96
|
+
Create a new instance of Azure OpenAI TTS.
|
|
97
|
+
|
|
98
|
+
This automatically infers the following arguments from their corresponding environment variables if they are not provided:
|
|
99
|
+
- `api_key` from `AZURE_OPENAI_API_KEY`
|
|
100
|
+
- `organization` from `OPENAI_ORG_ID`
|
|
101
|
+
- `project` from `OPENAI_PROJECT_ID`
|
|
102
|
+
- `azure_ad_token` from `AZURE_OPENAI_AD_TOKEN`
|
|
103
|
+
- `api_version` from `OPENAI_API_VERSION`
|
|
104
|
+
- `azure_endpoint` from `AZURE_OPENAI_ENDPOINT`
|
|
105
|
+
- `azure_deployment` from `AZURE_OPENAI_DEPLOYMENT` (if not provided, uses `model` as deployment name)
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
azure_endpoint = azure_endpoint or os.getenv("AZURE_OPENAI_ENDPOINT")
|
|
109
|
+
azure_deployment = azure_deployment or os.getenv("AZURE_OPENAI_DEPLOYMENT")
|
|
110
|
+
api_version = api_version or os.getenv("OPENAI_API_VERSION")
|
|
111
|
+
api_key = api_key or os.getenv("AZURE_OPENAI_API_KEY")
|
|
112
|
+
azure_ad_token = azure_ad_token or os.getenv("AZURE_OPENAI_AD_TOKEN")
|
|
113
|
+
organization = organization or os.getenv("OPENAI_ORG_ID")
|
|
114
|
+
project = project or os.getenv("OPENAI_PROJECT_ID")
|
|
115
|
+
|
|
116
|
+
if not azure_deployment:
|
|
117
|
+
azure_deployment = model
|
|
118
|
+
|
|
119
|
+
if not azure_endpoint:
|
|
120
|
+
raise ValueError("Azure endpoint must be provided either through azure_endpoint parameter or AZURE_OPENAI_ENDPOINT environment variable")
|
|
121
|
+
|
|
122
|
+
if not api_key and not azure_ad_token:
|
|
123
|
+
raise ValueError("Either API key or Azure AD token must be provided")
|
|
124
|
+
|
|
125
|
+
azure_client = openai.AsyncAzureOpenAI(
|
|
126
|
+
max_retries=0,
|
|
127
|
+
azure_endpoint=azure_endpoint,
|
|
128
|
+
azure_deployment=azure_deployment,
|
|
129
|
+
api_version=api_version,
|
|
130
|
+
api_key=api_key,
|
|
131
|
+
azure_ad_token=azure_ad_token,
|
|
132
|
+
organization=organization,
|
|
133
|
+
project=project,
|
|
134
|
+
base_url=base_url,
|
|
135
|
+
timeout=timeout
|
|
136
|
+
if timeout
|
|
137
|
+
else httpx.Timeout(connect=15.0, read=5.0, write=5.0, pool=5.0),
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
instance = OpenAITTS(
|
|
141
|
+
model=model,
|
|
142
|
+
voice=voice,
|
|
143
|
+
speed=speed,
|
|
144
|
+
instructions=instructions,
|
|
145
|
+
response_format=response_format,
|
|
146
|
+
)
|
|
147
|
+
instance._client = azure_client
|
|
148
|
+
return instance
|
|
149
|
+
|
|
67
150
|
def reset_first_audio_tracking(self) -> None:
|
|
68
151
|
"""Reset the first audio tracking state for next TTS task"""
|
|
69
152
|
self._first_chunk_sent = False
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.0.32"
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.0.30"
|
|
File without changes
|
|
File without changes
|
|
@@ -100,15 +100,16 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
|
|
|
100
100
|
def __init__(
|
|
101
101
|
self,
|
|
102
102
|
*,
|
|
103
|
+
api_key: str | None = None,
|
|
103
104
|
model: str,
|
|
104
105
|
config: OpenAIRealtimeConfig | None = None,
|
|
105
|
-
api_key: str | None = None,
|
|
106
106
|
base_url: str | None = None,
|
|
107
107
|
) -> None:
|
|
108
108
|
"""
|
|
109
109
|
Initialize OpenAI realtime model.
|
|
110
110
|
|
|
111
111
|
Args:
|
|
112
|
+
api_key: OpenAI API key. If not provided, will attempt to read from OPENAI_API_KEY env var
|
|
112
113
|
model: The OpenAI model identifier to use (e.g. 'gpt-4', 'gpt-3.5-turbo')
|
|
113
114
|
config: Optional configuration object for customizing model behavior. Contains settings for:
|
|
114
115
|
- voice: Voice ID to use for audio output
|
|
@@ -117,7 +118,6 @@ class OpenAIRealtime(RealtimeBaseModel[OpenAIEventTypes]):
|
|
|
117
118
|
- input_audio_transcription: Settings for audio transcription
|
|
118
119
|
- tool_choice: How tools should be selected ('auto' or 'none')
|
|
119
120
|
- modalities: List of enabled modalities ('text', 'audio')
|
|
120
|
-
api_key: OpenAI API key. If not provided, will attempt to read from OPENAI_API_KEY env var
|
|
121
121
|
base_url: Base URL for OpenAI API. Defaults to 'https://api.openai.com/v1'
|
|
122
122
|
|
|
123
123
|
Raises:
|