intellema-vdk 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- intellema_vdk/__init__.py +67 -10
- intellema_vdk/config.py +14 -0
- intellema_vdk/providers/__init__.py +35 -0
- intellema_vdk/providers/livekit/__init__.py +19 -0
- intellema_vdk/providers/livekit/client.py +612 -0
- intellema_vdk/providers/livekit/exceptions.py +23 -0
- intellema_vdk/providers/protocols.py +33 -0
- intellema_vdk/providers/retell/__init__.py +17 -0
- intellema_vdk/providers/retell/client.py +468 -0
- intellema_vdk/providers/retell/exceptions.py +19 -0
- intellema_vdk/{retell_lib → providers/retell}/import_phone_number.py +1 -1
- intellema_vdk/stt/__init__.py +17 -0
- intellema_vdk/stt/client.py +482 -0
- intellema_vdk/stt/exceptions.py +19 -0
- intellema_vdk/tts/__init__.py +15 -0
- intellema_vdk/tts/__pycache__/__init__.cpython-312.pyc +0 -0
- intellema_vdk/tts/__pycache__/client.cpython-312.pyc +0 -0
- intellema_vdk/tts/__pycache__/exceptions.cpython-312.pyc +0 -0
- intellema_vdk/tts/__pycache__/providers.cpython-312.pyc +0 -0
- intellema_vdk/tts/client.py +541 -0
- intellema_vdk/tts/exceptions.py +15 -0
- intellema_vdk/tts/providers.py +293 -0
- intellema_vdk/utils/logger_config.py +41 -0
- intellema_vdk-0.2.2.dist-info/METADATA +311 -0
- intellema_vdk-0.2.2.dist-info/RECORD +29 -0
- {intellema_vdk-0.2.0.dist-info → intellema_vdk-0.2.2.dist-info}/WHEEL +1 -1
- intellema_vdk/livekit_lib/__init__.py +0 -3
- intellema_vdk/livekit_lib/client.py +0 -280
- intellema_vdk/retell_lib/retell_client.py +0 -248
- intellema_vdk/speech_lib/__init__.py +0 -2
- intellema_vdk/speech_lib/stt_client.py +0 -108
- intellema_vdk/speech_lib/tts_streamer.py +0 -188
- intellema_vdk-0.2.0.dist-info/METADATA +0 -221
- intellema_vdk-0.2.0.dist-info/RECORD +0 -14
- /intellema_vdk/{retell_lib/__init__.py → stt/providers.py} +0 -0
- {intellema_vdk-0.2.0.dist-info → intellema_vdk-0.2.2.dist-info}/licenses/LICENSE +0 -0
- {intellema_vdk-0.2.0.dist-info → intellema_vdk-0.2.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,482 @@
|
|
|
1
|
+
import os # Used for operating system dependent functionality, like checking file existence.
|
|
2
|
+
import json # Used for JSON serialization.
|
|
3
|
+
import logging # Used for logging events.
|
|
4
|
+
import subprocess # Used for installing packages
|
|
5
|
+
import sys # Used for sys operations
|
|
6
|
+
from pathlib import Path # Used for path operations.
|
|
7
|
+
from typing import Optional, Tuple, Any, List, Dict, Union, TYPE_CHECKING # Used for type hinting.
|
|
8
|
+
|
|
9
|
+
# httpx is a core dependency and should be available
|
|
10
|
+
import httpx # Modern asynchronous HTTP client.
|
|
11
|
+
|
|
12
|
+
# Lazy import OpenAI - only load when STTManager is instantiated
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from openai import APIError, AsyncOpenAI
|
|
15
|
+
else:
|
|
16
|
+
APIError = None
|
|
17
|
+
AsyncOpenAI = None
|
|
18
|
+
|
|
19
|
+
from ..config import get_env # Used to get environment variables.
|
|
20
|
+
from .exceptions import (
|
|
21
|
+
STTConfigurationError,
|
|
22
|
+
STTFileError,
|
|
23
|
+
STTTranscriptionError,
|
|
24
|
+
STTAgentError,
|
|
25
|
+
STTError
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
# Setup logger for this module.
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class STTManager:
|
|
33
|
+
"""Manages Speech-to-Text (STT) operations using OpenAI's Whisper model.
|
|
34
|
+
|
|
35
|
+
This class provides functionality to transcribe audio files and optionally
|
|
36
|
+
post the transcribed text to an external agent API.
|
|
37
|
+
|
|
38
|
+
Attributes:
|
|
39
|
+
_api_key (str): The OpenAI API key.
|
|
40
|
+
_agent_api_url (Optional[str]): The URL of the agent API to post
|
|
41
|
+
transcriptions to.
|
|
42
|
+
_agent_post_key (str): The key to use when posting to the agent API.
|
|
43
|
+
_openai_client (AsyncOpenAI): The asynchronous OpenAI API client.
|
|
44
|
+
_http_client (httpx.AsyncClient): The asynchronous HTTP client for
|
|
45
|
+
making requests to the agent API.
|
|
46
|
+
|
|
47
|
+
Example:
|
|
48
|
+
>>> async def main():
|
|
49
|
+
... stt_manager = STTManager(agent_post_key="transcript")
|
|
50
|
+
... try:
|
|
51
|
+
... result, response = await stt_manager.transcribe_and_post("your/audio/file.wav")
|
|
52
|
+
... print(f"Transcript: {result['text']}")
|
|
53
|
+
... print(f"Response from agent: {response}")
|
|
54
|
+
... except STTError as e:
|
|
55
|
+
... print(f"An STT error occurred: {e}")
|
|
56
|
+
... finally:
|
|
57
|
+
... await stt_manager.close()
|
|
58
|
+
...
|
|
59
|
+
>>> if __name__ == "__main__":
|
|
60
|
+
... import asyncio
|
|
61
|
+
... asyncio.run(main())
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
# Default supported audio extensions
|
|
65
|
+
DEFAULT_BATCH_EXTENSIONS = [".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm"]
|
|
66
|
+
|
|
67
|
+
def __init__(self, agent_post_key: str = "message") -> None:
|
|
68
|
+
"""Initializes the STTManager.
|
|
69
|
+
|
|
70
|
+
It retrieves the necessary API keys and URLs from environment variables
|
|
71
|
+
and sets up the API clients.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
agent_post_key (str): The key to use if posting transcriptions to
|
|
75
|
+
an agent API. Defaults to "message".
|
|
76
|
+
Transcript will be sent under this key in the JSON payload.
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
Raises:
|
|
80
|
+
STTConfigurationError: If the `OPENAI_API_KEY` is not set in the
|
|
81
|
+
environment variables.
|
|
82
|
+
"""
|
|
83
|
+
# Lazy import OpenAI - only install when actually used
|
|
84
|
+
global AsyncOpenAI, APIError
|
|
85
|
+
if AsyncOpenAI is None:
|
|
86
|
+
try:
|
|
87
|
+
from openai import APIError as _APIError, AsyncOpenAI as _AsyncOpenAI
|
|
88
|
+
APIError = _APIError
|
|
89
|
+
AsyncOpenAI = _AsyncOpenAI
|
|
90
|
+
except ImportError:
|
|
91
|
+
print("OpenAI SDK is not installed. Installing now...")
|
|
92
|
+
print("Run: pip install intellema-vdk[stt]")
|
|
93
|
+
try:
|
|
94
|
+
subprocess.check_call([sys.executable, "-m", "pip", "install", "openai>=1.0.0"])
|
|
95
|
+
from openai import APIError as _APIError, AsyncOpenAI as _AsyncOpenAI
|
|
96
|
+
APIError = _APIError
|
|
97
|
+
AsyncOpenAI = _AsyncOpenAI
|
|
98
|
+
print("✓ OpenAI SDK installed successfully!")
|
|
99
|
+
except Exception as e:
|
|
100
|
+
raise STTConfigurationError(
|
|
101
|
+
"Failed to install openai. Please install manually:\n"
|
|
102
|
+
" pip install intellema-vdk[stt]\n"
|
|
103
|
+
"or:\n"
|
|
104
|
+
" pip install openai>=1.0.0"
|
|
105
|
+
) from e
|
|
106
|
+
|
|
107
|
+
self._api_key = get_env("OPENAI_API_KEY")
|
|
108
|
+
if not self._api_key:
|
|
109
|
+
raise STTConfigurationError(
|
|
110
|
+
"OPENAI_API_KEY must be set in your .env file.")
|
|
111
|
+
|
|
112
|
+
self._agent_api_url = get_env("AGENT_API_URL")
|
|
113
|
+
if not self._agent_api_url:
|
|
114
|
+
logger.warning(
|
|
115
|
+
"AGENT_API_URL is not set in .env. Posting to agent will be disabled.")
|
|
116
|
+
|
|
117
|
+
self._agent_post_key = agent_post_key
|
|
118
|
+
self._openai_client = AsyncOpenAI(api_key=self._api_key)
|
|
119
|
+
self._http_client = httpx.AsyncClient()
|
|
120
|
+
|
|
121
|
+
async def close(self) -> None:
|
|
122
|
+
"""Cleans up resources used by the STTManager.
|
|
123
|
+
|
|
124
|
+
This method should be called when the STTManager is no longer needed
|
|
125
|
+
to close the underlying HTTP and API clients.
|
|
126
|
+
"""
|
|
127
|
+
await self._http_client.aclose()
|
|
128
|
+
await self._openai_client.close()
|
|
129
|
+
|
|
130
|
+
def _get_audio_files(self, folder_path: str, extensions: List[str], recursive: bool = False) -> List[str]:
|
|
131
|
+
"""Gets all audio files from a folder.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
folder_path (str): The path to the folder.
|
|
135
|
+
extensions (List[str]): List of file extensions to include (e.g., [".wav", ".mp3"]).
|
|
136
|
+
recursive (bool): Whether to scan subfolders recursively.
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
List[str]: List of file paths.
|
|
140
|
+
"""
|
|
141
|
+
folder = Path(folder_path)
|
|
142
|
+
audio_files = []
|
|
143
|
+
|
|
144
|
+
if recursive:
|
|
145
|
+
for ext in extensions:
|
|
146
|
+
audio_files.extend([str(f) for f in folder.rglob(f"*{ext}")])
|
|
147
|
+
else:
|
|
148
|
+
for ext in extensions:
|
|
149
|
+
audio_files.extend([str(f) for f in folder.glob(f"*{ext}")])
|
|
150
|
+
|
|
151
|
+
return sorted(audio_files)
|
|
152
|
+
|
|
153
|
+
async def _validate_audio(self, file_path: str, allowed_extensions: Optional[List[str]] = None) -> Tuple[bool, Optional[str]]:
|
|
154
|
+
"""Validates an audio file.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
file_path (str): The path to the audio file.
|
|
158
|
+
allowed_extensions (Optional[List[str]]): List of allowed extensions. If None,
|
|
159
|
+
uses DEFAULT_BATCH_EXTENSIONS.
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
Tuple[bool, Optional[str]]: A tuple of (is_valid, error_message).
|
|
163
|
+
If valid, returns (True, None). If invalid, returns (False, error_message).
|
|
164
|
+
"""
|
|
165
|
+
# Check if file exists
|
|
166
|
+
if not os.path.exists(file_path):
|
|
167
|
+
return False, f"File not found: {file_path}"
|
|
168
|
+
|
|
169
|
+
# Check if it's a file (not a directory)
|
|
170
|
+
if not os.path.isfile(file_path):
|
|
171
|
+
return False, f"Path is not a file: {file_path}"
|
|
172
|
+
|
|
173
|
+
# Check if file is readable
|
|
174
|
+
if not os.access(file_path, os.R_OK):
|
|
175
|
+
return False, f"File is not readable: {file_path}"
|
|
176
|
+
|
|
177
|
+
# Check file extension
|
|
178
|
+
file_ext = Path(file_path).suffix.lower()
|
|
179
|
+
valid_extensions = allowed_extensions if allowed_extensions else self.DEFAULT_BATCH_EXTENSIONS
|
|
180
|
+
if file_ext not in valid_extensions:
|
|
181
|
+
return False, f"Unsupported file format: {file_ext}. Allowed: {', '.join(valid_extensions)}"
|
|
182
|
+
|
|
183
|
+
# Check file size (must be > 0)
|
|
184
|
+
file_size = os.path.getsize(file_path)
|
|
185
|
+
if file_size == 0:
|
|
186
|
+
return False, f"File is empty: {file_path}"
|
|
187
|
+
|
|
188
|
+
return True, None
|
|
189
|
+
|
|
190
|
+
def _write_output_to_file(self, output_path: str, data: Union[Dict, List[Dict]]) -> None:
|
|
191
|
+
"""Writes transcription results to a file.
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
output_path (str): The path to the output file.
|
|
195
|
+
data (Union[Dict, List[Dict]]): The data to write.
|
|
196
|
+
"""
|
|
197
|
+
try:
|
|
198
|
+
output_file = Path(output_path)
|
|
199
|
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
200
|
+
|
|
201
|
+
with open(output_path, 'w', encoding='utf-8') as f:
|
|
202
|
+
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
203
|
+
|
|
204
|
+
logger.info(f"Successfully wrote output to: {output_path}")
|
|
205
|
+
except Exception as e:
|
|
206
|
+
logger.error(f"Failed to write output to {output_path}: {e}")
|
|
207
|
+
raise STTFileError(f"Failed to write output to {output_path}: {e}") from e
|
|
208
|
+
|
|
209
|
+
async def transcribe_audio(
|
|
210
|
+
self,
|
|
211
|
+
path: str,
|
|
212
|
+
model: str = "whisper-1",
|
|
213
|
+
language: Optional[str] = None,
|
|
214
|
+
temperature: Optional[float] = None,
|
|
215
|
+
batch_process: bool = False,
|
|
216
|
+
batch_extensions: Optional[List[str]] = None,
|
|
217
|
+
recursive: bool = False,
|
|
218
|
+
validate_audio: bool = True,
|
|
219
|
+
output_file: Optional[str] = None
|
|
220
|
+
) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
|
|
221
|
+
"""Transcribes an audio file or batch of audio files using OpenAI's Whisper model.
|
|
222
|
+
|
|
223
|
+
Args:
|
|
224
|
+
path (str): Either a single file path or a folder path when batch_process=True.
|
|
225
|
+
model (str): The name of the Whisper model to use. Currently, only
|
|
226
|
+
"whisper-1" is supported by the OpenAI API.
|
|
227
|
+
language (Optional[str]): The language of the audio in ISO-639-1 format
|
|
228
|
+
(e.g., "en" for English, "es" for Spanish). If None, the API will
|
|
229
|
+
auto-detect the language. Defaults to None.
|
|
230
|
+
temperature (Optional[float]): The sampling temperature, between 0 and 1.
|
|
231
|
+
Higher values like 0.8 will make the output more random, while lower
|
|
232
|
+
values like 0.2 will make it more focused and deterministic. If None,
|
|
233
|
+
the API will use its default value. Defaults to None.
|
|
234
|
+
batch_process (bool): If False, path must be a single audio file.
|
|
235
|
+
If True, path is a folder and all supported files inside will be transcribed.
|
|
236
|
+
batch_extensions (Optional[List[str]]): When batch_process=True, which extensions
|
|
237
|
+
to include (e.g., [".wav", ".mp3"]). Defaults to all supported formats.
|
|
238
|
+
recursive (bool): When batch_process=True, whether to scan subfolders.
|
|
239
|
+
validate_audio (bool): Whether to validate audio files before transcription.
|
|
240
|
+
output_file (Optional[str]): Path to save the transcription results as JSON.
|
|
241
|
+
|
|
242
|
+
Returns:
|
|
243
|
+
Union[Dict[str, Any], List[Dict[str, Any]]]:
|
|
244
|
+
- Single file: {"text": "...", "language": "...", "path": "..."}
|
|
245
|
+
- Batch: [{"path": "...", "text": "...", "language": "...", "error": None}, ...]
|
|
246
|
+
|
|
247
|
+
Raises:
|
|
248
|
+
STTFileError: If the audio file/folder is not found or invalid.
|
|
249
|
+
STTTranscriptionError: If there is an error with the OpenAI API.
|
|
250
|
+
"""
|
|
251
|
+
if batch_process:
|
|
252
|
+
# Batch processing
|
|
253
|
+
if not os.path.isdir(path):
|
|
254
|
+
raise STTFileError(f"Batch processing requires a valid folder path, got: {path}")
|
|
255
|
+
|
|
256
|
+
extensions = batch_extensions or self.DEFAULT_BATCH_EXTENSIONS
|
|
257
|
+
audio_files = self._get_audio_files(path, extensions, recursive)
|
|
258
|
+
|
|
259
|
+
if not audio_files:
|
|
260
|
+
logger.warning(f"No audio files found in {path} with extensions {extensions}")
|
|
261
|
+
result = []
|
|
262
|
+
else:
|
|
263
|
+
logger.info(f"Found {len(audio_files)} audio files to process")
|
|
264
|
+
results = []
|
|
265
|
+
|
|
266
|
+
for file_path in audio_files:
|
|
267
|
+
file_result = {
|
|
268
|
+
"path": file_path,
|
|
269
|
+
"text": None,
|
|
270
|
+
"language": None,
|
|
271
|
+
"error": None
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
try:
|
|
275
|
+
# Validate if needed
|
|
276
|
+
if validate_audio:
|
|
277
|
+
is_valid, error_msg = await self._validate_audio(file_path, extensions)
|
|
278
|
+
if not is_valid:
|
|
279
|
+
file_result["error"] = error_msg
|
|
280
|
+
logger.warning(f"Validation failed for {file_path}: {error_msg}")
|
|
281
|
+
results.append(file_result)
|
|
282
|
+
continue
|
|
283
|
+
|
|
284
|
+
# Transcribe
|
|
285
|
+
single_result = await self._transcribe_single_file(file_path, model, language, temperature, validate_audio=False)
|
|
286
|
+
file_result["text"] = single_result["text"]
|
|
287
|
+
file_result["language"] = single_result.get("language")
|
|
288
|
+
|
|
289
|
+
except Exception as e:
|
|
290
|
+
file_result["error"] = str(e)
|
|
291
|
+
logger.error(f"Failed to transcribe {file_path}: {e}")
|
|
292
|
+
|
|
293
|
+
results.append(file_result)
|
|
294
|
+
|
|
295
|
+
result = results
|
|
296
|
+
|
|
297
|
+
# Write to file if specified
|
|
298
|
+
if output_file:
|
|
299
|
+
self._write_output_to_file(output_file, result)
|
|
300
|
+
|
|
301
|
+
return result
|
|
302
|
+
else:
|
|
303
|
+
# Single file processing
|
|
304
|
+
# Warn if batch_extensions is specified without batch_process
|
|
305
|
+
if batch_extensions:
|
|
306
|
+
logger.warning(
|
|
307
|
+
f"batch_extensions={batch_extensions} specified but batch_process=False. "
|
|
308
|
+
f"The extension filter will be used for validation."
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
# Warn if recursive is specified without batch_process
|
|
312
|
+
if recursive:
|
|
313
|
+
logger.warning(
|
|
314
|
+
"recursive=True specified but batch_process=False. "
|
|
315
|
+
"This parameter is ignored for single file processing."
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
result = await self._transcribe_single_file(
|
|
319
|
+
path, model, language, temperature, validate_audio,
|
|
320
|
+
allowed_extensions=batch_extensions
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
# Write to file if specified
|
|
324
|
+
if output_file:
|
|
325
|
+
self._write_output_to_file(output_file, result)
|
|
326
|
+
|
|
327
|
+
return result
|
|
328
|
+
|
|
329
|
+
async def _transcribe_single_file(
|
|
330
|
+
self,
|
|
331
|
+
file_path: str,
|
|
332
|
+
model: str,
|
|
333
|
+
language: Optional[str],
|
|
334
|
+
temperature: Optional[float],
|
|
335
|
+
validate_audio: bool = True,
|
|
336
|
+
allowed_extensions: Optional[List[str]] = None
|
|
337
|
+
) -> Dict[str, Any]:
|
|
338
|
+
"""Transcribes a single audio file.
|
|
339
|
+
|
|
340
|
+
Args:
|
|
341
|
+
file_path (str): The path to the audio file.
|
|
342
|
+
model (str): The Whisper model to use.
|
|
343
|
+
language (Optional[str]): The language code.
|
|
344
|
+
temperature (Optional[float]): The sampling temperature.
|
|
345
|
+
validate_audio (bool): Whether to validate the audio file.
|
|
346
|
+
allowed_extensions (Optional[List[str]]): Allowed file extensions for validation.
|
|
347
|
+
|
|
348
|
+
Returns:
|
|
349
|
+
Dict[str, Any]: {"text": "...", "language": "...", "path": "..."}
|
|
350
|
+
|
|
351
|
+
Raises:
|
|
352
|
+
STTFileError: If validation fails.
|
|
353
|
+
STTTranscriptionError: If transcription fails.
|
|
354
|
+
"""
|
|
355
|
+
logger.info(f"Starting transcription for file: {file_path}")
|
|
356
|
+
|
|
357
|
+
# Validate if needed
|
|
358
|
+
if validate_audio:
|
|
359
|
+
is_valid, error_msg = await self._validate_audio(file_path, allowed_extensions)
|
|
360
|
+
if not is_valid:
|
|
361
|
+
raise STTFileError(error_msg)
|
|
362
|
+
|
|
363
|
+
try:
|
|
364
|
+
with open(file_path, "rb") as audio_file:
|
|
365
|
+
transcription_params = {
|
|
366
|
+
"model": model,
|
|
367
|
+
"file": audio_file
|
|
368
|
+
}
|
|
369
|
+
if language:
|
|
370
|
+
transcription_params["language"] = language
|
|
371
|
+
if temperature is not None:
|
|
372
|
+
transcription_params["temperature"] = temperature
|
|
373
|
+
|
|
374
|
+
transcript = await self._openai_client.audio.transcriptions.create(**transcription_params)
|
|
375
|
+
except APIError as e:
|
|
376
|
+
raise STTTranscriptionError(f"OpenAI API error: {e}") from e
|
|
377
|
+
|
|
378
|
+
logger.info(f"Successfully transcribed file: {file_path}")
|
|
379
|
+
|
|
380
|
+
return {
|
|
381
|
+
"text": transcript.text,
|
|
382
|
+
"language": language,
|
|
383
|
+
"path": file_path
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
async def transcribe_and_post(
|
|
387
|
+
self,
|
|
388
|
+
path: str,
|
|
389
|
+
model: str = "whisper-1",
|
|
390
|
+
language: Optional[str] = None,
|
|
391
|
+
temperature: Optional[float] = None,
|
|
392
|
+
batch_process: bool = False,
|
|
393
|
+
batch_extensions: Optional[List[str]] = None,
|
|
394
|
+
recursive: bool = False,
|
|
395
|
+
validate_audio: bool = True,
|
|
396
|
+
output_file: Optional[str] = None
|
|
397
|
+
) -> Tuple[Union[Dict[str, Any], List[Dict[str, Any]]], Optional[Any]]:
|
|
398
|
+
"""Transcribes an audio file or batch and posts the result to the agent API.
|
|
399
|
+
|
|
400
|
+
Args:
|
|
401
|
+
path (str): Either a single file path or a folder path when batch_process=True.
|
|
402
|
+
model (str): The name of the Whisper model to use.
|
|
403
|
+
language (Optional[str]): The language of the audio in ISO-639-1 format.
|
|
404
|
+
temperature (Optional[float]): The sampling temperature, between 0 and 1.
|
|
405
|
+
batch_process (bool): If False, path is a single file. If True, path is a folder.
|
|
406
|
+
batch_extensions (Optional[List[str]]): File extensions to include in batch mode.
|
|
407
|
+
recursive (bool): Whether to scan subfolders in batch mode.
|
|
408
|
+
validate_audio (bool): Whether to validate audio files before transcription.
|
|
409
|
+
output_file (Optional[str]): Path to save the transcription results as JSON.
|
|
410
|
+
|
|
411
|
+
Returns:
|
|
412
|
+
A tuple containing:
|
|
413
|
+
- Union[Dict[str, Any], List[Dict[str, Any]]]: Transcription result(s).
|
|
414
|
+
- Optional[Any]: The JSON response from the agent API, or None if disabled.
|
|
415
|
+
|
|
416
|
+
Raises:
|
|
417
|
+
STTError: If any step in the process fails.
|
|
418
|
+
"""
|
|
419
|
+
try:
|
|
420
|
+
# Transcribe the audio file(s)
|
|
421
|
+
transcript_result = await self.transcribe_audio(
|
|
422
|
+
path=path,
|
|
423
|
+
model=model,
|
|
424
|
+
language=language,
|
|
425
|
+
temperature=temperature,
|
|
426
|
+
batch_process=batch_process,
|
|
427
|
+
batch_extensions=batch_extensions,
|
|
428
|
+
recursive=recursive,
|
|
429
|
+
validate_audio=validate_audio,
|
|
430
|
+
output_file=output_file
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
response = None
|
|
434
|
+
# Post the transcribed text to the agent API if the URL is configured
|
|
435
|
+
if self._agent_api_url:
|
|
436
|
+
if batch_process:
|
|
437
|
+
# For batch, post all successful transcriptions
|
|
438
|
+
texts = [r["text"] for r in transcript_result if r["text"] is not None]
|
|
439
|
+
if texts:
|
|
440
|
+
response = await self._post_to_agent(texts)
|
|
441
|
+
else:
|
|
442
|
+
# For single file, post the text
|
|
443
|
+
response = await self._post_to_agent(transcript_result["text"])
|
|
444
|
+
else:
|
|
445
|
+
logger.info("AGENT_API_URL not set, skipping post to agent.")
|
|
446
|
+
|
|
447
|
+
return transcript_result, response
|
|
448
|
+
|
|
449
|
+
except STTError as e:
|
|
450
|
+
logger.error(
|
|
451
|
+
f"STT Error during processing of {path}: {e}", exc_info=True)
|
|
452
|
+
raise
|
|
453
|
+
except Exception as e:
|
|
454
|
+
logger.error(
|
|
455
|
+
f"An unexpected error occurred during processing of {path}: {e}", exc_info=True)
|
|
456
|
+
raise
|
|
457
|
+
|
|
458
|
+
async def _post_to_agent(self, data: Union[str, List[str]]) -> Any:
|
|
459
|
+
"""Posts the transcribed text to the agent API.
|
|
460
|
+
|
|
461
|
+
The text is sent in a JSON payload under the configured key.
|
|
462
|
+
|
|
463
|
+
Args:
|
|
464
|
+
data (Union[str, List[str]]): The transcribed text or list of texts to post.
|
|
465
|
+
|
|
466
|
+
Returns:
|
|
467
|
+
Any: The JSON response from the agent API.
|
|
468
|
+
|
|
469
|
+
Raises:
|
|
470
|
+
STTAgentError: If the HTTP request to the agent API fails.
|
|
471
|
+
"""
|
|
472
|
+
payload = {self._agent_post_key: data}
|
|
473
|
+
try:
|
|
474
|
+
logger.info(f"Posting to agent with payload: {payload}")
|
|
475
|
+
response = await self._http_client.post(self._agent_api_url, json=payload)
|
|
476
|
+
response.raise_for_status() # Raises HTTPError for bad responses (4xx or 5xx)
|
|
477
|
+
logger.info(
|
|
478
|
+
f"Successfully posted to agent. Status: {response.status_code}")
|
|
479
|
+
return response.json()
|
|
480
|
+
except httpx.HTTPError as e:
|
|
481
|
+
logger.error(f"Failed to post to agent API: {e}", exc_info=True)
|
|
482
|
+
raise STTAgentError(f"Failed to post to agent API: {e}") from e
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
class STTError(Exception):
|
|
2
|
+
"""Base exception for all STT-related errors."""
|
|
3
|
+
pass
|
|
4
|
+
|
|
5
|
+
class STTConfigurationError(STTError):
|
|
6
|
+
"""Raised when configuration (API keys, URLs) is missing or invalid."""
|
|
7
|
+
pass
|
|
8
|
+
|
|
9
|
+
class STTFileError(STTError):
|
|
10
|
+
"""Raised when the audio file is not found or inaccessible."""
|
|
11
|
+
pass
|
|
12
|
+
|
|
13
|
+
class STTTranscriptionError(STTError):
|
|
14
|
+
"""Raised when the transcription service (OpenAI) fails."""
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
class STTAgentError(STTError):
|
|
18
|
+
"""Raised when posting to the agent API fails."""
|
|
19
|
+
pass
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from .client import TTSStreamer
|
|
2
|
+
from .exceptions import (
|
|
3
|
+
TTSError,
|
|
4
|
+
TTSConfigurationError,
|
|
5
|
+
TTSStreamError,
|
|
6
|
+
TTSAPIError,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"TTSStreamer",
|
|
11
|
+
"TTSError",
|
|
12
|
+
"TTSConfigurationError",
|
|
13
|
+
"TTSStreamError",
|
|
14
|
+
"TTSAPIError",
|
|
15
|
+
]
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|