intellema-vdk 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. intellema_vdk/__init__.py +67 -10
  2. intellema_vdk/config.py +14 -0
  3. intellema_vdk/providers/__init__.py +35 -0
  4. intellema_vdk/providers/livekit/__init__.py +19 -0
  5. intellema_vdk/providers/livekit/client.py +612 -0
  6. intellema_vdk/providers/livekit/exceptions.py +23 -0
  7. intellema_vdk/providers/protocols.py +33 -0
  8. intellema_vdk/providers/retell/__init__.py +17 -0
  9. intellema_vdk/providers/retell/client.py +468 -0
  10. intellema_vdk/providers/retell/exceptions.py +19 -0
  11. intellema_vdk/{retell_lib → providers/retell}/import_phone_number.py +1 -1
  12. intellema_vdk/stt/__init__.py +17 -0
  13. intellema_vdk/stt/client.py +482 -0
  14. intellema_vdk/stt/exceptions.py +19 -0
  15. intellema_vdk/tts/__init__.py +15 -0
  16. intellema_vdk/tts/__pycache__/__init__.cpython-312.pyc +0 -0
  17. intellema_vdk/tts/__pycache__/client.cpython-312.pyc +0 -0
  18. intellema_vdk/tts/__pycache__/exceptions.cpython-312.pyc +0 -0
  19. intellema_vdk/tts/__pycache__/providers.cpython-312.pyc +0 -0
  20. intellema_vdk/tts/client.py +541 -0
  21. intellema_vdk/tts/exceptions.py +15 -0
  22. intellema_vdk/tts/providers.py +293 -0
  23. intellema_vdk/utils/logger_config.py +41 -0
  24. intellema_vdk-0.2.2.dist-info/METADATA +311 -0
  25. intellema_vdk-0.2.2.dist-info/RECORD +29 -0
  26. {intellema_vdk-0.2.0.dist-info → intellema_vdk-0.2.2.dist-info}/WHEEL +1 -1
  27. intellema_vdk/livekit_lib/__init__.py +0 -3
  28. intellema_vdk/livekit_lib/client.py +0 -280
  29. intellema_vdk/retell_lib/retell_client.py +0 -248
  30. intellema_vdk/speech_lib/__init__.py +0 -2
  31. intellema_vdk/speech_lib/stt_client.py +0 -108
  32. intellema_vdk/speech_lib/tts_streamer.py +0 -188
  33. intellema_vdk-0.2.0.dist-info/METADATA +0 -221
  34. intellema_vdk-0.2.0.dist-info/RECORD +0 -14
  35. /intellema_vdk/{retell_lib/__init__.py → stt/providers.py} +0 -0
  36. {intellema_vdk-0.2.0.dist-info → intellema_vdk-0.2.2.dist-info}/licenses/LICENSE +0 -0
  37. {intellema_vdk-0.2.0.dist-info → intellema_vdk-0.2.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,482 @@
1
+ import os # Used for operating system dependent functionality, like checking file existence.
2
+ import json # Used for JSON serialization.
3
+ import logging # Used for logging events.
4
+ import subprocess # Used for installing packages
5
+ import sys # Used for sys operations
6
+ from pathlib import Path # Used for path operations.
7
+ from typing import Optional, Tuple, Any, List, Dict, Union, TYPE_CHECKING # Used for type hinting.
8
+
9
+ # httpx is a core dependency and should be available
10
+ import httpx # Modern asynchronous HTTP client.
11
+
12
+ # Lazy import OpenAI - only load when STTManager is instantiated
13
+ if TYPE_CHECKING:
14
+ from openai import APIError, AsyncOpenAI
15
+ else:
16
+ APIError = None
17
+ AsyncOpenAI = None
18
+
19
+ from ..config import get_env # Used to get environment variables.
20
+ from .exceptions import (
21
+ STTConfigurationError,
22
+ STTFileError,
23
+ STTTranscriptionError,
24
+ STTAgentError,
25
+ STTError
26
+ )
27
+
28
+ # Setup logger for this module.
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ class STTManager:
33
+ """Manages Speech-to-Text (STT) operations using OpenAI's Whisper model.
34
+
35
+ This class provides functionality to transcribe audio files and optionally
36
+ post the transcribed text to an external agent API.
37
+
38
+ Attributes:
39
+ _api_key (str): The OpenAI API key.
40
+ _agent_api_url (Optional[str]): The URL of the agent API to post
41
+ transcriptions to.
42
+ _agent_post_key (str): The key to use when posting to the agent API.
43
+ _openai_client (AsyncOpenAI): The asynchronous OpenAI API client.
44
+ _http_client (httpx.AsyncClient): The asynchronous HTTP client for
45
+ making requests to the agent API.
46
+
47
+ Example:
48
+ >>> async def main():
49
+ ... stt_manager = STTManager(agent_post_key="transcript")
50
+ ... try:
51
+ ... result, response = await stt_manager.transcribe_and_post("your/audio/file.wav")
52
+ ... print(f"Transcript: {result['text']}")
53
+ ... print(f"Response from agent: {response}")
54
+ ... except STTError as e:
55
+ ... print(f"An STT error occurred: {e}")
56
+ ... finally:
57
+ ... await stt_manager.close()
58
+ ...
59
+ >>> if __name__ == "__main__":
60
+ ... import asyncio
61
+ ... asyncio.run(main())
62
+ """
63
+
64
+ # Default supported audio extensions
65
+ DEFAULT_BATCH_EXTENSIONS = [".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm"]
66
+
67
+ def __init__(self, agent_post_key: str = "message") -> None:
68
+ """Initializes the STTManager.
69
+
70
+ It retrieves the necessary API keys and URLs from environment variables
71
+ and sets up the API clients.
72
+
73
+ Args:
74
+ agent_post_key (str): The key to use if posting transcriptions to
75
+ an agent API. Defaults to "message".
76
+ Transcript will be sent under this key in the JSON payload.
77
+
78
+
79
+ Raises:
80
+ STTConfigurationError: If the `OPENAI_API_KEY` is not set in the
81
+ environment variables.
82
+ """
83
+ # Lazy import OpenAI - only install when actually used
84
+ global AsyncOpenAI, APIError
85
+ if AsyncOpenAI is None:
86
+ try:
87
+ from openai import APIError as _APIError, AsyncOpenAI as _AsyncOpenAI
88
+ APIError = _APIError
89
+ AsyncOpenAI = _AsyncOpenAI
90
+ except ImportError:
91
+ print("OpenAI SDK is not installed. Installing now...")
92
+ print("Run: pip install intellema-vdk[stt]")
93
+ try:
94
+ subprocess.check_call([sys.executable, "-m", "pip", "install", "openai>=1.0.0"])
95
+ from openai import APIError as _APIError, AsyncOpenAI as _AsyncOpenAI
96
+ APIError = _APIError
97
+ AsyncOpenAI = _AsyncOpenAI
98
+ print("✓ OpenAI SDK installed successfully!")
99
+ except Exception as e:
100
+ raise STTConfigurationError(
101
+ "Failed to install openai. Please install manually:\n"
102
+ " pip install intellema-vdk[stt]\n"
103
+ "or:\n"
104
+ " pip install openai>=1.0.0"
105
+ ) from e
106
+
107
+ self._api_key = get_env("OPENAI_API_KEY")
108
+ if not self._api_key:
109
+ raise STTConfigurationError(
110
+ "OPENAI_API_KEY must be set in your .env file.")
111
+
112
+ self._agent_api_url = get_env("AGENT_API_URL")
113
+ if not self._agent_api_url:
114
+ logger.warning(
115
+ "AGENT_API_URL is not set in .env. Posting to agent will be disabled.")
116
+
117
+ self._agent_post_key = agent_post_key
118
+ self._openai_client = AsyncOpenAI(api_key=self._api_key)
119
+ self._http_client = httpx.AsyncClient()
120
+
121
+ async def close(self) -> None:
122
+ """Cleans up resources used by the STTManager.
123
+
124
+ This method should be called when the STTManager is no longer needed
125
+ to close the underlying HTTP and API clients.
126
+ """
127
+ await self._http_client.aclose()
128
+ await self._openai_client.close()
129
+
130
+ def _get_audio_files(self, folder_path: str, extensions: List[str], recursive: bool = False) -> List[str]:
131
+ """Gets all audio files from a folder.
132
+
133
+ Args:
134
+ folder_path (str): The path to the folder.
135
+ extensions (List[str]): List of file extensions to include (e.g., [".wav", ".mp3"]).
136
+ recursive (bool): Whether to scan subfolders recursively.
137
+
138
+ Returns:
139
+ List[str]: List of file paths.
140
+ """
141
+ folder = Path(folder_path)
142
+ audio_files = []
143
+
144
+ if recursive:
145
+ for ext in extensions:
146
+ audio_files.extend([str(f) for f in folder.rglob(f"*{ext}")])
147
+ else:
148
+ for ext in extensions:
149
+ audio_files.extend([str(f) for f in folder.glob(f"*{ext}")])
150
+
151
+ return sorted(audio_files)
152
+
153
+ async def _validate_audio(self, file_path: str, allowed_extensions: Optional[List[str]] = None) -> Tuple[bool, Optional[str]]:
154
+ """Validates an audio file.
155
+
156
+ Args:
157
+ file_path (str): The path to the audio file.
158
+ allowed_extensions (Optional[List[str]]): List of allowed extensions. If None,
159
+ uses DEFAULT_BATCH_EXTENSIONS.
160
+
161
+ Returns:
162
+ Tuple[bool, Optional[str]]: A tuple of (is_valid, error_message).
163
+ If valid, returns (True, None). If invalid, returns (False, error_message).
164
+ """
165
+ # Check if file exists
166
+ if not os.path.exists(file_path):
167
+ return False, f"File not found: {file_path}"
168
+
169
+ # Check if it's a file (not a directory)
170
+ if not os.path.isfile(file_path):
171
+ return False, f"Path is not a file: {file_path}"
172
+
173
+ # Check if file is readable
174
+ if not os.access(file_path, os.R_OK):
175
+ return False, f"File is not readable: {file_path}"
176
+
177
+ # Check file extension
178
+ file_ext = Path(file_path).suffix.lower()
179
+ valid_extensions = allowed_extensions if allowed_extensions else self.DEFAULT_BATCH_EXTENSIONS
180
+ if file_ext not in valid_extensions:
181
+ return False, f"Unsupported file format: {file_ext}. Allowed: {', '.join(valid_extensions)}"
182
+
183
+ # Check file size (must be > 0)
184
+ file_size = os.path.getsize(file_path)
185
+ if file_size == 0:
186
+ return False, f"File is empty: {file_path}"
187
+
188
+ return True, None
189
+
190
+ def _write_output_to_file(self, output_path: str, data: Union[Dict, List[Dict]]) -> None:
191
+ """Writes transcription results to a file.
192
+
193
+ Args:
194
+ output_path (str): The path to the output file.
195
+ data (Union[Dict, List[Dict]]): The data to write.
196
+ """
197
+ try:
198
+ output_file = Path(output_path)
199
+ output_file.parent.mkdir(parents=True, exist_ok=True)
200
+
201
+ with open(output_path, 'w', encoding='utf-8') as f:
202
+ json.dump(data, f, indent=2, ensure_ascii=False)
203
+
204
+ logger.info(f"Successfully wrote output to: {output_path}")
205
+ except Exception as e:
206
+ logger.error(f"Failed to write output to {output_path}: {e}")
207
+ raise STTFileError(f"Failed to write output to {output_path}: {e}") from e
208
+
209
+ async def transcribe_audio(
210
+ self,
211
+ path: str,
212
+ model: str = "whisper-1",
213
+ language: Optional[str] = None,
214
+ temperature: Optional[float] = None,
215
+ batch_process: bool = False,
216
+ batch_extensions: Optional[List[str]] = None,
217
+ recursive: bool = False,
218
+ validate_audio: bool = True,
219
+ output_file: Optional[str] = None
220
+ ) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
221
+ """Transcribes an audio file or batch of audio files using OpenAI's Whisper model.
222
+
223
+ Args:
224
+ path (str): Either a single file path or a folder path when batch_process=True.
225
+ model (str): The name of the Whisper model to use. Currently, only
226
+ "whisper-1" is supported by the OpenAI API.
227
+ language (Optional[str]): The language of the audio in ISO-639-1 format
228
+ (e.g., "en" for English, "es" for Spanish). If None, the API will
229
+ auto-detect the language. Defaults to None.
230
+ temperature (Optional[float]): The sampling temperature, between 0 and 1.
231
+ Higher values like 0.8 will make the output more random, while lower
232
+ values like 0.2 will make it more focused and deterministic. If None,
233
+ the API will use its default value. Defaults to None.
234
+ batch_process (bool): If False, path must be a single audio file.
235
+ If True, path is a folder and all supported files inside will be transcribed.
236
+ batch_extensions (Optional[List[str]]): When batch_process=True, which extensions
237
+ to include (e.g., [".wav", ".mp3"]). Defaults to all supported formats.
238
+ recursive (bool): When batch_process=True, whether to scan subfolders.
239
+ validate_audio (bool): Whether to validate audio files before transcription.
240
+ output_file (Optional[str]): Path to save the transcription results as JSON.
241
+
242
+ Returns:
243
+ Union[Dict[str, Any], List[Dict[str, Any]]]:
244
+ - Single file: {"text": "...", "language": "...", "path": "..."}
245
+ - Batch: [{"path": "...", "text": "...", "language": "...", "error": None}, ...]
246
+
247
+ Raises:
248
+ STTFileError: If the audio file/folder is not found or invalid.
249
+ STTTranscriptionError: If there is an error with the OpenAI API.
250
+ """
251
+ if batch_process:
252
+ # Batch processing
253
+ if not os.path.isdir(path):
254
+ raise STTFileError(f"Batch processing requires a valid folder path, got: {path}")
255
+
256
+ extensions = batch_extensions or self.DEFAULT_BATCH_EXTENSIONS
257
+ audio_files = self._get_audio_files(path, extensions, recursive)
258
+
259
+ if not audio_files:
260
+ logger.warning(f"No audio files found in {path} with extensions {extensions}")
261
+ result = []
262
+ else:
263
+ logger.info(f"Found {len(audio_files)} audio files to process")
264
+ results = []
265
+
266
+ for file_path in audio_files:
267
+ file_result = {
268
+ "path": file_path,
269
+ "text": None,
270
+ "language": None,
271
+ "error": None
272
+ }
273
+
274
+ try:
275
+ # Validate if needed
276
+ if validate_audio:
277
+ is_valid, error_msg = await self._validate_audio(file_path, extensions)
278
+ if not is_valid:
279
+ file_result["error"] = error_msg
280
+ logger.warning(f"Validation failed for {file_path}: {error_msg}")
281
+ results.append(file_result)
282
+ continue
283
+
284
+ # Transcribe
285
+ single_result = await self._transcribe_single_file(file_path, model, language, temperature, validate_audio=False)
286
+ file_result["text"] = single_result["text"]
287
+ file_result["language"] = single_result.get("language")
288
+
289
+ except Exception as e:
290
+ file_result["error"] = str(e)
291
+ logger.error(f"Failed to transcribe {file_path}: {e}")
292
+
293
+ results.append(file_result)
294
+
295
+ result = results
296
+
297
+ # Write to file if specified
298
+ if output_file:
299
+ self._write_output_to_file(output_file, result)
300
+
301
+ return result
302
+ else:
303
+ # Single file processing
304
+ # Warn if batch_extensions is specified without batch_process
305
+ if batch_extensions:
306
+ logger.warning(
307
+ f"batch_extensions={batch_extensions} specified but batch_process=False. "
308
+ f"The extension filter will be used for validation."
309
+ )
310
+
311
+ # Warn if recursive is specified without batch_process
312
+ if recursive:
313
+ logger.warning(
314
+ "recursive=True specified but batch_process=False. "
315
+ "This parameter is ignored for single file processing."
316
+ )
317
+
318
+ result = await self._transcribe_single_file(
319
+ path, model, language, temperature, validate_audio,
320
+ allowed_extensions=batch_extensions
321
+ )
322
+
323
+ # Write to file if specified
324
+ if output_file:
325
+ self._write_output_to_file(output_file, result)
326
+
327
+ return result
328
+
329
+ async def _transcribe_single_file(
330
+ self,
331
+ file_path: str,
332
+ model: str,
333
+ language: Optional[str],
334
+ temperature: Optional[float],
335
+ validate_audio: bool = True,
336
+ allowed_extensions: Optional[List[str]] = None
337
+ ) -> Dict[str, Any]:
338
+ """Transcribes a single audio file.
339
+
340
+ Args:
341
+ file_path (str): The path to the audio file.
342
+ model (str): The Whisper model to use.
343
+ language (Optional[str]): The language code.
344
+ temperature (Optional[float]): The sampling temperature.
345
+ validate_audio (bool): Whether to validate the audio file.
346
+ allowed_extensions (Optional[List[str]]): Allowed file extensions for validation.
347
+
348
+ Returns:
349
+ Dict[str, Any]: {"text": "...", "language": "...", "path": "..."}
350
+
351
+ Raises:
352
+ STTFileError: If validation fails.
353
+ STTTranscriptionError: If transcription fails.
354
+ """
355
+ logger.info(f"Starting transcription for file: {file_path}")
356
+
357
+ # Validate if needed
358
+ if validate_audio:
359
+ is_valid, error_msg = await self._validate_audio(file_path, allowed_extensions)
360
+ if not is_valid:
361
+ raise STTFileError(error_msg)
362
+
363
+ try:
364
+ with open(file_path, "rb") as audio_file:
365
+ transcription_params = {
366
+ "model": model,
367
+ "file": audio_file
368
+ }
369
+ if language:
370
+ transcription_params["language"] = language
371
+ if temperature is not None:
372
+ transcription_params["temperature"] = temperature
373
+
374
+ transcript = await self._openai_client.audio.transcriptions.create(**transcription_params)
375
+ except APIError as e:
376
+ raise STTTranscriptionError(f"OpenAI API error: {e}") from e
377
+
378
+ logger.info(f"Successfully transcribed file: {file_path}")
379
+
380
+ return {
381
+ "text": transcript.text,
382
+ "language": language,
383
+ "path": file_path
384
+ }
385
+
386
+ async def transcribe_and_post(
387
+ self,
388
+ path: str,
389
+ model: str = "whisper-1",
390
+ language: Optional[str] = None,
391
+ temperature: Optional[float] = None,
392
+ batch_process: bool = False,
393
+ batch_extensions: Optional[List[str]] = None,
394
+ recursive: bool = False,
395
+ validate_audio: bool = True,
396
+ output_file: Optional[str] = None
397
+ ) -> Tuple[Union[Dict[str, Any], List[Dict[str, Any]]], Optional[Any]]:
398
+ """Transcribes an audio file or batch and posts the result to the agent API.
399
+
400
+ Args:
401
+ path (str): Either a single file path or a folder path when batch_process=True.
402
+ model (str): The name of the Whisper model to use.
403
+ language (Optional[str]): The language of the audio in ISO-639-1 format.
404
+ temperature (Optional[float]): The sampling temperature, between 0 and 1.
405
+ batch_process (bool): If False, path is a single file. If True, path is a folder.
406
+ batch_extensions (Optional[List[str]]): File extensions to include in batch mode.
407
+ recursive (bool): Whether to scan subfolders in batch mode.
408
+ validate_audio (bool): Whether to validate audio files before transcription.
409
+ output_file (Optional[str]): Path to save the transcription results as JSON.
410
+
411
+ Returns:
412
+ A tuple containing:
413
+ - Union[Dict[str, Any], List[Dict[str, Any]]]: Transcription result(s).
414
+ - Optional[Any]: The JSON response from the agent API, or None if disabled.
415
+
416
+ Raises:
417
+ STTError: If any step in the process fails.
418
+ """
419
+ try:
420
+ # Transcribe the audio file(s)
421
+ transcript_result = await self.transcribe_audio(
422
+ path=path,
423
+ model=model,
424
+ language=language,
425
+ temperature=temperature,
426
+ batch_process=batch_process,
427
+ batch_extensions=batch_extensions,
428
+ recursive=recursive,
429
+ validate_audio=validate_audio,
430
+ output_file=output_file
431
+ )
432
+
433
+ response = None
434
+ # Post the transcribed text to the agent API if the URL is configured
435
+ if self._agent_api_url:
436
+ if batch_process:
437
+ # For batch, post all successful transcriptions
438
+ texts = [r["text"] for r in transcript_result if r["text"] is not None]
439
+ if texts:
440
+ response = await self._post_to_agent(texts)
441
+ else:
442
+ # For single file, post the text
443
+ response = await self._post_to_agent(transcript_result["text"])
444
+ else:
445
+ logger.info("AGENT_API_URL not set, skipping post to agent.")
446
+
447
+ return transcript_result, response
448
+
449
+ except STTError as e:
450
+ logger.error(
451
+ f"STT Error during processing of {path}: {e}", exc_info=True)
452
+ raise
453
+ except Exception as e:
454
+ logger.error(
455
+ f"An unexpected error occurred during processing of {path}: {e}", exc_info=True)
456
+ raise
457
+
458
+ async def _post_to_agent(self, data: Union[str, List[str]]) -> Any:
459
+ """Posts the transcribed text to the agent API.
460
+
461
+ The text is sent in a JSON payload under the configured key.
462
+
463
+ Args:
464
+ data (Union[str, List[str]]): The transcribed text or list of texts to post.
465
+
466
+ Returns:
467
+ Any: The JSON response from the agent API.
468
+
469
+ Raises:
470
+ STTAgentError: If the HTTP request to the agent API fails.
471
+ """
472
+ payload = {self._agent_post_key: data}
473
+ try:
474
+ logger.info(f"Posting to agent with payload: {payload}")
475
+ response = await self._http_client.post(self._agent_api_url, json=payload)
476
+ response.raise_for_status() # Raises HTTPError for bad responses (4xx or 5xx)
477
+ logger.info(
478
+ f"Successfully posted to agent. Status: {response.status_code}")
479
+ return response.json()
480
+ except httpx.HTTPError as e:
481
+ logger.error(f"Failed to post to agent API: {e}", exc_info=True)
482
+ raise STTAgentError(f"Failed to post to agent API: {e}") from e
@@ -0,0 +1,19 @@
1
+ class STTError(Exception):
2
+ """Base exception for all STT-related errors."""
3
+ pass
4
+
5
+ class STTConfigurationError(STTError):
6
+ """Raised when configuration (API keys, URLs) is missing or invalid."""
7
+ pass
8
+
9
+ class STTFileError(STTError):
10
+ """Raised when the audio file is not found or inaccessible."""
11
+ pass
12
+
13
+ class STTTranscriptionError(STTError):
14
+ """Raised when the transcription service (OpenAI) fails."""
15
+ pass
16
+
17
+ class STTAgentError(STTError):
18
+ """Raised when posting to the agent API fails."""
19
+ pass
@@ -0,0 +1,15 @@
1
+ from .client import TTSStreamer
2
+ from .exceptions import (
3
+ TTSError,
4
+ TTSConfigurationError,
5
+ TTSStreamError,
6
+ TTSAPIError,
7
+ )
8
+
9
+ __all__ = [
10
+ "TTSStreamer",
11
+ "TTSError",
12
+ "TTSConfigurationError",
13
+ "TTSStreamError",
14
+ "TTSAPIError",
15
+ ]