intellema-vdk 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- intellema_vdk/__init__.py +67 -10
- intellema_vdk/config.py +14 -0
- intellema_vdk/providers/__init__.py +35 -0
- intellema_vdk/providers/livekit/__init__.py +19 -0
- intellema_vdk/providers/livekit/client.py +612 -0
- intellema_vdk/providers/livekit/exceptions.py +23 -0
- intellema_vdk/providers/protocols.py +33 -0
- intellema_vdk/providers/retell/__init__.py +17 -0
- intellema_vdk/providers/retell/client.py +468 -0
- intellema_vdk/providers/retell/exceptions.py +19 -0
- intellema_vdk/{retell_lib → providers/retell}/import_phone_number.py +1 -1
- intellema_vdk/stt/__init__.py +17 -0
- intellema_vdk/stt/client.py +482 -0
- intellema_vdk/stt/exceptions.py +19 -0
- intellema_vdk/tts/__init__.py +15 -0
- intellema_vdk/tts/__pycache__/__init__.cpython-312.pyc +0 -0
- intellema_vdk/tts/__pycache__/client.cpython-312.pyc +0 -0
- intellema_vdk/tts/__pycache__/exceptions.cpython-312.pyc +0 -0
- intellema_vdk/tts/__pycache__/providers.cpython-312.pyc +0 -0
- intellema_vdk/tts/client.py +541 -0
- intellema_vdk/tts/exceptions.py +15 -0
- intellema_vdk/tts/providers.py +293 -0
- intellema_vdk/utils/logger_config.py +41 -0
- intellema_vdk-0.2.2.dist-info/METADATA +311 -0
- intellema_vdk-0.2.2.dist-info/RECORD +29 -0
- {intellema_vdk-0.2.1.dist-info → intellema_vdk-0.2.2.dist-info}/WHEEL +1 -1
- intellema_vdk/__pycache__/__init__.cpython-312.pyc +0 -0
- intellema_vdk/livekit_lib/__init__.py +0 -3
- intellema_vdk/livekit_lib/__pycache__/__init__.cpython-312.pyc +0 -0
- intellema_vdk/livekit_lib/__pycache__/client.cpython-312.pyc +0 -0
- intellema_vdk/livekit_lib/client.py +0 -280
- intellema_vdk/retell_lib/__pycache__/__init__.cpython-312.pyc +0 -0
- intellema_vdk/retell_lib/__pycache__/retell_client.cpython-312.pyc +0 -0
- intellema_vdk/retell_lib/retell_client.py +0 -248
- intellema_vdk/speech_lib/__init__.py +0 -2
- intellema_vdk/speech_lib/__pycache__/__init__.cpython-312.pyc +0 -0
- intellema_vdk/speech_lib/__pycache__/stt_client.cpython-312.pyc +0 -0
- intellema_vdk/speech_lib/__pycache__/tts_streamer.cpython-312.pyc +0 -0
- intellema_vdk/speech_lib/stt_client.py +0 -110
- intellema_vdk/speech_lib/tts_streamer.py +0 -188
- intellema_vdk-0.2.1.dist-info/METADATA +0 -221
- intellema_vdk-0.2.1.dist-info/RECORD +0 -22
- /intellema_vdk/{retell_lib/__init__.py → stt/providers.py} +0 -0
- {intellema_vdk-0.2.1.dist-info → intellema_vdk-0.2.2.dist-info}/licenses/LICENSE +0 -0
- {intellema_vdk-0.2.1.dist-info → intellema_vdk-0.2.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,293 @@
|
|
|
1
|
+
"""TTS provider implementations for Together AI and OpenAI."""
|
|
2
|
+
|
|
3
|
+
from typing import Iterator, Protocol, runtime_checkable, Literal, TypedDict
|
|
4
|
+
import logging
|
|
5
|
+
from ..config import (
|
|
6
|
+
WAV_HEADER_SIZE
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
# Type definitions for provider-specific configurations
|
|
12
|
+
|
|
13
|
+
TogetherTTSModel = Literal["canopylabs/orpheus-3b-0.1-ft"]
|
|
14
|
+
TogetherTTSVoice = Literal["tara"]
|
|
15
|
+
|
|
16
|
+
OpenAITTSModel = Literal["tts-1", "tts-1-hd"]
|
|
17
|
+
OpenAITTSVoice = Literal["alloy", "echo", "fable", "onyx", "nova", "shimmer"]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class TogetherTTSConfig(TypedDict, total=False):
|
|
21
|
+
"""Configuration options for Together AI TTS provider."""
|
|
22
|
+
model: TogetherTTSModel
|
|
23
|
+
voice: TogetherTTSVoice
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class OpenAITTSConfig(TypedDict, total=False):
|
|
27
|
+
"""Configuration options for OpenAI TTS provider."""
|
|
28
|
+
model: OpenAITTSModel
|
|
29
|
+
voice: OpenAITTSVoice
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@runtime_checkable
|
|
33
|
+
class TTSProvider(Protocol):
|
|
34
|
+
"""
|
|
35
|
+
Protocol defining the interface for Text-to-Speech providers.
|
|
36
|
+
|
|
37
|
+
All TTS providers must implement the stream method to generate
|
|
38
|
+
audio from text input.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def stream(self, text: str) -> Iterator[bytes]:
|
|
42
|
+
"""
|
|
43
|
+
Generate audio data from text input as a stream of bytes.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
text: The text to convert to speech.
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
An iterator yielding raw audio bytes (PCM format).
|
|
50
|
+
|
|
51
|
+
Raises:
|
|
52
|
+
Exception: If the TTS API call fails.
|
|
53
|
+
"""
|
|
54
|
+
...
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class TogetherTTSProvider:
|
|
58
|
+
"""
|
|
59
|
+
Together AI TTS provider implementation.
|
|
60
|
+
|
|
61
|
+
Uses the Together API to generate speech from text with the Orpheus model.
|
|
62
|
+
Supports streaming audio generation with low latency.
|
|
63
|
+
|
|
64
|
+
Attributes:
|
|
65
|
+
model: The TTS model identifier.
|
|
66
|
+
voice: The voice identifier for speech generation.
|
|
67
|
+
client: The Together API client instance.
|
|
68
|
+
|
|
69
|
+
Example:
|
|
70
|
+
>>> provider = TogetherTTSProvider(
|
|
71
|
+
... api_key="your-api-key",
|
|
72
|
+
... model="canopylabs/orpheus-3b-0.1-ft",
|
|
73
|
+
... voice="tara"
|
|
74
|
+
... )
|
|
75
|
+
>>> for audio_chunk in provider.stream("Hello world"):
|
|
76
|
+
... # Process audio chunk
|
|
77
|
+
... pass
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
def __init__(
|
|
81
|
+
self,
|
|
82
|
+
api_key: str,
|
|
83
|
+
model: TogetherTTSModel = "canopylabs/orpheus-3b-0.1-ft",
|
|
84
|
+
voice: TogetherTTSVoice = "tara"
|
|
85
|
+
) -> None:
|
|
86
|
+
"""
|
|
87
|
+
Initialize the Together TTS provider.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
api_key: Together API key for authentication.
|
|
91
|
+
model: The TTS model to use. Currently supports:
|
|
92
|
+
- "canopylabs/orpheus-3b-0.1-ft" (default)
|
|
93
|
+
voice: The voice to use for speech generation. Currently supports:
|
|
94
|
+
- "tara" (default)
|
|
95
|
+
|
|
96
|
+
Raises:
|
|
97
|
+
ImportError: If the together package is not installed.
|
|
98
|
+
"""
|
|
99
|
+
try:
|
|
100
|
+
from together import Together
|
|
101
|
+
except ImportError:
|
|
102
|
+
import subprocess
|
|
103
|
+
import sys
|
|
104
|
+
print("Together AI SDK is not installed. Installing now...")
|
|
105
|
+
print("Run: pip install intellema-vdk[tts]")
|
|
106
|
+
try:
|
|
107
|
+
subprocess.check_call([sys.executable, "-m", "pip", "install", "together>=1.0.0"])
|
|
108
|
+
from together import Together
|
|
109
|
+
print("✓ Together AI SDK installed successfully!")
|
|
110
|
+
except Exception as e:
|
|
111
|
+
raise ImportError(
|
|
112
|
+
"Failed to install together. Please install manually:\n"
|
|
113
|
+
" pip install intellema-vdk[tts]\n"
|
|
114
|
+
"or:\n"
|
|
115
|
+
" pip install together>=1.0.0"
|
|
116
|
+
) from e
|
|
117
|
+
|
|
118
|
+
self.api_key = api_key
|
|
119
|
+
self.model = model
|
|
120
|
+
self.voice = voice
|
|
121
|
+
self.client = Together(api_key=api_key)
|
|
122
|
+
|
|
123
|
+
def stream(self, text: str) -> Iterator[bytes]:
|
|
124
|
+
"""
|
|
125
|
+
Stream audio from Together API.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
text: The text to convert to speech.
|
|
129
|
+
|
|
130
|
+
Yields:
|
|
131
|
+
Raw audio bytes in PCM format.
|
|
132
|
+
"""
|
|
133
|
+
try:
|
|
134
|
+
response = self.client.audio.speech.create(
|
|
135
|
+
model=self.model,
|
|
136
|
+
input=text,
|
|
137
|
+
voice=self.voice,
|
|
138
|
+
stream=True,
|
|
139
|
+
response_format="raw",
|
|
140
|
+
response_encoding="pcm_s16le",
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
# The Together API returns a generator directly
|
|
144
|
+
# Try to handle it as a byte stream first
|
|
145
|
+
for chunk in response:
|
|
146
|
+
# Handle tuple format first (most common for Together API)
|
|
147
|
+
if isinstance(chunk, tuple):
|
|
148
|
+
if len(chunk) > 1:
|
|
149
|
+
# Tuple format (metadata, data)
|
|
150
|
+
sub_iterator = chunk[1]
|
|
151
|
+
if isinstance(sub_iterator, (bytes, bytearray)):
|
|
152
|
+
processed = self._strip_wav_header(sub_iterator)
|
|
153
|
+
if len(processed) > 0:
|
|
154
|
+
yield processed
|
|
155
|
+
else:
|
|
156
|
+
try:
|
|
157
|
+
for sub_chunk in sub_iterator:
|
|
158
|
+
if isinstance(sub_chunk, (bytes, bytearray)):
|
|
159
|
+
processed = self._strip_wav_header(sub_chunk)
|
|
160
|
+
if len(processed) > 0:
|
|
161
|
+
yield processed
|
|
162
|
+
elif hasattr(sub_chunk, "data") and sub_chunk.data:
|
|
163
|
+
# TogetherResponse objects have data attribute
|
|
164
|
+
processed = self._strip_wav_header(sub_chunk.data)
|
|
165
|
+
if len(processed) > 0:
|
|
166
|
+
yield processed
|
|
167
|
+
elif hasattr(sub_chunk, "content") and sub_chunk.content:
|
|
168
|
+
processed = self._strip_wav_header(sub_chunk.content)
|
|
169
|
+
if len(processed) > 0:
|
|
170
|
+
yield processed
|
|
171
|
+
except TypeError as te:
|
|
172
|
+
logger.warning(f"Non-iterable sub-iterator: {type(sub_iterator)}, error: {te}")
|
|
173
|
+
# Handle different response structures
|
|
174
|
+
elif isinstance(chunk, (bytes, bytearray)):
|
|
175
|
+
# Direct bytes - this is what we expect
|
|
176
|
+
processed = self._strip_wav_header(chunk)
|
|
177
|
+
if len(processed) > 0:
|
|
178
|
+
yield processed
|
|
179
|
+
elif hasattr(chunk, "content"):
|
|
180
|
+
# Object with content attribute
|
|
181
|
+
processed = self._strip_wav_header(chunk.content)
|
|
182
|
+
if len(processed) > 0:
|
|
183
|
+
yield processed
|
|
184
|
+
|
|
185
|
+
except Exception as e:
|
|
186
|
+
logger.error(f"Together TTS stream error: {e}", exc_info=True)
|
|
187
|
+
raise
|
|
188
|
+
|
|
189
|
+
def _strip_wav_header(self, audio_data: bytes) -> bytes:
|
|
190
|
+
"""Remove WAV header if present, returning raw PCM data."""
|
|
191
|
+
if len(audio_data) >= WAV_HEADER_SIZE and audio_data[:4] == b"RIFF":
|
|
192
|
+
return audio_data[WAV_HEADER_SIZE:]
|
|
193
|
+
return audio_data
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
class OpenAITTSProvider:
|
|
197
|
+
"""
|
|
198
|
+
OpenAI TTS provider implementation.
|
|
199
|
+
|
|
200
|
+
Uses the OpenAI API to generate high-quality speech from text.
|
|
201
|
+
Supports multiple voices and quality levels.
|
|
202
|
+
|
|
203
|
+
Attributes:
|
|
204
|
+
model: The TTS model identifier.
|
|
205
|
+
voice: The voice identifier for speech generation.
|
|
206
|
+
client: The OpenAI API client instance.
|
|
207
|
+
|
|
208
|
+
Example:
|
|
209
|
+
>>> provider = OpenAITTSProvider(
|
|
210
|
+
... api_key="your-api-key",
|
|
211
|
+
... model="tts-1-hd",
|
|
212
|
+
... voice="nova"
|
|
213
|
+
... )
|
|
214
|
+
>>> for audio_chunk in provider.stream("Hello world"):
|
|
215
|
+
... # Process audio chunk
|
|
216
|
+
... pass
|
|
217
|
+
"""
|
|
218
|
+
|
|
219
|
+
def __init__(
|
|
220
|
+
self,
|
|
221
|
+
api_key: str,
|
|
222
|
+
model: OpenAITTSModel = "tts-1",
|
|
223
|
+
voice: OpenAITTSVoice = "alloy"
|
|
224
|
+
) -> None:
|
|
225
|
+
"""
|
|
226
|
+
Initialize the OpenAI TTS provider.
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
api_key: OpenAI API key for authentication.
|
|
230
|
+
model: The TTS model to use:
|
|
231
|
+
- "tts-1": Standard quality, lower latency (default)
|
|
232
|
+
- "tts-1-hd": High definition quality, higher latency
|
|
233
|
+
voice: The voice to use for speech generation:
|
|
234
|
+
- "alloy": Neutral and balanced (default)
|
|
235
|
+
- "echo": Male voice
|
|
236
|
+
- "fable": British accent
|
|
237
|
+
- "onyx": Deep and authoritative
|
|
238
|
+
- "nova": Energetic and youthful
|
|
239
|
+
- "shimmer": Warm and expressive
|
|
240
|
+
|
|
241
|
+
Raises:
|
|
242
|
+
ImportError: If the openai package is not installed.
|
|
243
|
+
"""
|
|
244
|
+
try:
|
|
245
|
+
from openai import OpenAI
|
|
246
|
+
except ImportError:
|
|
247
|
+
import subprocess
|
|
248
|
+
import sys
|
|
249
|
+
print("OpenAI SDK is not installed. Installing now...")
|
|
250
|
+
print("Run: pip install intellema-vdk[tts]")
|
|
251
|
+
try:
|
|
252
|
+
subprocess.check_call([sys.executable, "-m", "pip", "install", "openai>=1.0.0"])
|
|
253
|
+
from openai import OpenAI
|
|
254
|
+
print("✓ OpenAI SDK installed successfully!")
|
|
255
|
+
except Exception as e:
|
|
256
|
+
raise ImportError(
|
|
257
|
+
"Failed to install openai. Please install manually:\n"
|
|
258
|
+
" pip install intellema-vdk[tts]\n"
|
|
259
|
+
"or:\n"
|
|
260
|
+
" pip install openai>=1.0.0"
|
|
261
|
+
) from e
|
|
262
|
+
|
|
263
|
+
self.api_key = api_key
|
|
264
|
+
self.model = model
|
|
265
|
+
self.voice = voice
|
|
266
|
+
self.client = OpenAI(api_key=api_key)
|
|
267
|
+
|
|
268
|
+
def stream(self, text: str) -> Iterator[bytes]:
|
|
269
|
+
"""
|
|
270
|
+
Stream audio from OpenAI API.
|
|
271
|
+
|
|
272
|
+
Args:
|
|
273
|
+
text: The text to convert to speech.
|
|
274
|
+
|
|
275
|
+
Yields:
|
|
276
|
+
Raw audio bytes in PCM format (converted from MP3/Opus).
|
|
277
|
+
"""
|
|
278
|
+
try:
|
|
279
|
+
response = self.client.audio.speech.create(
|
|
280
|
+
model=self.model,
|
|
281
|
+
voice=self.voice,
|
|
282
|
+
input=text,
|
|
283
|
+
response_format="pcm" # Request PCM format directly
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
# OpenAI streaming response
|
|
287
|
+
for chunk in response.iter_bytes(chunk_size=4096):
|
|
288
|
+
if chunk:
|
|
289
|
+
yield chunk
|
|
290
|
+
|
|
291
|
+
except Exception as e:
|
|
292
|
+
logger.error(f"OpenAI TTS stream error: {e}", exc_info=True)
|
|
293
|
+
raise
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import logging # Standard library for logging events.
|
|
2
|
+
import sys # Provides access to system-specific parameters and functions.
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def setup_logging(
|
|
6
|
+
log_level: int = logging.INFO,
|
|
7
|
+
log_format: str = "%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
|
8
|
+
date_format: str = "%Y-%m-%d %H:%M:%S",
|
|
9
|
+
) -> None:
|
|
10
|
+
"""Configures basic console logging for the application.
|
|
11
|
+
|
|
12
|
+
This function should be called once at the beginning of the application's
|
|
13
|
+
execution to set up a consistent logging format and level. It configures
|
|
14
|
+
the root logger to output messages to the standard output.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
log_level (int): The logging level for the root logger.
|
|
18
|
+
Defaults to `logging.INFO`.
|
|
19
|
+
Example accepted values: `logging.DEBUG`, `logging.INFO`,
|
|
20
|
+
`logging.WARNING`, `logging.ERROR`.
|
|
21
|
+
log_format (str): The format string for log messages.
|
|
22
|
+
Defaults to a standard format including timestamp, level, and name.
|
|
23
|
+
date_format (str): The format string for the date/time in log messages.
|
|
24
|
+
Defaults to "%Y-%m-%d %H:%M:%S".
|
|
25
|
+
|
|
26
|
+
Example:
|
|
27
|
+
>>> import logging
|
|
28
|
+
>>> # Set up logging with a DEBUG level
|
|
29
|
+
>>> setup_logging(log_level=logging.DEBUG)
|
|
30
|
+
>>>
|
|
31
|
+
>>> # Now, any logger will inherit this configuration
|
|
32
|
+
>>> logger = logging.getLogger("my_app")
|
|
33
|
+
>>> logger.debug("This is a debug message.")
|
|
34
|
+
>>> logger.info("This is an info message.")
|
|
35
|
+
"""
|
|
36
|
+
logging.basicConfig(
|
|
37
|
+
level=log_level,
|
|
38
|
+
format=log_format,
|
|
39
|
+
datefmt=date_format,
|
|
40
|
+
stream=sys.stdout, # Direct logs to standard output.
|
|
41
|
+
)
|
|
@@ -0,0 +1,311 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: intellema-vdk
|
|
3
|
+
Version: 0.2.2
|
|
4
|
+
Summary: A Voice Development Kit for different Voice Agent Platforms
|
|
5
|
+
Author: Intellema
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2026 Intellema
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
|
|
28
|
+
Classifier: Programming Language :: Python :: 3
|
|
29
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
30
|
+
Classifier: Operating System :: OS Independent
|
|
31
|
+
Requires-Python: >=3.8
|
|
32
|
+
Description-Content-Type: text/markdown
|
|
33
|
+
License-File: LICENSE
|
|
34
|
+
Requires-Dist: python-dotenv>=1.0.0
|
|
35
|
+
Requires-Dist: requests>=2.31.0
|
|
36
|
+
Requires-Dist: httpx>=0.24.0
|
|
37
|
+
Provides-Extra: livekit
|
|
38
|
+
Requires-Dist: livekit-api>=1.1.0; extra == "livekit"
|
|
39
|
+
Requires-Dist: boto3>=1.28.0; extra == "livekit"
|
|
40
|
+
Provides-Extra: retell
|
|
41
|
+
Requires-Dist: retell-sdk>=2.0.0; extra == "retell"
|
|
42
|
+
Requires-Dist: twilio>=8.0.0; extra == "retell"
|
|
43
|
+
Requires-Dist: boto3>=1.28.0; extra == "retell"
|
|
44
|
+
Provides-Extra: stt
|
|
45
|
+
Requires-Dist: openai>=1.0.0; extra == "stt"
|
|
46
|
+
Provides-Extra: tts
|
|
47
|
+
Requires-Dist: together>=1.0.0; extra == "tts"
|
|
48
|
+
Requires-Dist: openai>=1.0.0; extra == "tts"
|
|
49
|
+
Provides-Extra: audio
|
|
50
|
+
Requires-Dist: pyaudio>=0.2.13; extra == "audio"
|
|
51
|
+
Provides-Extra: all
|
|
52
|
+
Requires-Dist: livekit-api>=1.1.0; extra == "all"
|
|
53
|
+
Requires-Dist: retell-sdk>=2.0.0; extra == "all"
|
|
54
|
+
Requires-Dist: twilio>=8.0.0; extra == "all"
|
|
55
|
+
Requires-Dist: boto3>=1.28.0; extra == "all"
|
|
56
|
+
Requires-Dist: openai>=1.0.0; extra == "all"
|
|
57
|
+
Requires-Dist: together>=1.0.0; extra == "all"
|
|
58
|
+
Requires-Dist: pyaudio>=0.2.13; extra == "all"
|
|
59
|
+
Dynamic: license-file
|
|
60
|
+
|
|
61
|
+
# Intellema VDK
|
|
62
|
+
|
|
63
|
+
Intellema VDK is a unified Voice Development Kit that simplifies integration with voice agent platforms like LiveKit and Retell AI. Build scalable voice applications with a consistent, provider-agnostic API.
|
|
64
|
+
|
|
65
|
+
## Features
|
|
66
|
+
|
|
67
|
+
- **Voice Providers**: LiveKit and Retell AI support with unified interface
|
|
68
|
+
- **Outbound Calling**: Initiate phone calls via SIP trunks
|
|
69
|
+
- **Speech-to-Text**: Transcribe audio with OpenAI Whisper
|
|
70
|
+
- **Text-to-Speech**: Low-latency streaming TTS via Together AI
|
|
71
|
+
- **Recording & Streaming**: Save to S3 or stream to RTMP
|
|
72
|
+
- **Participant Management**: Tokens, muting, kick controls
|
|
73
|
+
- **Real-time Messaging**: Send data packets during calls
|
|
74
|
+
|
|
75
|
+
## Quick Start
|
|
76
|
+
|
|
77
|
+
### Installation
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
# Minimal installation (core dependencies only)
|
|
81
|
+
pip install intellema-vdk
|
|
82
|
+
|
|
83
|
+
# Install with specific provider support
|
|
84
|
+
pip install intellema-vdk[livekit] # LiveKit voice provider
|
|
85
|
+
pip install intellema-vdk[retell] # Retell voice provider
|
|
86
|
+
pip install intellema-vdk[stt] # Speech-to-Text features
|
|
87
|
+
pip install intellema-vdk[tts] # Text-to-Speech features
|
|
88
|
+
pip install intellema-vdk[audio] # Audio playback (PyAudio)
|
|
89
|
+
|
|
90
|
+
# Install all features
|
|
91
|
+
pip install intellema-vdk[all]
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
**Requirements:** Python 3.8+
|
|
95
|
+
|
|
96
|
+
**Note on PyAudio:** The `audio` extra requires PortAudio to be installed on your system:
|
|
97
|
+
- **Windows**: Usually works with `pip install pyaudio`, or use `pipwin install pyaudio`
|
|
98
|
+
- **macOS**: `brew install portaudio && pip install pyaudio`
|
|
99
|
+
- **Linux**: `sudo apt-get install portaudio19-dev && pip install pyaudio`
|
|
100
|
+
|
|
101
|
+
The package will automatically install required dependencies when you first use a feature.
|
|
102
|
+
|
|
103
|
+
### Minimal Example
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
import asyncio
|
|
107
|
+
from intellema_vdk import VoiceClient
|
|
108
|
+
|
|
109
|
+
async def main() -> None:
|
|
110
|
+
client = VoiceClient("livekit") # or "retell"
|
|
111
|
+
|
|
112
|
+
call_id: str = await client.start_outbound_call(
|
|
113
|
+
phone_number="+15551234567",
|
|
114
|
+
prompt_content="Hello from VoxChain!"
|
|
115
|
+
)
|
|
116
|
+
print(f"Call started: {call_id}")
|
|
117
|
+
|
|
118
|
+
await client.close()
|
|
119
|
+
|
|
120
|
+
if __name__ == "__main__":
|
|
121
|
+
asyncio.run(main())
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### Configuration
|
|
125
|
+
|
|
126
|
+
Create a `.env` file with your credentials:
|
|
127
|
+
|
|
128
|
+
```bash
|
|
129
|
+
# LiveKit (if using)
|
|
130
|
+
LIVEKIT_URL=wss://your-livekit-server.com
|
|
131
|
+
LIVEKIT_API_KEY=your_api_key
|
|
132
|
+
LIVEKIT_API_SECRET=your_api_secret
|
|
133
|
+
SIP_OUTBOUND_TRUNK_ID=your_trunk_id
|
|
134
|
+
|
|
135
|
+
# Retell + Twilio (if using)
|
|
136
|
+
TWILIO_ACCOUNT_SID=your_sid
|
|
137
|
+
TWILIO_AUTH_TOKEN=your_token
|
|
138
|
+
TWILIO_PHONE_NUMBER=+15551234567
|
|
139
|
+
RETELL_API_KEY=your_retell_key
|
|
140
|
+
RETELL_AGENT_ID=your_agent_id
|
|
141
|
+
|
|
142
|
+
# STT
|
|
143
|
+
OPENAI_API_KEY=sk-your-key
|
|
144
|
+
AGENT_API_URL=https://your-agent-api.com/process # Optional
|
|
145
|
+
|
|
146
|
+
# TTS (set appropriate API key according to provider)
|
|
147
|
+
TOGETHER_API_KEY=your_together_key
|
|
148
|
+
OPENAI_API_KEY=your_openai_key
|
|
149
|
+
|
|
150
|
+
# Optional: AWS for recordings
|
|
151
|
+
AWS_ACCESS_KEY_ID=your_key
|
|
152
|
+
AWS_SECRET_ACCESS_KEY=your_secret
|
|
153
|
+
AWS_REGION=us-east-1
|
|
154
|
+
AWS_S3_BUCKET=your-bucket
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
See [docs/guides/configuration.md](docs/guides/configuration.md) for detailed setup.
|
|
158
|
+
|
|
159
|
+
## Core Modules
|
|
160
|
+
|
|
161
|
+
### Voice Providers
|
|
162
|
+
|
|
163
|
+
Choose between LiveKit or Retell for voice calls.
|
|
164
|
+
|
|
165
|
+
```python
|
|
166
|
+
from intellema_vdk import VoiceClient
|
|
167
|
+
|
|
168
|
+
# LiveKit for advanced features
|
|
169
|
+
livekit = VoiceClient("livekit")
|
|
170
|
+
|
|
171
|
+
# Retell for quick setup
|
|
172
|
+
retell = VoiceClient("retell")
|
|
173
|
+
|
|
174
|
+
# Common interface
|
|
175
|
+
call_id: str = await livekit.start_outbound_call("+15551234567", "Hello!")
|
|
176
|
+
await livekit.start_recording(call_id)
|
|
177
|
+
await livekit.delete_room(call_id)
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
**Detailed Documentation:**
|
|
181
|
+
- [docs/api/providers.md](docs/api/providers.md) - Full API reference with examples
|
|
182
|
+
- [docs/guides/examples.md](docs/guides/examples.md) - Complete usage patterns
|
|
183
|
+
|
|
184
|
+
**Important for Retell:**
|
|
185
|
+
Before making calls, register your Twilio number:
|
|
186
|
+
```bash
|
|
187
|
+
python import_phone_number.py
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
### Speech-to-Text (STT)
|
|
191
|
+
|
|
192
|
+
Transcribe audio files with OpenAI Whisper - supports single files and batch processing:
|
|
193
|
+
|
|
194
|
+
```python
|
|
195
|
+
from intellema_vdk import STTManager
|
|
196
|
+
|
|
197
|
+
async def transcribe() -> None:
|
|
198
|
+
stt = STTManager()
|
|
199
|
+
try:
|
|
200
|
+
# Single file
|
|
201
|
+
result = await stt.transcribe_audio("recording.wav")
|
|
202
|
+
print(result["text"])
|
|
203
|
+
|
|
204
|
+
# Batch process folder
|
|
205
|
+
results = await stt.transcribe_audio(
|
|
206
|
+
"recordings/",
|
|
207
|
+
batch_process=True,
|
|
208
|
+
output_file="transcripts.json"
|
|
209
|
+
)
|
|
210
|
+
finally:
|
|
211
|
+
await stt.close()
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
**Detailed Documentation:** [docs/api/stt.md](docs/api/stt.md)
|
|
215
|
+
|
|
216
|
+
### Text-to-Speech (TTS)
|
|
217
|
+
|
|
218
|
+
Stream text to audio in real-time with support for multiple providers:
|
|
219
|
+
|
|
220
|
+
```python
|
|
221
|
+
from intellema_vdk import TTSStreamer
|
|
222
|
+
|
|
223
|
+
# Together AI (low latency)
|
|
224
|
+
tts = TTSStreamer(provider="together")
|
|
225
|
+
|
|
226
|
+
# OpenAI (high quality, 6 voices)
|
|
227
|
+
tts = TTSStreamer(
|
|
228
|
+
provider="openai",
|
|
229
|
+
voice="nova", # alloy, echo, fable, onyx, nova, shimmer
|
|
230
|
+
model="tts-1-hd" # tts-1 or tts-1-hd
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
# Feed text as it's generated
|
|
234
|
+
for chunk in llm_stream:
|
|
235
|
+
tts.feed(chunk)
|
|
236
|
+
|
|
237
|
+
tts.flush() # Wait for completion
|
|
238
|
+
tts.close()
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
**Detailed Documentation:** [docs/api/tts.md](docs/api/tts.md)
|
|
242
|
+
|
|
243
|
+
**Sample Implementation:** Run the included chatbot demo:
|
|
244
|
+
```bash
|
|
245
|
+
python sample_implementation.py
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
## Advanced Usage
|
|
249
|
+
|
|
250
|
+
### Logging
|
|
251
|
+
|
|
252
|
+
Configure logging to see VDK internals:
|
|
253
|
+
|
|
254
|
+
```python
|
|
255
|
+
from intellema_vdk import setup_logging
|
|
256
|
+
|
|
257
|
+
setup_logging() # INFO level by default
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
Custom configuration:
|
|
261
|
+
|
|
262
|
+
```python
|
|
263
|
+
import logging
|
|
264
|
+
setup_logging(
|
|
265
|
+
log_level=logging.DEBUG,
|
|
266
|
+
log_format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
267
|
+
)
|
|
268
|
+
```
|
|
269
|
+
|
|
270
|
+
### Recording Calls
|
|
271
|
+
|
|
272
|
+
```python
|
|
273
|
+
# LiveKit or Retell
|
|
274
|
+
recording_id: str = await client.start_recording(
|
|
275
|
+
call_id=call_id,
|
|
276
|
+
upload_to_s3=True,
|
|
277
|
+
wait_for_completion=False
|
|
278
|
+
)
|
|
279
|
+
```
|
|
280
|
+
|
|
281
|
+
### Streaming to RTMP
|
|
282
|
+
|
|
283
|
+
```python
|
|
284
|
+
await client.start_stream(
|
|
285
|
+
call_id=call_id,
|
|
286
|
+
rtmp_urls=["rtmp://your-server.com/live/key"]
|
|
287
|
+
)
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
## Documentation
|
|
291
|
+
|
|
292
|
+
- **[Getting Started Guide](docs/guides/getting_started.md)** - Setup and first steps
|
|
293
|
+
- **[Configuration Guide](docs/guides/configuration.md)** - Environment variables
|
|
294
|
+
- **[Examples](docs/guides/examples.md)** - Common usage patterns
|
|
295
|
+
- **API Reference:**
|
|
296
|
+
- [Voice Providers](docs/api/providers.md) - LiveKit & Retell
|
|
297
|
+
- [STT](docs/api/stt.md) - Speech-to-Text
|
|
298
|
+
- [TTS](docs/api/tts.md) - Text-to-Speech
|
|
299
|
+
|
|
300
|
+
## Important Notes
|
|
301
|
+
|
|
302
|
+
- **Retell `delete_room` Limitation**: Only works if the user speaks, triggering the agent to check the termination variable. For immediate hangup, use Twilio API directly.
|
|
303
|
+
- **Retell Recording**: Retell automatically records calls. The `start_recording` method retrieves the recording URL after the call ends (no need to explicitly start recording during the call). Ensure recording is enabled for your Retell agent in the dashboard.
|
|
304
|
+
- **Retell Audio Streaming**: Real-time audio streaming (`start_stream`) is **not supported** for Retell phone calls. Retell deprecated their Audio WebSocket API at the end of 2024. Use `start_recording()` to retrieve recordings after the call ends.
|
|
305
|
+
- **Type Safety**: All examples include type annotations for better IDE support.
|
|
306
|
+
- **Async Required**: All voice and STT operations are async; use `asyncio.run()`.
|
|
307
|
+
|
|
308
|
+
## License
|
|
309
|
+
|
|
310
|
+
See [LICENSE](LICENSE) file for details.
|
|
311
|
+
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
intellema_vdk/__init__.py,sha256=zP113PCAKvkBJytAp-BlaOWM0wHMhWWI2JTraGbQ3AA,2299
|
|
2
|
+
intellema_vdk/config.py,sha256=ziQA2AQunE_FIi7r5xACZPEtQZkViOkx_GdgCscJs7k,335
|
|
3
|
+
intellema_vdk/providers/__init__.py,sha256=rojH0sZvWP3cPxCxdKZeRxT8dXHIrJjZYg-E1zgr6P4,777
|
|
4
|
+
intellema_vdk/providers/protocols.py,sha256=GzLQUdWZYzvV4ftfj_p80pfX8etMoQ5OpyDa59Jpwhw,1390
|
|
5
|
+
intellema_vdk/providers/livekit/__init__.py,sha256=yICha10muvINORDuIO6pdgHs69ubgBJ9TJ90ju2r6as,415
|
|
6
|
+
intellema_vdk/providers/livekit/client.py,sha256=osVtkRWoSRo3VI2jDSd4f2eHKLPOZhgE7RKBbBlLtrI,24758
|
|
7
|
+
intellema_vdk/providers/livekit/exceptions.py,sha256=TaDgkzfPaDFk0gUqU6LuH62hk4jIfjsJd_JH58npg5c,724
|
|
8
|
+
intellema_vdk/providers/retell/__init__.py,sha256=PqWbcugL5Zx40YXldIyPd4w8slD6um6QuDWw3MlJzxA,357
|
|
9
|
+
intellema_vdk/providers/retell/client.py,sha256=MdIU3eC6Z9u56HHrGS-4Mk5OKhU-jrUaR1pcfE_swsM,20497
|
|
10
|
+
intellema_vdk/providers/retell/exceptions.py,sha256=91XHePf_zz6toAakYW7gxOwPrNKNulg16iw-Bgzu-OI,598
|
|
11
|
+
intellema_vdk/providers/retell/import_phone_number.py,sha256=y1E3J0PykCVDsEuJAyC5xE_xCvWLV5KsuxFpPwHrE80,3056
|
|
12
|
+
intellema_vdk/stt/__init__.py,sha256=FUQGQoA1i7mUcD33Oi4QY7JqaPImZk96Ebk4TATr4AA,327
|
|
13
|
+
intellema_vdk/stt/client.py,sha256=a4WFpYOeL01rVTNbfdA_B-kUgyLnTtpx3Uif1zEzPrE,21043
|
|
14
|
+
intellema_vdk/stt/exceptions.py,sha256=d3uqA8EOubfz7_uSHUi2e2nRsr61v60TMlp8BdSNqpM,546
|
|
15
|
+
intellema_vdk/stt/providers.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
|
+
intellema_vdk/tts/__init__.py,sha256=jv8bbauP2M6oJeq07VkGGP8eJ4PO7muj6icwC5JmWi8,273
|
|
17
|
+
intellema_vdk/tts/client.py,sha256=h6CwhFUXHcJJfCreRHubyG667q1Wn6QDH15mSOyXsqI,21590
|
|
18
|
+
intellema_vdk/tts/exceptions.py,sha256=7YZPeMXpEStWtZmp2S192Xmc6QmjzzgIRU_0mmq2rCQ,427
|
|
19
|
+
intellema_vdk/tts/providers.py,sha256=mUw5my-x4gF7mhL-pTr01PQa4u3IgPP3XrPctvhTRAE,10894
|
|
20
|
+
intellema_vdk/tts/__pycache__/__init__.cpython-312.pyc,sha256=_nxULtaXwbWZDOyroh9YiuhpUvsyzzHF8el8Fifmz3Y,377
|
|
21
|
+
intellema_vdk/tts/__pycache__/client.cpython-312.pyc,sha256=Vs2Ms9AyG7Qjt9HwKU82w0u7xzcu1p01UrRXzbAC9tg,22581
|
|
22
|
+
intellema_vdk/tts/__pycache__/exceptions.cpython-312.pyc,sha256=ySAYE4I73IQ-oIDwMcX15fBZQYkwMLta2ukK_A6XUKo,1041
|
|
23
|
+
intellema_vdk/tts/__pycache__/providers.cpython-312.pyc,sha256=ICStMr12B2QhxZ5-hRq8CnUUijt_as6EUAxuPIyOacQ,11681
|
|
24
|
+
intellema_vdk/utils/logger_config.py,sha256=fXiQbbYFq5o_XH9Q8Fgh_rdMTAa2QQcwjtWnlwSpDnM,1687
|
|
25
|
+
intellema_vdk-0.2.2.dist-info/licenses/LICENSE,sha256=41qw3yuvY1SpTkwLebZTVYOKk9OIe1Kr6I1S6Y5mp8Y,1087
|
|
26
|
+
intellema_vdk-0.2.2.dist-info/METADATA,sha256=E7xJ0uIbt3eEDZDzSfIXzmivrI7Mf1FsC4H0RLZedcY,9989
|
|
27
|
+
intellema_vdk-0.2.2.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
28
|
+
intellema_vdk-0.2.2.dist-info/top_level.txt,sha256=nQ_0rJRkEthHH0bJYoPAVVgQiO6Uw6c_mHnfeROG14U,14
|
|
29
|
+
intellema_vdk-0.2.2.dist-info/RECORD,,
|
|
Binary file
|
|
Binary file
|
|
Binary file
|