intellema-vdk 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- intellema_vdk/__init__.py +2 -5
- intellema_vdk/retell_lib/import_phone_number.py +73 -0
- intellema_vdk/retell_lib/retell_client.py +117 -59
- intellema_vdk/speech_lib/__init__.py +2 -0
- intellema_vdk/speech_lib/stt_client.py +108 -0
- intellema_vdk/speech_lib/tts_streamer.py +188 -0
- {intellema_vdk-0.1.0.dist-info → intellema_vdk-0.2.0.dist-info}/METADATA +102 -1
- intellema_vdk-0.2.0.dist-info/RECORD +14 -0
- {intellema_vdk-0.1.0.dist-info → intellema_vdk-0.2.0.dist-info}/WHEEL +1 -1
- intellema_vdk/livekit_lib/__pycache__/__init__.cpython-312.pyc +0 -0
- intellema_vdk/livekit_lib/__pycache__/client.cpython-312.pyc +0 -0
- intellema_vdk/retell_lib/__pycache__/__init__.cpython-312.pyc +0 -0
- intellema_vdk/retell_lib/__pycache__/retell_client.cpython-312.pyc +0 -0
- intellema_vdk-0.1.0.dist-info/RECORD +0 -14
- {intellema_vdk-0.1.0.dist-info → intellema_vdk-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {intellema_vdk-0.1.0.dist-info → intellema_vdk-0.2.0.dist-info}/top_level.txt +0 -0
intellema_vdk/__init__.py
CHANGED
|
@@ -1,12 +1,9 @@
|
|
|
1
1
|
from typing import Optional, List, Any
|
|
2
|
-
import os
|
|
3
|
-
from dotenv import load_dotenv
|
|
4
|
-
|
|
5
|
-
# Load environment variables
|
|
6
|
-
load_dotenv()
|
|
7
2
|
|
|
8
3
|
from .livekit_lib.client import LiveKitManager
|
|
9
4
|
from .retell_lib.retell_client import RetellManager
|
|
5
|
+
from .speech_lib.stt_client import STTManager
|
|
6
|
+
from .speech_lib.tts_streamer import TTSStreamer
|
|
10
7
|
|
|
11
8
|
def VoiceClient(provider: str, **kwargs) -> Any:
|
|
12
9
|
"""
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
|
|
4
|
+
# Add the project root to the python path so we can import intellema_vdk
|
|
5
|
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
|
6
|
+
|
|
7
|
+
from intellema_vdk.retell_lib.retell_client import RetellManager
|
|
8
|
+
|
|
9
|
+
def import_twilio_number():
|
|
10
|
+
"""
|
|
11
|
+
Import your Twilio phone number to Retell.
|
|
12
|
+
This is required before you can make outbound calls using Retell.
|
|
13
|
+
"""
|
|
14
|
+
try:
|
|
15
|
+
manager = RetellManager()
|
|
16
|
+
|
|
17
|
+
print("=== Retell Phone Number Import ===\n")
|
|
18
|
+
print(f"Phone Number to import: {manager.twilio_number}")
|
|
19
|
+
print(f"Agent ID to bind: {manager.retell_agent_id}\n")
|
|
20
|
+
|
|
21
|
+
# Ask if user has a Twilio SIP trunk
|
|
22
|
+
print("Do you have a Twilio Elastic SIP Trunk configured?")
|
|
23
|
+
print("If you're not sure, you can:")
|
|
24
|
+
print(" 1. Visit: https://console.twilio.com/us1/develop/voice/manage/trunks")
|
|
25
|
+
print(" 2. Or just press Enter to try without it (may not work for some setups)\n")
|
|
26
|
+
|
|
27
|
+
has_trunk = input("Do you have a SIP trunk? (y/n, default: n): ").strip().lower()
|
|
28
|
+
|
|
29
|
+
termination_uri = None
|
|
30
|
+
sip_username = None
|
|
31
|
+
sip_password = None
|
|
32
|
+
|
|
33
|
+
if has_trunk == 'y':
|
|
34
|
+
print("\nEnter your Twilio SIP Trunk Termination URI.")
|
|
35
|
+
print("Format: yourtrunkname.pstn.twilio.com")
|
|
36
|
+
print("You can find this in Twilio Console > Elastic SIP Trunking > Your Trunk > Termination")
|
|
37
|
+
termination_uri = input("Termination URI: ").strip()
|
|
38
|
+
|
|
39
|
+
print("\nDo you use Credential List authentication? (Recommended)")
|
|
40
|
+
has_creds = input("Use credentials? (y/n, default: y): ").strip().lower() or 'y'
|
|
41
|
+
|
|
42
|
+
if has_creds == 'y':
|
|
43
|
+
print("Enter the username/password from your Twilio Credential List:")
|
|
44
|
+
sip_username = input("Username: ").strip()
|
|
45
|
+
sip_password = input("Password: ").strip()
|
|
46
|
+
|
|
47
|
+
# Optional nickname
|
|
48
|
+
nickname = input("\nOptional: Enter a nickname for this number (press Enter to skip): ").strip() or None
|
|
49
|
+
|
|
50
|
+
print(f"\n=== Importing Phone Number ===")
|
|
51
|
+
|
|
52
|
+
response = manager.import_phone_number(
|
|
53
|
+
termination_uri=termination_uri,
|
|
54
|
+
nickname=nickname,
|
|
55
|
+
sip_trunk_auth_username=sip_username,
|
|
56
|
+
sip_trunk_auth_password=sip_password
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
print(f"\n=== Import Successful! ===")
|
|
60
|
+
print(f"You can now use this number to make outbound calls via Retell.")
|
|
61
|
+
|
|
62
|
+
return response
|
|
63
|
+
|
|
64
|
+
except Exception as e:
|
|
65
|
+
print(f"\n✗ Import failed: {e}")
|
|
66
|
+
print(f"\nTroubleshooting:")
|
|
67
|
+
print(f" 1. If you don't have a SIP trunk, you may need to purchase the number through Retell")
|
|
68
|
+
print(f" 2. Visit Retell dashboard: https://app.retellai.com/")
|
|
69
|
+
print(f" 3. Or create a Twilio Elastic SIP Trunk first")
|
|
70
|
+
raise
|
|
71
|
+
|
|
72
|
+
if __name__ == "__main__":
|
|
73
|
+
import_twilio_number()
|
|
@@ -26,64 +26,122 @@ class RetellManager:
|
|
|
26
26
|
self.twilio_client = Client(self.twilio_account_sid, self.twilio_auth_token)
|
|
27
27
|
self.retell_client = Retell(api_key=self.retell_api_key)
|
|
28
28
|
|
|
29
|
-
def
|
|
29
|
+
def import_phone_number(self, termination_uri: str = None, outbound_agent_id: str = None, inbound_agent_id: str = None, nickname: str = None, sip_trunk_auth_username: str = None, sip_trunk_auth_password: str = None):
|
|
30
30
|
"""
|
|
31
|
-
|
|
32
|
-
|
|
31
|
+
Import/register your Twilio phone number with Retell.
|
|
32
|
+
This is required before you can make outbound calls using the phone number.
|
|
33
33
|
|
|
34
34
|
Args:
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
35
|
+
termination_uri: Twilio SIP trunk termination URI (e.g., "yourtrunk.pstn.twilio.com").
|
|
36
|
+
If not provided, will try to use a default format.
|
|
37
|
+
outbound_agent_id: Agent ID to use for outbound calls. Defaults to self.retell_agent_id.
|
|
38
|
+
inbound_agent_id: Agent ID to use for inbound calls. Defaults to None (no inbound).
|
|
39
|
+
nickname: Optional nickname for the phone number.
|
|
40
|
+
sip_trunk_auth_username: Username for SIP trunk authentication (if using credential list).
|
|
41
|
+
sip_trunk_auth_password: Password for SIP trunk authentication (if using credential list).
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
The phone number registration response from Retell.
|
|
38
45
|
"""
|
|
39
|
-
#
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
46
|
+
# Build the import kwargs
|
|
47
|
+
import_kwargs = {
|
|
48
|
+
"phone_number": self.twilio_number,
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
# Add termination URI if provided
|
|
52
|
+
if termination_uri:
|
|
53
|
+
import_kwargs["termination_uri"] = termination_uri
|
|
54
|
+
|
|
55
|
+
# Add SIP credentials if provided
|
|
56
|
+
if sip_trunk_auth_username and sip_trunk_auth_password:
|
|
57
|
+
import_kwargs["sip_trunk_auth_username"] = sip_trunk_auth_username
|
|
58
|
+
import_kwargs["sip_trunk_auth_password"] = sip_trunk_auth_password
|
|
59
|
+
|
|
60
|
+
# Set outbound agent (required for outbound calls)
|
|
61
|
+
if outbound_agent_id:
|
|
62
|
+
import_kwargs["outbound_agent_id"] = outbound_agent_id
|
|
63
|
+
elif self.retell_agent_id:
|
|
64
|
+
import_kwargs["outbound_agent_id"] = self.retell_agent_id
|
|
65
|
+
|
|
66
|
+
# Set inbound agent if provided
|
|
67
|
+
if inbound_agent_id:
|
|
68
|
+
import_kwargs["inbound_agent_id"] = inbound_agent_id
|
|
69
|
+
|
|
70
|
+
# Add nickname if provided
|
|
71
|
+
if nickname:
|
|
72
|
+
import_kwargs["nickname"] = nickname
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
response = self.retell_client.phone_number.import_(**import_kwargs)
|
|
76
|
+
print(f"✓ Phone number {self.twilio_number} successfully imported to Retell!")
|
|
77
|
+
print(f" Phone Number: {response.phone_number}")
|
|
78
|
+
print(f" Type: {response.phone_number_type}")
|
|
79
|
+
if hasattr(response, 'outbound_agent_id') and response.outbound_agent_id:
|
|
80
|
+
print(f" Outbound Agent: {response.outbound_agent_id}")
|
|
81
|
+
if hasattr(response, 'inbound_agent_id') and response.inbound_agent_id:
|
|
82
|
+
print(f" Inbound Agent: {response.inbound_agent_id}")
|
|
83
|
+
return response
|
|
84
|
+
except Exception as e:
|
|
85
|
+
print(f"✗ Error importing phone number: {e}")
|
|
86
|
+
print(f"\nNote: If you're using Twilio, you may need to:")
|
|
87
|
+
print(f" 1. Create an Elastic SIP Trunk in Twilio console")
|
|
88
|
+
print(f" 2. Provide the termination_uri parameter (e.g., 'yourtrunk.pstn.twilio.com')")
|
|
89
|
+
print(f" 3. Or purchase the number directly through Retell dashboard")
|
|
90
|
+
raise
|
|
48
91
|
|
|
49
|
-
# 2. Construct the audio WebSocket URL using the call_id
|
|
50
|
-
audio_websocket_url = f"wss://api.retellai.com/audio-websocket/{register_response.call_id}"
|
|
51
|
-
|
|
52
|
-
# 3. Construct TwiML to connect Twilio to Retell
|
|
53
|
-
# Note: We construct the XML string manually to avoid extra dependencies like twilio.twiml
|
|
54
|
-
twiml = f"""<Response>
|
|
55
|
-
<Connect>
|
|
56
|
-
<Stream url="{audio_websocket_url}" />
|
|
57
|
-
</Connect>
|
|
58
|
-
</Response>"""
|
|
59
|
-
|
|
60
|
-
# 3. Create the call with Twilio using the generated TwiML
|
|
61
|
-
call = self.twilio_client.calls.create(
|
|
62
|
-
to=phone_number,
|
|
63
|
-
from_=self.twilio_number,
|
|
64
|
-
twiml=twiml
|
|
65
|
-
)
|
|
66
|
-
return call.sid
|
|
67
92
|
|
|
68
|
-
def
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
93
|
+
def start_outbound_call(self, phone_number: str, prompt_content: str = None, call_id: str = None) -> str:
|
|
94
|
+
call_kwargs = {
|
|
95
|
+
"from_number": self.twilio_number,
|
|
96
|
+
"to_number": phone_number,
|
|
97
|
+
"override_agent_id": self.retell_agent_id,
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
if call_id:
|
|
101
|
+
call_kwargs["metadata"] = {"call_id": call_id}
|
|
102
|
+
|
|
103
|
+
# Only set up dynamic variables if we have a custom prompt
|
|
104
|
+
if prompt_content:
|
|
105
|
+
enhanced_prompt = (
|
|
106
|
+
f"{prompt_content}\n\n"
|
|
107
|
+
"SYSTEM INSTRUCTION: Continually check the dynamic variable {{force_end}}. "
|
|
108
|
+
"If {{force_end}} == 'true, you must IMMEDIATELY say goodbye and activate your end_call tool to terminate the call."
|
|
109
|
+
)
|
|
110
|
+
call_kwargs["retell_llm_dynamic_variables"] = {
|
|
111
|
+
"prompt_content": enhanced_prompt,
|
|
112
|
+
"force_end": "false"
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
call_response = self.retell_client.call.create_phone_call(**call_kwargs)
|
|
116
|
+
|
|
117
|
+
print(f"Call created successfully!")
|
|
118
|
+
print(f"Retell Call ID: {call_response.call_id}")
|
|
119
|
+
print(f"Call Status: {call_response.call_status}")
|
|
120
|
+
|
|
121
|
+
return call_response.call_id
|
|
122
|
+
|
|
123
|
+
def delete_room(self, call_id: str):
|
|
73
124
|
try:
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
125
|
+
call_data = self.retell_client.call.retrieve(call_id)
|
|
126
|
+
print(f"Current call status: {call_data.call_status}")
|
|
127
|
+
|
|
128
|
+
if call_data.call_status in ['registered', 'ongoing', 'dialing']:
|
|
129
|
+
print(f"Triggering end for Retell call {call_id}...")
|
|
130
|
+
|
|
131
|
+
self.retell_client.call.update(
|
|
132
|
+
call_id,
|
|
133
|
+
override_dynamic_variables={"force_end": "true"}
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
print("✓ force_end override sent to Retell API")
|
|
137
|
+
else:
|
|
138
|
+
print(f"Call already ended: {call_data.call_status}")
|
|
139
|
+
|
|
83
140
|
except Exception as e:
|
|
84
|
-
print(f"Error ending call {
|
|
141
|
+
print(f"Error ending call {call_id}: {e}")
|
|
142
|
+
raise
|
|
85
143
|
|
|
86
|
-
def start_stream(self,
|
|
144
|
+
def start_stream(self, call_id: str, rtmp_urls: List[str]):
|
|
87
145
|
"""
|
|
88
146
|
Starts a Twilio Media Stream.
|
|
89
147
|
Note: Twilio streams are WebSocket-based. If rtmp_urls contains a WSS URL, it will work.
|
|
@@ -91,16 +149,16 @@ class RetellManager:
|
|
|
91
149
|
if not rtmp_urls:
|
|
92
150
|
raise ValueError("No stream URLs provided")
|
|
93
151
|
|
|
94
|
-
self.twilio_client.calls(
|
|
152
|
+
self.twilio_client.calls(call_id).streams.create(
|
|
95
153
|
url=rtmp_urls[0]
|
|
96
154
|
)
|
|
97
155
|
|
|
98
|
-
def start_recording(self,
|
|
156
|
+
def start_recording(self, call_id: str, output_filepath: Optional[str] = None, upload_to_s3: bool = True, wait_for_completion: bool = True):
|
|
99
157
|
"""
|
|
100
158
|
Triggers a recording on the active Twilio call.
|
|
101
159
|
|
|
102
160
|
Args:
|
|
103
|
-
|
|
161
|
+
call_id: The Twilio Call SID.
|
|
104
162
|
output_filepath: Optional filename for the recording.
|
|
105
163
|
upload_to_s3: If True, uploads to S3.
|
|
106
164
|
wait_for_completion: If True, waits for recording to finish and then uploads.
|
|
@@ -110,7 +168,7 @@ class RetellManager:
|
|
|
110
168
|
"""
|
|
111
169
|
|
|
112
170
|
# Start Twilio recording
|
|
113
|
-
recording = self.twilio_client.calls(
|
|
171
|
+
recording = self.twilio_client.calls(call_id).recordings.create()
|
|
114
172
|
print(f"Recording started: {recording.sid}")
|
|
115
173
|
|
|
116
174
|
if not wait_for_completion:
|
|
@@ -147,7 +205,7 @@ class RetellManager:
|
|
|
147
205
|
if not access_key or not secret_key or not bucket:
|
|
148
206
|
raise ValueError("AWS credentials (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_S3_BUCKET) are required for S3 upload.")
|
|
149
207
|
|
|
150
|
-
filename = output_filepath if output_filepath else f"{
|
|
208
|
+
filename = output_filepath if output_filepath else f"{call_id}-{uuid.uuid4().hex[:6]}.mp3"
|
|
151
209
|
|
|
152
210
|
s3 = boto3.client(
|
|
153
211
|
's3',
|
|
@@ -170,20 +228,20 @@ class RetellManager:
|
|
|
170
228
|
|
|
171
229
|
return recording.sid
|
|
172
230
|
|
|
173
|
-
def mute_participant(self,
|
|
231
|
+
def mute_participant(self, call_id: str, identity: str, track_sid: str, muted: bool):
|
|
174
232
|
"""
|
|
175
233
|
Mutes the participant on the Twilio call.
|
|
176
234
|
This prevents audio from reaching the Retell AI.
|
|
177
235
|
"""
|
|
178
|
-
self.twilio_client.calls(
|
|
236
|
+
self.twilio_client.calls(call_id).update(muted=muted)
|
|
179
237
|
|
|
180
|
-
def kick_participant(self,
|
|
238
|
+
def kick_participant(self, call_id: str, identity: str):
|
|
181
239
|
"""
|
|
182
240
|
Alias for delete_room (hangup).
|
|
183
241
|
"""
|
|
184
|
-
self.delete_room(
|
|
242
|
+
self.delete_room(call_id)
|
|
185
243
|
|
|
186
|
-
def send_alert(self,
|
|
244
|
+
def send_alert(self, call_id: str, message: str, participant_identity: Optional[str] = None):
|
|
187
245
|
"""
|
|
188
246
|
Not fully supported in this hybrid model
|
|
189
247
|
"""
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import logging
|
|
3
|
+
import httpx
|
|
4
|
+
from dotenv import load_dotenv
|
|
5
|
+
from openai import AsyncOpenAI
|
|
6
|
+
|
|
7
|
+
load_dotenv()
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class STTManager:
|
|
12
|
+
def __init__(self):
|
|
13
|
+
"""
|
|
14
|
+
Initializes the STTManager.
|
|
15
|
+
|
|
16
|
+
Note:
|
|
17
|
+
The following must be set in your .env file:
|
|
18
|
+
- OPENAI_API_KEY
|
|
19
|
+
- AGENT_API_URL (If not set, posting to agent will be disabled)
|
|
20
|
+
"""
|
|
21
|
+
self._api_key = os.getenv("OPENAI_API_KEY")
|
|
22
|
+
if not self._api_key:
|
|
23
|
+
raise ValueError("OPENAI_API_KEY must be set in your .env file.")
|
|
24
|
+
|
|
25
|
+
self._agent_api_url = os.getenv("AGENT_API_URL")
|
|
26
|
+
if not self._agent_api_url:
|
|
27
|
+
logger.warning("AGENT_API_URL is not set in .env. Posting to agent will be disabled.")
|
|
28
|
+
|
|
29
|
+
self._openai_client = AsyncOpenAI(api_key=self._api_key)
|
|
30
|
+
self._http_client = httpx.AsyncClient()
|
|
31
|
+
|
|
32
|
+
async def close(self):
|
|
33
|
+
"""
|
|
34
|
+
Cleans up resources used by the STTManager.
|
|
35
|
+
"""
|
|
36
|
+
await self._http_client.aclose()
|
|
37
|
+
await self._openai_client.close()
|
|
38
|
+
|
|
39
|
+
async def transcribe_audio(self, file_path: str, model: str = "whisper-1") -> str:
|
|
40
|
+
"""
|
|
41
|
+
Transcribes an audio file using OpenAI's whisper model.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
file_path: The path to the audio file to transcribe.
|
|
45
|
+
Supported formats: mp3, mp4, mpeg, mpga, m4a, wav, and webm.
|
|
46
|
+
model: The name of the whisper model to use.
|
|
47
|
+
Note: The OpenAI API currently only supports "whisper-1".
|
|
48
|
+
Returns:
|
|
49
|
+
The transcribed text as a string.
|
|
50
|
+
"""
|
|
51
|
+
logger.info(f"Starting transcription for file: {file_path}")
|
|
52
|
+
if not os.path.exists(file_path):
|
|
53
|
+
raise FileNotFoundError(f"Audio file not found at: {file_path}")
|
|
54
|
+
|
|
55
|
+
with open(file_path, "rb") as audio_file:
|
|
56
|
+
transcript = await self._openai_client.audio.transcriptions.create(
|
|
57
|
+
model=model,
|
|
58
|
+
file=audio_file
|
|
59
|
+
)
|
|
60
|
+
logger.info(f"Successfully transcribed file: {file_path}")
|
|
61
|
+
|
|
62
|
+
return transcript.text
|
|
63
|
+
|
|
64
|
+
async def transcribe_and_post(self, file_path: str):
|
|
65
|
+
"""
|
|
66
|
+
Processes an audio file by transcribing it and posting the result to the agent API under a 'message' key.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
file_path: The path to the audio file to process.
|
|
70
|
+
Supported formats: mp3, mp4, mpeg, mpga, m4a, wav, and webm.
|
|
71
|
+
Returns:
|
|
72
|
+
The transcribed text as a string.
|
|
73
|
+
"""
|
|
74
|
+
try:
|
|
75
|
+
# Transcribe the audio file
|
|
76
|
+
transcript_text = await self.transcribe_audio(file_path)
|
|
77
|
+
|
|
78
|
+
# Post the transcribed text to the agent API
|
|
79
|
+
if self._agent_api_url:
|
|
80
|
+
await self._post_to_agent(transcript_text)
|
|
81
|
+
else:
|
|
82
|
+
logger.info("AGENT_API_URL not set, skipping post to agent.")
|
|
83
|
+
|
|
84
|
+
return transcript_text
|
|
85
|
+
|
|
86
|
+
except FileNotFoundError:
|
|
87
|
+
logger.error(f"Audio file not found at: {file_path}", exc_info=True)
|
|
88
|
+
raise
|
|
89
|
+
except Exception as e:
|
|
90
|
+
logger.error(f"An error occurred during processing of {file_path}: {e}", exc_info=True)
|
|
91
|
+
raise
|
|
92
|
+
|
|
93
|
+
async def _post_to_agent(self, text: str):
|
|
94
|
+
"""
|
|
95
|
+
Posts the transcribed text to the agent API under a 'message' key.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
text: The transcribed text to post.
|
|
99
|
+
"""
|
|
100
|
+
payload = {"message": text}
|
|
101
|
+
try:
|
|
102
|
+
logger.info(f"Posting to agent with payload: {payload}")
|
|
103
|
+
response = await self._http_client.post(self._agent_api_url, json=payload)
|
|
104
|
+
response.raise_for_status()
|
|
105
|
+
logger.info(f"Successfully posted to agent. Status: {response.status_code}")
|
|
106
|
+
except httpx.HTTPError as e:
|
|
107
|
+
logger.error(f"Failed to post to agent API: {e}", exc_info=True)
|
|
108
|
+
raise
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import queue
|
|
3
|
+
import threading
|
|
4
|
+
import time
|
|
5
|
+
import pyaudio
|
|
6
|
+
from together import Together
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TTSStreamer:
|
|
10
|
+
def __init__(self, api_key=None):
|
|
11
|
+
self.api_key = api_key or os.environ.get("TOGETHER_API_KEY")
|
|
12
|
+
if not self.api_key:
|
|
13
|
+
raise ValueError(
|
|
14
|
+
"Together API Key is missing. Set TOGETHER_API_KEY env var."
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
self.client = Together(api_key=self.api_key)
|
|
18
|
+
|
|
19
|
+
# Audio Config
|
|
20
|
+
self.p = pyaudio.PyAudio()
|
|
21
|
+
self.stream = self.p.open(
|
|
22
|
+
format=pyaudio.paInt16, channels=1, rate=24000, output=True
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
# Queues
|
|
26
|
+
self.text_queue = queue.Queue()
|
|
27
|
+
self.audio_queue = queue.Queue()
|
|
28
|
+
|
|
29
|
+
# State
|
|
30
|
+
self.text_buffer = ""
|
|
31
|
+
self.is_running = True
|
|
32
|
+
self.playback_finished = threading.Event()
|
|
33
|
+
|
|
34
|
+
# Start Threads
|
|
35
|
+
self.fetcher_thread = threading.Thread(target=self._tts_fetcher, daemon=True)
|
|
36
|
+
self.player_thread = threading.Thread(target=self._audio_player, daemon=True)
|
|
37
|
+
|
|
38
|
+
self.fetcher_thread.start()
|
|
39
|
+
self.player_thread.start()
|
|
40
|
+
|
|
41
|
+
def feed(self, text_chunk):
|
|
42
|
+
"""Feed text tokens from LLM."""
|
|
43
|
+
if not self.is_running or not text_chunk:
|
|
44
|
+
return
|
|
45
|
+
|
|
46
|
+
self.text_buffer += text_chunk
|
|
47
|
+
sentence_endings = [".", "!", "?", "\n"]
|
|
48
|
+
|
|
49
|
+
for ending in sentence_endings:
|
|
50
|
+
if ending in self.text_buffer:
|
|
51
|
+
parts = self.text_buffer.split(ending)
|
|
52
|
+
|
|
53
|
+
# Send all complete sentences
|
|
54
|
+
for sentence in parts[:-1]:
|
|
55
|
+
if sentence.strip():
|
|
56
|
+
self.text_queue.put(sentence.strip() + ending)
|
|
57
|
+
|
|
58
|
+
# Keep the remainder
|
|
59
|
+
self.text_buffer = parts[-1]
|
|
60
|
+
|
|
61
|
+
def flush(self):
|
|
62
|
+
"""
|
|
63
|
+
Graceful finish: Push remaining text, signal end, and wait for audio to finish playing.
|
|
64
|
+
"""
|
|
65
|
+
# 1. Push remaining buffer
|
|
66
|
+
if self.text_buffer.strip():
|
|
67
|
+
self.text_queue.put(self.text_buffer.strip())
|
|
68
|
+
|
|
69
|
+
# 2. Signal Fetcher to stop expecting text
|
|
70
|
+
self.text_queue.put(None)
|
|
71
|
+
|
|
72
|
+
# 3. Wait for the player to signal it's done
|
|
73
|
+
# We use a timeout to prevent infinite hanging
|
|
74
|
+
self.playback_finished.wait(timeout=10.0)
|
|
75
|
+
|
|
76
|
+
def close(self):
|
|
77
|
+
"""
|
|
78
|
+
Immediate kill: Stop threads and close audio stream.
|
|
79
|
+
"""
|
|
80
|
+
if not self.is_running:
|
|
81
|
+
return
|
|
82
|
+
|
|
83
|
+
self.is_running = False
|
|
84
|
+
|
|
85
|
+
# Clear queues to unblock threads if they are stuck
|
|
86
|
+
with self.text_queue.mutex:
|
|
87
|
+
self.text_queue.queue.clear()
|
|
88
|
+
with self.audio_queue.mutex:
|
|
89
|
+
self.audio_queue.queue.clear()
|
|
90
|
+
|
|
91
|
+
try:
|
|
92
|
+
self.stream.stop_stream()
|
|
93
|
+
self.stream.close()
|
|
94
|
+
self.p.terminate()
|
|
95
|
+
except Exception:
|
|
96
|
+
pass
|
|
97
|
+
|
|
98
|
+
def stop(self):
|
|
99
|
+
"""Alias for close"""
|
|
100
|
+
self.close()
|
|
101
|
+
|
|
102
|
+
def _tts_fetcher(self):
|
|
103
|
+
while self.is_running:
|
|
104
|
+
try:
|
|
105
|
+
text = self.text_queue.get(timeout=0.5)
|
|
106
|
+
except queue.Empty:
|
|
107
|
+
continue
|
|
108
|
+
|
|
109
|
+
if text is None:
|
|
110
|
+
self.audio_queue.put(None) # Signal player to finish
|
|
111
|
+
break
|
|
112
|
+
|
|
113
|
+
try:
|
|
114
|
+
response = self.client.audio.speech.create(
|
|
115
|
+
model="canopylabs/orpheus-3b-0.1-ft",
|
|
116
|
+
input=text,
|
|
117
|
+
voice="tara",
|
|
118
|
+
stream=True,
|
|
119
|
+
response_format="raw",
|
|
120
|
+
response_encoding="pcm_s16le",
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
for chunk in response:
|
|
124
|
+
if not self.is_running:
|
|
125
|
+
break
|
|
126
|
+
|
|
127
|
+
if isinstance(chunk, tuple):
|
|
128
|
+
if len(chunk) > 1:
|
|
129
|
+
sub_iterator = chunk[1]
|
|
130
|
+
# Check if explicitly bytes (non-iterable in this context intended for iteration)
|
|
131
|
+
if isinstance(sub_iterator, bytes):
|
|
132
|
+
self._process_audio_bytes(sub_iterator)
|
|
133
|
+
else:
|
|
134
|
+
try:
|
|
135
|
+
for sub_chunk in sub_iterator:
|
|
136
|
+
if isinstance(sub_chunk, bytes):
|
|
137
|
+
self._process_audio_bytes(sub_chunk)
|
|
138
|
+
elif hasattr(sub_chunk, "content"):
|
|
139
|
+
self._process_audio_bytes(sub_chunk.content)
|
|
140
|
+
elif hasattr(sub_chunk, "data"):
|
|
141
|
+
self._process_audio_bytes(sub_chunk.data)
|
|
142
|
+
except TypeError:
|
|
143
|
+
pass
|
|
144
|
+
|
|
145
|
+
elif hasattr(chunk, "content"):
|
|
146
|
+
audio_data = chunk.content
|
|
147
|
+
if audio_data:
|
|
148
|
+
self._process_audio_bytes(audio_data)
|
|
149
|
+
|
|
150
|
+
elif isinstance(chunk, bytes):
|
|
151
|
+
self._process_audio_bytes(chunk)
|
|
152
|
+
|
|
153
|
+
except Exception as e:
|
|
154
|
+
print(f"TTS Error: {e}")
|
|
155
|
+
finally:
|
|
156
|
+
self.text_queue.task_done()
|
|
157
|
+
|
|
158
|
+
def _process_audio_bytes(self, audio_data):
|
|
159
|
+
"""Helper to strip headers and push to queue"""
|
|
160
|
+
# Strip WAV header if present (RIFF...WAVE)
|
|
161
|
+
if len(audio_data) >= 44 and audio_data[:4] == b"RIFF":
|
|
162
|
+
audio_data = audio_data[44:]
|
|
163
|
+
self.audio_queue.put(audio_data)
|
|
164
|
+
|
|
165
|
+
def _audio_player(self):
|
|
166
|
+
buffer = b""
|
|
167
|
+
while self.is_running:
|
|
168
|
+
try:
|
|
169
|
+
audio_data = self.audio_queue.get(timeout=0.5)
|
|
170
|
+
except queue.Empty:
|
|
171
|
+
continue
|
|
172
|
+
|
|
173
|
+
if audio_data is None:
|
|
174
|
+
self.playback_finished.set()
|
|
175
|
+
break
|
|
176
|
+
|
|
177
|
+
buffer += audio_data
|
|
178
|
+
|
|
179
|
+
if len(buffer) >= 2:
|
|
180
|
+
frame_count = len(buffer) // 2
|
|
181
|
+
bytes_to_play = frame_count * 2
|
|
182
|
+
play_chunk = buffer[:bytes_to_play]
|
|
183
|
+
buffer = buffer[bytes_to_play:]
|
|
184
|
+
|
|
185
|
+
try:
|
|
186
|
+
self.stream.write(play_chunk)
|
|
187
|
+
except OSError:
|
|
188
|
+
break
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: intellema-vdk
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: A Voice Development Kit for different Voice Agent Platforms
|
|
5
5
|
Author: Intellema
|
|
6
6
|
License: MIT License
|
|
@@ -37,6 +37,12 @@ Requires-Dist: boto3>=1.28.0
|
|
|
37
37
|
Requires-Dist: twilio
|
|
38
38
|
Requires-Dist: retell-sdk
|
|
39
39
|
Requires-Dist: requests
|
|
40
|
+
Requires-Dist: openai
|
|
41
|
+
Requires-Dist: httpx
|
|
42
|
+
Requires-Dist: pyaudio
|
|
43
|
+
Requires-Dist: together
|
|
44
|
+
Requires-Dist: langchain-openai
|
|
45
|
+
Requires-Dist: langchain-core
|
|
40
46
|
Dynamic: license-file
|
|
41
47
|
|
|
42
48
|
# Intellema VDK
|
|
@@ -100,6 +106,73 @@ from intellema_vdk import start_outbound_call
|
|
|
100
106
|
await start_outbound_call("livekit", phone_number="+1...")
|
|
101
107
|
```
|
|
102
108
|
|
|
109
|
+
## Speech To Text (STT)
|
|
110
|
+
|
|
111
|
+
The `STTManager` class provides an interface for transcribing audio files using OpenAI's Whisper model and optionally posting the transcribed text to a specified agent API.
|
|
112
|
+
|
|
113
|
+
### Usage
|
|
114
|
+
|
|
115
|
+
Here's how to use the `STTManager` to transcribe an audio file and post the result:
|
|
116
|
+
Ensure to set OPENAI_API_KEY and AGENT_API_URL in your `.env` file.
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
import asyncio
|
|
120
|
+
from intellema_vdk import STTManager
|
|
121
|
+
|
|
122
|
+
async def main():
|
|
123
|
+
# 1- Initialize the STTManager
|
|
124
|
+
stt_manager = STTManager()
|
|
125
|
+
|
|
126
|
+
try:
|
|
127
|
+
# 2- Transcribe an audio file and post the result to your agent API URL (if provided)
|
|
128
|
+
# Replace "path/to/your/audio.mp3" with the actual file path
|
|
129
|
+
transcript = await stt_manager.transcribe_and_post("path/to/your/audio.mp3")
|
|
130
|
+
print(f"Transcription: {transcript}")
|
|
131
|
+
|
|
132
|
+
except FileNotFoundError:
|
|
133
|
+
print("The audio file was not found.")
|
|
134
|
+
except Exception as e:
|
|
135
|
+
print(f"An error occurred: {e}")
|
|
136
|
+
finally:
|
|
137
|
+
# 3- Clean up
|
|
138
|
+
await stt_manager.close()
|
|
139
|
+
|
|
140
|
+
if __name__ == "__main__":
|
|
141
|
+
asyncio.run(main())
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
## TTS Streaming
|
|
145
|
+
|
|
146
|
+
The `TTSStreamer` class provides low-latency text-to-speech streaming using Together AI's inference engine. It enables real-time voice synthesis from streaming LLM responses.
|
|
147
|
+
|
|
148
|
+
### Running the Sample implementation
|
|
149
|
+
|
|
150
|
+
We provide a ready-to-use sample that connects LangChain (OpenAI) with the TTS Streamer.
|
|
151
|
+
|
|
152
|
+
1. **Configure Keys**: Ensure `OPENAI_API_KEY` and `TOGETHER_API_KEY` are set in your `.env`.
|
|
153
|
+
2. **Run the script**:
|
|
154
|
+
```bash
|
|
155
|
+
python sample_implementation.py
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
### Library Usage
|
|
159
|
+
|
|
160
|
+
You can integrate the streamer into your own loops:
|
|
161
|
+
|
|
162
|
+
```python
|
|
163
|
+
from intellema_vdk import TTSStreamer
|
|
164
|
+
|
|
165
|
+
# 1. Initialize per turn
|
|
166
|
+
tts = TTSStreamer()
|
|
167
|
+
|
|
168
|
+
# 2. Feed text chunks as they are generated
|
|
169
|
+
for chunk in llm_response_stream:
|
|
170
|
+
tts.feed(chunk)
|
|
171
|
+
|
|
172
|
+
# 3. Flush and clean up
|
|
173
|
+
tts.flush()
|
|
174
|
+
tts.close()
|
|
175
|
+
```
|
|
103
176
|
|
|
104
177
|
## Configuration
|
|
105
178
|
|
|
@@ -115,6 +188,34 @@ TWILIO_AUTH_TOKEN=your-token
|
|
|
115
188
|
TWILIO_PHONE_NUMBER=your-number
|
|
116
189
|
RETELL_API_KEY=your-retell-key
|
|
117
190
|
RETELL_AGENT_ID=your-agent-id
|
|
191
|
+
TOGETHER_API_KEY=your-together-key
|
|
192
|
+
OPENAI_API_KEY=your-openai-key
|
|
193
|
+
AGENT_API_URL=https://your-agent-api.com/endpoint
|
|
118
194
|
```
|
|
119
195
|
|
|
196
|
+
## Retell Setup
|
|
197
|
+
|
|
198
|
+
**Important:** Before initiating calls with Retell, you must register your Twilio phone number with Retell. This binds your agent to the number and allows Retell to handle the call flow.
|
|
199
|
+
|
|
200
|
+
You can register your number in two ways:
|
|
201
|
+
|
|
202
|
+
1. **Using the Helper Script:**
|
|
203
|
+
We provide an interactive script to guide you through the process:
|
|
204
|
+
```bash
|
|
205
|
+
python import_phone_number.py
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
2. **Programmatically:**
|
|
209
|
+
```python
|
|
210
|
+
from intellema_vdk.retell_lib.retell_client import RetellManager
|
|
211
|
+
|
|
212
|
+
manager = RetellManager()
|
|
213
|
+
# Optional: Pass termination_uri if you have a SIP trunk
|
|
214
|
+
manager.import_phone_number(nickname="My Twilio Number")
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
## Notes
|
|
218
|
+
|
|
219
|
+
- **Retell `delete_room` Limitation**: The `delete_room` method for Retell relies on updating dynamic variables during the conversation loop. As a result, it **only works if the user speaks something** which triggers the agent to check the variable and terminate the call.
|
|
220
|
+
|
|
120
221
|
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
intellema_vdk/__init__.py,sha256=64pm2TLqhGG225JLddco1kSOpLaD3eGByWvMpaHUUX0,1231
|
|
2
|
+
intellema_vdk/livekit_lib/__init__.py,sha256=9JsOBswDivM8tRw9EF1ql0wwFnHvwjcPWT-umqad98o,68
|
|
3
|
+
intellema_vdk/livekit_lib/client.py,sha256=UxOuT9I-YPtHopx4dXoGKRAJvLXKFgUdtrAcHdR4a-Q,10687
|
|
4
|
+
intellema_vdk/retell_lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
|
+
intellema_vdk/retell_lib/import_phone_number.py,sha256=pOt1k6De6-bt2xOPzMMR0nI4Ha6BzLjm19qenyy9RN8,3081
|
|
6
|
+
intellema_vdk/retell_lib/retell_client.py,sha256=qT00NJWi2rJyLWswWTx5fGl5mwPdy6QurQt1Enac0rU,10793
|
|
7
|
+
intellema_vdk/speech_lib/__init__.py,sha256=TXdyAAS6AfQfln_QlIvx_uXU-ksugXzC2N9hrjW1_MQ,73
|
|
8
|
+
intellema_vdk/speech_lib/stt_client.py,sha256=YB8-mJUtQKhqEC4zhipJUb6Y8LqJx0Vv_c4iIxuUjJM,4054
|
|
9
|
+
intellema_vdk/speech_lib/tts_streamer.py,sha256=qs2mzP0vKqv2eKvGJSCTee3mzeJGS9nji0Yy3Y-sOTc,6453
|
|
10
|
+
intellema_vdk-0.2.0.dist-info/licenses/LICENSE,sha256=41qw3yuvY1SpTkwLebZTVYOKk9OIe1Kr6I1S6Y5mp8Y,1087
|
|
11
|
+
intellema_vdk-0.2.0.dist-info/METADATA,sha256=j53oDXr8Xcq7nkP4v6bWeK62z0yR-Pa2yivGaCw-abc,7363
|
|
12
|
+
intellema_vdk-0.2.0.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
|
|
13
|
+
intellema_vdk-0.2.0.dist-info/top_level.txt,sha256=nQ_0rJRkEthHH0bJYoPAVVgQiO6Uw6c_mHnfeROG14U,14
|
|
14
|
+
intellema_vdk-0.2.0.dist-info/RECORD,,
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
intellema_vdk/__init__.py,sha256=L3hrqcxGVpd4xjXMdEXEO8-Rsg4MfJUR_iN3X0hbido,1224
|
|
2
|
-
intellema_vdk/livekit_lib/__init__.py,sha256=9JsOBswDivM8tRw9EF1ql0wwFnHvwjcPWT-umqad98o,68
|
|
3
|
-
intellema_vdk/livekit_lib/client.py,sha256=UxOuT9I-YPtHopx4dXoGKRAJvLXKFgUdtrAcHdR4a-Q,10687
|
|
4
|
-
intellema_vdk/livekit_lib/__pycache__/__init__.cpython-312.pyc,sha256=-LfSHUwq29ExyfKYStSqyiEzVE--LhEkogP2TLI1xPA,224
|
|
5
|
-
intellema_vdk/livekit_lib/__pycache__/client.cpython-312.pyc,sha256=6_zxzDpWZHmBNIuJeKOCWQlYe9XxAhW0npUlcX0Z9sc,14331
|
|
6
|
-
intellema_vdk/retell_lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
-
intellema_vdk/retell_lib/retell_client.py,sha256=sc6EnDc7J4SEF515-YIre56Tg-KLXDRRzli7Jwf-bSo,8037
|
|
8
|
-
intellema_vdk/retell_lib/__pycache__/__init__.cpython-312.pyc,sha256=raQGnf_MXBHnMWj5H8iXd3B7U53AfOEyXX2aPx9s1cA,147
|
|
9
|
-
intellema_vdk/retell_lib/__pycache__/retell_client.cpython-312.pyc,sha256=eE6z7VLSDglUI5feZnK-qKlB-pOTmoAyDWsgEuraP-s,10060
|
|
10
|
-
intellema_vdk-0.1.0.dist-info/licenses/LICENSE,sha256=41qw3yuvY1SpTkwLebZTVYOKk9OIe1Kr6I1S6Y5mp8Y,1087
|
|
11
|
-
intellema_vdk-0.1.0.dist-info/METADATA,sha256=RqzJZRQZn35QgwEkQu9LZbCIDW7D6b6dSBFfFiLIcTk,4120
|
|
12
|
-
intellema_vdk-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
13
|
-
intellema_vdk-0.1.0.dist-info/top_level.txt,sha256=nQ_0rJRkEthHH0bJYoPAVVgQiO6Uw6c_mHnfeROG14U,14
|
|
14
|
-
intellema_vdk-0.1.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|