intellema-vdk 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
intellema_vdk/__init__.py CHANGED
@@ -1,12 +1,9 @@
1
1
  from typing import Optional, List, Any
2
- import os
3
- from dotenv import load_dotenv
4
-
5
- # Load environment variables
6
- load_dotenv()
7
2
 
8
3
  from .livekit_lib.client import LiveKitManager
9
4
  from .retell_lib.retell_client import RetellManager
5
+ from .speech_lib.stt_client import STTManager
6
+ from .speech_lib.tts_streamer import TTSStreamer
10
7
 
11
8
  def VoiceClient(provider: str, **kwargs) -> Any:
12
9
  """
@@ -0,0 +1,73 @@
1
+ import os
2
+ import sys
3
+
4
+ # Add the project root to the python path so we can import intellema_vdk
5
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
6
+
7
+ from intellema_vdk.retell_lib.retell_client import RetellManager
8
+
9
+ def import_twilio_number():
10
+ """
11
+ Import your Twilio phone number to Retell.
12
+ This is required before you can make outbound calls using Retell.
13
+ """
14
+ try:
15
+ manager = RetellManager()
16
+
17
+ print("=== Retell Phone Number Import ===\n")
18
+ print(f"Phone Number to import: {manager.twilio_number}")
19
+ print(f"Agent ID to bind: {manager.retell_agent_id}\n")
20
+
21
+ # Ask if user has a Twilio SIP trunk
22
+ print("Do you have a Twilio Elastic SIP Trunk configured?")
23
+ print("If you're not sure, you can:")
24
+ print(" 1. Visit: https://console.twilio.com/us1/develop/voice/manage/trunks")
25
+ print(" 2. Or just press Enter to try without it (may not work for some setups)\n")
26
+
27
+ has_trunk = input("Do you have a SIP trunk? (y/n, default: n): ").strip().lower()
28
+
29
+ termination_uri = None
30
+ sip_username = None
31
+ sip_password = None
32
+
33
+ if has_trunk == 'y':
34
+ print("\nEnter your Twilio SIP Trunk Termination URI.")
35
+ print("Format: yourtrunkname.pstn.twilio.com")
36
+ print("You can find this in Twilio Console > Elastic SIP Trunking > Your Trunk > Termination")
37
+ termination_uri = input("Termination URI: ").strip()
38
+
39
+ print("\nDo you use Credential List authentication? (Recommended)")
40
+ has_creds = input("Use credentials? (y/n, default: y): ").strip().lower() or 'y'
41
+
42
+ if has_creds == 'y':
43
+ print("Enter the username/password from your Twilio Credential List:")
44
+ sip_username = input("Username: ").strip()
45
+ sip_password = input("Password: ").strip()
46
+
47
+ # Optional nickname
48
+ nickname = input("\nOptional: Enter a nickname for this number (press Enter to skip): ").strip() or None
49
+
50
+ print(f"\n=== Importing Phone Number ===")
51
+
52
+ response = manager.import_phone_number(
53
+ termination_uri=termination_uri,
54
+ nickname=nickname,
55
+ sip_trunk_auth_username=sip_username,
56
+ sip_trunk_auth_password=sip_password
57
+ )
58
+
59
+ print(f"\n=== Import Successful! ===")
60
+ print(f"You can now use this number to make outbound calls via Retell.")
61
+
62
+ return response
63
+
64
+ except Exception as e:
65
+ print(f"\n✗ Import failed: {e}")
66
+ print(f"\nTroubleshooting:")
67
+ print(f" 1. If you don't have a SIP trunk, you may need to purchase the number through Retell")
68
+ print(f" 2. Visit Retell dashboard: https://app.retellai.com/")
69
+ print(f" 3. Or create a Twilio Elastic SIP Trunk first")
70
+ raise
71
+
72
+ if __name__ == "__main__":
73
+ import_twilio_number()
@@ -26,64 +26,122 @@ class RetellManager:
26
26
  self.twilio_client = Client(self.twilio_account_sid, self.twilio_auth_token)
27
27
  self.retell_client = Retell(api_key=self.retell_api_key)
28
28
 
29
- def start_outbound_call(self, phone_number: str, prompt_content: str = None, call_id: str = None) -> str:
29
+ def import_phone_number(self, termination_uri: str = None, outbound_agent_id: str = None, inbound_agent_id: str = None, nickname: str = None, sip_trunk_auth_username: str = None, sip_trunk_auth_password: str = None):
30
30
  """
31
- Initiates an outbound call using Twilio.
32
- Registers the call with Retell first, then uses TwiML to connect Twilio to Retell's WebSocket.
31
+ Import/register your Twilio phone number with Retell.
32
+ This is required before you can make outbound calls using the phone number.
33
33
 
34
34
  Args:
35
- phone_number: The number to call.
36
- prompt_content: Content to override the agent's prompt (passed as 'prompt_content' dynamic variable).
37
- call_id: Custom ID for metadata (optional).
35
+ termination_uri: Twilio SIP trunk termination URI (e.g., "yourtrunk.pstn.twilio.com").
36
+ If not provided, will try to use a default format.
37
+ outbound_agent_id: Agent ID to use for outbound calls. Defaults to self.retell_agent_id.
38
+ inbound_agent_id: Agent ID to use for inbound calls. Defaults to None (no inbound).
39
+ nickname: Optional nickname for the phone number.
40
+ sip_trunk_auth_username: Username for SIP trunk authentication (if using credential list).
41
+ sip_trunk_auth_password: Password for SIP trunk authentication (if using credential list).
42
+
43
+ Returns:
44
+ The phone number registration response from Retell.
38
45
  """
39
- # 1. Register call with Retell to get the WebSocket URL
40
- register_response = self.retell_client.call.register_phone_call(
41
- agent_id=self.retell_agent_id,
42
- direction="outbound",
43
- from_number=self.twilio_number,
44
- to_number=phone_number,
45
- metadata={"call_id": call_id} if call_id else None,
46
- retell_llm_dynamic_variables={"prompt_content": prompt_content} if prompt_content else None
47
- )
46
+ # Build the import kwargs
47
+ import_kwargs = {
48
+ "phone_number": self.twilio_number,
49
+ }
50
+
51
+ # Add termination URI if provided
52
+ if termination_uri:
53
+ import_kwargs["termination_uri"] = termination_uri
54
+
55
+ # Add SIP credentials if provided
56
+ if sip_trunk_auth_username and sip_trunk_auth_password:
57
+ import_kwargs["sip_trunk_auth_username"] = sip_trunk_auth_username
58
+ import_kwargs["sip_trunk_auth_password"] = sip_trunk_auth_password
59
+
60
+ # Set outbound agent (required for outbound calls)
61
+ if outbound_agent_id:
62
+ import_kwargs["outbound_agent_id"] = outbound_agent_id
63
+ elif self.retell_agent_id:
64
+ import_kwargs["outbound_agent_id"] = self.retell_agent_id
65
+
66
+ # Set inbound agent if provided
67
+ if inbound_agent_id:
68
+ import_kwargs["inbound_agent_id"] = inbound_agent_id
69
+
70
+ # Add nickname if provided
71
+ if nickname:
72
+ import_kwargs["nickname"] = nickname
73
+
74
+ try:
75
+ response = self.retell_client.phone_number.import_(**import_kwargs)
76
+ print(f"✓ Phone number {self.twilio_number} successfully imported to Retell!")
77
+ print(f" Phone Number: {response.phone_number}")
78
+ print(f" Type: {response.phone_number_type}")
79
+ if hasattr(response, 'outbound_agent_id') and response.outbound_agent_id:
80
+ print(f" Outbound Agent: {response.outbound_agent_id}")
81
+ if hasattr(response, 'inbound_agent_id') and response.inbound_agent_id:
82
+ print(f" Inbound Agent: {response.inbound_agent_id}")
83
+ return response
84
+ except Exception as e:
85
+ print(f"✗ Error importing phone number: {e}")
86
+ print(f"\nNote: If you're using Twilio, you may need to:")
87
+ print(f" 1. Create an Elastic SIP Trunk in Twilio console")
88
+ print(f" 2. Provide the termination_uri parameter (e.g., 'yourtrunk.pstn.twilio.com')")
89
+ print(f" 3. Or purchase the number directly through Retell dashboard")
90
+ raise
48
91
 
49
- # 2. Construct the audio WebSocket URL using the call_id
50
- audio_websocket_url = f"wss://api.retellai.com/audio-websocket/{register_response.call_id}"
51
-
52
- # 3. Construct TwiML to connect Twilio to Retell
53
- # Note: We construct the XML string manually to avoid extra dependencies like twilio.twiml
54
- twiml = f"""<Response>
55
- <Connect>
56
- <Stream url="{audio_websocket_url}" />
57
- </Connect>
58
- </Response>"""
59
-
60
- # 3. Create the call with Twilio using the generated TwiML
61
- call = self.twilio_client.calls.create(
62
- to=phone_number,
63
- from_=self.twilio_number,
64
- twiml=twiml
65
- )
66
- return call.sid
67
92
 
68
- def delete_room(self, room_name: str):
69
- """
70
- Ends the call. 'room_name' is interpreted as the Twilio Call SID.
71
- Ends both the Retell agent and the Twilio call.
72
- """
93
+ def start_outbound_call(self, phone_number: str, prompt_content: str = None, call_id: str = None) -> str:
94
+ call_kwargs = {
95
+ "from_number": self.twilio_number,
96
+ "to_number": phone_number,
97
+ "override_agent_id": self.retell_agent_id,
98
+ }
99
+
100
+ if call_id:
101
+ call_kwargs["metadata"] = {"call_id": call_id}
102
+
103
+ # Only set up dynamic variables if we have a custom prompt
104
+ if prompt_content:
105
+ enhanced_prompt = (
106
+ f"{prompt_content}\n\n"
107
+ "SYSTEM INSTRUCTION: Continually check the dynamic variable {{force_end}}. "
108
+ "If {{force_end}} == 'true, you must IMMEDIATELY say goodbye and activate your end_call tool to terminate the call."
109
+ )
110
+ call_kwargs["retell_llm_dynamic_variables"] = {
111
+ "prompt_content": enhanced_prompt,
112
+ "force_end": "false"
113
+ }
114
+
115
+ call_response = self.retell_client.call.create_phone_call(**call_kwargs)
116
+
117
+ print(f"Call created successfully!")
118
+ print(f"Retell Call ID: {call_response.call_id}")
119
+ print(f"Call Status: {call_response.call_status}")
120
+
121
+ return call_response.call_id
122
+
123
+ def delete_room(self, call_id: str):
73
124
  try:
74
- # Attempt to end Retell call if mapped, but primarily hang up Twilio
75
- # Note: Retell SDK end_call requires retell call id, not twilio sid.
76
- # If we don't have the mapping, hanging up Twilio is the most effective way to stop everything.
77
- try:
78
- self.retell_client.call.end_call(call_id=room_name)
79
- except Exception:
80
- pass # Ignore if Retell call fails (e.g. invalid ID), ensure Twilio hangs up
81
-
82
- self.twilio_client.calls(room_name).update(status='completed')
125
+ call_data = self.retell_client.call.retrieve(call_id)
126
+ print(f"Current call status: {call_data.call_status}")
127
+
128
+ if call_data.call_status in ['registered', 'ongoing', 'dialing']:
129
+ print(f"Triggering end for Retell call {call_id}...")
130
+
131
+ self.retell_client.call.update(
132
+ call_id,
133
+ override_dynamic_variables={"force_end": "true"}
134
+ )
135
+
136
+ print("✓ force_end override sent to Retell API")
137
+ else:
138
+ print(f"Call already ended: {call_data.call_status}")
139
+
83
140
  except Exception as e:
84
- print(f"Error ending call {room_name}: {e}")
141
+ print(f"Error ending call {call_id}: {e}")
142
+ raise
85
143
 
86
- def start_stream(self, room_name: str, rtmp_urls: List[str]):
144
+ def start_stream(self, call_id: str, rtmp_urls: List[str]):
87
145
  """
88
146
  Starts a Twilio Media Stream.
89
147
  Note: Twilio streams are WebSocket-based. If rtmp_urls contains a WSS URL, it will work.
@@ -91,16 +149,16 @@ class RetellManager:
91
149
  if not rtmp_urls:
92
150
  raise ValueError("No stream URLs provided")
93
151
 
94
- self.twilio_client.calls(room_name).streams.create(
152
+ self.twilio_client.calls(call_id).streams.create(
95
153
  url=rtmp_urls[0]
96
154
  )
97
155
 
98
- def start_recording(self, room_name: str, output_filepath: Optional[str] = None, upload_to_s3: bool = True, wait_for_completion: bool = True):
156
+ def start_recording(self, call_id: str, output_filepath: Optional[str] = None, upload_to_s3: bool = True, wait_for_completion: bool = True):
99
157
  """
100
158
  Triggers a recording on the active Twilio call.
101
159
 
102
160
  Args:
103
- room_name: The Twilio Call SID.
161
+ call_id: The Twilio Call SID.
104
162
  output_filepath: Optional filename for the recording.
105
163
  upload_to_s3: If True, uploads to S3.
106
164
  wait_for_completion: If True, waits for recording to finish and then uploads.
@@ -110,7 +168,7 @@ class RetellManager:
110
168
  """
111
169
 
112
170
  # Start Twilio recording
113
- recording = self.twilio_client.calls(room_name).recordings.create()
171
+ recording = self.twilio_client.calls(call_id).recordings.create()
114
172
  print(f"Recording started: {recording.sid}")
115
173
 
116
174
  if not wait_for_completion:
@@ -147,7 +205,7 @@ class RetellManager:
147
205
  if not access_key or not secret_key or not bucket:
148
206
  raise ValueError("AWS credentials (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_S3_BUCKET) are required for S3 upload.")
149
207
 
150
- filename = output_filepath if output_filepath else f"{room_name}-{uuid.uuid4().hex[:6]}.mp3"
208
+ filename = output_filepath if output_filepath else f"{call_id}-{uuid.uuid4().hex[:6]}.mp3"
151
209
 
152
210
  s3 = boto3.client(
153
211
  's3',
@@ -170,20 +228,20 @@ class RetellManager:
170
228
 
171
229
  return recording.sid
172
230
 
173
- def mute_participant(self, room_name: str, identity: str, track_sid: str, muted: bool):
231
+ def mute_participant(self, call_id: str, identity: str, track_sid: str, muted: bool):
174
232
  """
175
233
  Mutes the participant on the Twilio call.
176
234
  This prevents audio from reaching the Retell AI.
177
235
  """
178
- self.twilio_client.calls(room_name).update(muted=muted)
236
+ self.twilio_client.calls(call_id).update(muted=muted)
179
237
 
180
- def kick_participant(self, room_name: str, identity: str):
238
+ def kick_participant(self, call_id: str, identity: str):
181
239
  """
182
240
  Alias for delete_room (hangup).
183
241
  """
184
- self.delete_room(room_name)
242
+ self.delete_room(call_id)
185
243
 
186
- def send_alert(self, room_name: str, message: str, participant_identity: Optional[str] = None):
244
+ def send_alert(self, call_id: str, message: str, participant_identity: Optional[str] = None):
187
245
  """
188
246
  Not fully supported in this hybrid model
189
247
  """
@@ -0,0 +1,2 @@
1
+ from .stt_client import STTManager
2
+ from .tts_streamer import TTSStreamer
@@ -0,0 +1,108 @@
1
+ import os
2
+ import logging
3
+ import httpx
4
+ from dotenv import load_dotenv
5
+ from openai import AsyncOpenAI
6
+
7
+ load_dotenv()
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class STTManager:
12
+ def __init__(self):
13
+ """
14
+ Initializes the STTManager.
15
+
16
+ Note:
17
+ The following must be set in your .env file:
18
+ - OPENAI_API_KEY
19
+ - AGENT_API_URL (If not set, posting to agent will be disabled)
20
+ """
21
+ self._api_key = os.getenv("OPENAI_API_KEY")
22
+ if not self._api_key:
23
+ raise ValueError("OPENAI_API_KEY must be set in your .env file.")
24
+
25
+ self._agent_api_url = os.getenv("AGENT_API_URL")
26
+ if not self._agent_api_url:
27
+ logger.warning("AGENT_API_URL is not set in .env. Posting to agent will be disabled.")
28
+
29
+ self._openai_client = AsyncOpenAI(api_key=self._api_key)
30
+ self._http_client = httpx.AsyncClient()
31
+
32
+ async def close(self):
33
+ """
34
+ Cleans up resources used by the STTManager.
35
+ """
36
+ await self._http_client.aclose()
37
+ await self._openai_client.close()
38
+
39
+ async def transcribe_audio(self, file_path: str, model: str = "whisper-1") -> str:
40
+ """
41
+ Transcribes an audio file using OpenAI's whisper model.
42
+
43
+ Args:
44
+ file_path: The path to the audio file to transcribe.
45
+ Supported formats: mp3, mp4, mpeg, mpga, m4a, wav, and webm.
46
+ model: The name of the whisper model to use.
47
+ Note: The OpenAI API currently only supports "whisper-1".
48
+ Returns:
49
+ The transcribed text as a string.
50
+ """
51
+ logger.info(f"Starting transcription for file: {file_path}")
52
+ if not os.path.exists(file_path):
53
+ raise FileNotFoundError(f"Audio file not found at: {file_path}")
54
+
55
+ with open(file_path, "rb") as audio_file:
56
+ transcript = await self._openai_client.audio.transcriptions.create(
57
+ model=model,
58
+ file=audio_file
59
+ )
60
+ logger.info(f"Successfully transcribed file: {file_path}")
61
+
62
+ return transcript.text
63
+
64
+ async def transcribe_and_post(self, file_path: str):
65
+ """
66
+ Processes an audio file by transcribing it and posting the result to the agent API under a 'message' key.
67
+
68
+ Args:
69
+ file_path: The path to the audio file to process.
70
+ Supported formats: mp3, mp4, mpeg, mpga, m4a, wav, and webm.
71
+ Returns:
72
+ The transcribed text as a string.
73
+ """
74
+ try:
75
+ # Transcribe the audio file
76
+ transcript_text = await self.transcribe_audio(file_path)
77
+
78
+ # Post the transcribed text to the agent API
79
+ if self._agent_api_url:
80
+ await self._post_to_agent(transcript_text)
81
+ else:
82
+ logger.info("AGENT_API_URL not set, skipping post to agent.")
83
+
84
+ return transcript_text
85
+
86
+ except FileNotFoundError:
87
+ logger.error(f"Audio file not found at: {file_path}", exc_info=True)
88
+ raise
89
+ except Exception as e:
90
+ logger.error(f"An error occurred during processing of {file_path}: {e}", exc_info=True)
91
+ raise
92
+
93
+ async def _post_to_agent(self, text: str):
94
+ """
95
+ Posts the transcribed text to the agent API under a 'message' key.
96
+
97
+ Args:
98
+ text: The transcribed text to post.
99
+ """
100
+ payload = {"message": text}
101
+ try:
102
+ logger.info(f"Posting to agent with payload: {payload}")
103
+ response = await self._http_client.post(self._agent_api_url, json=payload)
104
+ response.raise_for_status()
105
+ logger.info(f"Successfully posted to agent. Status: {response.status_code}")
106
+ except httpx.HTTPError as e:
107
+ logger.error(f"Failed to post to agent API: {e}", exc_info=True)
108
+ raise
@@ -0,0 +1,188 @@
1
+ import os
2
+ import queue
3
+ import threading
4
+ import time
5
+ import pyaudio
6
+ from together import Together
7
+
8
+
9
+ class TTSStreamer:
10
+ def __init__(self, api_key=None):
11
+ self.api_key = api_key or os.environ.get("TOGETHER_API_KEY")
12
+ if not self.api_key:
13
+ raise ValueError(
14
+ "Together API Key is missing. Set TOGETHER_API_KEY env var."
15
+ )
16
+
17
+ self.client = Together(api_key=self.api_key)
18
+
19
+ # Audio Config
20
+ self.p = pyaudio.PyAudio()
21
+ self.stream = self.p.open(
22
+ format=pyaudio.paInt16, channels=1, rate=24000, output=True
23
+ )
24
+
25
+ # Queues
26
+ self.text_queue = queue.Queue()
27
+ self.audio_queue = queue.Queue()
28
+
29
+ # State
30
+ self.text_buffer = ""
31
+ self.is_running = True
32
+ self.playback_finished = threading.Event()
33
+
34
+ # Start Threads
35
+ self.fetcher_thread = threading.Thread(target=self._tts_fetcher, daemon=True)
36
+ self.player_thread = threading.Thread(target=self._audio_player, daemon=True)
37
+
38
+ self.fetcher_thread.start()
39
+ self.player_thread.start()
40
+
41
+ def feed(self, text_chunk):
42
+ """Feed text tokens from LLM."""
43
+ if not self.is_running or not text_chunk:
44
+ return
45
+
46
+ self.text_buffer += text_chunk
47
+ sentence_endings = [".", "!", "?", "\n"]
48
+
49
+ for ending in sentence_endings:
50
+ if ending in self.text_buffer:
51
+ parts = self.text_buffer.split(ending)
52
+
53
+ # Send all complete sentences
54
+ for sentence in parts[:-1]:
55
+ if sentence.strip():
56
+ self.text_queue.put(sentence.strip() + ending)
57
+
58
+ # Keep the remainder
59
+ self.text_buffer = parts[-1]
60
+
61
+ def flush(self):
62
+ """
63
+ Graceful finish: Push remaining text, signal end, and wait for audio to finish playing.
64
+ """
65
+ # 1. Push remaining buffer
66
+ if self.text_buffer.strip():
67
+ self.text_queue.put(self.text_buffer.strip())
68
+
69
+ # 2. Signal Fetcher to stop expecting text
70
+ self.text_queue.put(None)
71
+
72
+ # 3. Wait for the player to signal it's done
73
+ # We use a timeout to prevent infinite hanging
74
+ self.playback_finished.wait(timeout=10.0)
75
+
76
+ def close(self):
77
+ """
78
+ Immediate kill: Stop threads and close audio stream.
79
+ """
80
+ if not self.is_running:
81
+ return
82
+
83
+ self.is_running = False
84
+
85
+ # Clear queues to unblock threads if they are stuck
86
+ with self.text_queue.mutex:
87
+ self.text_queue.queue.clear()
88
+ with self.audio_queue.mutex:
89
+ self.audio_queue.queue.clear()
90
+
91
+ try:
92
+ self.stream.stop_stream()
93
+ self.stream.close()
94
+ self.p.terminate()
95
+ except Exception:
96
+ pass
97
+
98
+ def stop(self):
99
+ """Alias for close"""
100
+ self.close()
101
+
102
+ def _tts_fetcher(self):
103
+ while self.is_running:
104
+ try:
105
+ text = self.text_queue.get(timeout=0.5)
106
+ except queue.Empty:
107
+ continue
108
+
109
+ if text is None:
110
+ self.audio_queue.put(None) # Signal player to finish
111
+ break
112
+
113
+ try:
114
+ response = self.client.audio.speech.create(
115
+ model="canopylabs/orpheus-3b-0.1-ft",
116
+ input=text,
117
+ voice="tara",
118
+ stream=True,
119
+ response_format="raw",
120
+ response_encoding="pcm_s16le",
121
+ )
122
+
123
+ for chunk in response:
124
+ if not self.is_running:
125
+ break
126
+
127
+ if isinstance(chunk, tuple):
128
+ if len(chunk) > 1:
129
+ sub_iterator = chunk[1]
130
+ # Check if explicitly bytes (non-iterable in this context intended for iteration)
131
+ if isinstance(sub_iterator, bytes):
132
+ self._process_audio_bytes(sub_iterator)
133
+ else:
134
+ try:
135
+ for sub_chunk in sub_iterator:
136
+ if isinstance(sub_chunk, bytes):
137
+ self._process_audio_bytes(sub_chunk)
138
+ elif hasattr(sub_chunk, "content"):
139
+ self._process_audio_bytes(sub_chunk.content)
140
+ elif hasattr(sub_chunk, "data"):
141
+ self._process_audio_bytes(sub_chunk.data)
142
+ except TypeError:
143
+ pass
144
+
145
+ elif hasattr(chunk, "content"):
146
+ audio_data = chunk.content
147
+ if audio_data:
148
+ self._process_audio_bytes(audio_data)
149
+
150
+ elif isinstance(chunk, bytes):
151
+ self._process_audio_bytes(chunk)
152
+
153
+ except Exception as e:
154
+ print(f"TTS Error: {e}")
155
+ finally:
156
+ self.text_queue.task_done()
157
+
158
+ def _process_audio_bytes(self, audio_data):
159
+ """Helper to strip headers and push to queue"""
160
+ # Strip WAV header if present (RIFF...WAVE)
161
+ if len(audio_data) >= 44 and audio_data[:4] == b"RIFF":
162
+ audio_data = audio_data[44:]
163
+ self.audio_queue.put(audio_data)
164
+
165
+ def _audio_player(self):
166
+ buffer = b""
167
+ while self.is_running:
168
+ try:
169
+ audio_data = self.audio_queue.get(timeout=0.5)
170
+ except queue.Empty:
171
+ continue
172
+
173
+ if audio_data is None:
174
+ self.playback_finished.set()
175
+ break
176
+
177
+ buffer += audio_data
178
+
179
+ if len(buffer) >= 2:
180
+ frame_count = len(buffer) // 2
181
+ bytes_to_play = frame_count * 2
182
+ play_chunk = buffer[:bytes_to_play]
183
+ buffer = buffer[bytes_to_play:]
184
+
185
+ try:
186
+ self.stream.write(play_chunk)
187
+ except OSError:
188
+ break
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: intellema-vdk
3
- Version: 0.1.0
3
+ Version: 0.2.0
4
4
  Summary: A Voice Development Kit for different Voice Agent Platforms
5
5
  Author: Intellema
6
6
  License: MIT License
@@ -37,6 +37,12 @@ Requires-Dist: boto3>=1.28.0
37
37
  Requires-Dist: twilio
38
38
  Requires-Dist: retell-sdk
39
39
  Requires-Dist: requests
40
+ Requires-Dist: openai
41
+ Requires-Dist: httpx
42
+ Requires-Dist: pyaudio
43
+ Requires-Dist: together
44
+ Requires-Dist: langchain-openai
45
+ Requires-Dist: langchain-core
40
46
  Dynamic: license-file
41
47
 
42
48
  # Intellema VDK
@@ -100,6 +106,73 @@ from intellema_vdk import start_outbound_call
100
106
  await start_outbound_call("livekit", phone_number="+1...")
101
107
  ```
102
108
 
109
+ ## Speech To Text (STT)
110
+
111
+ The `STTManager` class provides an interface for transcribing audio files using OpenAI's Whisper model and optionally posting the transcribed text to a specified agent API.
112
+
113
+ ### Usage
114
+
115
+ Here's how to use the `STTManager` to transcribe an audio file and post the result:
116
+ Ensure to set OPENAI_API_KEY and AGENT_API_URL in your `.env` file.
117
+
118
+ ```python
119
+ import asyncio
120
+ from intellema_vdk import STTManager
121
+
122
+ async def main():
123
+ # 1- Initialize the STTManager
124
+ stt_manager = STTManager()
125
+
126
+ try:
127
+ # 2- Transcribe an audio file and post the result to your agent API URL (if provided)
128
+ # Replace "path/to/your/audio.mp3" with the actual file path
129
+ transcript = await stt_manager.transcribe_and_post("path/to/your/audio.mp3")
130
+ print(f"Transcription: {transcript}")
131
+
132
+ except FileNotFoundError:
133
+ print("The audio file was not found.")
134
+ except Exception as e:
135
+ print(f"An error occurred: {e}")
136
+ finally:
137
+ # 3- Clean up
138
+ await stt_manager.close()
139
+
140
+ if __name__ == "__main__":
141
+ asyncio.run(main())
142
+ ```
143
+
144
+ ## TTS Streaming
145
+
146
+ The `TTSStreamer` class provides low-latency text-to-speech streaming using Together AI's inference engine. It enables real-time voice synthesis from streaming LLM responses.
147
+
148
+ ### Running the Sample implementation
149
+
150
+ We provide a ready-to-use sample that connects LangChain (OpenAI) with the TTS Streamer.
151
+
152
+ 1. **Configure Keys**: Ensure `OPENAI_API_KEY` and `TOGETHER_API_KEY` are set in your `.env`.
153
+ 2. **Run the script**:
154
+ ```bash
155
+ python sample_implementation.py
156
+ ```
157
+
158
+ ### Library Usage
159
+
160
+ You can integrate the streamer into your own loops:
161
+
162
+ ```python
163
+ from intellema_vdk import TTSStreamer
164
+
165
+ # 1. Initialize per turn
166
+ tts = TTSStreamer()
167
+
168
+ # 2. Feed text chunks as they are generated
169
+ for chunk in llm_response_stream:
170
+ tts.feed(chunk)
171
+
172
+ # 3. Flush and clean up
173
+ tts.flush()
174
+ tts.close()
175
+ ```
103
176
 
104
177
  ## Configuration
105
178
 
@@ -115,6 +188,34 @@ TWILIO_AUTH_TOKEN=your-token
115
188
  TWILIO_PHONE_NUMBER=your-number
116
189
  RETELL_API_KEY=your-retell-key
117
190
  RETELL_AGENT_ID=your-agent-id
191
+ TOGETHER_API_KEY=your-together-key
192
+ OPENAI_API_KEY=your-openai-key
193
+ AGENT_API_URL=https://your-agent-api.com/endpoint
118
194
  ```
119
195
 
196
+ ## Retell Setup
197
+
198
+ **Important:** Before initiating calls with Retell, you must register your Twilio phone number with Retell. This binds your agent to the number and allows Retell to handle the call flow.
199
+
200
+ You can register your number in two ways:
201
+
202
+ 1. **Using the Helper Script:**
203
+ We provide an interactive script to guide you through the process:
204
+ ```bash
205
+ python import_phone_number.py
206
+ ```
207
+
208
+ 2. **Programmatically:**
209
+ ```python
210
+ from intellema_vdk.retell_lib.retell_client import RetellManager
211
+
212
+ manager = RetellManager()
213
+ # Optional: Pass termination_uri if you have a SIP trunk
214
+ manager.import_phone_number(nickname="My Twilio Number")
215
+ ```
216
+
217
+ ## Notes
218
+
219
+ - **Retell `delete_room` Limitation**: The `delete_room` method for Retell relies on updating dynamic variables during the conversation loop. As a result, it **only works if the user speaks something** which triggers the agent to check the variable and terminate the call.
220
+
120
221
 
@@ -0,0 +1,14 @@
1
+ intellema_vdk/__init__.py,sha256=64pm2TLqhGG225JLddco1kSOpLaD3eGByWvMpaHUUX0,1231
2
+ intellema_vdk/livekit_lib/__init__.py,sha256=9JsOBswDivM8tRw9EF1ql0wwFnHvwjcPWT-umqad98o,68
3
+ intellema_vdk/livekit_lib/client.py,sha256=UxOuT9I-YPtHopx4dXoGKRAJvLXKFgUdtrAcHdR4a-Q,10687
4
+ intellema_vdk/retell_lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ intellema_vdk/retell_lib/import_phone_number.py,sha256=pOt1k6De6-bt2xOPzMMR0nI4Ha6BzLjm19qenyy9RN8,3081
6
+ intellema_vdk/retell_lib/retell_client.py,sha256=qT00NJWi2rJyLWswWTx5fGl5mwPdy6QurQt1Enac0rU,10793
7
+ intellema_vdk/speech_lib/__init__.py,sha256=TXdyAAS6AfQfln_QlIvx_uXU-ksugXzC2N9hrjW1_MQ,73
8
+ intellema_vdk/speech_lib/stt_client.py,sha256=YB8-mJUtQKhqEC4zhipJUb6Y8LqJx0Vv_c4iIxuUjJM,4054
9
+ intellema_vdk/speech_lib/tts_streamer.py,sha256=qs2mzP0vKqv2eKvGJSCTee3mzeJGS9nji0Yy3Y-sOTc,6453
10
+ intellema_vdk-0.2.0.dist-info/licenses/LICENSE,sha256=41qw3yuvY1SpTkwLebZTVYOKk9OIe1Kr6I1S6Y5mp8Y,1087
11
+ intellema_vdk-0.2.0.dist-info/METADATA,sha256=j53oDXr8Xcq7nkP4v6bWeK62z0yR-Pa2yivGaCw-abc,7363
12
+ intellema_vdk-0.2.0.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
13
+ intellema_vdk-0.2.0.dist-info/top_level.txt,sha256=nQ_0rJRkEthHH0bJYoPAVVgQiO6Uw6c_mHnfeROG14U,14
14
+ intellema_vdk-0.2.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: setuptools (80.10.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,14 +0,0 @@
1
- intellema_vdk/__init__.py,sha256=L3hrqcxGVpd4xjXMdEXEO8-Rsg4MfJUR_iN3X0hbido,1224
2
- intellema_vdk/livekit_lib/__init__.py,sha256=9JsOBswDivM8tRw9EF1ql0wwFnHvwjcPWT-umqad98o,68
3
- intellema_vdk/livekit_lib/client.py,sha256=UxOuT9I-YPtHopx4dXoGKRAJvLXKFgUdtrAcHdR4a-Q,10687
4
- intellema_vdk/livekit_lib/__pycache__/__init__.cpython-312.pyc,sha256=-LfSHUwq29ExyfKYStSqyiEzVE--LhEkogP2TLI1xPA,224
5
- intellema_vdk/livekit_lib/__pycache__/client.cpython-312.pyc,sha256=6_zxzDpWZHmBNIuJeKOCWQlYe9XxAhW0npUlcX0Z9sc,14331
6
- intellema_vdk/retell_lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- intellema_vdk/retell_lib/retell_client.py,sha256=sc6EnDc7J4SEF515-YIre56Tg-KLXDRRzli7Jwf-bSo,8037
8
- intellema_vdk/retell_lib/__pycache__/__init__.cpython-312.pyc,sha256=raQGnf_MXBHnMWj5H8iXd3B7U53AfOEyXX2aPx9s1cA,147
9
- intellema_vdk/retell_lib/__pycache__/retell_client.cpython-312.pyc,sha256=eE6z7VLSDglUI5feZnK-qKlB-pOTmoAyDWsgEuraP-s,10060
10
- intellema_vdk-0.1.0.dist-info/licenses/LICENSE,sha256=41qw3yuvY1SpTkwLebZTVYOKk9OIe1Kr6I1S6Y5mp8Y,1087
11
- intellema_vdk-0.1.0.dist-info/METADATA,sha256=RqzJZRQZn35QgwEkQu9LZbCIDW7D6b6dSBFfFiLIcTk,4120
12
- intellema_vdk-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
13
- intellema_vdk-0.1.0.dist-info/top_level.txt,sha256=nQ_0rJRkEthHH0bJYoPAVVgQiO6Uw6c_mHnfeROG14U,14
14
- intellema_vdk-0.1.0.dist-info/RECORD,,