intellema-vdk 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. intellema_vdk/__init__.py +67 -10
  2. intellema_vdk/config.py +14 -0
  3. intellema_vdk/providers/__init__.py +35 -0
  4. intellema_vdk/providers/livekit/__init__.py +19 -0
  5. intellema_vdk/providers/livekit/client.py +612 -0
  6. intellema_vdk/providers/livekit/exceptions.py +23 -0
  7. intellema_vdk/providers/protocols.py +33 -0
  8. intellema_vdk/providers/retell/__init__.py +17 -0
  9. intellema_vdk/providers/retell/client.py +468 -0
  10. intellema_vdk/providers/retell/exceptions.py +19 -0
  11. intellema_vdk/{retell_lib → providers/retell}/import_phone_number.py +1 -1
  12. intellema_vdk/stt/__init__.py +17 -0
  13. intellema_vdk/stt/client.py +482 -0
  14. intellema_vdk/stt/exceptions.py +19 -0
  15. intellema_vdk/tts/__init__.py +15 -0
  16. intellema_vdk/tts/__pycache__/__init__.cpython-312.pyc +0 -0
  17. intellema_vdk/tts/__pycache__/client.cpython-312.pyc +0 -0
  18. intellema_vdk/tts/__pycache__/exceptions.cpython-312.pyc +0 -0
  19. intellema_vdk/tts/__pycache__/providers.cpython-312.pyc +0 -0
  20. intellema_vdk/tts/client.py +541 -0
  21. intellema_vdk/tts/exceptions.py +15 -0
  22. intellema_vdk/tts/providers.py +293 -0
  23. intellema_vdk/utils/logger_config.py +41 -0
  24. intellema_vdk-0.2.2.dist-info/METADATA +311 -0
  25. intellema_vdk-0.2.2.dist-info/RECORD +29 -0
  26. {intellema_vdk-0.2.0.dist-info → intellema_vdk-0.2.2.dist-info}/WHEEL +1 -1
  27. intellema_vdk/livekit_lib/__init__.py +0 -3
  28. intellema_vdk/livekit_lib/client.py +0 -280
  29. intellema_vdk/retell_lib/retell_client.py +0 -248
  30. intellema_vdk/speech_lib/__init__.py +0 -2
  31. intellema_vdk/speech_lib/stt_client.py +0 -108
  32. intellema_vdk/speech_lib/tts_streamer.py +0 -188
  33. intellema_vdk-0.2.0.dist-info/METADATA +0 -221
  34. intellema_vdk-0.2.0.dist-info/RECORD +0 -14
  35. /intellema_vdk/{retell_lib/__init__.py → stt/providers.py} +0 -0
  36. {intellema_vdk-0.2.0.dist-info → intellema_vdk-0.2.2.dist-info}/licenses/LICENSE +0 -0
  37. {intellema_vdk-0.2.0.dist-info → intellema_vdk-0.2.2.dist-info}/top_level.txt +0 -0
@@ -1,280 +0,0 @@
1
- import os
2
- import json
3
- import uuid
4
- import asyncio
5
- import time
6
- import boto3
7
- from typing import List, Optional
8
- from dotenv import load_dotenv
9
- from livekit import api
10
-
11
- # Load environment variables
12
- load_dotenv(dotenv_path=".env.local")
13
- load_dotenv()
14
-
15
- class LiveKitManager:
16
- def __init__(self):
17
- self.url = os.getenv("LIVEKIT_URL")
18
- self.api_key = os.getenv("LIVEKIT_API_KEY")
19
- self.api_secret = os.getenv("LIVEKIT_API_SECRET")
20
- self.sip_trunk_id = os.getenv("SIP_OUTBOUND_TRUNK_ID")
21
-
22
- if not self.url or not self.api_key or not self.api_secret:
23
- raise ValueError("LIVEKIT_URL, LIVEKIT_API_KEY, and LIVEKIT_API_SECRET must be set.")
24
-
25
- self.lk_api = api.LiveKitAPI(
26
- url=self.url,
27
- api_key=self.api_key,
28
- api_secret=self.api_secret,
29
- )
30
-
31
- async def close(self):
32
- await self.lk_api.aclose()
33
-
34
- async def start_outbound_call(self, phone_number: str, prompt_content: str, call_id: str = None, timeout: int = 600):
35
- if not call_id:
36
- call_id = f"outbound_call_{uuid.uuid4().hex[:12]}"
37
-
38
- metadata = json.dumps({
39
- "phone_number": phone_number,
40
- "prompt_content": prompt_content
41
- })
42
-
43
- # 1. Create room with metadata
44
- room = await self.lk_api.room.create_room(
45
- api.CreateRoomRequest(
46
- name=call_id,
47
- empty_timeout=timeout,
48
- metadata=metadata
49
- )
50
- )
51
-
52
- # 2. Dispatch agent
53
- await self.lk_api.agent_dispatch.create_dispatch(
54
- api.CreateAgentDispatchRequest(
55
- room=call_id,
56
- agent_name="outbound-caller",
57
- metadata=metadata
58
- )
59
- )
60
-
61
- # 3. Initiate Outbound Call (SIP/PSTN)
62
- if not self.sip_trunk_id:
63
- raise ValueError("SIP_OUTBOUND_TRUNK_ID is not configured in environment.")
64
-
65
- sip_participant_identity = f"phone-{phone_number}"
66
-
67
- try:
68
- await self.lk_api.sip.create_sip_participant(
69
- api.CreateSIPParticipantRequest(
70
- room_name=call_id,
71
- sip_trunk_id=self.sip_trunk_id,
72
- sip_call_to=phone_number,
73
- participant_identity=sip_participant_identity,
74
- wait_until_answered=True,
75
- )
76
- )
77
- except Exception as e:
78
- # Handle SIP Busy/Error
79
- if "Busy Here" in str(e) or "486" in str(e):
80
- print(f"Call failed: User is busy ({phone_number})")
81
- # We might want to clean up the room if the call failed
82
- await self.delete_room(call_id)
83
- raise ValueError("User is busy")
84
- raise e
85
-
86
- return room
87
-
88
- async def create_token(self, call_id: str, participant_name: str) -> str:
89
- token = api.AccessToken(self.api_key, self.api_secret)
90
- token.with_identity(participant_name)
91
- token.with_name(participant_name)
92
- token.with_grants(api.VideoGrants(
93
- room_join=True,
94
- room=call_id,
95
- ))
96
- return token.to_jwt()
97
-
98
- async def delete_room(self, call_id: str):
99
- await self.lk_api.room.delete_room(api.DeleteRoomRequest(room=call_id))
100
-
101
- async def start_stream(self, call_id: str, rtmp_urls: List[str]):
102
- await self.lk_api.egress.start_room_composite_egress(
103
- api.RoomCompositeEgressRequest(
104
- room_name=call_id,
105
- layout="speaker",
106
- stream_outputs=[
107
- api.StreamOutput(
108
- protocol=api.StreamProtocol.RTMP,
109
- urls=rtmp_urls
110
- )
111
- ]
112
- )
113
- )
114
-
115
- async def start_recording(self, call_id: str, output_filepath: Optional[str] = None, upload_to_s3: bool = True, wait_for_completion: bool = True):
116
- """
117
- Start recording a room.
118
-
119
- Args:
120
- call_id: Name of the room/call to record.
121
- output_filepath: Optional path/filename for the recording.
122
- upload_to_s3: If True, uploads to S3 (requires env vars). If False, saves locally on Egress server.
123
- wait_for_completion: If True, waits for the recording to finish and downloads it locally (if upload_to_s3 is True).
124
- """
125
- file_output = None
126
- filename = output_filepath if output_filepath else f"{call_id}-{uuid.uuid4().hex[:6]}.mp4"
127
-
128
- if upload_to_s3:
129
- access_key = os.getenv("AWS_ACCESS_KEY_ID")
130
- secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
131
- bucket = os.getenv("AWS_S3_BUCKET")
132
- region = os.getenv("AWS_REGION")
133
-
134
- if not access_key or not secret_key or not bucket:
135
- raise ValueError("AWS credentials (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_S3_BUCKET) are required for S3 upload.")
136
-
137
- file_output = api.EncodedFileOutput(
138
- file_type=api.EncodedFileType.MP4,
139
- filepath=filename,
140
- s3=api.S3Upload(
141
- access_key=access_key,
142
- secret=secret_key,
143
- bucket=bucket,
144
- region=region,
145
- ),
146
- )
147
- print(f"Starting recording. File will be saved to S3: s3://{bucket}/{filename}")
148
- else:
149
- file_output = api.EncodedFileOutput(
150
- file_type=api.EncodedFileType.MP4,
151
- filepath=filename,
152
- )
153
- print(f"Starting recording. File will be saved locally: {filename}")
154
-
155
- egress_info = await self.lk_api.egress.start_room_composite_egress(
156
- api.RoomCompositeEgressRequest(
157
- room_name=call_id,
158
- layout="grid",
159
- preset=api.EncodingOptionsPreset.H264_720P_30,
160
- file_outputs=[file_output]
161
- )
162
- )
163
-
164
- if wait_for_completion and upload_to_s3:
165
- egress_id = egress_info.egress_id
166
- print(f"Waiting for egress {egress_id} to complete...")
167
-
168
- while True:
169
- try:
170
- egress_list = await self.lk_api.egress.list_egress(api.ListEgressRequest(egress_id=egress_id))
171
- except Exception as e:
172
- print(f"Error checking egress status: {e}")
173
- await asyncio.sleep(5)
174
- continue
175
-
176
- if not egress_list.items:
177
- print("Egress info not found during polling.")
178
- break
179
-
180
- info = egress_list.items[0]
181
- if info.status == api.EgressStatus.EGRESS_COMPLETE:
182
- print("Egress completed successfully.")
183
- break
184
- elif info.status == api.EgressStatus.EGRESS_FAILED:
185
- raise RuntimeError(f"Egress failed: {info.error}")
186
- elif info.status == api.EgressStatus.EGRESS_LIMIT_REACHED:
187
- raise RuntimeError(f"Egress limit reached: {info.error}")
188
-
189
- await asyncio.sleep(5)
190
-
191
- # Download from S3
192
- print(f"Downloading {filename} from S3 bucket {bucket}...")
193
- s3 = boto3.client(
194
- 's3',
195
- aws_access_key_id=access_key,
196
- aws_secret_access_key=secret_key,
197
- region_name=region
198
- )
199
-
200
- local_dir = "recordings"
201
- os.makedirs(local_dir, exist_ok=True)
202
- local_path = os.path.join(local_dir, filename)
203
-
204
- try:
205
- s3.download_file(bucket, filename, local_path)
206
- print(f"Recording downloaded to: {local_path}")
207
- except Exception as e:
208
- print(f"Failed to download recording: {e}")
209
- raise e
210
-
211
- async def kick_participant(self, call_id: str, identity: str):
212
- await self.lk_api.room.remove_participant(
213
- api.RoomParticipantIdentity(
214
- room=call_id,
215
- identity=identity
216
- )
217
- )
218
-
219
- async def mute_participant(self, call_id: str, identity: str, track_sid: str, muted: bool):
220
- await self.lk_api.room.mute_published_track(
221
- api.MuteRoomTrackRequest(
222
- room=call_id,
223
- identity=identity,
224
- track_sid=track_sid,
225
- muted=muted
226
- )
227
- )
228
-
229
- async def send_alert(self, call_id: str, message: str, participant_identity: Optional[str] = None):
230
- destination_identities = [participant_identity] if participant_identity else []
231
- data_packet = json.dumps({"type": "alert", "message": message}).encode('utf-8')
232
-
233
- await self.lk_api.room.send_data(
234
- api.SendDataRequest(
235
- room=call_id,
236
- data=data_packet,
237
- kind=1, # 1 = RELIABLE, 0 = LOSSY
238
- destination_identities=destination_identities
239
- )
240
- )
241
-
242
- async def get_participant_identities(self, call_id: str) -> List[dict]:
243
- """
244
- Get a list of all participants in a room with their identities and tracks.
245
-
246
- Returns:
247
- List of dicts with participant info:
248
- [
249
- {
250
- "identity": str,
251
- "name": str,
252
- "tracks": [
253
- {"sid": str, "type": str, "muted": bool, "source": str},
254
- ...
255
- ]
256
- },
257
- ...
258
- ]
259
- """
260
- response = await self.lk_api.room.list_participants(
261
- api.ListParticipantsRequest(room=call_id)
262
- )
263
- participants = []
264
- for p in response.participants:
265
- tracks = []
266
- for track in p.tracks:
267
- tracks.append({
268
- "sid": track.sid,
269
- "type": "audio" if track.type == 1 else "video" if track.type == 2 else "unknown",
270
- "muted": track.muted,
271
- "source": track.source.name if hasattr(track.source, 'name') else str(track.source)
272
- })
273
- participants.append({
274
- "identity": p.identity,
275
- "name": p.name,
276
- "tracks": tracks
277
- })
278
- return participants
279
-
280
-
@@ -1,248 +0,0 @@
1
- import os
2
- from typing import List, Optional
3
- from dotenv import load_dotenv
4
- from twilio.rest import Client
5
- from retell import Retell
6
- import time
7
- import uuid
8
- import requests
9
- import boto3
10
-
11
- # Load environment variables
12
- load_dotenv(dotenv_path=".env.local")
13
- load_dotenv()
14
-
15
- class RetellManager:
16
- def __init__(self):
17
- self.twilio_account_sid = os.getenv("TWILIO_ACCOUNT_SID")
18
- self.twilio_auth_token = os.getenv("TWILIO_AUTH_TOKEN")
19
- self.twilio_number = os.getenv("TWILIO_PHONE_NUMBER")
20
- self.retell_api_key = os.getenv("RETELL_API_KEY")
21
- self.retell_agent_id = os.getenv("RETELL_AGENT_ID")
22
-
23
- if not all([self.twilio_account_sid, self.twilio_auth_token, self.twilio_number, self.retell_api_key, self.retell_agent_id]):
24
- raise ValueError("Missing necessary environment variables for RetellManager")
25
-
26
- self.twilio_client = Client(self.twilio_account_sid, self.twilio_auth_token)
27
- self.retell_client = Retell(api_key=self.retell_api_key)
28
-
29
- def import_phone_number(self, termination_uri: str = None, outbound_agent_id: str = None, inbound_agent_id: str = None, nickname: str = None, sip_trunk_auth_username: str = None, sip_trunk_auth_password: str = None):
30
- """
31
- Import/register your Twilio phone number with Retell.
32
- This is required before you can make outbound calls using the phone number.
33
-
34
- Args:
35
- termination_uri: Twilio SIP trunk termination URI (e.g., "yourtrunk.pstn.twilio.com").
36
- If not provided, will try to use a default format.
37
- outbound_agent_id: Agent ID to use for outbound calls. Defaults to self.retell_agent_id.
38
- inbound_agent_id: Agent ID to use for inbound calls. Defaults to None (no inbound).
39
- nickname: Optional nickname for the phone number.
40
- sip_trunk_auth_username: Username for SIP trunk authentication (if using credential list).
41
- sip_trunk_auth_password: Password for SIP trunk authentication (if using credential list).
42
-
43
- Returns:
44
- The phone number registration response from Retell.
45
- """
46
- # Build the import kwargs
47
- import_kwargs = {
48
- "phone_number": self.twilio_number,
49
- }
50
-
51
- # Add termination URI if provided
52
- if termination_uri:
53
- import_kwargs["termination_uri"] = termination_uri
54
-
55
- # Add SIP credentials if provided
56
- if sip_trunk_auth_username and sip_trunk_auth_password:
57
- import_kwargs["sip_trunk_auth_username"] = sip_trunk_auth_username
58
- import_kwargs["sip_trunk_auth_password"] = sip_trunk_auth_password
59
-
60
- # Set outbound agent (required for outbound calls)
61
- if outbound_agent_id:
62
- import_kwargs["outbound_agent_id"] = outbound_agent_id
63
- elif self.retell_agent_id:
64
- import_kwargs["outbound_agent_id"] = self.retell_agent_id
65
-
66
- # Set inbound agent if provided
67
- if inbound_agent_id:
68
- import_kwargs["inbound_agent_id"] = inbound_agent_id
69
-
70
- # Add nickname if provided
71
- if nickname:
72
- import_kwargs["nickname"] = nickname
73
-
74
- try:
75
- response = self.retell_client.phone_number.import_(**import_kwargs)
76
- print(f"✓ Phone number {self.twilio_number} successfully imported to Retell!")
77
- print(f" Phone Number: {response.phone_number}")
78
- print(f" Type: {response.phone_number_type}")
79
- if hasattr(response, 'outbound_agent_id') and response.outbound_agent_id:
80
- print(f" Outbound Agent: {response.outbound_agent_id}")
81
- if hasattr(response, 'inbound_agent_id') and response.inbound_agent_id:
82
- print(f" Inbound Agent: {response.inbound_agent_id}")
83
- return response
84
- except Exception as e:
85
- print(f"✗ Error importing phone number: {e}")
86
- print(f"\nNote: If you're using Twilio, you may need to:")
87
- print(f" 1. Create an Elastic SIP Trunk in Twilio console")
88
- print(f" 2. Provide the termination_uri parameter (e.g., 'yourtrunk.pstn.twilio.com')")
89
- print(f" 3. Or purchase the number directly through Retell dashboard")
90
- raise
91
-
92
-
93
- def start_outbound_call(self, phone_number: str, prompt_content: str = None, call_id: str = None) -> str:
94
- call_kwargs = {
95
- "from_number": self.twilio_number,
96
- "to_number": phone_number,
97
- "override_agent_id": self.retell_agent_id,
98
- }
99
-
100
- if call_id:
101
- call_kwargs["metadata"] = {"call_id": call_id}
102
-
103
- # Only set up dynamic variables if we have a custom prompt
104
- if prompt_content:
105
- enhanced_prompt = (
106
- f"{prompt_content}\n\n"
107
- "SYSTEM INSTRUCTION: Continually check the dynamic variable {{force_end}}. "
108
- "If {{force_end}} == 'true, you must IMMEDIATELY say goodbye and activate your end_call tool to terminate the call."
109
- )
110
- call_kwargs["retell_llm_dynamic_variables"] = {
111
- "prompt_content": enhanced_prompt,
112
- "force_end": "false"
113
- }
114
-
115
- call_response = self.retell_client.call.create_phone_call(**call_kwargs)
116
-
117
- print(f"Call created successfully!")
118
- print(f"Retell Call ID: {call_response.call_id}")
119
- print(f"Call Status: {call_response.call_status}")
120
-
121
- return call_response.call_id
122
-
123
- def delete_room(self, call_id: str):
124
- try:
125
- call_data = self.retell_client.call.retrieve(call_id)
126
- print(f"Current call status: {call_data.call_status}")
127
-
128
- if call_data.call_status in ['registered', 'ongoing', 'dialing']:
129
- print(f"Triggering end for Retell call {call_id}...")
130
-
131
- self.retell_client.call.update(
132
- call_id,
133
- override_dynamic_variables={"force_end": "true"}
134
- )
135
-
136
- print("✓ force_end override sent to Retell API")
137
- else:
138
- print(f"Call already ended: {call_data.call_status}")
139
-
140
- except Exception as e:
141
- print(f"Error ending call {call_id}: {e}")
142
- raise
143
-
144
- def start_stream(self, call_id: str, rtmp_urls: List[str]):
145
- """
146
- Starts a Twilio Media Stream.
147
- Note: Twilio streams are WebSocket-based. If rtmp_urls contains a WSS URL, it will work.
148
- """
149
- if not rtmp_urls:
150
- raise ValueError("No stream URLs provided")
151
-
152
- self.twilio_client.calls(call_id).streams.create(
153
- url=rtmp_urls[0]
154
- )
155
-
156
- def start_recording(self, call_id: str, output_filepath: Optional[str] = None, upload_to_s3: bool = True, wait_for_completion: bool = True):
157
- """
158
- Triggers a recording on the active Twilio call.
159
-
160
- Args:
161
- call_id: The Twilio Call SID.
162
- output_filepath: Optional filename for the recording.
163
- upload_to_s3: If True, uploads to S3.
164
- wait_for_completion: If True, waits for recording to finish and then uploads.
165
-
166
- Returns:
167
- The Twilio Recording SID.
168
- """
169
-
170
- # Start Twilio recording
171
- recording = self.twilio_client.calls(call_id).recordings.create()
172
- print(f"Recording started: {recording.sid}")
173
-
174
- if not wait_for_completion:
175
- return recording.sid
176
-
177
- # Poll for recording completion
178
- print("Waiting for recording to complete...")
179
- while True:
180
- rec_status = self.twilio_client.recordings(recording.sid).fetch()
181
- if rec_status.status == 'completed':
182
- print("Recording completed.")
183
- break
184
- elif rec_status.status in ['failed', 'absent']:
185
- raise RuntimeError(f"Recording failed with status: {rec_status.status}")
186
- time.sleep(5)
187
-
188
- if not upload_to_s3:
189
- return recording.sid
190
-
191
- # Download recording from Twilio
192
- media_url = f"https://api.twilio.com/2010-04-01/Accounts/{self.twilio_account_sid}/Recordings/{recording.sid}.mp3"
193
- print(f"Downloading recording from: {media_url}")
194
-
195
- response = requests.get(media_url, auth=(self.twilio_account_sid, self.twilio_auth_token))
196
- if response.status_code != 200:
197
- raise RuntimeError(f"Failed to download recording: {response.status_code} {response.text}")
198
-
199
- # Upload to S3
200
- access_key = os.getenv("AWS_ACCESS_KEY_ID")
201
- secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
202
- bucket = os.getenv("AWS_S3_BUCKET")
203
- region = os.getenv("AWS_REGION")
204
-
205
- if not access_key or not secret_key or not bucket:
206
- raise ValueError("AWS credentials (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_S3_BUCKET) are required for S3 upload.")
207
-
208
- filename = output_filepath if output_filepath else f"{call_id}-{uuid.uuid4().hex[:6]}.mp3"
209
-
210
- s3 = boto3.client(
211
- 's3',
212
- aws_access_key_id=access_key,
213
- aws_secret_access_key=secret_key,
214
- region_name=region
215
- )
216
-
217
- print(f"Uploading to S3: s3://{bucket}/{filename}")
218
- s3.put_object(Bucket=bucket, Key=filename, Body=response.content)
219
- print(f"Upload complete: s3://{bucket}/{filename}")
220
-
221
- # Also save locally
222
- local_dir = "recordings"
223
- os.makedirs(local_dir, exist_ok=True)
224
- local_path = os.path.join(local_dir, filename)
225
- with open(local_path, 'wb') as f:
226
- f.write(response.content)
227
- print(f"Recording saved locally: {local_path}")
228
-
229
- return recording.sid
230
-
231
- def mute_participant(self, call_id: str, identity: str, track_sid: str, muted: bool):
232
- """
233
- Mutes the participant on the Twilio call.
234
- This prevents audio from reaching the Retell AI.
235
- """
236
- self.twilio_client.calls(call_id).update(muted=muted)
237
-
238
- def kick_participant(self, call_id: str, identity: str):
239
- """
240
- Alias for delete_room (hangup).
241
- """
242
- self.delete_room(call_id)
243
-
244
- def send_alert(self, call_id: str, message: str, participant_identity: Optional[str] = None):
245
- """
246
- Not fully supported in this hybrid model
247
- """
248
- raise NotImplementedError("send_alert is not currently supported in RetellManager")
@@ -1,2 +0,0 @@
1
- from .stt_client import STTManager
2
- from .tts_streamer import TTSStreamer
@@ -1,108 +0,0 @@
1
- import os
2
- import logging
3
- import httpx
4
- from dotenv import load_dotenv
5
- from openai import AsyncOpenAI
6
-
7
- load_dotenv()
8
- logger = logging.getLogger(__name__)
9
-
10
-
11
- class STTManager:
12
- def __init__(self):
13
- """
14
- Initializes the STTManager.
15
-
16
- Note:
17
- The following must be set in your .env file:
18
- - OPENAI_API_KEY
19
- - AGENT_API_URL (If not set, posting to agent will be disabled)
20
- """
21
- self._api_key = os.getenv("OPENAI_API_KEY")
22
- if not self._api_key:
23
- raise ValueError("OPENAI_API_KEY must be set in your .env file.")
24
-
25
- self._agent_api_url = os.getenv("AGENT_API_URL")
26
- if not self._agent_api_url:
27
- logger.warning("AGENT_API_URL is not set in .env. Posting to agent will be disabled.")
28
-
29
- self._openai_client = AsyncOpenAI(api_key=self._api_key)
30
- self._http_client = httpx.AsyncClient()
31
-
32
- async def close(self):
33
- """
34
- Cleans up resources used by the STTManager.
35
- """
36
- await self._http_client.aclose()
37
- await self._openai_client.close()
38
-
39
- async def transcribe_audio(self, file_path: str, model: str = "whisper-1") -> str:
40
- """
41
- Transcribes an audio file using OpenAI's whisper model.
42
-
43
- Args:
44
- file_path: The path to the audio file to transcribe.
45
- Supported formats: mp3, mp4, mpeg, mpga, m4a, wav, and webm.
46
- model: The name of the whisper model to use.
47
- Note: The OpenAI API currently only supports "whisper-1".
48
- Returns:
49
- The transcribed text as a string.
50
- """
51
- logger.info(f"Starting transcription for file: {file_path}")
52
- if not os.path.exists(file_path):
53
- raise FileNotFoundError(f"Audio file not found at: {file_path}")
54
-
55
- with open(file_path, "rb") as audio_file:
56
- transcript = await self._openai_client.audio.transcriptions.create(
57
- model=model,
58
- file=audio_file
59
- )
60
- logger.info(f"Successfully transcribed file: {file_path}")
61
-
62
- return transcript.text
63
-
64
- async def transcribe_and_post(self, file_path: str):
65
- """
66
- Processes an audio file by transcribing it and posting the result to the agent API under a 'message' key.
67
-
68
- Args:
69
- file_path: The path to the audio file to process.
70
- Supported formats: mp3, mp4, mpeg, mpga, m4a, wav, and webm.
71
- Returns:
72
- The transcribed text as a string.
73
- """
74
- try:
75
- # Transcribe the audio file
76
- transcript_text = await self.transcribe_audio(file_path)
77
-
78
- # Post the transcribed text to the agent API
79
- if self._agent_api_url:
80
- await self._post_to_agent(transcript_text)
81
- else:
82
- logger.info("AGENT_API_URL not set, skipping post to agent.")
83
-
84
- return transcript_text
85
-
86
- except FileNotFoundError:
87
- logger.error(f"Audio file not found at: {file_path}", exc_info=True)
88
- raise
89
- except Exception as e:
90
- logger.error(f"An error occurred during processing of {file_path}: {e}", exc_info=True)
91
- raise
92
-
93
- async def _post_to_agent(self, text: str):
94
- """
95
- Posts the transcribed text to the agent API under a 'message' key.
96
-
97
- Args:
98
- text: The transcribed text to post.
99
- """
100
- payload = {"message": text}
101
- try:
102
- logger.info(f"Posting to agent with payload: {payload}")
103
- response = await self._http_client.post(self._agent_api_url, json=payload)
104
- response.raise_for_status()
105
- logger.info(f"Successfully posted to agent. Status: {response.status_code}")
106
- except httpx.HTTPError as e:
107
- logger.error(f"Failed to post to agent API: {e}", exc_info=True)
108
- raise