vision-agents-plugins-deepgram 0.1.3__tar.gz → 0.1.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of vision-agents-plugins-deepgram might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vision-agents-plugins-deepgram
3
- Version: 0.1.3
3
+ Version: 0.1.6
4
4
  Summary: Deepgram STT integration for Vision Agents
5
5
  Project-URL: Documentation, https://visionagents.ai/
6
6
  Project-URL: Website, https://visionagents.ai/
@@ -8,31 +8,32 @@ Project-URL: Source, https://github.com/GetStream/Vision-Agents
8
8
  License-Expression: MIT
9
9
  Keywords: AI,STT,agents,deepgram,speech-to-text,transcription,voice agents
10
10
  Requires-Python: >=3.10
11
- Requires-Dist: deepgram-sdk==4.8.1
11
+ Requires-Dist: deepgram-sdk<5.1,>=5.0.0
12
12
  Requires-Dist: numpy<2.3,>=2.2.6
13
13
  Requires-Dist: vision-agents
14
14
  Description-Content-Type: text/markdown
15
15
 
16
16
  # Deepgram Speech-to-Text Plugin
17
17
 
18
- A high-quality Speech-to-Text (STT) plugin for GetStream that uses the Deepgram API.
18
+ A high-quality Speech-to-Text (STT) plugin for Vision agents that uses the Deepgram API.
19
19
 
20
20
  ## Installation
21
21
 
22
22
  ```bash
23
- pip install getstream-plugins-deepgram
23
+ uv add vision-agents-plugins-deepgram
24
24
  ```
25
25
 
26
26
  ## Usage
27
27
 
28
28
  ```python
29
- from getstream.plugins.deepgram import DeepgramSTT
29
+ from vision_agents.plugins import deepgram
30
+ from getstream.video.rtc.track_util import PcmData
30
31
 
31
32
  # Initialize with API key from environment variable
32
- stt = DeepgramSTT()
33
+ stt = deepgram.STT()
33
34
 
34
35
  # Or specify API key directly
35
- stt = DeepgramSTT(api_key="your_deepgram_api_key")
36
+ stt = deepgram.STT(api_key="your_deepgram_api_key")
36
37
 
37
38
  # Register event handlers
38
39
  @stt.on("transcript")
@@ -44,6 +45,7 @@ def on_partial(text, user, metadata):
44
45
  print(f"Partial transcript from {user}: {text}")
45
46
 
46
47
  # Process audio
48
+ pcm_data = PcmData(samples=b"\x00\x00" * 1000, sample_rate=48000, format="s16")
47
49
  await stt.process_audio(pcm_data)
48
50
 
49
51
  # When done
@@ -52,14 +54,16 @@ await stt.close()
52
54
 
53
55
  ## Configuration Options
54
56
 
55
- - `api_key`: Deepgram API key (default: reads from DEEPGRAM_API_KEY environment variable)
56
- - `options`: Deepgram LiveOptions for configuring the transcription
57
+ - `api_key`: Deepgram API key (default: reads from `DEEPGRAM_API_KEY` environment variable)
58
+ - `options`: Deepgram options for configuring the transcription.
59
+ See the [Deepgram Listen V1 Connect API documentation](https://github.com/deepgram/deepgram-python-sdk/blob/main/websockets-reference.md#%EF%B8%8F-parameters) for more details.
57
60
  - `sample_rate`: Sample rate of the audio in Hz (default: 16000)
58
61
  - `language`: Language code for transcription (default: "en-US")
59
- - `keep_alive_interval`: Interval in seconds to send keep-alive messages (default: 5.0)
62
+ - `keep_alive_interval`: Interval in seconds to send keep-alive messages (default: 1.0s)
63
+ - `connection_timeout`: Timeout to wait for the Deepgram connection to be established before skipping the in seconds to send keep-alive messages (default: 15.0s)
60
64
 
61
65
  ## Requirements
62
66
 
63
67
  - Python 3.10+
64
- - deepgram-sdk>=4.5.0
68
+ - deepgram-sdk>=5.0.0,<5.1
65
69
  - numpy>=2.2.6,<2.3
@@ -0,0 +1,54 @@
1
+ # Deepgram Speech-to-Text Plugin
2
+
3
+ A high-quality Speech-to-Text (STT) plugin for Vision agents that uses the Deepgram API.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ uv add vision-agents-plugins-deepgram
9
+ ```
10
+
11
+ ## Usage
12
+
13
+ ```python
14
+ from vision_agents.plugins import deepgram
15
+ from getstream.video.rtc.track_util import PcmData
16
+
17
+ # Initialize with API key from environment variable
18
+ stt = deepgram.STT()
19
+
20
+ # Or specify API key directly
21
+ stt = deepgram.STT(api_key="your_deepgram_api_key")
22
+
23
+ # Register event handlers
24
+ @stt.on("transcript")
25
+ def on_transcript(text, user, metadata):
26
+ print(f"Final transcript from {user}: {text}")
27
+
28
+ @stt.on("partial_transcript")
29
+ def on_partial(text, user, metadata):
30
+ print(f"Partial transcript from {user}: {text}")
31
+
32
+ # Process audio
33
+ pcm_data = PcmData(samples=b"\x00\x00" * 1000, sample_rate=48000, format="s16")
34
+ await stt.process_audio(pcm_data)
35
+
36
+ # When done
37
+ await stt.close()
38
+ ```
39
+
40
+ ## Configuration Options
41
+
42
+ - `api_key`: Deepgram API key (default: reads from `DEEPGRAM_API_KEY` environment variable)
43
+ - `options`: Deepgram options for configuring the transcription.
44
+ See the [Deepgram Listen V1 Connect API documentation](https://github.com/deepgram/deepgram-python-sdk/blob/main/websockets-reference.md#%EF%B8%8F-parameters) for more details.
45
+ - `sample_rate`: Sample rate of the audio in Hz (default: 16000)
46
+ - `language`: Language code for transcription (default: "en-US")
47
+ - `keep_alive_interval`: Interval in seconds to send keep-alive messages (default: 1.0s)
48
+ - `connection_timeout`: Timeout to wait for the Deepgram connection to be established before skipping the in seconds to send keep-alive messages (default: 15.0s)
49
+
50
+ ## Requirements
51
+
52
+ - Python 3.10+
53
+ - deepgram-sdk>=5.0.0,<5.1
54
+ - numpy>=2.2.6,<2.3
@@ -12,7 +12,7 @@ requires-python = ">=3.10"
12
12
  license = "MIT"
13
13
  dependencies = [
14
14
  "vision-agents",
15
- "deepgram-sdk==4.8.1",
15
+ "deepgram-sdk>=5.0.0,<5.1",
16
16
  "numpy>=2.2.6,<2.3",
17
17
  ]
18
18
 
@@ -0,0 +1,349 @@
1
+ import asyncio
2
+ import contextlib
3
+ import logging
4
+ import os
5
+ import time
6
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
7
+
8
+ import numpy as np
9
+ import websockets
10
+ from deepgram import AsyncDeepgramClient
11
+ from deepgram.core.events import EventType
12
+ from deepgram.extensions.types.sockets import (
13
+ ListenV1ControlMessage,
14
+ ListenV1MetadataEvent,
15
+ ListenV1ResultsEvent,
16
+ ListenV1SpeechStartedEvent,
17
+ ListenV1UtteranceEndEvent,
18
+ )
19
+ from deepgram.listen.v1.socket_client import AsyncV1SocketClient
20
+ from getstream.video.rtc.track_util import PcmData
21
+
22
+ from vision_agents.core import stt
23
+
24
+ from .utils import generate_silence
25
+
26
+ if TYPE_CHECKING:
27
+ from vision_agents.core.edge.types import Participant
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ class STT(stt.STT):
33
+ """
34
+ Deepgram-based Speech-to-Text implementation.
35
+
36
+ This implementation operates in asynchronous mode - it receives streaming transcripts
37
+ from Deepgram's WebSocket connection and emits events immediately as they arrive,
38
+ providing real-time responsiveness for live transcription scenarios.
39
+
40
+ Events:
41
+ - transcript: Emitted when a complete transcript is available.
42
+ Args: text (str), user_metadata (dict), metadata (dict)
43
+ - partial_transcript: Emitted when a partial transcript is available.
44
+ Args: text (str), user_metadata (dict), metadata (dict)
45
+ - error: Emitted when an error occurs during transcription.
46
+ Args: error (Exception)
47
+ """
48
+
49
+ def __init__(
50
+ self,
51
+ api_key: Optional[str] = None,
52
+ options: Optional[dict] = None,
53
+ sample_rate: int = 48000,
54
+ language: str = "en-US",
55
+ interim_results: bool = True,
56
+ client: Optional[AsyncDeepgramClient] = None,
57
+ keep_alive_interval: float = 1.0,
58
+ connection_timeout: float = 15.0,
59
+ ):
60
+ """
61
+ Initialize the Deepgram STT service.
62
+
63
+ Args:
64
+ api_key: Deepgram API key. If not provided, the DEEPGRAM_API_KEY
65
+ environment variable will be used automatically.
66
+ options: Deepgram live transcription options
67
+ sample_rate: Sample rate of the audio in Hz (default: 48000)
68
+ language: Language code for transcription
69
+ interim_results: Whether to emit interim results (partial transcripts with the partial_transcript event).
70
+ connection_timeout: Time to wait for the Deepgram connection to be established.
71
+
72
+ """
73
+ super().__init__(sample_rate=sample_rate)
74
+
75
+ # If no API key was provided, check for DEEPGRAM_API_KEY in environment
76
+ if api_key is None:
77
+ api_key = os.environ.get("DEEPGRAM_API_KEY")
78
+ if not api_key:
79
+ logger.warning(
80
+ "No API key provided and DEEPGRAM_API_KEY environment variable not found."
81
+ )
82
+
83
+ # Initialize DeepgramClient with the API key
84
+ logger.info("Initializing Deepgram client")
85
+ self.deepgram = (
86
+ client if client is not None else AsyncDeepgramClient(api_key=api_key)
87
+ )
88
+ self.dg_connection: Optional[AsyncV1SocketClient] = None
89
+
90
+ self.options = options or {
91
+ "model": "nova-2",
92
+ "language": language,
93
+ "encoding": "linear16",
94
+ "sample_rate": sample_rate,
95
+ "channels": 1,
96
+ "interim_results": interim_results,
97
+ }
98
+
99
+ # Track current user context for associating transcripts with users
100
+ self._current_user: Optional[Dict[str, Any]] = None
101
+
102
+ # Generate a silence audio to use as keep-alive message
103
+ self._keep_alive_data = generate_silence(
104
+ sample_rate=sample_rate, duration_ms=10
105
+ )
106
+ self._keep_alive_interval = keep_alive_interval
107
+
108
+ self._stack = contextlib.AsyncExitStack()
109
+ # An event to detect that the connection was established once.
110
+ self._connected_once = asyncio.Event()
111
+ # Time to wait for connection to be established before sending the event
112
+ self._connection_timeout = connection_timeout
113
+ self._last_sent_at = float("-inf")
114
+ # Lock to prevent concurrent connection opening
115
+ self._connect_lock = asyncio.Lock()
116
+
117
+ # Start the listener loop in the background
118
+ asyncio.create_task(self.start())
119
+
120
+ async def start(self):
121
+ """
122
+ Start the main task establishing the Deepgram connection and processing the events.
123
+ """
124
+ if self._is_closed:
125
+ logger.warning("Cannot setup connection - Deepgram instance is closed")
126
+ return None
127
+
128
+ # Establish a Deepgram connection.
129
+ # Use a lock to make sure it's established only once
130
+ async with self._connect_lock:
131
+ if self.dg_connection is not None:
132
+ logger.debug("Connection already set up, skipping initialization")
133
+ return None
134
+
135
+ try:
136
+ logger.info("Creating a Deepgram connection with options %s", self.options)
137
+ dg_connection = await self._stack.enter_async_context(
138
+ self.deepgram.listen.v1.connect(**self.options)
139
+ )
140
+ except Exception as e:
141
+ # Log the error and set connection to None
142
+ logger.exception("Error setting up Deepgram connection")
143
+ self.dg_connection = None
144
+ # Emit error immediately
145
+ self._emit_error_event(e, "Deepgram connection setup")
146
+ raise
147
+ finally:
148
+ self._connected_once.set()
149
+
150
+ self.dg_connection = dg_connection
151
+ # Start the keep-alive loop to keep the connection open
152
+ asyncio.create_task(self._keepalive_loop())
153
+
154
+ # Register event handlers
155
+ self.dg_connection.on(
156
+ EventType.OPEN,
157
+ lambda msg: logger.debug(f"Deepgram connection opened. message={msg}"),
158
+ )
159
+ self.dg_connection.on(EventType.CLOSE, self._on_connection_close)
160
+ self.dg_connection.on(EventType.ERROR, self._on_connection_error)
161
+ self.dg_connection.on(EventType.MESSAGE, self._on_message)
162
+
163
+ # Start processing the events from Deepgram.
164
+ # This is a blocking call.
165
+ logger.debug("Listening to the events from a Deepgram connection")
166
+ await self.dg_connection.start_listening()
167
+ return None
168
+
169
+ async def started(self):
170
+ """
171
+ Wait until the Deepgram connection is established.
172
+ """
173
+ if self._connected_once.is_set():
174
+ return
175
+
176
+ await asyncio.wait_for(
177
+ self._connected_once.wait(), timeout=self._connection_timeout
178
+ )
179
+
180
+ async def close(self):
181
+ """Close the Deepgram connection and clean up resources."""
182
+ if self._is_closed:
183
+ logger.debug("Deepgram STT service already closed")
184
+ return
185
+
186
+ logger.info("Closing Deepgram STT service")
187
+ self._is_closed = True
188
+
189
+ # Close the Deepgram connection if it exists
190
+ if self.dg_connection:
191
+ logger.debug("Closing Deepgram connection")
192
+ try:
193
+ await self.dg_connection.send_control(
194
+ ListenV1ControlMessage(type="CloseStream")
195
+ )
196
+ await self._stack.aclose()
197
+ self.dg_connection = None
198
+ except Exception:
199
+ logger.exception("Error closing Deepgram connection")
200
+
201
+ async def _on_message(
202
+ self,
203
+ message: ListenV1ResultsEvent
204
+ | ListenV1MetadataEvent
205
+ | ListenV1UtteranceEndEvent
206
+ | ListenV1SpeechStartedEvent,
207
+ ):
208
+ if message.type != "Results":
209
+ logger.debug(
210
+ "Received non-transcript message, skip processing. message=%s", message
211
+ )
212
+ return
213
+
214
+ transcript = message.dict()
215
+
216
+ # Get the transcript text from the response
217
+ alternatives = transcript.get("channel", {}).get("alternatives", [])
218
+ if not alternatives:
219
+ return
220
+
221
+ transcript_text = alternatives[0].get("transcript", "")
222
+ if not transcript_text:
223
+ return
224
+
225
+ # Check if this is a final result
226
+ is_final = transcript.get("is_final", False)
227
+
228
+ # Create metadata with useful information
229
+ metadata = {
230
+ "confidence": alternatives[0].get("confidence", 0),
231
+ "words": alternatives[0].get("words", []),
232
+ "is_final": is_final,
233
+ "channel_index": transcript.get("channel_index", 0),
234
+ }
235
+
236
+ # Emit immediately for real-time responsiveness
237
+ if is_final:
238
+ self._emit_transcript_event(transcript_text, self._current_user, metadata)
239
+ else:
240
+ self._emit_partial_transcript_event(
241
+ transcript_text, self._current_user, metadata
242
+ )
243
+
244
+ logger.debug(
245
+ "Received transcript",
246
+ extra={
247
+ "is_final": is_final,
248
+ "text_length": len(transcript_text),
249
+ "confidence": metadata["confidence"],
250
+ },
251
+ )
252
+
253
+ async def _on_connection_error(self, error: websockets.WebSocketException):
254
+ error_text = str(error) if error is not None else "Unknown error"
255
+ logger.error("Deepgram error received: %s", error_text)
256
+ # Emit error immediately
257
+ error_obj = Exception(f"Deepgram error: {error_text}")
258
+ self._emit_error_event(error_obj, "Deepgram connection")
259
+
260
+ async def _on_connection_close(self, message: Any):
261
+ logger.warning(f"Deepgram connection closed. message={message}")
262
+ await self.close()
263
+
264
+ async def _process_audio_impl(
265
+ self,
266
+ pcm_data: PcmData,
267
+ user_metadata: Optional[Union[Dict[str, Any], "Participant"]] = None,
268
+ ) -> Optional[List[Tuple[bool, str, Dict[str, Any]]]]:
269
+ """
270
+ Process audio data through Deepgram for transcription.
271
+
272
+ Args:
273
+ pcm_data: The PCM audio data to process.
274
+ user_metadata: Additional metadata about the user or session.
275
+
276
+ Returns:
277
+ None - Deepgram operates in asynchronous mode and emits events directly
278
+ when transcripts arrive from the streaming service.
279
+ """
280
+ if self._is_closed:
281
+ logger.warning("Deepgram connection is closed, ignoring audio")
282
+ return None
283
+
284
+ # Store the current user context for transcript events
285
+ self._current_user = user_metadata # type: ignore[assignment]
286
+
287
+ # Check if the input sample rate matches the expected sample rate
288
+ if pcm_data.sample_rate != self.sample_rate:
289
+ logger.warning(
290
+ "Input audio sample rate (%s Hz) does not match the expected sample rate (%s Hz). "
291
+ "This may result in incorrect transcriptions. Consider resampling the audio.",
292
+ pcm_data.sample_rate,
293
+ self.sample_rate,
294
+ )
295
+
296
+ # Convert PCM data to bytes if needed
297
+ audio_data = pcm_data.samples
298
+ if not isinstance(audio_data, bytes):
299
+ # Convert numpy array to bytes
300
+ audio_data = audio_data.astype(np.int16).tobytes()
301
+
302
+ # Wait for the attempt to establish the connection
303
+ try:
304
+ await self.started()
305
+ except asyncio.TimeoutError:
306
+ logger.error(
307
+ f"Deepgram connection is not established within {self._connection_timeout} seconds. "
308
+ f"Skipping the audio package."
309
+ )
310
+ return None
311
+
312
+ # Send the audio data to Deepgram
313
+ logger.debug(
314
+ "Sending audio data to Deepgram",
315
+ extra={"audio_bytes": len(audio_data)},
316
+ )
317
+ await self._send_audio(audio_data)
318
+ return None
319
+
320
+ async def _send_audio(self, data: bytes):
321
+ if self.dg_connection is None:
322
+ logger.warning("Deepgram connection is not established")
323
+ return
324
+
325
+ try:
326
+ await self.dg_connection.send_media(data)
327
+ self._last_sent_at = time.time()
328
+ except Exception as e:
329
+ # Raise exception to be handled by base class
330
+ raise Exception(f"Deepgram audio transmission error: {e}") from e
331
+
332
+ async def _keepalive_loop(self):
333
+ """
334
+ Send the silence audio every `interval` seconds
335
+ to prevent Deepgram from closing the connection.
336
+ """
337
+ while not self._is_closed and self.dg_connection is not None:
338
+ if self._last_sent_at + self._keep_alive_interval <= time.time():
339
+ logger.debug("Sending keepalive packet to Deepgram...")
340
+ # Send audio silence to keep the connection open
341
+ await self._send_audio(self._keep_alive_data)
342
+ # Send keep-alive message as well
343
+ await self.dg_connection.send_control(
344
+ ListenV1ControlMessage(type="KeepAlive")
345
+ )
346
+
347
+ # Sleep max for 1s to avoid missing the keep-alive schedule
348
+ timeout = min(self._keep_alive_interval, 1.0)
349
+ await asyncio.sleep(timeout)
@@ -0,0 +1,18 @@
1
+ import numpy as np
2
+
3
+
4
+ def generate_silence(sample_rate: int, duration_ms: int) -> bytes:
5
+ """
6
+ Generate a silence of the given sample_rate and duration_ms.
7
+ """
8
+ # Audio parameters
9
+ channels = 1
10
+ sample_format = np.int16 # 16-bit signed PCM
11
+
12
+ # Number of samples = sample_rate * duration_seconds
13
+ num_samples = int(sample_rate * (duration_ms / 1000.0))
14
+
15
+ # Create silence raw bytes (s16 mono PCM)
16
+ pcm_bytes = np.zeros((num_samples, channels), dtype=sample_format).tobytes()
17
+ return pcm_bytes
18
+
@@ -1,50 +0,0 @@
1
- # Deepgram Speech-to-Text Plugin
2
-
3
- A high-quality Speech-to-Text (STT) plugin for GetStream that uses the Deepgram API.
4
-
5
- ## Installation
6
-
7
- ```bash
8
- pip install getstream-plugins-deepgram
9
- ```
10
-
11
- ## Usage
12
-
13
- ```python
14
- from getstream.plugins.deepgram import DeepgramSTT
15
-
16
- # Initialize with API key from environment variable
17
- stt = DeepgramSTT()
18
-
19
- # Or specify API key directly
20
- stt = DeepgramSTT(api_key="your_deepgram_api_key")
21
-
22
- # Register event handlers
23
- @stt.on("transcript")
24
- def on_transcript(text, user, metadata):
25
- print(f"Final transcript from {user}: {text}")
26
-
27
- @stt.on("partial_transcript")
28
- def on_partial(text, user, metadata):
29
- print(f"Partial transcript from {user}: {text}")
30
-
31
- # Process audio
32
- await stt.process_audio(pcm_data)
33
-
34
- # When done
35
- await stt.close()
36
- ```
37
-
38
- ## Configuration Options
39
-
40
- - `api_key`: Deepgram API key (default: reads from DEEPGRAM_API_KEY environment variable)
41
- - `options`: Deepgram LiveOptions for configuring the transcription
42
- - `sample_rate`: Sample rate of the audio in Hz (default: 16000)
43
- - `language`: Language code for transcription (default: "en-US")
44
- - `keep_alive_interval`: Interval in seconds to send keep-alive messages (default: 5.0)
45
-
46
- ## Requirements
47
-
48
- - Python 3.10+
49
- - deepgram-sdk>=4.5.0
50
- - numpy>=2.2.6,<2.3
@@ -1,276 +0,0 @@
1
- import json
2
- import logging
3
- from typing import Dict, Any, Optional, Tuple, List, Union, TYPE_CHECKING
4
-
5
- if TYPE_CHECKING:
6
- from vision_agents.core.edge.types import Participant
7
- import numpy as np
8
- import os
9
- import time
10
-
11
- from deepgram import DeepgramClient, LiveTranscriptionEvents, LiveOptions, DeepgramClientOptions
12
- from vision_agents.core import stt
13
- from getstream.video.rtc.track_util import PcmData
14
-
15
- logger = logging.getLogger(__name__)
16
-
17
-
18
- class STT(stt.STT):
19
- """
20
- Deepgram-based Speech-to-Text implementation.
21
-
22
- This implementation operates in asynchronous mode - it receives streaming transcripts
23
- from Deepgram's WebSocket connection and emits events immediately as they arrive,
24
- providing real-time responsiveness for live transcription scenarios.
25
-
26
- Events:
27
- - transcript: Emitted when a complete transcript is available.
28
- Args: text (str), user_metadata (dict), metadata (dict)
29
- - partial_transcript: Emitted when a partial transcript is available.
30
- Args: text (str), user_metadata (dict), metadata (dict)
31
- - error: Emitted when an error occurs during transcription.
32
- Args: error (Exception)
33
- """
34
-
35
- def __init__(
36
- self,
37
- api_key: Optional[str] = None,
38
- options: Optional[LiveOptions] = None, # type: ignore
39
- sample_rate: int = 48000,
40
- language: str = "en-US",
41
- interim_results: bool = True,
42
- client: Optional[DeepgramClient] = None,
43
- ):
44
- """
45
- Initialize the Deepgram STT service.
46
-
47
- Args:
48
- api_key: Deepgram API key. If not provided, the DEEPGRAM_API_KEY
49
- environment variable will be used automatically.
50
- options: Deepgram live transcription options
51
- sample_rate: Sample rate of the audio in Hz (default: 48000)
52
- language: Language code for transcription
53
- interim_results: Whether to emit interim results (partial transcripts with the partial_transcript event).
54
- """
55
- super().__init__(sample_rate=sample_rate)
56
-
57
- # If no API key was provided, check for DEEPGRAM_API_KEY in environment
58
- if api_key is None:
59
- api_key = os.environ.get("DEEPGRAM_API_KEY")
60
- if not api_key:
61
- logger.warning(
62
- "No API key provided and DEEPGRAM_API_KEY environment variable not found."
63
- )
64
-
65
- # Initialize DeepgramClient with the API key
66
- logger.info("Initializing Deepgram client")
67
- config = DeepgramClientOptions(
68
- options={"keepalive": "true"} # Comment this out to see the effect of not using keepalive
69
- )
70
- self.deepgram = client if client is not None else DeepgramClient(api_key, config)
71
- self.dg_connection: Optional[Any] = None
72
- self.options = options or LiveOptions(
73
- model="nova-2",
74
- language=language,
75
- encoding="linear16",
76
- sample_rate=sample_rate,
77
- channels=1,
78
- interim_results=interim_results,
79
- )
80
-
81
- # Track current user context for associating transcripts with users
82
- self._current_user: Optional[Dict[str, Any]] = None
83
-
84
- self._setup_connection()
85
-
86
- def _handle_transcript_result(
87
- self, is_final: bool, text: str, metadata: Dict[str, Any]
88
- ):
89
- """
90
- Handle a transcript result by emitting it immediately.
91
- """
92
- # Emit immediately for real-time responsiveness
93
- if is_final:
94
- self._emit_transcript_event(text, self._current_user, metadata)
95
- else:
96
- self._emit_partial_transcript_event(text, self._current_user, metadata)
97
-
98
- logger.debug(
99
- "Handled transcript result",
100
- extra={
101
- "is_final": is_final,
102
- "text_length": len(text),
103
- },
104
- )
105
-
106
- def _setup_connection(self):
107
- """Set up the Deepgram connection with event handlers."""
108
- if self._is_closed:
109
- logger.warning("Cannot setup connection - Deepgram instance is closed")
110
- return
111
-
112
- if self.dg_connection is not None:
113
- logger.debug("Connection already set up, skipping initialization")
114
- return
115
-
116
- try:
117
- # Use the newer websocket interface instead of deprecated live
118
- logger.debug("Setting up Deepgram WebSocket connection")
119
- self.dg_connection = self.deepgram.listen.websocket.v("1")
120
- assert self.dg_connection is not None
121
-
122
- # Handler for transcript results
123
- def handle_transcript(conn, result=None):
124
- try:
125
- # Update the last activity time
126
- self.last_activity_time = time.time()
127
-
128
- # Check if result is already a dict (from LiveResultResponse or test mocks)
129
- if isinstance(result, dict):
130
- transcript = result
131
- elif hasattr(result, "to_dict"):
132
- transcript = result.to_dict()
133
- elif hasattr(result, "to_json"):
134
- transcript = json.loads(result.to_json())
135
- elif isinstance(result, (str, bytes, bytearray)):
136
- transcript = json.loads(result)
137
- else:
138
- logger.warning(
139
- "Unrecognized transcript format: %s", type(result)
140
- )
141
- return
142
-
143
- # Get the transcript text from the response
144
- alternatives = transcript.get("channel", {}).get("alternatives", [])
145
- if not alternatives:
146
- return
147
-
148
- transcript_text = alternatives[0].get("transcript", "")
149
- if not transcript_text:
150
- return
151
-
152
- # Check if this is a final result
153
- is_final = transcript.get("is_final", False)
154
-
155
- # Create metadata with useful information
156
- metadata = {
157
- "confidence": alternatives[0].get("confidence", 0),
158
- "words": alternatives[0].get("words", []),
159
- "is_final": is_final,
160
- "channel_index": transcript.get("channel_index", 0),
161
- }
162
-
163
- # Handle the result (both collect and emit)
164
- self._handle_transcript_result(is_final, transcript_text, metadata)
165
-
166
- logger.debug(
167
- "Received transcript",
168
- extra={
169
- "is_final": is_final,
170
- "text_length": len(transcript_text),
171
- "confidence": metadata["confidence"],
172
- },
173
- )
174
- except Exception as e:
175
- logger.error("Error processing transcript", exc_info=e)
176
- # Emit error immediately
177
- self._emit_error_event(e, "Deepgram transcript processing")
178
-
179
- # Handler for errors
180
- def handle_error(conn, error=None):
181
- # Update the last activity time
182
- self.last_activity_time = time.time()
183
-
184
- error_text = str(error) if error is not None else "Unknown error"
185
- logger.error("Deepgram error received: %s", error_text)
186
-
187
- # Emit error immediately
188
- error_obj = Exception(f"Deepgram error: {error_text}")
189
- self._emit_error_event(error_obj, "Deepgram connection")
190
-
191
- # Register event handlers directly
192
- self.dg_connection.on(LiveTranscriptionEvents.Transcript, handle_transcript)
193
- self.dg_connection.on(LiveTranscriptionEvents.Error, handle_error)
194
-
195
- # Start the connection
196
- logger.info("Starting Deepgram connection with options %s", self.options)
197
- self.dg_connection.start(self.options)
198
-
199
- except Exception as e:
200
- # Log the error and set connection to None
201
- logger.error("Error setting up Deepgram connection", exc_info=e)
202
- self.dg_connection = None
203
- # Emit error immediately
204
- self._emit_error_event(e, "Deepgram connection setup")
205
-
206
- async def _process_audio_impl(
207
- self, pcm_data: PcmData, user_metadata: Optional[Union[Dict[str, Any], "Participant"]] = None
208
- ) -> Optional[List[Tuple[bool, str, Dict[str, Any]]]]:
209
- """
210
- Process audio data through Deepgram for transcription.
211
-
212
- Args:
213
- pcm_data: The PCM audio data to process.
214
- user_metadata: Additional metadata about the user or session.
215
-
216
- Returns:
217
- None - Deepgram operates in asynchronous mode and emits events directly
218
- when transcripts arrive from the streaming service.
219
- """
220
- if self._is_closed:
221
- logger.warning("Deepgram connection is closed, ignoring audio")
222
- return None
223
-
224
- # Store the current user context for transcript events
225
- self._current_user = user_metadata # type: ignore[assignment]
226
-
227
- # Check if the input sample rate matches the expected sample rate
228
- if pcm_data.sample_rate != self.sample_rate:
229
- logger.warning(
230
- "Input audio sample rate (%s Hz) does not match the expected sample rate (%s Hz). "
231
- "This may result in incorrect transcriptions. Consider resampling the audio.",
232
- pcm_data.sample_rate,
233
- self.sample_rate,
234
- )
235
-
236
- # Update the last activity time
237
- self.last_activity_time = time.time()
238
-
239
- # Convert PCM data to bytes if needed
240
- audio_data = pcm_data.samples
241
- if not isinstance(audio_data, bytes):
242
- # Convert numpy array to bytes
243
- audio_data = audio_data.astype(np.int16).tobytes()
244
-
245
- # Send the audio data to Deepgram
246
- try:
247
- logger.debug(
248
- "Sending audio data to Deepgram",
249
- extra={"audio_bytes": len(audio_data)},
250
- )
251
- assert self.dg_connection is not None
252
- self.dg_connection.send(audio_data)
253
- except Exception as e:
254
- # Raise exception to be handled by base class
255
- raise Exception(f"Deepgram audio transmission error: {e}")
256
-
257
- # Return None for asynchronous mode - events are emitted when they arrive
258
- return None
259
-
260
- async def close(self):
261
- """Close the Deepgram connection and clean up resources."""
262
- if self._is_closed:
263
- logger.debug("Deepgram STT service already closed")
264
- return
265
-
266
- logger.info("Closing Deepgram STT service")
267
- self._is_closed = True
268
-
269
- # Close the Deepgram connection if it exists
270
- if self.dg_connection:
271
- logger.debug("Closing Deepgram connection")
272
- try:
273
- self.dg_connection.finish()
274
- self.dg_connection = None
275
- except Exception as e:
276
- logger.error("Error closing Deepgram connection", exc_info=e)