vision-agents-plugins-deepgram 0.0.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of vision-agents-plugins-deepgram might be problematic. Click here for more details.

@@ -0,0 +1,32 @@
1
+ */__pycache__
2
+ */chat/__pycache__
3
+ */video/__pycache__
4
+ */chat/sync/__pycache__
5
+ */chat/async_/__pycache__
6
+ */sync/__pycache__
7
+ */async_/__pycache__
8
+ */video/sync/__pycache__
9
+ */model/__pycache__/
10
+ */cli/__pycache__
11
+ */cli/__pycache__
12
+ .env
13
+ .venv
14
+ .vscode/settings.json
15
+ *.pyc
16
+ dist/*
17
+ dist/*
18
+ *.log
19
+ .python-version
20
+ pyvenv.cfg
21
+ .idea*
22
+ bin/*
23
+ lib/*
24
+ shell.nix
25
+ pyrightconfig.json
26
+ .DS_Store
27
+
28
+ *.egg-info/
29
+ *.egg
30
+ *.pt
31
+ *.kef
32
+ .env.bak
@@ -0,0 +1,65 @@
1
+ Metadata-Version: 2.4
2
+ Name: vision-agents-plugins-deepgram
3
+ Version: 0.0.12
4
+ Summary: Deepgram STT integration for Vision Agents
5
+ Project-URL: Documentation, https://visionagents.ai/
6
+ Project-URL: Website, https://visionagents.ai/
7
+ Project-URL: Source, https://github.com/GetStream/Vision-Agents
8
+ License-Expression: MIT
9
+ Keywords: AI,STT,agents,deepgram,speech-to-text,transcription,voice agents
10
+ Requires-Python: >=3.10
11
+ Requires-Dist: deepgram-sdk==4.8.1
12
+ Requires-Dist: numpy<2.3,>=2.2.6
13
+ Requires-Dist: vision-agents
14
+ Description-Content-Type: text/markdown
15
+
16
+ # Deepgram Speech-to-Text Plugin
17
+
18
+ A high-quality Speech-to-Text (STT) plugin for GetStream that uses the Deepgram API.
19
+
20
+ ## Installation
21
+
22
+ ```bash
23
+ pip install getstream-plugins-deepgram
24
+ ```
25
+
26
+ ## Usage
27
+
28
+ ```python
29
+ from getstream.plugins.deepgram import DeepgramSTT
30
+
31
+ # Initialize with API key from environment variable
32
+ stt = DeepgramSTT()
33
+
34
+ # Or specify API key directly
35
+ stt = DeepgramSTT(api_key="your_deepgram_api_key")
36
+
37
+ # Register event handlers
38
+ @stt.on("transcript")
39
+ def on_transcript(text, user, metadata):
40
+ print(f"Final transcript from {user}: {text}")
41
+
42
+ @stt.on("partial_transcript")
43
+ def on_partial(text, user, metadata):
44
+ print(f"Partial transcript from {user}: {text}")
45
+
46
+ # Process audio
47
+ await stt.process_audio(pcm_data)
48
+
49
+ # When done
50
+ await stt.close()
51
+ ```
52
+
53
+ ## Configuration Options
54
+
55
+ - `api_key`: Deepgram API key (default: reads from DEEPGRAM_API_KEY environment variable)
56
+ - `options`: Deepgram LiveOptions for configuring the transcription
57
+ - `sample_rate`: Sample rate of the audio in Hz (default: 16000)
58
+ - `language`: Language code for transcription (default: "en-US")
59
+ - `keep_alive_interval`: Interval in seconds to send keep-alive messages (default: 5.0)
60
+
61
+ ## Requirements
62
+
63
+ - Python 3.10+
64
+ - deepgram-sdk>=4.5.0
65
+ - numpy>=2.2.6,<2.3
@@ -0,0 +1,50 @@
1
+ # Deepgram Speech-to-Text Plugin
2
+
3
+ A high-quality Speech-to-Text (STT) plugin for GetStream that uses the Deepgram API.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install getstream-plugins-deepgram
9
+ ```
10
+
11
+ ## Usage
12
+
13
+ ```python
14
+ from getstream.plugins.deepgram import DeepgramSTT
15
+
16
+ # Initialize with API key from environment variable
17
+ stt = DeepgramSTT()
18
+
19
+ # Or specify API key directly
20
+ stt = DeepgramSTT(api_key="your_deepgram_api_key")
21
+
22
+ # Register event handlers
23
+ @stt.on("transcript")
24
+ def on_transcript(text, user, metadata):
25
+ print(f"Final transcript from {user}: {text}")
26
+
27
+ @stt.on("partial_transcript")
28
+ def on_partial(text, user, metadata):
29
+ print(f"Partial transcript from {user}: {text}")
30
+
31
+ # Process audio
32
+ await stt.process_audio(pcm_data)
33
+
34
+ # When done
35
+ await stt.close()
36
+ ```
37
+
38
+ ## Configuration Options
39
+
40
+ - `api_key`: Deepgram API key (default: reads from DEEPGRAM_API_KEY environment variable)
41
+ - `options`: Deepgram LiveOptions for configuring the transcription
42
+ - `sample_rate`: Sample rate of the audio in Hz (default: 16000)
43
+ - `language`: Language code for transcription (default: "en-US")
44
+ - `keep_alive_interval`: Interval in seconds to send keep-alive messages (default: 5.0)
45
+
46
+ ## Requirements
47
+
48
+ - Python 3.10+
49
+ - deepgram-sdk>=4.5.0
50
+ - numpy>=2.2.6,<2.3
@@ -0,0 +1,44 @@
1
+ [build-system]
2
+ requires = ["hatchling", "hatch-vcs"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "vision-agents-plugins-deepgram"
7
+ dynamic = ["version"]
8
+ description = "Deepgram STT integration for Vision Agents"
9
+ readme = "README.md"
10
+ keywords = ["deepgram", "STT", "speech-to-text", "transcription", "AI", "voice agents", "agents"]
11
+ requires-python = ">=3.10"
12
+ license = "MIT"
13
+ dependencies = [
14
+ "vision-agents",
15
+ "deepgram-sdk==4.8.1",
16
+ "numpy>=2.2.6,<2.3",
17
+ ]
18
+
19
+ [project.urls]
20
+ Documentation = "https://visionagents.ai/"
21
+ Website = "https://visionagents.ai/"
22
+ Source = "https://github.com/GetStream/Vision-Agents"
23
+
24
+ [tool.hatch.version]
25
+ source = "vcs"
26
+ raw-options = { root = "..", search_parent_directories = true, fallback_version = "0.0.0" }
27
+
28
+ [tool.hatch.build.targets.wheel]
29
+ packages = [".", "vision_agents"]
30
+
31
+ [tool.hatch.build.targets.sdist]
32
+ include = ["/vision_agents"]
33
+
34
+ [tool.uv.sources]
35
+ vision-agents = { workspace = true }
36
+
37
+ [dependency-groups]
38
+ dev = [
39
+ "pytest>=8.4.1",
40
+ "pytest-asyncio>=1.0.0",
41
+ "soundfile>=0.13.1",
42
+ "torchaudio>=2.7.1",
43
+ "scipy>=1.15.3,<1.16",
44
+ ]
@@ -0,0 +1,7 @@
1
+ from .stt import STT
2
+
3
+ # Re-export under the new namespace for convenience
4
+ __path__ = __import__("pkgutil").extend_path(__path__, __name__)
5
+
6
+ __all__ = ["STT"]
7
+
@@ -0,0 +1,276 @@
1
+ import json
2
+ import logging
3
+ from typing import Dict, Any, Optional, Tuple, List, Union, TYPE_CHECKING
4
+
5
+ if TYPE_CHECKING:
6
+ from vision_agents.core.edge.types import Participant
7
+ import numpy as np
8
+ import os
9
+ import time
10
+
11
+ from deepgram import DeepgramClient, LiveTranscriptionEvents, LiveOptions, DeepgramClientOptions
12
+ from vision_agents.core import stt
13
+ from getstream.video.rtc.track_util import PcmData
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class STT(stt.STT):
19
+ """
20
+ Deepgram-based Speech-to-Text implementation.
21
+
22
+ This implementation operates in asynchronous mode - it receives streaming transcripts
23
+ from Deepgram's WebSocket connection and emits events immediately as they arrive,
24
+ providing real-time responsiveness for live transcription scenarios.
25
+
26
+ Events:
27
+ - transcript: Emitted when a complete transcript is available.
28
+ Args: text (str), user_metadata (dict), metadata (dict)
29
+ - partial_transcript: Emitted when a partial transcript is available.
30
+ Args: text (str), user_metadata (dict), metadata (dict)
31
+ - error: Emitted when an error occurs during transcription.
32
+ Args: error (Exception)
33
+ """
34
+
35
+ def __init__(
36
+ self,
37
+ api_key: Optional[str] = None,
38
+ options: Optional[LiveOptions] = None, # type: ignore
39
+ sample_rate: int = 48000,
40
+ language: str = "en-US",
41
+ interim_results: bool = True,
42
+ client: Optional[DeepgramClient] = None,
43
+ ):
44
+ """
45
+ Initialize the Deepgram STT service.
46
+
47
+ Args:
48
+ api_key: Deepgram API key. If not provided, the DEEPGRAM_API_KEY
49
+ environment variable will be used automatically.
50
+ options: Deepgram live transcription options
51
+ sample_rate: Sample rate of the audio in Hz (default: 48000)
52
+ language: Language code for transcription
53
+ interim_results: Whether to emit interim results (partial transcripts with the partial_transcript event).
54
+ """
55
+ super().__init__(sample_rate=sample_rate)
56
+
57
+ # If no API key was provided, check for DEEPGRAM_API_KEY in environment
58
+ if api_key is None:
59
+ api_key = os.environ.get("DEEPGRAM_API_KEY")
60
+ if not api_key:
61
+ logger.warning(
62
+ "No API key provided and DEEPGRAM_API_KEY environment variable not found."
63
+ )
64
+
65
+ # Initialize DeepgramClient with the API key
66
+ logger.info("Initializing Deepgram client")
67
+ config = DeepgramClientOptions(
68
+ options={"keepalive": "true"} # Comment this out to see the effect of not using keepalive
69
+ )
70
+ self.deepgram = client if client is not None else DeepgramClient(api_key, config)
71
+ self.dg_connection: Optional[Any] = None
72
+ self.options = options or LiveOptions(
73
+ model="nova-2",
74
+ language=language,
75
+ encoding="linear16",
76
+ sample_rate=sample_rate,
77
+ channels=1,
78
+ interim_results=interim_results,
79
+ )
80
+
81
+ # Track current user context for associating transcripts with users
82
+ self._current_user: Optional[Dict[str, Any]] = None
83
+
84
+ self._setup_connection()
85
+
86
+ def _handle_transcript_result(
87
+ self, is_final: bool, text: str, metadata: Dict[str, Any]
88
+ ):
89
+ """
90
+ Handle a transcript result by emitting it immediately.
91
+ """
92
+ # Emit immediately for real-time responsiveness
93
+ if is_final:
94
+ self._emit_transcript_event(text, self._current_user, metadata)
95
+ else:
96
+ self._emit_partial_transcript_event(text, self._current_user, metadata)
97
+
98
+ logger.debug(
99
+ "Handled transcript result",
100
+ extra={
101
+ "is_final": is_final,
102
+ "text_length": len(text),
103
+ },
104
+ )
105
+
106
+ def _setup_connection(self):
107
+ """Set up the Deepgram connection with event handlers."""
108
+ if self._is_closed:
109
+ logger.warning("Cannot setup connection - Deepgram instance is closed")
110
+ return
111
+
112
+ if self.dg_connection is not None:
113
+ logger.debug("Connection already set up, skipping initialization")
114
+ return
115
+
116
+ try:
117
+ # Use the newer websocket interface instead of deprecated live
118
+ logger.debug("Setting up Deepgram WebSocket connection")
119
+ self.dg_connection = self.deepgram.listen.websocket.v("1")
120
+ assert self.dg_connection is not None
121
+
122
+ # Handler for transcript results
123
+ def handle_transcript(conn, result=None):
124
+ try:
125
+ # Update the last activity time
126
+ self.last_activity_time = time.time()
127
+
128
+ # Check if result is already a dict (from LiveResultResponse or test mocks)
129
+ if isinstance(result, dict):
130
+ transcript = result
131
+ elif hasattr(result, "to_dict"):
132
+ transcript = result.to_dict()
133
+ elif hasattr(result, "to_json"):
134
+ transcript = json.loads(result.to_json())
135
+ elif isinstance(result, (str, bytes, bytearray)):
136
+ transcript = json.loads(result)
137
+ else:
138
+ logger.warning(
139
+ "Unrecognized transcript format: %s", type(result)
140
+ )
141
+ return
142
+
143
+ # Get the transcript text from the response
144
+ alternatives = transcript.get("channel", {}).get("alternatives", [])
145
+ if not alternatives:
146
+ return
147
+
148
+ transcript_text = alternatives[0].get("transcript", "")
149
+ if not transcript_text:
150
+ return
151
+
152
+ # Check if this is a final result
153
+ is_final = transcript.get("is_final", False)
154
+
155
+ # Create metadata with useful information
156
+ metadata = {
157
+ "confidence": alternatives[0].get("confidence", 0),
158
+ "words": alternatives[0].get("words", []),
159
+ "is_final": is_final,
160
+ "channel_index": transcript.get("channel_index", 0),
161
+ }
162
+
163
+ # Handle the result (both collect and emit)
164
+ self._handle_transcript_result(is_final, transcript_text, metadata)
165
+
166
+ logger.debug(
167
+ "Received transcript",
168
+ extra={
169
+ "is_final": is_final,
170
+ "text_length": len(transcript_text),
171
+ "confidence": metadata["confidence"],
172
+ },
173
+ )
174
+ except Exception as e:
175
+ logger.error("Error processing transcript", exc_info=e)
176
+ # Emit error immediately
177
+ self._emit_error_event(e, "Deepgram transcript processing")
178
+
179
+ # Handler for errors
180
+ def handle_error(conn, error=None):
181
+ # Update the last activity time
182
+ self.last_activity_time = time.time()
183
+
184
+ error_text = str(error) if error is not None else "Unknown error"
185
+ logger.error("Deepgram error received: %s", error_text)
186
+
187
+ # Emit error immediately
188
+ error_obj = Exception(f"Deepgram error: {error_text}")
189
+ self._emit_error_event(error_obj, "Deepgram connection")
190
+
191
+ # Register event handlers directly
192
+ self.dg_connection.on(LiveTranscriptionEvents.Transcript, handle_transcript)
193
+ self.dg_connection.on(LiveTranscriptionEvents.Error, handle_error)
194
+
195
+ # Start the connection
196
+ logger.info("Starting Deepgram connection with options %s", self.options)
197
+ self.dg_connection.start(self.options)
198
+
199
+ except Exception as e:
200
+ # Log the error and set connection to None
201
+ logger.error("Error setting up Deepgram connection", exc_info=e)
202
+ self.dg_connection = None
203
+ # Emit error immediately
204
+ self._emit_error_event(e, "Deepgram connection setup")
205
+
206
+ async def _process_audio_impl(
207
+ self, pcm_data: PcmData, user_metadata: Optional[Union[Dict[str, Any], "Participant"]] = None
208
+ ) -> Optional[List[Tuple[bool, str, Dict[str, Any]]]]:
209
+ """
210
+ Process audio data through Deepgram for transcription.
211
+
212
+ Args:
213
+ pcm_data: The PCM audio data to process.
214
+ user_metadata: Additional metadata about the user or session.
215
+
216
+ Returns:
217
+ None - Deepgram operates in asynchronous mode and emits events directly
218
+ when transcripts arrive from the streaming service.
219
+ """
220
+ if self._is_closed:
221
+ logger.warning("Deepgram connection is closed, ignoring audio")
222
+ return None
223
+
224
+ # Store the current user context for transcript events
225
+ self._current_user = user_metadata # type: ignore[assignment]
226
+
227
+ # Check if the input sample rate matches the expected sample rate
228
+ if pcm_data.sample_rate != self.sample_rate:
229
+ logger.warning(
230
+ "Input audio sample rate (%s Hz) does not match the expected sample rate (%s Hz). "
231
+ "This may result in incorrect transcriptions. Consider resampling the audio.",
232
+ pcm_data.sample_rate,
233
+ self.sample_rate,
234
+ )
235
+
236
+ # Update the last activity time
237
+ self.last_activity_time = time.time()
238
+
239
+ # Convert PCM data to bytes if needed
240
+ audio_data = pcm_data.samples
241
+ if not isinstance(audio_data, bytes):
242
+ # Convert numpy array to bytes
243
+ audio_data = audio_data.astype(np.int16).tobytes()
244
+
245
+ # Send the audio data to Deepgram
246
+ try:
247
+ logger.debug(
248
+ "Sending audio data to Deepgram",
249
+ extra={"audio_bytes": len(audio_data)},
250
+ )
251
+ assert self.dg_connection is not None
252
+ self.dg_connection.send(audio_data)
253
+ except Exception as e:
254
+ # Raise exception to be handled by base class
255
+ raise Exception(f"Deepgram audio transmission error: {e}")
256
+
257
+ # Return None for asynchronous mode - events are emitted when they arrive
258
+ return None
259
+
260
+ async def close(self):
261
+ """Close the Deepgram connection and clean up resources."""
262
+ if self._is_closed:
263
+ logger.debug("Deepgram STT service already closed")
264
+ return
265
+
266
+ logger.info("Closing Deepgram STT service")
267
+ self._is_closed = True
268
+
269
+ # Close the Deepgram connection if it exists
270
+ if self.dg_connection:
271
+ logger.debug("Closing Deepgram connection")
272
+ try:
273
+ self.dg_connection.finish()
274
+ self.dg_connection = None
275
+ except Exception as e:
276
+ logger.error("Error closing Deepgram connection", exc_info=e)