vision-agents-plugins-deepgram 0.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
.gitignore ADDED
@@ -0,0 +1,90 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .cursor/*
7
+ # Distribution / packaging
8
+ .Python
9
+ build/
10
+ dist/
11
+ downloads/
12
+ develop-eggs/
13
+ eggs/
14
+ .eggs/
15
+ lib64/
16
+ parts/
17
+ sdist/
18
+ var/
19
+ wheels/
20
+ share/python-wheels/
21
+ pip-wheel-metadata/
22
+ MANIFEST
23
+ *.egg-info/
24
+ *.egg
25
+
26
+ # Installer logs
27
+ pip-log.txt
28
+ pip-delete-this-directory.txt
29
+
30
+ # Unit test / coverage reports
31
+ htmlcov/
32
+ .tox/
33
+ .nox/
34
+ .coverage
35
+ .coverage.*
36
+ .cache
37
+ coverage.xml
38
+ nosetests.xml
39
+ *.cover
40
+ *.py,cover
41
+ .hypothesis/
42
+ .pytest_cache/
43
+
44
+ # Type checker / lint caches
45
+ .mypy_cache/
46
+ .dmypy.json
47
+ dmypy.json
48
+ .pytype/
49
+ .pyre/
50
+ .ruff_cache/
51
+
52
+ # Environments
53
+ .venv
54
+ env/
55
+ venv/
56
+ ENV/
57
+ env.bak/
58
+ venv.bak/
59
+ .env
60
+ .env.local
61
+ .env.*.local
62
+ .env.bak
63
+ pyvenv.cfg
64
+ .python-version
65
+
66
+ # Editors / IDEs
67
+ .vscode/
68
+ .idea/
69
+
70
+ # Jupyter Notebook
71
+ .ipynb_checkpoints/
72
+
73
+ # OS / Misc
74
+ .DS_Store
75
+ *.log
76
+
77
+ # Tooling & repo-specific
78
+ pyrightconfig.json
79
+ shell.nix
80
+ bin/*
81
+ lib/*
82
+ stream-py/
83
+
84
+ # Artifacts / assets
85
+ *.pt
86
+ *.kef
87
+ *.onnx
88
+ profile.html
89
+
90
+ /opencode.json
PKG-INFO ADDED
@@ -0,0 +1,75 @@
1
+ Metadata-Version: 2.4
2
+ Name: vision-agents-plugins-deepgram
3
+ Version: 0.2.7
4
+ Summary: Deepgram STT and TTS integration for Vision Agents
5
+ Project-URL: Documentation, https://visionagents.ai/
6
+ Project-URL: Website, https://visionagents.ai/
7
+ Project-URL: Source, https://github.com/GetStream/Vision-Agents
8
+ License-Expression: MIT
9
+ Keywords: AI,STT,TTS,agents,deepgram,speech-to-text,text-to-speech,transcription,voice agents
10
+ Requires-Python: >=3.10
11
+ Requires-Dist: deepgram-sdk>=5.3.0
12
+ Requires-Dist: numpy<2.3,>=2.2.6
13
+ Requires-Dist: vision-agents
14
+ Description-Content-Type: text/markdown
15
+
16
+ # Deepgram Plugin
17
+
18
+ Speech-to-Text (STT) and Text-to-Speech (TTS) plugins for Vision Agents using the Deepgram API.
19
+
20
+ ## Installation
21
+
22
+ ```bash
23
+ uv add vision-agents-plugins-deepgram
24
+ ```
25
+
26
+ ## Speech-to-Text (STT)
27
+
28
+ High-quality speech recognition using Deepgram's Flux model with built-in turn detection.
29
+
30
+ ```python
31
+ from vision_agents.plugins import deepgram
32
+
33
+ stt = deepgram.STT(
34
+ model="flux-general-en", # Default model
35
+ eager_turn_detection=True, # Enable eager end-of-turn detection
36
+ )
37
+ ```
38
+
39
+ ### STT Docs
40
+
41
+ - https://developers.deepgram.com/docs/flux/quickstart
42
+ - https://github.com/deepgram/deepgram-python-sdk/blob/main/examples/listen/v2/connect/async.py
43
+
44
+ ## Text-to-Speech (TTS)
45
+
46
+ Low-latency text-to-speech using Deepgram's Aura model via WebSocket streaming.
47
+
48
+ ```python
49
+ from vision_agents.plugins import deepgram
50
+
51
+ tts = deepgram.TTS(
52
+ model="aura-2-thalia-en", # Default voice
53
+ sample_rate=16000, # Audio sample rate
54
+ )
55
+ ```
56
+
57
+ ### Available Voices
58
+
59
+ Deepgram offers various Aura voice models:
60
+ - `aura-2-thalia-en` - Default female voice
61
+ - `aura-2-orion-en` - Male voice
62
+ - See [TTS Models](https://developers.deepgram.com/docs/tts-models) for all options
63
+
64
+ ### TTS Docs
65
+
66
+ - https://developers.deepgram.com/docs/tts-websocket
67
+ - https://developers.deepgram.com/docs/streaming-text-to-speech
68
+
69
+ ## Environment Variables
70
+
71
+ Set `DEEPGRAM_API_KEY` in your environment or pass `api_key` to the constructor.
72
+
73
+ ## Example
74
+
75
+ See the [example](./example/) directory for a complete working example using both STT and TTS.
README.md ADDED
@@ -0,0 +1,60 @@
1
+ # Deepgram Plugin
2
+
3
+ Speech-to-Text (STT) and Text-to-Speech (TTS) plugins for Vision Agents using the Deepgram API.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ uv add vision-agents-plugins-deepgram
9
+ ```
10
+
11
+ ## Speech-to-Text (STT)
12
+
13
+ High-quality speech recognition using Deepgram's Flux model with built-in turn detection.
14
+
15
+ ```python
16
+ from vision_agents.plugins import deepgram
17
+
18
+ stt = deepgram.STT(
19
+ model="flux-general-en", # Default model
20
+ eager_turn_detection=True, # Enable eager end-of-turn detection
21
+ )
22
+ ```
23
+
24
+ ### STT Docs
25
+
26
+ - https://developers.deepgram.com/docs/flux/quickstart
27
+ - https://github.com/deepgram/deepgram-python-sdk/blob/main/examples/listen/v2/connect/async.py
28
+
29
+ ## Text-to-Speech (TTS)
30
+
31
+ Low-latency text-to-speech using Deepgram's Aura model via WebSocket streaming.
32
+
33
+ ```python
34
+ from vision_agents.plugins import deepgram
35
+
36
+ tts = deepgram.TTS(
37
+ model="aura-2-thalia-en", # Default voice
38
+ sample_rate=16000, # Audio sample rate
39
+ )
40
+ ```
41
+
42
+ ### Available Voices
43
+
44
+ Deepgram offers various Aura voice models:
45
+ - `aura-2-thalia-en` - Default female voice
46
+ - `aura-2-orion-en` - Male voice
47
+ - See [TTS Models](https://developers.deepgram.com/docs/tts-models) for all options
48
+
49
+ ### TTS Docs
50
+
51
+ - https://developers.deepgram.com/docs/tts-websocket
52
+ - https://developers.deepgram.com/docs/streaming-text-to-speech
53
+
54
+ ## Environment Variables
55
+
56
+ Set `DEEPGRAM_API_KEY` in your environment or pass `api_key` to the constructor.
57
+
58
+ ## Example
59
+
60
+ See the [example](./example/) directory for a complete working example using both STT and TTS.
pyproject.toml ADDED
@@ -0,0 +1,44 @@
1
+ [build-system]
2
+ requires = ["hatchling", "hatch-vcs"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "vision-agents-plugins-deepgram"
7
+ dynamic = ["version"]
8
+ description = "Deepgram STT and TTS integration for Vision Agents"
9
+ readme = "README.md"
10
+ keywords = ["deepgram", "STT", "TTS", "speech-to-text", "text-to-speech", "transcription", "AI", "voice agents", "agents"]
11
+ requires-python = ">=3.10"
12
+ license = "MIT"
13
+ dependencies = [
14
+ "vision-agents",
15
+ "deepgram-sdk>=5.3.0",
16
+ "numpy>=2.2.6,<2.3",
17
+ ]
18
+
19
+ [project.urls]
20
+ Documentation = "https://visionagents.ai/"
21
+ Website = "https://visionagents.ai/"
22
+ Source = "https://github.com/GetStream/Vision-Agents"
23
+
24
+ [tool.hatch.version]
25
+ source = "vcs"
26
+ raw-options = { root = "..", search_parent_directories = true, fallback_version = "0.0.0" }
27
+
28
+ [tool.hatch.build.targets.wheel]
29
+ packages = [".", "vision_agents"]
30
+
31
+ [tool.hatch.build.targets.sdist]
32
+ include = ["/vision_agents"]
33
+
34
+ [tool.uv.sources]
35
+ vision-agents = { workspace = true }
36
+
37
+ [dependency-groups]
38
+ dev = [
39
+ "pytest>=8.4.1",
40
+ "pytest-asyncio>=1.0.0",
41
+ "soundfile>=0.13.1",
42
+ "torchvision>=0.20.0",
43
+ "scipy>=1.15.3,<1.16",
44
+ ]
@@ -0,0 +1,7 @@
1
+ from .deepgram_stt import STT
2
+ from .tts import TTS
3
+
4
+ # Re-export under the new namespace for convenience
5
+ __path__ = __import__("pkgutil").extend_path(__path__, __name__)
6
+
7
+ __all__ = ["STT", "TTS"]
@@ -0,0 +1,288 @@
1
+ import asyncio
2
+ import logging
3
+ import os
4
+ from typing import Optional, Any
5
+
6
+ from deepgram import AsyncDeepgramClient
7
+ from deepgram.core import EventType
8
+ from deepgram.extensions.types.sockets import ListenV2ControlMessage
9
+ from deepgram.listen.v2.socket_client import AsyncV2SocketClient
10
+ from getstream.video.rtc.track_util import PcmData
11
+
12
+ from vision_agents.core import stt
13
+ from vision_agents.core.stt import TranscriptResponse
14
+ from vision_agents.core.edge.types import Participant
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class STT(stt.STT):
20
+ """
21
+ Deepgram Speech-to-Text implementation using Flux model.
22
+
23
+ - https://developers.deepgram.com/docs/flux/quickstart
24
+ - https://github.com/deepgram/deepgram-python-sdk/blob/main/examples/listen/v2/connect/async.py
25
+ - https://github.com/deepgram/deepgram-python-sdk/tree/main
26
+ - https://github.com/deepgram-devs/deepgram-demos-flux-streaming-transcription/blob/main/main.py
27
+
28
+ Deepgram flux runs turn detection internally. So running turn detection in front of this is optional/not needed
29
+
30
+ - eot_threshold controls turn end sensitivity
31
+ - eager_eot_threshold controls eager turn ending (so you can already prepare the LLM response)
32
+ """
33
+
34
+ turn_detection: bool = True # we support turn detection with deepgram
35
+
36
+ def __init__(
37
+ self,
38
+ api_key: Optional[str] = None,
39
+ model: str = "flux-general-en",
40
+ language: Optional[str] = None,
41
+ eager_turn_detection: bool = False,
42
+ eot_threshold: Optional[float] = None,
43
+ eager_eot_threshold: Optional[float] = None,
44
+ client: Optional[AsyncDeepgramClient] = None,
45
+ ):
46
+ """
47
+ Initialize Deepgram STT.
48
+
49
+ Args:
50
+ api_key: Deepgram API key. If not provided, will use DEEPGRAM_API_KEY env var.
51
+ model: Model to use for transcription. Defaults to "flux-general-en".
52
+ language: Language code (e.g., "en", "es"). If not provided, auto-detection is used.
53
+ eot_threshold: End-of-turn threshold for determining when a turn is complete.
54
+ eager_eot_threshold: Eager end-of-turn threshold for faster turn detection.
55
+ client: Optional pre-configured AsyncDeepgramClient instance.
56
+ """
57
+ super().__init__(provider_name="deepgram")
58
+
59
+ if not api_key:
60
+ api_key = os.environ.get("DEEPGRAM_API_KEY")
61
+
62
+ if client is not None:
63
+ self.client = client
64
+ else:
65
+ # Initialize AsyncDeepgramClient with api_key as named parameter
66
+ if api_key:
67
+ self.client = AsyncDeepgramClient(api_key=api_key)
68
+ else:
69
+ self.client = AsyncDeepgramClient()
70
+
71
+ self.model = model
72
+ self.language = language
73
+ self.eot_threshold = eot_threshold
74
+ self.eager_turn_detection = eager_turn_detection
75
+ if self.eager_turn_detection and eager_eot_threshold is None:
76
+ eager_eot_threshold = 0.5
77
+ self.eager_eot_threshold = eager_eot_threshold
78
+ self._current_participant: Optional[Participant] = None
79
+ self.connection: Optional[AsyncV2SocketClient] = None
80
+ self._connection_ready = asyncio.Event()
81
+ self._connection_context: Optional[Any] = None
82
+ self._listen_task: Optional[asyncio.Task[Any]] = None
83
+
84
+ async def process_audio(
85
+ self,
86
+ pcm_data: PcmData,
87
+ participant: Optional[Participant] = None,
88
+ ):
89
+ """
90
+ Process audio data through Deepgram for transcription.
91
+
92
+ This method sends audio to the existing WebSocket connection. The connection
93
+ is started automatically on first use. Audio is automatically resampled to 16kHz.
94
+
95
+ Args:
96
+ pcm_data: The PCM audio data to process.
97
+ participant: Optional participant metadata (currently not used in streaming mode).
98
+ """
99
+ if self.closed:
100
+ logger.warning("Deepgram STT is closed, ignoring audio")
101
+ return
102
+
103
+ # Wait for connection to be ready
104
+ await self._connection_ready.wait()
105
+
106
+ # Double-check connection is still ready (could have closed while waiting)
107
+ if not self._connection_ready.is_set():
108
+ logger.warning("Deepgram connection closed while processing audio")
109
+ return
110
+
111
+ # Resample to 16kHz mono (recommended by Deepgram)
112
+ resampled_pcm = pcm_data.resample(16_000, 1)
113
+
114
+ # Convert int16 samples to bytes
115
+ audio_bytes = resampled_pcm.samples.tobytes()
116
+
117
+ self._current_participant = participant
118
+
119
+ if self.connection is not None:
120
+ await self.connection.send_media(audio_bytes)
121
+
122
+ async def start(self):
123
+ """
124
+ Start the Deepgram WebSocket connection and begin listening for transcripts.
125
+ """
126
+ if self.connection is not None:
127
+ logger.warning("Deepgram connection already started")
128
+ return
129
+
130
+ # Build connection parameters
131
+ connect_params = {
132
+ "model": self.model,
133
+ "encoding": "linear16",
134
+ "sample_rate": "16000",
135
+ }
136
+
137
+ # Add optional parameters if specified
138
+ if self.eot_threshold is not None:
139
+ connect_params["eot_threshold"] = str(self.eot_threshold)
140
+ if self.eager_eot_threshold is not None:
141
+ connect_params["eager_eot_threshold"] = str(self.eager_eot_threshold)
142
+
143
+ # Connect to Deepgram v2 listen WebSocket with timeout
144
+ self._connection_context = self.client.listen.v2.connect(**connect_params)
145
+
146
+ # Add timeout for connection establishment
147
+ self.connection = await asyncio.wait_for(
148
+ self._connection_context.__aenter__(), timeout=10.0
149
+ )
150
+
151
+ # Register event handlers
152
+ if self.connection is not None:
153
+ self.connection.on(EventType.OPEN, self._on_open)
154
+ self.connection.on(EventType.MESSAGE, self._on_message)
155
+ self.connection.on(EventType.ERROR, self._on_error)
156
+ self.connection.on(EventType.CLOSE, self._on_close)
157
+
158
+ # Start listening for events
159
+ self._listen_task = asyncio.create_task(self.connection.start_listening())
160
+
161
+ # Mark connection as ready
162
+ self._connection_ready.set()
163
+
164
+ def _on_message(self, message):
165
+ """
166
+ Event handler for messages from Deepgram.
167
+
168
+ Args:
169
+ message: The message object from Deepgram
170
+
171
+ TODO: errors in this function are hidden silently. Not sure why this happens.
172
+ """
173
+ # Extract message data
174
+ if not hasattr(message, "type"):
175
+ logger.warning(f"Received message without 'type' attribute: {message}")
176
+ return
177
+
178
+ # Handle TurnInfo messages (v2 API)
179
+ if message.type == "TurnInfo":
180
+ # Extract transcript text
181
+ transcript_text = getattr(message, "transcript", "").strip()
182
+
183
+ if not transcript_text:
184
+ return
185
+
186
+ # Get event type to determine if final or partial
187
+ # "StartOfTurn" and "Update" = partial, "EndOfTurn" = final
188
+ event = getattr(message, "event", "")
189
+
190
+ is_final = event == "EndOfTurn"
191
+ eager_end_of_turn = event == "EagerEndOfTurn"
192
+ start_of_turn = event == "StartOfTurn"
193
+
194
+ # Get end of turn confidence
195
+ end_of_turn_confidence = getattr(message, "end_of_turn_confidence", 0.0)
196
+
197
+ # Calculate average confidence from words
198
+ words = getattr(message, "words", [])
199
+ if words:
200
+ confidences = [w.confidence for w in words if hasattr(w, "confidence")]
201
+ avg_confidence = (
202
+ sum(confidences) / len(confidences) if confidences else 0.0
203
+ )
204
+ else:
205
+ avg_confidence = 0.0
206
+
207
+ # Get audio duration
208
+ audio_window_end = getattr(message, "audio_window_end", 0.0)
209
+ duration_ms = int(audio_window_end * 1000)
210
+
211
+ # Build response metadata
212
+ response_metadata = TranscriptResponse(
213
+ confidence=avg_confidence,
214
+ language=self.language or "auto",
215
+ audio_duration_ms=duration_ms,
216
+ model_name=self.model,
217
+ )
218
+
219
+ # Use the participant from the most recent process_audio call
220
+ participant = self._current_participant
221
+
222
+ if participant is None:
223
+ logger.warning("Received transcript but no participant set")
224
+ return
225
+
226
+ # broadcast STT event first
227
+ if is_final:
228
+ self._emit_transcript_event(
229
+ transcript_text, participant, response_metadata
230
+ )
231
+ else:
232
+ self._emit_partial_transcript_event(
233
+ transcript_text, participant, response_metadata
234
+ )
235
+
236
+ # broadcast turn event
237
+ if is_final or eager_end_of_turn:
238
+ self._emit_turn_ended_event(
239
+ participant=participant,
240
+ eager_end_of_turn=eager_end_of_turn,
241
+ confidence=end_of_turn_confidence,
242
+ )
243
+
244
+ if start_of_turn:
245
+ self._emit_turn_started_event(
246
+ participant=participant, confidence=end_of_turn_confidence
247
+ )
248
+
249
+ def _on_open(self, message):
250
+ pass
251
+
252
+ def _on_error(self, error):
253
+ """
254
+ Event handler for errors from Deepgram.
255
+
256
+ Args:
257
+ error: The error from Deepgram
258
+ """
259
+ logger.error(f"Deepgram WebSocket error: {error}")
260
+ raise Exception(f"Deepgram WebSocket error {error}")
261
+
262
+ def _on_close(self, error):
263
+ """
264
+ Event handler for connection close.
265
+ """
266
+ logger.debug(f"Deepgram WebSocket connection closed: {error}")
267
+ self._connection_ready.clear()
268
+
269
+ async def close(self):
270
+ """
271
+ Close the Deepgram connection and clean up resources.
272
+ """
273
+ # Mark as closed first
274
+ await super().close()
275
+
276
+ # Cancel listen task
277
+ if self._listen_task and not self._listen_task.done():
278
+ self._listen_task.cancel()
279
+ await asyncio.gather(self._listen_task, return_exceptions=True)
280
+
281
+ # Close connection
282
+ if self.connection and self._connection_context:
283
+ close_msg = ListenV2ControlMessage(type="CloseStream")
284
+ await self.connection.send_control(close_msg)
285
+ await self._connection_context.__aexit__(None, None, None)
286
+ self.connection = None
287
+ self._connection_context = None
288
+ self._connection_ready.clear()
@@ -0,0 +1,90 @@
1
+ import logging
2
+ import os
3
+ from typing import AsyncIterator, Optional
4
+
5
+ from deepgram import AsyncDeepgramClient
6
+ from getstream.video.rtc.track_util import PcmData, AudioFormat
7
+
8
+ from vision_agents.core import tts
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class TTS(tts.TTS):
14
+ """
15
+ Deepgram Text-to-Speech implementation using Aura model.
16
+
17
+ Uses the Deepgram Speak API with streaming response.
18
+
19
+ References:
20
+ - https://developers.deepgram.com/docs/text-to-speech
21
+ - https://developers.deepgram.com/docs/tts-models
22
+ """
23
+
24
+ def __init__(
25
+ self,
26
+ api_key: Optional[str] = None,
27
+ model: str = "aura-2-thalia-en",
28
+ sample_rate: int = 16000,
29
+ client: Optional[AsyncDeepgramClient] = None,
30
+ ):
31
+ """
32
+ Initialize Deepgram TTS.
33
+
34
+ Args:
35
+ api_key: Deepgram API key. If not provided, will use DEEPGRAM_API_KEY env var.
36
+ model: Voice model to use. Defaults to "aura-2-thalia-en".
37
+ See https://developers.deepgram.com/docs/tts-models for available voices.
38
+ sample_rate: Audio sample rate in Hz. Defaults to 16000.
39
+ client: Optional pre-configured AsyncDeepgramClient instance.
40
+ """
41
+ super().__init__(provider_name="deepgram")
42
+
43
+ if not api_key:
44
+ api_key = os.environ.get("DEEPGRAM_API_KEY")
45
+
46
+ if client is not None:
47
+ self.client = client
48
+ else:
49
+ if api_key:
50
+ self.client = AsyncDeepgramClient(api_key=api_key)
51
+ else:
52
+ self.client = AsyncDeepgramClient()
53
+
54
+ self.model = model
55
+ self.sample_rate = sample_rate
56
+
57
+ async def stream_audio(self, text: str, *_, **__) -> AsyncIterator[PcmData]:
58
+ """
59
+ Convert text to speech using Deepgram's Speak API.
60
+
61
+ Args:
62
+ text: The text to convert to speech.
63
+
64
+ Returns:
65
+ An async iterator of PcmData audio chunks.
66
+ """
67
+ # Use the Deepgram speak API with streaming response
68
+ response = self.client.speak.v1.audio.generate(
69
+ text=text,
70
+ model=self.model,
71
+ encoding="linear16",
72
+ sample_rate=self.sample_rate,
73
+ container="none", # Raw PCM, no container
74
+ )
75
+
76
+ return PcmData.from_response(
77
+ response,
78
+ sample_rate=self.sample_rate,
79
+ channels=1,
80
+ format=AudioFormat.S16,
81
+ )
82
+
83
+ async def stop_audio(self) -> None:
84
+ """
85
+ Stop audio playback.
86
+
87
+ This is a no-op for Deepgram TTS as each stream_audio call
88
+ creates its own request.
89
+ """
90
+ pass
@@ -0,0 +1,75 @@
1
+ Metadata-Version: 2.4
2
+ Name: vision-agents-plugins-deepgram
3
+ Version: 0.2.7
4
+ Summary: Deepgram STT and TTS integration for Vision Agents
5
+ Project-URL: Documentation, https://visionagents.ai/
6
+ Project-URL: Website, https://visionagents.ai/
7
+ Project-URL: Source, https://github.com/GetStream/Vision-Agents
8
+ License-Expression: MIT
9
+ Keywords: AI,STT,TTS,agents,deepgram,speech-to-text,text-to-speech,transcription,voice agents
10
+ Requires-Python: >=3.10
11
+ Requires-Dist: deepgram-sdk>=5.3.0
12
+ Requires-Dist: numpy<2.3,>=2.2.6
13
+ Requires-Dist: vision-agents
14
+ Description-Content-Type: text/markdown
15
+
16
+ # Deepgram Plugin
17
+
18
+ Speech-to-Text (STT) and Text-to-Speech (TTS) plugins for Vision Agents using the Deepgram API.
19
+
20
+ ## Installation
21
+
22
+ ```bash
23
+ uv add vision-agents-plugins-deepgram
24
+ ```
25
+
26
+ ## Speech-to-Text (STT)
27
+
28
+ High-quality speech recognition using Deepgram's Flux model with built-in turn detection.
29
+
30
+ ```python
31
+ from vision_agents.plugins import deepgram
32
+
33
+ stt = deepgram.STT(
34
+ model="flux-general-en", # Default model
35
+ eager_turn_detection=True, # Enable eager end-of-turn detection
36
+ )
37
+ ```
38
+
39
+ ### STT Docs
40
+
41
+ - https://developers.deepgram.com/docs/flux/quickstart
42
+ - https://github.com/deepgram/deepgram-python-sdk/blob/main/examples/listen/v2/connect/async.py
43
+
44
+ ## Text-to-Speech (TTS)
45
+
46
+ Low-latency text-to-speech using Deepgram's Aura model via WebSocket streaming.
47
+
48
+ ```python
49
+ from vision_agents.plugins import deepgram
50
+
51
+ tts = deepgram.TTS(
52
+ model="aura-2-thalia-en", # Default voice
53
+ sample_rate=16000, # Audio sample rate
54
+ )
55
+ ```
56
+
57
+ ### Available Voices
58
+
59
+ Deepgram offers various Aura voice models:
60
+ - `aura-2-thalia-en` - Default female voice
61
+ - `aura-2-orion-en` - Male voice
62
+ - See [TTS Models](https://developers.deepgram.com/docs/tts-models) for all options
63
+
64
+ ### TTS Docs
65
+
66
+ - https://developers.deepgram.com/docs/tts-websocket
67
+ - https://developers.deepgram.com/docs/streaming-text-to-speech
68
+
69
+ ## Environment Variables
70
+
71
+ Set `DEEPGRAM_API_KEY` in your environment or pass `api_key` to the constructor.
72
+
73
+ ## Example
74
+
75
+ See the [example](./example/) directory for a complete working example using both STT and TTS.
@@ -0,0 +1,13 @@
1
+ ./.gitignore,sha256=zrSq4X-Qh8j7QY0ukXt-RXj6StdhdbJdR3e8HoHbTTg,961
2
+ ./PKG-INFO,sha256=kH6bebh7sm4YoT5YJEwcLFtznB0IXs0jsnbve1d2OEA,2092
3
+ ./README.md,sha256=4YA-4sqnbnM4uRHV-QlpS7u2zMxK40tlMhj-K5dgaMw,1514
4
+ ./pyproject.toml,sha256=AukzbgN5ZSP0r3VdI1KNxYSXfSAV7-yZWHE2mLovucc,1132
5
+ ./vision_agents/plugins/deepgram/__init__.py,sha256=ceZaZdjlOqBn_aV1-MDZnJL9GEWXCPOIioC3ktp26o0,195
6
+ ./vision_agents/plugins/deepgram/deepgram_stt.py,sha256=5NbgHmdPBdML5F-pl1v5qu87yz9j5tOK3RYdLdzeetw,10671
7
+ ./vision_agents/plugins/deepgram/tts.py,sha256=ih5k_pV9rUMBswEJ1na_9lIwjzT49NAlDZbZItijerU,2645
8
+ vision_agents/plugins/deepgram/__init__.py,sha256=ceZaZdjlOqBn_aV1-MDZnJL9GEWXCPOIioC3ktp26o0,195
9
+ vision_agents/plugins/deepgram/deepgram_stt.py,sha256=5NbgHmdPBdML5F-pl1v5qu87yz9j5tOK3RYdLdzeetw,10671
10
+ vision_agents/plugins/deepgram/tts.py,sha256=ih5k_pV9rUMBswEJ1na_9lIwjzT49NAlDZbZItijerU,2645
11
+ vision_agents_plugins_deepgram-0.2.7.dist-info/METADATA,sha256=kH6bebh7sm4YoT5YJEwcLFtznB0IXs0jsnbve1d2OEA,2092
12
+ vision_agents_plugins_deepgram-0.2.7.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
13
+ vision_agents_plugins_deepgram-0.2.7.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.28.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any