vision-agents-plugins-deepgram 0.2.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- .gitignore +90 -0
- PKG-INFO +75 -0
- README.md +60 -0
- pyproject.toml +44 -0
- vision_agents/plugins/deepgram/__init__.py +7 -0
- vision_agents/plugins/deepgram/deepgram_stt.py +288 -0
- vision_agents/plugins/deepgram/tts.py +90 -0
- vision_agents_plugins_deepgram-0.2.7.dist-info/METADATA +75 -0
- vision_agents_plugins_deepgram-0.2.7.dist-info/RECORD +13 -0
- vision_agents_plugins_deepgram-0.2.7.dist-info/WHEEL +4 -0
.gitignore
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
.cursor/*
|
|
7
|
+
# Distribution / packaging
|
|
8
|
+
.Python
|
|
9
|
+
build/
|
|
10
|
+
dist/
|
|
11
|
+
downloads/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
eggs/
|
|
14
|
+
.eggs/
|
|
15
|
+
lib64/
|
|
16
|
+
parts/
|
|
17
|
+
sdist/
|
|
18
|
+
var/
|
|
19
|
+
wheels/
|
|
20
|
+
share/python-wheels/
|
|
21
|
+
pip-wheel-metadata/
|
|
22
|
+
MANIFEST
|
|
23
|
+
*.egg-info/
|
|
24
|
+
*.egg
|
|
25
|
+
|
|
26
|
+
# Installer logs
|
|
27
|
+
pip-log.txt
|
|
28
|
+
pip-delete-this-directory.txt
|
|
29
|
+
|
|
30
|
+
# Unit test / coverage reports
|
|
31
|
+
htmlcov/
|
|
32
|
+
.tox/
|
|
33
|
+
.nox/
|
|
34
|
+
.coverage
|
|
35
|
+
.coverage.*
|
|
36
|
+
.cache
|
|
37
|
+
coverage.xml
|
|
38
|
+
nosetests.xml
|
|
39
|
+
*.cover
|
|
40
|
+
*.py,cover
|
|
41
|
+
.hypothesis/
|
|
42
|
+
.pytest_cache/
|
|
43
|
+
|
|
44
|
+
# Type checker / lint caches
|
|
45
|
+
.mypy_cache/
|
|
46
|
+
.dmypy.json
|
|
47
|
+
dmypy.json
|
|
48
|
+
.pytype/
|
|
49
|
+
.pyre/
|
|
50
|
+
.ruff_cache/
|
|
51
|
+
|
|
52
|
+
# Environments
|
|
53
|
+
.venv
|
|
54
|
+
env/
|
|
55
|
+
venv/
|
|
56
|
+
ENV/
|
|
57
|
+
env.bak/
|
|
58
|
+
venv.bak/
|
|
59
|
+
.env
|
|
60
|
+
.env.local
|
|
61
|
+
.env.*.local
|
|
62
|
+
.env.bak
|
|
63
|
+
pyvenv.cfg
|
|
64
|
+
.python-version
|
|
65
|
+
|
|
66
|
+
# Editors / IDEs
|
|
67
|
+
.vscode/
|
|
68
|
+
.idea/
|
|
69
|
+
|
|
70
|
+
# Jupyter Notebook
|
|
71
|
+
.ipynb_checkpoints/
|
|
72
|
+
|
|
73
|
+
# OS / Misc
|
|
74
|
+
.DS_Store
|
|
75
|
+
*.log
|
|
76
|
+
|
|
77
|
+
# Tooling & repo-specific
|
|
78
|
+
pyrightconfig.json
|
|
79
|
+
shell.nix
|
|
80
|
+
bin/*
|
|
81
|
+
lib/*
|
|
82
|
+
stream-py/
|
|
83
|
+
|
|
84
|
+
# Artifacts / assets
|
|
85
|
+
*.pt
|
|
86
|
+
*.kef
|
|
87
|
+
*.onnx
|
|
88
|
+
profile.html
|
|
89
|
+
|
|
90
|
+
/opencode.json
|
PKG-INFO
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vision-agents-plugins-deepgram
|
|
3
|
+
Version: 0.2.7
|
|
4
|
+
Summary: Deepgram STT and TTS integration for Vision Agents
|
|
5
|
+
Project-URL: Documentation, https://visionagents.ai/
|
|
6
|
+
Project-URL: Website, https://visionagents.ai/
|
|
7
|
+
Project-URL: Source, https://github.com/GetStream/Vision-Agents
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
Keywords: AI,STT,TTS,agents,deepgram,speech-to-text,text-to-speech,transcription,voice agents
|
|
10
|
+
Requires-Python: >=3.10
|
|
11
|
+
Requires-Dist: deepgram-sdk>=5.3.0
|
|
12
|
+
Requires-Dist: numpy<2.3,>=2.2.6
|
|
13
|
+
Requires-Dist: vision-agents
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
|
|
16
|
+
# Deepgram Plugin
|
|
17
|
+
|
|
18
|
+
Speech-to-Text (STT) and Text-to-Speech (TTS) plugins for Vision Agents using the Deepgram API.
|
|
19
|
+
|
|
20
|
+
## Installation
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
uv add vision-agents-plugins-deepgram
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Speech-to-Text (STT)
|
|
27
|
+
|
|
28
|
+
High-quality speech recognition using Deepgram's Flux model with built-in turn detection.
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
from vision_agents.plugins import deepgram
|
|
32
|
+
|
|
33
|
+
stt = deepgram.STT(
|
|
34
|
+
model="flux-general-en", # Default model
|
|
35
|
+
eager_turn_detection=True, # Enable eager end-of-turn detection
|
|
36
|
+
)
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
### STT Docs
|
|
40
|
+
|
|
41
|
+
- https://developers.deepgram.com/docs/flux/quickstart
|
|
42
|
+
- https://github.com/deepgram/deepgram-python-sdk/blob/main/examples/listen/v2/connect/async.py
|
|
43
|
+
|
|
44
|
+
## Text-to-Speech (TTS)
|
|
45
|
+
|
|
46
|
+
Low-latency text-to-speech using Deepgram's Aura model via WebSocket streaming.
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
from vision_agents.plugins import deepgram
|
|
50
|
+
|
|
51
|
+
tts = deepgram.TTS(
|
|
52
|
+
model="aura-2-thalia-en", # Default voice
|
|
53
|
+
sample_rate=16000, # Audio sample rate
|
|
54
|
+
)
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### Available Voices
|
|
58
|
+
|
|
59
|
+
Deepgram offers various Aura voice models:
|
|
60
|
+
- `aura-2-thalia-en` - Default female voice
|
|
61
|
+
- `aura-2-orion-en` - Male voice
|
|
62
|
+
- See [TTS Models](https://developers.deepgram.com/docs/tts-models) for all options
|
|
63
|
+
|
|
64
|
+
### TTS Docs
|
|
65
|
+
|
|
66
|
+
- https://developers.deepgram.com/docs/tts-websocket
|
|
67
|
+
- https://developers.deepgram.com/docs/streaming-text-to-speech
|
|
68
|
+
|
|
69
|
+
## Environment Variables
|
|
70
|
+
|
|
71
|
+
Set `DEEPGRAM_API_KEY` in your environment or pass `api_key` to the constructor.
|
|
72
|
+
|
|
73
|
+
## Example
|
|
74
|
+
|
|
75
|
+
See the [example](./example/) directory for a complete working example using both STT and TTS.
|
README.md
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# Deepgram Plugin
|
|
2
|
+
|
|
3
|
+
Speech-to-Text (STT) and Text-to-Speech (TTS) plugins for Vision Agents using the Deepgram API.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
uv add vision-agents-plugins-deepgram
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Speech-to-Text (STT)
|
|
12
|
+
|
|
13
|
+
High-quality speech recognition using Deepgram's Flux model with built-in turn detection.
|
|
14
|
+
|
|
15
|
+
```python
|
|
16
|
+
from vision_agents.plugins import deepgram
|
|
17
|
+
|
|
18
|
+
stt = deepgram.STT(
|
|
19
|
+
model="flux-general-en", # Default model
|
|
20
|
+
eager_turn_detection=True, # Enable eager end-of-turn detection
|
|
21
|
+
)
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
### STT Docs
|
|
25
|
+
|
|
26
|
+
- https://developers.deepgram.com/docs/flux/quickstart
|
|
27
|
+
- https://github.com/deepgram/deepgram-python-sdk/blob/main/examples/listen/v2/connect/async.py
|
|
28
|
+
|
|
29
|
+
## Text-to-Speech (TTS)
|
|
30
|
+
|
|
31
|
+
Low-latency text-to-speech using Deepgram's Aura model via WebSocket streaming.
|
|
32
|
+
|
|
33
|
+
```python
|
|
34
|
+
from vision_agents.plugins import deepgram
|
|
35
|
+
|
|
36
|
+
tts = deepgram.TTS(
|
|
37
|
+
model="aura-2-thalia-en", # Default voice
|
|
38
|
+
sample_rate=16000, # Audio sample rate
|
|
39
|
+
)
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
### Available Voices
|
|
43
|
+
|
|
44
|
+
Deepgram offers various Aura voice models:
|
|
45
|
+
- `aura-2-thalia-en` - Default female voice
|
|
46
|
+
- `aura-2-orion-en` - Male voice
|
|
47
|
+
- See [TTS Models](https://developers.deepgram.com/docs/tts-models) for all options
|
|
48
|
+
|
|
49
|
+
### TTS Docs
|
|
50
|
+
|
|
51
|
+
- https://developers.deepgram.com/docs/tts-websocket
|
|
52
|
+
- https://developers.deepgram.com/docs/streaming-text-to-speech
|
|
53
|
+
|
|
54
|
+
## Environment Variables
|
|
55
|
+
|
|
56
|
+
Set `DEEPGRAM_API_KEY` in your environment or pass `api_key` to the constructor.
|
|
57
|
+
|
|
58
|
+
## Example
|
|
59
|
+
|
|
60
|
+
See the [example](./example/) directory for a complete working example using both STT and TTS.
|
pyproject.toml
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling", "hatch-vcs"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "vision-agents-plugins-deepgram"
|
|
7
|
+
dynamic = ["version"]
|
|
8
|
+
description = "Deepgram STT and TTS integration for Vision Agents"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
keywords = ["deepgram", "STT", "TTS", "speech-to-text", "text-to-speech", "transcription", "AI", "voice agents", "agents"]
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
license = "MIT"
|
|
13
|
+
dependencies = [
|
|
14
|
+
"vision-agents",
|
|
15
|
+
"deepgram-sdk>=5.3.0",
|
|
16
|
+
"numpy>=2.2.6,<2.3",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
[project.urls]
|
|
20
|
+
Documentation = "https://visionagents.ai/"
|
|
21
|
+
Website = "https://visionagents.ai/"
|
|
22
|
+
Source = "https://github.com/GetStream/Vision-Agents"
|
|
23
|
+
|
|
24
|
+
[tool.hatch.version]
|
|
25
|
+
source = "vcs"
|
|
26
|
+
raw-options = { root = "..", search_parent_directories = true, fallback_version = "0.0.0" }
|
|
27
|
+
|
|
28
|
+
[tool.hatch.build.targets.wheel]
|
|
29
|
+
packages = [".", "vision_agents"]
|
|
30
|
+
|
|
31
|
+
[tool.hatch.build.targets.sdist]
|
|
32
|
+
include = ["/vision_agents"]
|
|
33
|
+
|
|
34
|
+
[tool.uv.sources]
|
|
35
|
+
vision-agents = { workspace = true }
|
|
36
|
+
|
|
37
|
+
[dependency-groups]
|
|
38
|
+
dev = [
|
|
39
|
+
"pytest>=8.4.1",
|
|
40
|
+
"pytest-asyncio>=1.0.0",
|
|
41
|
+
"soundfile>=0.13.1",
|
|
42
|
+
"torchvision>=0.20.0",
|
|
43
|
+
"scipy>=1.15.3,<1.16",
|
|
44
|
+
]
|
|
@@ -0,0 +1,288 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
from typing import Optional, Any
|
|
5
|
+
|
|
6
|
+
from deepgram import AsyncDeepgramClient
|
|
7
|
+
from deepgram.core import EventType
|
|
8
|
+
from deepgram.extensions.types.sockets import ListenV2ControlMessage
|
|
9
|
+
from deepgram.listen.v2.socket_client import AsyncV2SocketClient
|
|
10
|
+
from getstream.video.rtc.track_util import PcmData
|
|
11
|
+
|
|
12
|
+
from vision_agents.core import stt
|
|
13
|
+
from vision_agents.core.stt import TranscriptResponse
|
|
14
|
+
from vision_agents.core.edge.types import Participant
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class STT(stt.STT):
|
|
20
|
+
"""
|
|
21
|
+
Deepgram Speech-to-Text implementation using Flux model.
|
|
22
|
+
|
|
23
|
+
- https://developers.deepgram.com/docs/flux/quickstart
|
|
24
|
+
- https://github.com/deepgram/deepgram-python-sdk/blob/main/examples/listen/v2/connect/async.py
|
|
25
|
+
- https://github.com/deepgram/deepgram-python-sdk/tree/main
|
|
26
|
+
- https://github.com/deepgram-devs/deepgram-demos-flux-streaming-transcription/blob/main/main.py
|
|
27
|
+
|
|
28
|
+
Deepgram flux runs turn detection internally. So running turn detection in front of this is optional/not needed
|
|
29
|
+
|
|
30
|
+
- eot_threshold controls turn end sensitivity
|
|
31
|
+
- eager_eot_threshold controls eager turn ending (so you can already prepare the LLM response)
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
turn_detection: bool = True # we support turn detection with deepgram
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
api_key: Optional[str] = None,
|
|
39
|
+
model: str = "flux-general-en",
|
|
40
|
+
language: Optional[str] = None,
|
|
41
|
+
eager_turn_detection: bool = False,
|
|
42
|
+
eot_threshold: Optional[float] = None,
|
|
43
|
+
eager_eot_threshold: Optional[float] = None,
|
|
44
|
+
client: Optional[AsyncDeepgramClient] = None,
|
|
45
|
+
):
|
|
46
|
+
"""
|
|
47
|
+
Initialize Deepgram STT.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
api_key: Deepgram API key. If not provided, will use DEEPGRAM_API_KEY env var.
|
|
51
|
+
model: Model to use for transcription. Defaults to "flux-general-en".
|
|
52
|
+
language: Language code (e.g., "en", "es"). If not provided, auto-detection is used.
|
|
53
|
+
eot_threshold: End-of-turn threshold for determining when a turn is complete.
|
|
54
|
+
eager_eot_threshold: Eager end-of-turn threshold for faster turn detection.
|
|
55
|
+
client: Optional pre-configured AsyncDeepgramClient instance.
|
|
56
|
+
"""
|
|
57
|
+
super().__init__(provider_name="deepgram")
|
|
58
|
+
|
|
59
|
+
if not api_key:
|
|
60
|
+
api_key = os.environ.get("DEEPGRAM_API_KEY")
|
|
61
|
+
|
|
62
|
+
if client is not None:
|
|
63
|
+
self.client = client
|
|
64
|
+
else:
|
|
65
|
+
# Initialize AsyncDeepgramClient with api_key as named parameter
|
|
66
|
+
if api_key:
|
|
67
|
+
self.client = AsyncDeepgramClient(api_key=api_key)
|
|
68
|
+
else:
|
|
69
|
+
self.client = AsyncDeepgramClient()
|
|
70
|
+
|
|
71
|
+
self.model = model
|
|
72
|
+
self.language = language
|
|
73
|
+
self.eot_threshold = eot_threshold
|
|
74
|
+
self.eager_turn_detection = eager_turn_detection
|
|
75
|
+
if self.eager_turn_detection and eager_eot_threshold is None:
|
|
76
|
+
eager_eot_threshold = 0.5
|
|
77
|
+
self.eager_eot_threshold = eager_eot_threshold
|
|
78
|
+
self._current_participant: Optional[Participant] = None
|
|
79
|
+
self.connection: Optional[AsyncV2SocketClient] = None
|
|
80
|
+
self._connection_ready = asyncio.Event()
|
|
81
|
+
self._connection_context: Optional[Any] = None
|
|
82
|
+
self._listen_task: Optional[asyncio.Task[Any]] = None
|
|
83
|
+
|
|
84
|
+
async def process_audio(
|
|
85
|
+
self,
|
|
86
|
+
pcm_data: PcmData,
|
|
87
|
+
participant: Optional[Participant] = None,
|
|
88
|
+
):
|
|
89
|
+
"""
|
|
90
|
+
Process audio data through Deepgram for transcription.
|
|
91
|
+
|
|
92
|
+
This method sends audio to the existing WebSocket connection. The connection
|
|
93
|
+
is started automatically on first use. Audio is automatically resampled to 16kHz.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
pcm_data: The PCM audio data to process.
|
|
97
|
+
participant: Optional participant metadata (currently not used in streaming mode).
|
|
98
|
+
"""
|
|
99
|
+
if self.closed:
|
|
100
|
+
logger.warning("Deepgram STT is closed, ignoring audio")
|
|
101
|
+
return
|
|
102
|
+
|
|
103
|
+
# Wait for connection to be ready
|
|
104
|
+
await self._connection_ready.wait()
|
|
105
|
+
|
|
106
|
+
# Double-check connection is still ready (could have closed while waiting)
|
|
107
|
+
if not self._connection_ready.is_set():
|
|
108
|
+
logger.warning("Deepgram connection closed while processing audio")
|
|
109
|
+
return
|
|
110
|
+
|
|
111
|
+
# Resample to 16kHz mono (recommended by Deepgram)
|
|
112
|
+
resampled_pcm = pcm_data.resample(16_000, 1)
|
|
113
|
+
|
|
114
|
+
# Convert int16 samples to bytes
|
|
115
|
+
audio_bytes = resampled_pcm.samples.tobytes()
|
|
116
|
+
|
|
117
|
+
self._current_participant = participant
|
|
118
|
+
|
|
119
|
+
if self.connection is not None:
|
|
120
|
+
await self.connection.send_media(audio_bytes)
|
|
121
|
+
|
|
122
|
+
async def start(self):
|
|
123
|
+
"""
|
|
124
|
+
Start the Deepgram WebSocket connection and begin listening for transcripts.
|
|
125
|
+
"""
|
|
126
|
+
if self.connection is not None:
|
|
127
|
+
logger.warning("Deepgram connection already started")
|
|
128
|
+
return
|
|
129
|
+
|
|
130
|
+
# Build connection parameters
|
|
131
|
+
connect_params = {
|
|
132
|
+
"model": self.model,
|
|
133
|
+
"encoding": "linear16",
|
|
134
|
+
"sample_rate": "16000",
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
# Add optional parameters if specified
|
|
138
|
+
if self.eot_threshold is not None:
|
|
139
|
+
connect_params["eot_threshold"] = str(self.eot_threshold)
|
|
140
|
+
if self.eager_eot_threshold is not None:
|
|
141
|
+
connect_params["eager_eot_threshold"] = str(self.eager_eot_threshold)
|
|
142
|
+
|
|
143
|
+
# Connect to Deepgram v2 listen WebSocket with timeout
|
|
144
|
+
self._connection_context = self.client.listen.v2.connect(**connect_params)
|
|
145
|
+
|
|
146
|
+
# Add timeout for connection establishment
|
|
147
|
+
self.connection = await asyncio.wait_for(
|
|
148
|
+
self._connection_context.__aenter__(), timeout=10.0
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
# Register event handlers
|
|
152
|
+
if self.connection is not None:
|
|
153
|
+
self.connection.on(EventType.OPEN, self._on_open)
|
|
154
|
+
self.connection.on(EventType.MESSAGE, self._on_message)
|
|
155
|
+
self.connection.on(EventType.ERROR, self._on_error)
|
|
156
|
+
self.connection.on(EventType.CLOSE, self._on_close)
|
|
157
|
+
|
|
158
|
+
# Start listening for events
|
|
159
|
+
self._listen_task = asyncio.create_task(self.connection.start_listening())
|
|
160
|
+
|
|
161
|
+
# Mark connection as ready
|
|
162
|
+
self._connection_ready.set()
|
|
163
|
+
|
|
164
|
+
def _on_message(self, message):
|
|
165
|
+
"""
|
|
166
|
+
Event handler for messages from Deepgram.
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
message: The message object from Deepgram
|
|
170
|
+
|
|
171
|
+
TODO: errors in this function are hidden silently. Not sure why this happens.
|
|
172
|
+
"""
|
|
173
|
+
# Extract message data
|
|
174
|
+
if not hasattr(message, "type"):
|
|
175
|
+
logger.warning(f"Received message without 'type' attribute: {message}")
|
|
176
|
+
return
|
|
177
|
+
|
|
178
|
+
# Handle TurnInfo messages (v2 API)
|
|
179
|
+
if message.type == "TurnInfo":
|
|
180
|
+
# Extract transcript text
|
|
181
|
+
transcript_text = getattr(message, "transcript", "").strip()
|
|
182
|
+
|
|
183
|
+
if not transcript_text:
|
|
184
|
+
return
|
|
185
|
+
|
|
186
|
+
# Get event type to determine if final or partial
|
|
187
|
+
# "StartOfTurn" and "Update" = partial, "EndOfTurn" = final
|
|
188
|
+
event = getattr(message, "event", "")
|
|
189
|
+
|
|
190
|
+
is_final = event == "EndOfTurn"
|
|
191
|
+
eager_end_of_turn = event == "EagerEndOfTurn"
|
|
192
|
+
start_of_turn = event == "StartOfTurn"
|
|
193
|
+
|
|
194
|
+
# Get end of turn confidence
|
|
195
|
+
end_of_turn_confidence = getattr(message, "end_of_turn_confidence", 0.0)
|
|
196
|
+
|
|
197
|
+
# Calculate average confidence from words
|
|
198
|
+
words = getattr(message, "words", [])
|
|
199
|
+
if words:
|
|
200
|
+
confidences = [w.confidence for w in words if hasattr(w, "confidence")]
|
|
201
|
+
avg_confidence = (
|
|
202
|
+
sum(confidences) / len(confidences) if confidences else 0.0
|
|
203
|
+
)
|
|
204
|
+
else:
|
|
205
|
+
avg_confidence = 0.0
|
|
206
|
+
|
|
207
|
+
# Get audio duration
|
|
208
|
+
audio_window_end = getattr(message, "audio_window_end", 0.0)
|
|
209
|
+
duration_ms = int(audio_window_end * 1000)
|
|
210
|
+
|
|
211
|
+
# Build response metadata
|
|
212
|
+
response_metadata = TranscriptResponse(
|
|
213
|
+
confidence=avg_confidence,
|
|
214
|
+
language=self.language or "auto",
|
|
215
|
+
audio_duration_ms=duration_ms,
|
|
216
|
+
model_name=self.model,
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
# Use the participant from the most recent process_audio call
|
|
220
|
+
participant = self._current_participant
|
|
221
|
+
|
|
222
|
+
if participant is None:
|
|
223
|
+
logger.warning("Received transcript but no participant set")
|
|
224
|
+
return
|
|
225
|
+
|
|
226
|
+
# broadcast STT event first
|
|
227
|
+
if is_final:
|
|
228
|
+
self._emit_transcript_event(
|
|
229
|
+
transcript_text, participant, response_metadata
|
|
230
|
+
)
|
|
231
|
+
else:
|
|
232
|
+
self._emit_partial_transcript_event(
|
|
233
|
+
transcript_text, participant, response_metadata
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
# broadcast turn event
|
|
237
|
+
if is_final or eager_end_of_turn:
|
|
238
|
+
self._emit_turn_ended_event(
|
|
239
|
+
participant=participant,
|
|
240
|
+
eager_end_of_turn=eager_end_of_turn,
|
|
241
|
+
confidence=end_of_turn_confidence,
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
if start_of_turn:
|
|
245
|
+
self._emit_turn_started_event(
|
|
246
|
+
participant=participant, confidence=end_of_turn_confidence
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
def _on_open(self, message):
|
|
250
|
+
pass
|
|
251
|
+
|
|
252
|
+
def _on_error(self, error):
|
|
253
|
+
"""
|
|
254
|
+
Event handler for errors from Deepgram.
|
|
255
|
+
|
|
256
|
+
Args:
|
|
257
|
+
error: The error from Deepgram
|
|
258
|
+
"""
|
|
259
|
+
logger.error(f"Deepgram WebSocket error: {error}")
|
|
260
|
+
raise Exception(f"Deepgram WebSocket error {error}")
|
|
261
|
+
|
|
262
|
+
def _on_close(self, error):
|
|
263
|
+
"""
|
|
264
|
+
Event handler for connection close.
|
|
265
|
+
"""
|
|
266
|
+
logger.debug(f"Deepgram WebSocket connection closed: {error}")
|
|
267
|
+
self._connection_ready.clear()
|
|
268
|
+
|
|
269
|
+
async def close(self):
|
|
270
|
+
"""
|
|
271
|
+
Close the Deepgram connection and clean up resources.
|
|
272
|
+
"""
|
|
273
|
+
# Mark as closed first
|
|
274
|
+
await super().close()
|
|
275
|
+
|
|
276
|
+
# Cancel listen task
|
|
277
|
+
if self._listen_task and not self._listen_task.done():
|
|
278
|
+
self._listen_task.cancel()
|
|
279
|
+
await asyncio.gather(self._listen_task, return_exceptions=True)
|
|
280
|
+
|
|
281
|
+
# Close connection
|
|
282
|
+
if self.connection and self._connection_context:
|
|
283
|
+
close_msg = ListenV2ControlMessage(type="CloseStream")
|
|
284
|
+
await self.connection.send_control(close_msg)
|
|
285
|
+
await self._connection_context.__aexit__(None, None, None)
|
|
286
|
+
self.connection = None
|
|
287
|
+
self._connection_context = None
|
|
288
|
+
self._connection_ready.clear()
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
from typing import AsyncIterator, Optional
|
|
4
|
+
|
|
5
|
+
from deepgram import AsyncDeepgramClient
|
|
6
|
+
from getstream.video.rtc.track_util import PcmData, AudioFormat
|
|
7
|
+
|
|
8
|
+
from vision_agents.core import tts
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class TTS(tts.TTS):
|
|
14
|
+
"""
|
|
15
|
+
Deepgram Text-to-Speech implementation using Aura model.
|
|
16
|
+
|
|
17
|
+
Uses the Deepgram Speak API with streaming response.
|
|
18
|
+
|
|
19
|
+
References:
|
|
20
|
+
- https://developers.deepgram.com/docs/text-to-speech
|
|
21
|
+
- https://developers.deepgram.com/docs/tts-models
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(
|
|
25
|
+
self,
|
|
26
|
+
api_key: Optional[str] = None,
|
|
27
|
+
model: str = "aura-2-thalia-en",
|
|
28
|
+
sample_rate: int = 16000,
|
|
29
|
+
client: Optional[AsyncDeepgramClient] = None,
|
|
30
|
+
):
|
|
31
|
+
"""
|
|
32
|
+
Initialize Deepgram TTS.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
api_key: Deepgram API key. If not provided, will use DEEPGRAM_API_KEY env var.
|
|
36
|
+
model: Voice model to use. Defaults to "aura-2-thalia-en".
|
|
37
|
+
See https://developers.deepgram.com/docs/tts-models for available voices.
|
|
38
|
+
sample_rate: Audio sample rate in Hz. Defaults to 16000.
|
|
39
|
+
client: Optional pre-configured AsyncDeepgramClient instance.
|
|
40
|
+
"""
|
|
41
|
+
super().__init__(provider_name="deepgram")
|
|
42
|
+
|
|
43
|
+
if not api_key:
|
|
44
|
+
api_key = os.environ.get("DEEPGRAM_API_KEY")
|
|
45
|
+
|
|
46
|
+
if client is not None:
|
|
47
|
+
self.client = client
|
|
48
|
+
else:
|
|
49
|
+
if api_key:
|
|
50
|
+
self.client = AsyncDeepgramClient(api_key=api_key)
|
|
51
|
+
else:
|
|
52
|
+
self.client = AsyncDeepgramClient()
|
|
53
|
+
|
|
54
|
+
self.model = model
|
|
55
|
+
self.sample_rate = sample_rate
|
|
56
|
+
|
|
57
|
+
async def stream_audio(self, text: str, *_, **__) -> AsyncIterator[PcmData]:
|
|
58
|
+
"""
|
|
59
|
+
Convert text to speech using Deepgram's Speak API.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
text: The text to convert to speech.
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
An async iterator of PcmData audio chunks.
|
|
66
|
+
"""
|
|
67
|
+
# Use the Deepgram speak API with streaming response
|
|
68
|
+
response = self.client.speak.v1.audio.generate(
|
|
69
|
+
text=text,
|
|
70
|
+
model=self.model,
|
|
71
|
+
encoding="linear16",
|
|
72
|
+
sample_rate=self.sample_rate,
|
|
73
|
+
container="none", # Raw PCM, no container
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
return PcmData.from_response(
|
|
77
|
+
response,
|
|
78
|
+
sample_rate=self.sample_rate,
|
|
79
|
+
channels=1,
|
|
80
|
+
format=AudioFormat.S16,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
async def stop_audio(self) -> None:
|
|
84
|
+
"""
|
|
85
|
+
Stop audio playback.
|
|
86
|
+
|
|
87
|
+
This is a no-op for Deepgram TTS as each stream_audio call
|
|
88
|
+
creates its own request.
|
|
89
|
+
"""
|
|
90
|
+
pass
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vision-agents-plugins-deepgram
|
|
3
|
+
Version: 0.2.7
|
|
4
|
+
Summary: Deepgram STT and TTS integration for Vision Agents
|
|
5
|
+
Project-URL: Documentation, https://visionagents.ai/
|
|
6
|
+
Project-URL: Website, https://visionagents.ai/
|
|
7
|
+
Project-URL: Source, https://github.com/GetStream/Vision-Agents
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
Keywords: AI,STT,TTS,agents,deepgram,speech-to-text,text-to-speech,transcription,voice agents
|
|
10
|
+
Requires-Python: >=3.10
|
|
11
|
+
Requires-Dist: deepgram-sdk>=5.3.0
|
|
12
|
+
Requires-Dist: numpy<2.3,>=2.2.6
|
|
13
|
+
Requires-Dist: vision-agents
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
|
|
16
|
+
# Deepgram Plugin
|
|
17
|
+
|
|
18
|
+
Speech-to-Text (STT) and Text-to-Speech (TTS) plugins for Vision Agents using the Deepgram API.
|
|
19
|
+
|
|
20
|
+
## Installation
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
uv add vision-agents-plugins-deepgram
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Speech-to-Text (STT)
|
|
27
|
+
|
|
28
|
+
High-quality speech recognition using Deepgram's Flux model with built-in turn detection.
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
from vision_agents.plugins import deepgram
|
|
32
|
+
|
|
33
|
+
stt = deepgram.STT(
|
|
34
|
+
model="flux-general-en", # Default model
|
|
35
|
+
eager_turn_detection=True, # Enable eager end-of-turn detection
|
|
36
|
+
)
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
### STT Docs
|
|
40
|
+
|
|
41
|
+
- https://developers.deepgram.com/docs/flux/quickstart
|
|
42
|
+
- https://github.com/deepgram/deepgram-python-sdk/blob/main/examples/listen/v2/connect/async.py
|
|
43
|
+
|
|
44
|
+
## Text-to-Speech (TTS)
|
|
45
|
+
|
|
46
|
+
Low-latency text-to-speech using Deepgram's Aura model via WebSocket streaming.
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
from vision_agents.plugins import deepgram
|
|
50
|
+
|
|
51
|
+
tts = deepgram.TTS(
|
|
52
|
+
model="aura-2-thalia-en", # Default voice
|
|
53
|
+
sample_rate=16000, # Audio sample rate
|
|
54
|
+
)
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### Available Voices
|
|
58
|
+
|
|
59
|
+
Deepgram offers various Aura voice models:
|
|
60
|
+
- `aura-2-thalia-en` - Default female voice
|
|
61
|
+
- `aura-2-orion-en` - Male voice
|
|
62
|
+
- See [TTS Models](https://developers.deepgram.com/docs/tts-models) for all options
|
|
63
|
+
|
|
64
|
+
### TTS Docs
|
|
65
|
+
|
|
66
|
+
- https://developers.deepgram.com/docs/tts-websocket
|
|
67
|
+
- https://developers.deepgram.com/docs/streaming-text-to-speech
|
|
68
|
+
|
|
69
|
+
## Environment Variables
|
|
70
|
+
|
|
71
|
+
Set `DEEPGRAM_API_KEY` in your environment or pass `api_key` to the constructor.
|
|
72
|
+
|
|
73
|
+
## Example
|
|
74
|
+
|
|
75
|
+
See the [example](./example/) directory for a complete working example using both STT and TTS.
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
./.gitignore,sha256=zrSq4X-Qh8j7QY0ukXt-RXj6StdhdbJdR3e8HoHbTTg,961
|
|
2
|
+
./PKG-INFO,sha256=kH6bebh7sm4YoT5YJEwcLFtznB0IXs0jsnbve1d2OEA,2092
|
|
3
|
+
./README.md,sha256=4YA-4sqnbnM4uRHV-QlpS7u2zMxK40tlMhj-K5dgaMw,1514
|
|
4
|
+
./pyproject.toml,sha256=AukzbgN5ZSP0r3VdI1KNxYSXfSAV7-yZWHE2mLovucc,1132
|
|
5
|
+
./vision_agents/plugins/deepgram/__init__.py,sha256=ceZaZdjlOqBn_aV1-MDZnJL9GEWXCPOIioC3ktp26o0,195
|
|
6
|
+
./vision_agents/plugins/deepgram/deepgram_stt.py,sha256=5NbgHmdPBdML5F-pl1v5qu87yz9j5tOK3RYdLdzeetw,10671
|
|
7
|
+
./vision_agents/plugins/deepgram/tts.py,sha256=ih5k_pV9rUMBswEJ1na_9lIwjzT49NAlDZbZItijerU,2645
|
|
8
|
+
vision_agents/plugins/deepgram/__init__.py,sha256=ceZaZdjlOqBn_aV1-MDZnJL9GEWXCPOIioC3ktp26o0,195
|
|
9
|
+
vision_agents/plugins/deepgram/deepgram_stt.py,sha256=5NbgHmdPBdML5F-pl1v5qu87yz9j5tOK3RYdLdzeetw,10671
|
|
10
|
+
vision_agents/plugins/deepgram/tts.py,sha256=ih5k_pV9rUMBswEJ1na_9lIwjzT49NAlDZbZItijerU,2645
|
|
11
|
+
vision_agents_plugins_deepgram-0.2.7.dist-info/METADATA,sha256=kH6bebh7sm4YoT5YJEwcLFtznB0IXs0jsnbve1d2OEA,2092
|
|
12
|
+
vision_agents_plugins_deepgram-0.2.7.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
13
|
+
vision_agents_plugins_deepgram-0.2.7.dist-info/RECORD,,
|