vision-agents-plugins-mistral 0.3.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,91 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .cursor/*
7
+ # Distribution / packaging
8
+ .Python
9
+ build/
10
+ dist/
11
+ downloads/
12
+ develop-eggs/
13
+ eggs/
14
+ .eggs/
15
+ lib64/
16
+ parts/
17
+ sdist/
18
+ var/
19
+ wheels/
20
+ share/python-wheels/
21
+ pip-wheel-metadata/
22
+ MANIFEST
23
+ *.egg-info/
24
+ *.egg
25
+
26
+ # Installer logs
27
+ pip-log.txt
28
+ pip-delete-this-directory.txt
29
+
30
+ # Unit test / coverage reports
31
+ htmlcov/
32
+ .tox/
33
+ .nox/
34
+ .coverage
35
+ .coverage.*
36
+ .cache
37
+ coverage.xml
38
+ nosetests.xml
39
+ *.cover
40
+ *.py,cover
41
+ .hypothesis/
42
+ .pytest_cache/
43
+
44
+ # Type checker / lint caches
45
+ .mypy_cache/
46
+ .dmypy.json
47
+ dmypy.json
48
+ .pytype/
49
+ .pyre/
50
+ .ruff_cache/
51
+
52
+ # Environments
53
+ .venv
54
+ env/
55
+ venv/
56
+ ENV/
57
+ env.bak/
58
+ venv.bak/
59
+ .env
60
+ .env.local
61
+ .env.*.local
62
+ .env.bak
63
+ pyvenv.cfg
64
+ .python-version
65
+
66
+ # Editors / IDEs
67
+ .vscode/
68
+ .idea/
69
+
70
+ # Jupyter Notebook
71
+ .ipynb_checkpoints/
72
+
73
+ # OS / Misc
74
+ .DS_Store
75
+ *.log
76
+
77
+ # Tooling & repo-specific
78
+ pyrightconfig.json
79
+ shell.nix
80
+ bin/*
81
+ lib/*
82
+ stream-py/
83
+
84
+ # Artifacts / assets
85
+ *.pt
86
+ *.kef
87
+ *.onnx
88
+ profile.html
89
+
90
+ /opencode.json
91
+ .ralph-tui/
@@ -0,0 +1,93 @@
1
+ Metadata-Version: 2.4
2
+ Name: vision-agents-plugins-mistral
3
+ Version: 0.3.3
4
+ Summary: Mistral Voxtral STT integration for Vision Agents
5
+ Project-URL: Documentation, https://visionagents.ai/
6
+ Project-URL: Website, https://visionagents.ai/
7
+ Project-URL: Source, https://github.com/GetStream/Vision-Agents
8
+ License-Expression: MIT
9
+ Keywords: AI,STT,agents,mistral,real-time,speech-to-text,transcription,voice agents,voxtral
10
+ Requires-Python: >=3.10
11
+ Requires-Dist: mistralai[realtime]>=1.12.0
12
+ Requires-Dist: vision-agents
13
+ Description-Content-Type: text/markdown
14
+
15
+ # Mistral Voxtral STT Plugin
16
+
17
+ Mistral Voxtral realtime speech-to-text integration for Vision Agents.
18
+
19
+ ## Features
20
+
21
+ - Real-time speech recognition via WebSocket streaming
22
+ - Low-latency transcription using Voxtral models
23
+ - Automatic language detection
24
+ - Partial transcript streaming for responsive UX
25
+ - Sentence-level final transcripts (triggered by `.`, `?`, `!`)
26
+
27
+ ## Installation
28
+
29
+ ```bash
30
+ uv add vision-agents[mistral]
31
+ ```
32
+
33
+ ## Usage
34
+
35
+ ```python
36
+ from vision_agents.core import Agent, Runner, User
37
+ from vision_agents.core.agents import AgentLauncher
38
+ from vision_agents.plugins import deepgram, gemini, getstream, mistral
39
+
40
+
41
+ async def create_agent(**kwargs) -> Agent:
42
+ return Agent(
43
+ edge=getstream.Edge(),
44
+ agent_user=User(name="Assistant", id="agent"),
45
+ instructions="You're a helpful voice AI assistant. Keep replies short and conversational.",
46
+ stt=mistral.STT(),
47
+ tts=deepgram.TTS(),
48
+ llm=gemini.LLM("gemini-2.0-flash"),
49
+ )
50
+
51
+
52
+ async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None:
53
+ await agent.create_user()
54
+ call = await agent.create_call(call_type, call_id)
55
+
56
+ async with agent.join(call):
57
+ await agent.run()
58
+
59
+
60
+ if __name__ == "__main__":
61
+ Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli()
62
+ ```
63
+
64
+ Run with:
65
+
66
+ ```bash
67
+ uv run plugins/mistral/example/mistral_stt_example.py run
68
+ ```
69
+
70
+ ## Turn Detection
71
+
72
+ Mistral Voxtral STT does not include built-in turn detection (`turn_detection=False`). You'll need to pair it with an external turn detection plugin.
73
+
74
+ ## Configuration
75
+
76
+ | Parameter | Description | Default |
77
+ |-----------|-------------|---------|
78
+ | `api_key` | Mistral API key | `MISTRAL_API_KEY` env var |
79
+ | `model` | Model identifier | `voxtral-mini-transcribe-realtime-2602` |
80
+ | `sample_rate` | Audio sample rate (Hz): 8000, 16000, 22050, 44100, 48000 | `16000` |
81
+ | `client` | Pre-configured Mistral client | `None` |
82
+
83
+ ## Events
84
+
85
+ The plugin emits standard STT events:
86
+
87
+ - `STTTranscriptEvent`: Final transcript (emitted at sentence boundaries or stream end)
88
+ - `STTPartialTranscriptEvent`: Partial word/delta as transcription streams
89
+
90
+ ## Dependencies
91
+
92
+ - `mistralai[realtime]>=1.12.0`
93
+ - `vision-agents`
@@ -0,0 +1,79 @@
1
+ # Mistral Voxtral STT Plugin
2
+
3
+ Mistral Voxtral realtime speech-to-text integration for Vision Agents.
4
+
5
+ ## Features
6
+
7
+ - Real-time speech recognition via WebSocket streaming
8
+ - Low-latency transcription using Voxtral models
9
+ - Automatic language detection
10
+ - Partial transcript streaming for responsive UX
11
+ - Sentence-level final transcripts (triggered by `.`, `?`, `!`)
12
+
13
+ ## Installation
14
+
15
+ ```bash
16
+ uv add vision-agents[mistral]
17
+ ```
18
+
19
+ ## Usage
20
+
21
+ ```python
22
+ from vision_agents.core import Agent, Runner, User
23
+ from vision_agents.core.agents import AgentLauncher
24
+ from vision_agents.plugins import deepgram, gemini, getstream, mistral
25
+
26
+
27
+ async def create_agent(**kwargs) -> Agent:
28
+ return Agent(
29
+ edge=getstream.Edge(),
30
+ agent_user=User(name="Assistant", id="agent"),
31
+ instructions="You're a helpful voice AI assistant. Keep replies short and conversational.",
32
+ stt=mistral.STT(),
33
+ tts=deepgram.TTS(),
34
+ llm=gemini.LLM("gemini-2.0-flash"),
35
+ )
36
+
37
+
38
+ async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None:
39
+ await agent.create_user()
40
+ call = await agent.create_call(call_type, call_id)
41
+
42
+ async with agent.join(call):
43
+ await agent.run()
44
+
45
+
46
+ if __name__ == "__main__":
47
+ Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli()
48
+ ```
49
+
50
+ Run with:
51
+
52
+ ```bash
53
+ uv run plugins/mistral/example/mistral_stt_example.py run
54
+ ```
55
+
56
+ ## Turn Detection
57
+
58
+ Mistral Voxtral STT does not include built-in turn detection (`turn_detection=False`). You'll need to pair it with an external turn detection plugin.
59
+
60
+ ## Configuration
61
+
62
+ | Parameter | Description | Default |
63
+ |-----------|-------------|---------|
64
+ | `api_key` | Mistral API key | `MISTRAL_API_KEY` env var |
65
+ | `model` | Model identifier | `voxtral-mini-transcribe-realtime-2602` |
66
+ | `sample_rate` | Audio sample rate (Hz): 8000, 16000, 22050, 44100, 48000 | `16000` |
67
+ | `client` | Pre-configured Mistral client | `None` |
68
+
69
+ ## Events
70
+
71
+ The plugin emits standard STT events:
72
+
73
+ - `STTTranscriptEvent`: Final transcript (emitted at sentence boundaries or stream end)
74
+ - `STTPartialTranscriptEvent`: Partial word/delta as transcription streams
75
+
76
+ ## Dependencies
77
+
78
+ - `mistralai[realtime]>=1.12.0`
79
+ - `vision-agents`
@@ -0,0 +1,40 @@
1
+ [build-system]
2
+ requires = ["hatchling", "hatch-vcs"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "vision-agents-plugins-mistral"
7
+ dynamic = ["version"]
8
+ description = "Mistral Voxtral STT integration for Vision Agents"
9
+ readme = "README.md"
10
+ keywords = ["mistral", "voxtral", "real-time", "STT", "speech-to-text", "transcription", "AI", "voice agents", "agents"]
11
+ requires-python = ">=3.10"
12
+ license = "MIT"
13
+ dependencies = [
14
+ "vision-agents",
15
+ "mistralai[realtime]>=1.12.0",
16
+ ]
17
+
18
+ [project.urls]
19
+ Documentation = "https://visionagents.ai/"
20
+ Website = "https://visionagents.ai/"
21
+ Source = "https://github.com/GetStream/Vision-Agents"
22
+
23
+ [tool.hatch.version]
24
+ source = "vcs"
25
+ raw-options = { root = "..", search_parent_directories = true, fallback_version = "0.0.0" }
26
+
27
+ [tool.hatch.build.targets.wheel]
28
+ packages = [".", "vision_agents"]
29
+
30
+ [tool.hatch.build.targets.sdist]
31
+ include = ["/vision_agents"]
32
+
33
+ [tool.uv.sources]
34
+ vision-agents = { workspace = true }
35
+
36
+ [dependency-groups]
37
+ dev = [
38
+ "pytest>=8.4.1",
39
+ "pytest-asyncio>=1.0.0",
40
+ ]
@@ -0,0 +1,3 @@
1
+ from .stt import STT
2
+
3
+ __all__ = ["STT"]
@@ -0,0 +1,252 @@
1
+ import asyncio
2
+ import logging
3
+ import os
4
+ import time
5
+ from typing import Any, Optional
6
+
7
+ from getstream.video.rtc.track_util import PcmData
8
+ from mistralai import Mistral
9
+ from mistralai.extra.realtime import AudioFormat, RealtimeConnection
10
+ from mistralai.models import (
11
+ RealtimeTranscriptionError,
12
+ TranscriptionStreamDone,
13
+ TranscriptionStreamTextDelta,
14
+ )
15
+ from vision_agents.core import stt
16
+ from vision_agents.core.edge.types import Participant
17
+ from vision_agents.core.stt import TranscriptResponse
18
+ from vision_agents.core.utils.utils import cancel_and_wait
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class STT(stt.STT):
24
+ """
25
+ Mistral Voxtral Realtime Speech-to-Text implementation.
26
+
27
+ Uses WebSocket streaming for low-latency transcription.
28
+
29
+ Docs:
30
+ - https://docs.mistral.ai/capabilities/audio_transcription#realtime-transcription
31
+ """
32
+
33
+ turn_detection: bool = False
34
+
35
+ def __init__(
36
+ self,
37
+ api_key: Optional[str] = None,
38
+ model: str = "voxtral-mini-transcribe-realtime-2602",
39
+ sample_rate: int = 16000,
40
+ client: Optional[Mistral] = None,
41
+ ):
42
+ """
43
+ Initialize Mistral Voxtral STT.
44
+
45
+ Args:
46
+ api_key: Mistral API key. If not provided, MISTRAL_API_KEY env var is used.
47
+ model: Model to use for transcription.
48
+ sample_rate: Audio sample rate in Hz. Supports 8000, 16000, 22050, 44100, 48000.
49
+ client: Optional pre-configured Mistral client instance.
50
+ """
51
+ super().__init__(provider_name="mistral")
52
+
53
+ if client is not None:
54
+ self._client = client
55
+ else:
56
+ if not api_key:
57
+ api_key = os.environ.get("MISTRAL_API_KEY")
58
+ self._client = Mistral(api_key=api_key)
59
+
60
+ self.model = model
61
+ self.sample_rate = sample_rate
62
+ self._connection: Optional[RealtimeConnection] = None
63
+ self._receive_task: Optional[asyncio.Task[Any]] = None
64
+ self._current_participant: Optional[Participant] = None
65
+ self._connection_ready = asyncio.Event()
66
+ self._audio_start_time: Optional[float] = None
67
+ self._accumulated_text: str = ""
68
+ self._done_received = asyncio.Event()
69
+
70
+ async def start(self):
71
+ """Start the Mistral WebSocket connection."""
72
+ await super().start()
73
+
74
+ if self._connection is not None:
75
+ logger.warning("Mistral connection already started")
76
+ return
77
+
78
+ audio_format = AudioFormat(encoding="pcm_s16le", sample_rate=self.sample_rate)
79
+
80
+ self._connection = await asyncio.wait_for(
81
+ self._client.audio.realtime.connect(
82
+ model=self.model,
83
+ audio_format=audio_format,
84
+ ),
85
+ timeout=10.0,
86
+ )
87
+
88
+ self._receive_task = asyncio.create_task(self._receive_loop())
89
+ self._connection_ready.set()
90
+
91
+ logger.info("Mistral WebSocket connection established")
92
+
93
+ async def _receive_loop(self):
94
+ """Background task to receive and process events from Mistral."""
95
+ if self._connection is None:
96
+ return
97
+
98
+ try:
99
+ async for event in self._connection:
100
+ logger.debug(f"Mistral event: {type(event).__name__}")
101
+
102
+ if isinstance(event, TranscriptionStreamTextDelta):
103
+ self._handle_text_delta(event)
104
+ elif isinstance(event, TranscriptionStreamDone):
105
+ self._handle_done(event)
106
+ break # Exit loop after done
107
+ elif isinstance(event, RealtimeTranscriptionError):
108
+ self._handle_error(event)
109
+ break # Exit loop on error
110
+
111
+ except asyncio.CancelledError:
112
+ logger.debug("Mistral receive loop cancelled")
113
+ raise
114
+ except Exception as e:
115
+ logger.error(f"Error in Mistral receive loop: {e}")
116
+ if not self.closed:
117
+ self._emit_error_event(e, context="receive_loop")
118
+
119
+ def _handle_text_delta(self, event: TranscriptionStreamTextDelta):
120
+ """Handle text delta - emit word-by-word partials, full text on complete."""
121
+ text = event.text
122
+ if not text:
123
+ return
124
+
125
+ participant = self._current_participant
126
+ if participant is None:
127
+ logger.warning("Received transcript but no participant set")
128
+ return
129
+
130
+ # Accumulate text for complete events
131
+ self._accumulated_text += text
132
+
133
+ processing_time_ms: Optional[float] = None
134
+ if self._audio_start_time is not None:
135
+ processing_time_ms = (time.perf_counter() - self._audio_start_time) * 1000
136
+
137
+ response = TranscriptResponse(
138
+ model_name=self.model,
139
+ processing_time_ms=processing_time_ms,
140
+ )
141
+
142
+ # Emit partial with just the new word/delta (not accumulated)
143
+ text_stripped = text.strip()
144
+ if text_stripped:
145
+ self._emit_partial_transcript_event(text_stripped, participant, response)
146
+
147
+ # Check for sentence-ending punctuation - emit complete transcript
148
+ if text.rstrip().endswith((".", "?", "!")):
149
+ accumulated_stripped = self._accumulated_text.strip()
150
+ if accumulated_stripped:
151
+ self._emit_transcript_event(accumulated_stripped, participant, response)
152
+ self._accumulated_text = ""
153
+ self._audio_start_time = None
154
+
155
+ def _handle_done(self, event: TranscriptionStreamDone):
156
+ """Handle end-of-stream event with full transcript."""
157
+ text = event.text.strip()
158
+ if not text:
159
+ return
160
+
161
+ participant = self._current_participant
162
+ if participant is None:
163
+ logger.warning("Received done event but no participant set")
164
+ return
165
+
166
+ response = TranscriptResponse(
167
+ language=event.language,
168
+ model_name=event.model,
169
+ )
170
+
171
+ self._emit_transcript_event(text, participant, response)
172
+ self._accumulated_text = ""
173
+ self._audio_start_time = None
174
+ self._done_received.set()
175
+
176
+ def _handle_error(self, event: RealtimeTranscriptionError):
177
+ """Handle error event."""
178
+ error_msg = str(event.error) if event.error else "Unknown Mistral error"
179
+ logger.error(f"Mistral transcription error: {error_msg}")
180
+
181
+ error = Exception(error_msg)
182
+ self._emit_error_event(
183
+ error, context="transcription", participant=self._current_participant
184
+ )
185
+ self._audio_start_time = None
186
+
187
+ async def process_audio(
188
+ self,
189
+ pcm_data: PcmData,
190
+ participant: Optional[Participant] = None,
191
+ ):
192
+ """
193
+ Process audio data through Mistral for transcription.
194
+
195
+ Args:
196
+ pcm_data: The PCM audio data to process.
197
+ participant: Optional participant metadata.
198
+ """
199
+ if self.closed:
200
+ logger.warning("Mistral STT is closed, ignoring audio")
201
+ return
202
+
203
+ await self._connection_ready.wait()
204
+
205
+ if self._connection is None or self._connection.is_closed:
206
+ logger.warning("Mistral connection not available")
207
+ return
208
+
209
+ resampled = pcm_data.resample(self.sample_rate, 1)
210
+ audio_bytes = resampled.samples.tobytes()
211
+
212
+ self._current_participant = participant
213
+
214
+ if self._audio_start_time is None:
215
+ self._audio_start_time = time.perf_counter()
216
+
217
+ await self._connection.send_audio(audio_bytes)
218
+
219
+ async def close(self):
220
+ """Close the Mistral connection and clean up resources."""
221
+ await super().close()
222
+
223
+ # Signal end of audio to trigger Done event with full transcript
224
+ if self._connection and not self._connection.is_closed:
225
+ try:
226
+ await self._connection.end_audio()
227
+ except Exception as e:
228
+ logger.warning(f"Error signaling end of audio: {e}")
229
+
230
+ # Wait for Done event with timeout
231
+ if self._receive_task and not self._done_received.is_set():
232
+ try:
233
+ await asyncio.wait_for(self._done_received.wait(), timeout=5.0)
234
+ except asyncio.TimeoutError:
235
+ logger.debug("Timeout waiting for done event")
236
+
237
+ if self._receive_task:
238
+ await cancel_and_wait(self._receive_task)
239
+ self._receive_task = None
240
+
241
+ if self._connection:
242
+ try:
243
+ await self._connection.close()
244
+ except Exception as e:
245
+ logger.warning(f"Error closing Mistral connection: {e}")
246
+ finally:
247
+ self._connection = None
248
+ self._connection_ready.clear()
249
+ self._done_received.clear()
250
+
251
+ self._audio_start_time = None
252
+ self._accumulated_text = ""