vision-agents-plugins-mistral 0.3.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agents_plugins_mistral-0.3.3/.gitignore +91 -0
- vision_agents_plugins_mistral-0.3.3/PKG-INFO +93 -0
- vision_agents_plugins_mistral-0.3.3/README.md +79 -0
- vision_agents_plugins_mistral-0.3.3/pyproject.toml +40 -0
- vision_agents_plugins_mistral-0.3.3/vision_agents/plugins/mistral/__init__.py +3 -0
- vision_agents_plugins_mistral-0.3.3/vision_agents/plugins/mistral/stt.py +252 -0
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
.cursor/*
|
|
7
|
+
# Distribution / packaging
|
|
8
|
+
.Python
|
|
9
|
+
build/
|
|
10
|
+
dist/
|
|
11
|
+
downloads/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
eggs/
|
|
14
|
+
.eggs/
|
|
15
|
+
lib64/
|
|
16
|
+
parts/
|
|
17
|
+
sdist/
|
|
18
|
+
var/
|
|
19
|
+
wheels/
|
|
20
|
+
share/python-wheels/
|
|
21
|
+
pip-wheel-metadata/
|
|
22
|
+
MANIFEST
|
|
23
|
+
*.egg-info/
|
|
24
|
+
*.egg
|
|
25
|
+
|
|
26
|
+
# Installer logs
|
|
27
|
+
pip-log.txt
|
|
28
|
+
pip-delete-this-directory.txt
|
|
29
|
+
|
|
30
|
+
# Unit test / coverage reports
|
|
31
|
+
htmlcov/
|
|
32
|
+
.tox/
|
|
33
|
+
.nox/
|
|
34
|
+
.coverage
|
|
35
|
+
.coverage.*
|
|
36
|
+
.cache
|
|
37
|
+
coverage.xml
|
|
38
|
+
nosetests.xml
|
|
39
|
+
*.cover
|
|
40
|
+
*.py,cover
|
|
41
|
+
.hypothesis/
|
|
42
|
+
.pytest_cache/
|
|
43
|
+
|
|
44
|
+
# Type checker / lint caches
|
|
45
|
+
.mypy_cache/
|
|
46
|
+
.dmypy.json
|
|
47
|
+
dmypy.json
|
|
48
|
+
.pytype/
|
|
49
|
+
.pyre/
|
|
50
|
+
.ruff_cache/
|
|
51
|
+
|
|
52
|
+
# Environments
|
|
53
|
+
.venv
|
|
54
|
+
env/
|
|
55
|
+
venv/
|
|
56
|
+
ENV/
|
|
57
|
+
env.bak/
|
|
58
|
+
venv.bak/
|
|
59
|
+
.env
|
|
60
|
+
.env.local
|
|
61
|
+
.env.*.local
|
|
62
|
+
.env.bak
|
|
63
|
+
pyvenv.cfg
|
|
64
|
+
.python-version
|
|
65
|
+
|
|
66
|
+
# Editors / IDEs
|
|
67
|
+
.vscode/
|
|
68
|
+
.idea/
|
|
69
|
+
|
|
70
|
+
# Jupyter Notebook
|
|
71
|
+
.ipynb_checkpoints/
|
|
72
|
+
|
|
73
|
+
# OS / Misc
|
|
74
|
+
.DS_Store
|
|
75
|
+
*.log
|
|
76
|
+
|
|
77
|
+
# Tooling & repo-specific
|
|
78
|
+
pyrightconfig.json
|
|
79
|
+
shell.nix
|
|
80
|
+
bin/*
|
|
81
|
+
lib/*
|
|
82
|
+
stream-py/
|
|
83
|
+
|
|
84
|
+
# Artifacts / assets
|
|
85
|
+
*.pt
|
|
86
|
+
*.kef
|
|
87
|
+
*.onnx
|
|
88
|
+
profile.html
|
|
89
|
+
|
|
90
|
+
/opencode.json
|
|
91
|
+
.ralph-tui/
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vision-agents-plugins-mistral
|
|
3
|
+
Version: 0.3.3
|
|
4
|
+
Summary: Mistral Voxtral STT integration for Vision Agents
|
|
5
|
+
Project-URL: Documentation, https://visionagents.ai/
|
|
6
|
+
Project-URL: Website, https://visionagents.ai/
|
|
7
|
+
Project-URL: Source, https://github.com/GetStream/Vision-Agents
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
Keywords: AI,STT,agents,mistral,real-time,speech-to-text,transcription,voice agents,voxtral
|
|
10
|
+
Requires-Python: >=3.10
|
|
11
|
+
Requires-Dist: mistralai[realtime]>=1.12.0
|
|
12
|
+
Requires-Dist: vision-agents
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
|
|
15
|
+
# Mistral Voxtral STT Plugin
|
|
16
|
+
|
|
17
|
+
Mistral Voxtral realtime speech-to-text integration for Vision Agents.
|
|
18
|
+
|
|
19
|
+
## Features
|
|
20
|
+
|
|
21
|
+
- Real-time speech recognition via WebSocket streaming
|
|
22
|
+
- Low-latency transcription using Voxtral models
|
|
23
|
+
- Automatic language detection
|
|
24
|
+
- Partial transcript streaming for responsive UX
|
|
25
|
+
- Sentence-level final transcripts (triggered by `.`, `?`, `!`)
|
|
26
|
+
|
|
27
|
+
## Installation
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
uv add vision-agents[mistral]
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Usage
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
from vision_agents.core import Agent, Runner, User
|
|
37
|
+
from vision_agents.core.agents import AgentLauncher
|
|
38
|
+
from vision_agents.plugins import deepgram, gemini, getstream, mistral
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
async def create_agent(**kwargs) -> Agent:
|
|
42
|
+
return Agent(
|
|
43
|
+
edge=getstream.Edge(),
|
|
44
|
+
agent_user=User(name="Assistant", id="agent"),
|
|
45
|
+
instructions="You're a helpful voice AI assistant. Keep replies short and conversational.",
|
|
46
|
+
stt=mistral.STT(),
|
|
47
|
+
tts=deepgram.TTS(),
|
|
48
|
+
llm=gemini.LLM("gemini-2.0-flash"),
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None:
|
|
53
|
+
await agent.create_user()
|
|
54
|
+
call = await agent.create_call(call_type, call_id)
|
|
55
|
+
|
|
56
|
+
async with agent.join(call):
|
|
57
|
+
await agent.run()
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
if __name__ == "__main__":
|
|
61
|
+
Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli()
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Run with:
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
uv run plugins/mistral/example/mistral_stt_example.py run
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## Turn Detection
|
|
71
|
+
|
|
72
|
+
Mistral Voxtral STT does not include built-in turn detection (`turn_detection=False`). You'll need to pair it with an external turn detection plugin.
|
|
73
|
+
|
|
74
|
+
## Configuration
|
|
75
|
+
|
|
76
|
+
| Parameter | Description | Default |
|
|
77
|
+
|-----------|-------------|---------|
|
|
78
|
+
| `api_key` | Mistral API key | `MISTRAL_API_KEY` env var |
|
|
79
|
+
| `model` | Model identifier | `voxtral-mini-transcribe-realtime-2602` |
|
|
80
|
+
| `sample_rate` | Audio sample rate (Hz): 8000, 16000, 22050, 44100, 48000 | `16000` |
|
|
81
|
+
| `client` | Pre-configured Mistral client | `None` |
|
|
82
|
+
|
|
83
|
+
## Events
|
|
84
|
+
|
|
85
|
+
The plugin emits standard STT events:
|
|
86
|
+
|
|
87
|
+
- `STTTranscriptEvent`: Final transcript (emitted at sentence boundaries or stream end)
|
|
88
|
+
- `STTPartialTranscriptEvent`: Partial word/delta as transcription streams
|
|
89
|
+
|
|
90
|
+
## Dependencies
|
|
91
|
+
|
|
92
|
+
- `mistralai[realtime]>=1.12.0`
|
|
93
|
+
- `vision-agents`
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# Mistral Voxtral STT Plugin
|
|
2
|
+
|
|
3
|
+
Mistral Voxtral realtime speech-to-text integration for Vision Agents.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- Real-time speech recognition via WebSocket streaming
|
|
8
|
+
- Low-latency transcription using Voxtral models
|
|
9
|
+
- Automatic language detection
|
|
10
|
+
- Partial transcript streaming for responsive UX
|
|
11
|
+
- Sentence-level final transcripts (triggered by `.`, `?`, `!`)
|
|
12
|
+
|
|
13
|
+
## Installation
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
uv add vision-agents[mistral]
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Usage
|
|
20
|
+
|
|
21
|
+
```python
|
|
22
|
+
from vision_agents.core import Agent, Runner, User
|
|
23
|
+
from vision_agents.core.agents import AgentLauncher
|
|
24
|
+
from vision_agents.plugins import deepgram, gemini, getstream, mistral
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
async def create_agent(**kwargs) -> Agent:
|
|
28
|
+
return Agent(
|
|
29
|
+
edge=getstream.Edge(),
|
|
30
|
+
agent_user=User(name="Assistant", id="agent"),
|
|
31
|
+
instructions="You're a helpful voice AI assistant. Keep replies short and conversational.",
|
|
32
|
+
stt=mistral.STT(),
|
|
33
|
+
tts=deepgram.TTS(),
|
|
34
|
+
llm=gemini.LLM("gemini-2.0-flash"),
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None:
|
|
39
|
+
await agent.create_user()
|
|
40
|
+
call = await agent.create_call(call_type, call_id)
|
|
41
|
+
|
|
42
|
+
async with agent.join(call):
|
|
43
|
+
await agent.run()
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
if __name__ == "__main__":
|
|
47
|
+
Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli()
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
Run with:
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
uv run plugins/mistral/example/mistral_stt_example.py run
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Turn Detection
|
|
57
|
+
|
|
58
|
+
Mistral Voxtral STT does not include built-in turn detection (`turn_detection=False`). You'll need to pair it with an external turn detection plugin.
|
|
59
|
+
|
|
60
|
+
## Configuration
|
|
61
|
+
|
|
62
|
+
| Parameter | Description | Default |
|
|
63
|
+
|-----------|-------------|---------|
|
|
64
|
+
| `api_key` | Mistral API key | `MISTRAL_API_KEY` env var |
|
|
65
|
+
| `model` | Model identifier | `voxtral-mini-transcribe-realtime-2602` |
|
|
66
|
+
| `sample_rate` | Audio sample rate (Hz): 8000, 16000, 22050, 44100, 48000 | `16000` |
|
|
67
|
+
| `client` | Pre-configured Mistral client | `None` |
|
|
68
|
+
|
|
69
|
+
## Events
|
|
70
|
+
|
|
71
|
+
The plugin emits standard STT events:
|
|
72
|
+
|
|
73
|
+
- `STTTranscriptEvent`: Final transcript (emitted at sentence boundaries or stream end)
|
|
74
|
+
- `STTPartialTranscriptEvent`: Partial word/delta as transcription streams
|
|
75
|
+
|
|
76
|
+
## Dependencies
|
|
77
|
+
|
|
78
|
+
- `mistralai[realtime]>=1.12.0`
|
|
79
|
+
- `vision-agents`
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling", "hatch-vcs"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "vision-agents-plugins-mistral"
|
|
7
|
+
dynamic = ["version"]
|
|
8
|
+
description = "Mistral Voxtral STT integration for Vision Agents"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
keywords = ["mistral", "voxtral", "real-time", "STT", "speech-to-text", "transcription", "AI", "voice agents", "agents"]
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
license = "MIT"
|
|
13
|
+
dependencies = [
|
|
14
|
+
"vision-agents",
|
|
15
|
+
"mistralai[realtime]>=1.12.0",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
[project.urls]
|
|
19
|
+
Documentation = "https://visionagents.ai/"
|
|
20
|
+
Website = "https://visionagents.ai/"
|
|
21
|
+
Source = "https://github.com/GetStream/Vision-Agents"
|
|
22
|
+
|
|
23
|
+
[tool.hatch.version]
|
|
24
|
+
source = "vcs"
|
|
25
|
+
raw-options = { root = "..", search_parent_directories = true, fallback_version = "0.0.0" }
|
|
26
|
+
|
|
27
|
+
[tool.hatch.build.targets.wheel]
|
|
28
|
+
packages = [".", "vision_agents"]
|
|
29
|
+
|
|
30
|
+
[tool.hatch.build.targets.sdist]
|
|
31
|
+
include = ["/vision_agents"]
|
|
32
|
+
|
|
33
|
+
[tool.uv.sources]
|
|
34
|
+
vision-agents = { workspace = true }
|
|
35
|
+
|
|
36
|
+
[dependency-groups]
|
|
37
|
+
dev = [
|
|
38
|
+
"pytest>=8.4.1",
|
|
39
|
+
"pytest-asyncio>=1.0.0",
|
|
40
|
+
]
|
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
import time
|
|
5
|
+
from typing import Any, Optional
|
|
6
|
+
|
|
7
|
+
from getstream.video.rtc.track_util import PcmData
|
|
8
|
+
from mistralai import Mistral
|
|
9
|
+
from mistralai.extra.realtime import AudioFormat, RealtimeConnection
|
|
10
|
+
from mistralai.models import (
|
|
11
|
+
RealtimeTranscriptionError,
|
|
12
|
+
TranscriptionStreamDone,
|
|
13
|
+
TranscriptionStreamTextDelta,
|
|
14
|
+
)
|
|
15
|
+
from vision_agents.core import stt
|
|
16
|
+
from vision_agents.core.edge.types import Participant
|
|
17
|
+
from vision_agents.core.stt import TranscriptResponse
|
|
18
|
+
from vision_agents.core.utils.utils import cancel_and_wait
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class STT(stt.STT):
|
|
24
|
+
"""
|
|
25
|
+
Mistral Voxtral Realtime Speech-to-Text implementation.
|
|
26
|
+
|
|
27
|
+
Uses WebSocket streaming for low-latency transcription.
|
|
28
|
+
|
|
29
|
+
Docs:
|
|
30
|
+
- https://docs.mistral.ai/capabilities/audio_transcription#realtime-transcription
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
turn_detection: bool = False
|
|
34
|
+
|
|
35
|
+
def __init__(
|
|
36
|
+
self,
|
|
37
|
+
api_key: Optional[str] = None,
|
|
38
|
+
model: str = "voxtral-mini-transcribe-realtime-2602",
|
|
39
|
+
sample_rate: int = 16000,
|
|
40
|
+
client: Optional[Mistral] = None,
|
|
41
|
+
):
|
|
42
|
+
"""
|
|
43
|
+
Initialize Mistral Voxtral STT.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
api_key: Mistral API key. If not provided, MISTRAL_API_KEY env var is used.
|
|
47
|
+
model: Model to use for transcription.
|
|
48
|
+
sample_rate: Audio sample rate in Hz. Supports 8000, 16000, 22050, 44100, 48000.
|
|
49
|
+
client: Optional pre-configured Mistral client instance.
|
|
50
|
+
"""
|
|
51
|
+
super().__init__(provider_name="mistral")
|
|
52
|
+
|
|
53
|
+
if client is not None:
|
|
54
|
+
self._client = client
|
|
55
|
+
else:
|
|
56
|
+
if not api_key:
|
|
57
|
+
api_key = os.environ.get("MISTRAL_API_KEY")
|
|
58
|
+
self._client = Mistral(api_key=api_key)
|
|
59
|
+
|
|
60
|
+
self.model = model
|
|
61
|
+
self.sample_rate = sample_rate
|
|
62
|
+
self._connection: Optional[RealtimeConnection] = None
|
|
63
|
+
self._receive_task: Optional[asyncio.Task[Any]] = None
|
|
64
|
+
self._current_participant: Optional[Participant] = None
|
|
65
|
+
self._connection_ready = asyncio.Event()
|
|
66
|
+
self._audio_start_time: Optional[float] = None
|
|
67
|
+
self._accumulated_text: str = ""
|
|
68
|
+
self._done_received = asyncio.Event()
|
|
69
|
+
|
|
70
|
+
async def start(self):
|
|
71
|
+
"""Start the Mistral WebSocket connection."""
|
|
72
|
+
await super().start()
|
|
73
|
+
|
|
74
|
+
if self._connection is not None:
|
|
75
|
+
logger.warning("Mistral connection already started")
|
|
76
|
+
return
|
|
77
|
+
|
|
78
|
+
audio_format = AudioFormat(encoding="pcm_s16le", sample_rate=self.sample_rate)
|
|
79
|
+
|
|
80
|
+
self._connection = await asyncio.wait_for(
|
|
81
|
+
self._client.audio.realtime.connect(
|
|
82
|
+
model=self.model,
|
|
83
|
+
audio_format=audio_format,
|
|
84
|
+
),
|
|
85
|
+
timeout=10.0,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
self._receive_task = asyncio.create_task(self._receive_loop())
|
|
89
|
+
self._connection_ready.set()
|
|
90
|
+
|
|
91
|
+
logger.info("Mistral WebSocket connection established")
|
|
92
|
+
|
|
93
|
+
async def _receive_loop(self):
|
|
94
|
+
"""Background task to receive and process events from Mistral."""
|
|
95
|
+
if self._connection is None:
|
|
96
|
+
return
|
|
97
|
+
|
|
98
|
+
try:
|
|
99
|
+
async for event in self._connection:
|
|
100
|
+
logger.debug(f"Mistral event: {type(event).__name__}")
|
|
101
|
+
|
|
102
|
+
if isinstance(event, TranscriptionStreamTextDelta):
|
|
103
|
+
self._handle_text_delta(event)
|
|
104
|
+
elif isinstance(event, TranscriptionStreamDone):
|
|
105
|
+
self._handle_done(event)
|
|
106
|
+
break # Exit loop after done
|
|
107
|
+
elif isinstance(event, RealtimeTranscriptionError):
|
|
108
|
+
self._handle_error(event)
|
|
109
|
+
break # Exit loop on error
|
|
110
|
+
|
|
111
|
+
except asyncio.CancelledError:
|
|
112
|
+
logger.debug("Mistral receive loop cancelled")
|
|
113
|
+
raise
|
|
114
|
+
except Exception as e:
|
|
115
|
+
logger.error(f"Error in Mistral receive loop: {e}")
|
|
116
|
+
if not self.closed:
|
|
117
|
+
self._emit_error_event(e, context="receive_loop")
|
|
118
|
+
|
|
119
|
+
def _handle_text_delta(self, event: TranscriptionStreamTextDelta):
|
|
120
|
+
"""Handle text delta - emit word-by-word partials, full text on complete."""
|
|
121
|
+
text = event.text
|
|
122
|
+
if not text:
|
|
123
|
+
return
|
|
124
|
+
|
|
125
|
+
participant = self._current_participant
|
|
126
|
+
if participant is None:
|
|
127
|
+
logger.warning("Received transcript but no participant set")
|
|
128
|
+
return
|
|
129
|
+
|
|
130
|
+
# Accumulate text for complete events
|
|
131
|
+
self._accumulated_text += text
|
|
132
|
+
|
|
133
|
+
processing_time_ms: Optional[float] = None
|
|
134
|
+
if self._audio_start_time is not None:
|
|
135
|
+
processing_time_ms = (time.perf_counter() - self._audio_start_time) * 1000
|
|
136
|
+
|
|
137
|
+
response = TranscriptResponse(
|
|
138
|
+
model_name=self.model,
|
|
139
|
+
processing_time_ms=processing_time_ms,
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
# Emit partial with just the new word/delta (not accumulated)
|
|
143
|
+
text_stripped = text.strip()
|
|
144
|
+
if text_stripped:
|
|
145
|
+
self._emit_partial_transcript_event(text_stripped, participant, response)
|
|
146
|
+
|
|
147
|
+
# Check for sentence-ending punctuation - emit complete transcript
|
|
148
|
+
if text.rstrip().endswith((".", "?", "!")):
|
|
149
|
+
accumulated_stripped = self._accumulated_text.strip()
|
|
150
|
+
if accumulated_stripped:
|
|
151
|
+
self._emit_transcript_event(accumulated_stripped, participant, response)
|
|
152
|
+
self._accumulated_text = ""
|
|
153
|
+
self._audio_start_time = None
|
|
154
|
+
|
|
155
|
+
def _handle_done(self, event: TranscriptionStreamDone):
|
|
156
|
+
"""Handle end-of-stream event with full transcript."""
|
|
157
|
+
text = event.text.strip()
|
|
158
|
+
if not text:
|
|
159
|
+
return
|
|
160
|
+
|
|
161
|
+
participant = self._current_participant
|
|
162
|
+
if participant is None:
|
|
163
|
+
logger.warning("Received done event but no participant set")
|
|
164
|
+
return
|
|
165
|
+
|
|
166
|
+
response = TranscriptResponse(
|
|
167
|
+
language=event.language,
|
|
168
|
+
model_name=event.model,
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
self._emit_transcript_event(text, participant, response)
|
|
172
|
+
self._accumulated_text = ""
|
|
173
|
+
self._audio_start_time = None
|
|
174
|
+
self._done_received.set()
|
|
175
|
+
|
|
176
|
+
def _handle_error(self, event: RealtimeTranscriptionError):
|
|
177
|
+
"""Handle error event."""
|
|
178
|
+
error_msg = str(event.error) if event.error else "Unknown Mistral error"
|
|
179
|
+
logger.error(f"Mistral transcription error: {error_msg}")
|
|
180
|
+
|
|
181
|
+
error = Exception(error_msg)
|
|
182
|
+
self._emit_error_event(
|
|
183
|
+
error, context="transcription", participant=self._current_participant
|
|
184
|
+
)
|
|
185
|
+
self._audio_start_time = None
|
|
186
|
+
|
|
187
|
+
async def process_audio(
|
|
188
|
+
self,
|
|
189
|
+
pcm_data: PcmData,
|
|
190
|
+
participant: Optional[Participant] = None,
|
|
191
|
+
):
|
|
192
|
+
"""
|
|
193
|
+
Process audio data through Mistral for transcription.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
pcm_data: The PCM audio data to process.
|
|
197
|
+
participant: Optional participant metadata.
|
|
198
|
+
"""
|
|
199
|
+
if self.closed:
|
|
200
|
+
logger.warning("Mistral STT is closed, ignoring audio")
|
|
201
|
+
return
|
|
202
|
+
|
|
203
|
+
await self._connection_ready.wait()
|
|
204
|
+
|
|
205
|
+
if self._connection is None or self._connection.is_closed:
|
|
206
|
+
logger.warning("Mistral connection not available")
|
|
207
|
+
return
|
|
208
|
+
|
|
209
|
+
resampled = pcm_data.resample(self.sample_rate, 1)
|
|
210
|
+
audio_bytes = resampled.samples.tobytes()
|
|
211
|
+
|
|
212
|
+
self._current_participant = participant
|
|
213
|
+
|
|
214
|
+
if self._audio_start_time is None:
|
|
215
|
+
self._audio_start_time = time.perf_counter()
|
|
216
|
+
|
|
217
|
+
await self._connection.send_audio(audio_bytes)
|
|
218
|
+
|
|
219
|
+
async def close(self):
|
|
220
|
+
"""Close the Mistral connection and clean up resources."""
|
|
221
|
+
await super().close()
|
|
222
|
+
|
|
223
|
+
# Signal end of audio to trigger Done event with full transcript
|
|
224
|
+
if self._connection and not self._connection.is_closed:
|
|
225
|
+
try:
|
|
226
|
+
await self._connection.end_audio()
|
|
227
|
+
except Exception as e:
|
|
228
|
+
logger.warning(f"Error signaling end of audio: {e}")
|
|
229
|
+
|
|
230
|
+
# Wait for Done event with timeout
|
|
231
|
+
if self._receive_task and not self._done_received.is_set():
|
|
232
|
+
try:
|
|
233
|
+
await asyncio.wait_for(self._done_received.wait(), timeout=5.0)
|
|
234
|
+
except asyncio.TimeoutError:
|
|
235
|
+
logger.debug("Timeout waiting for done event")
|
|
236
|
+
|
|
237
|
+
if self._receive_task:
|
|
238
|
+
await cancel_and_wait(self._receive_task)
|
|
239
|
+
self._receive_task = None
|
|
240
|
+
|
|
241
|
+
if self._connection:
|
|
242
|
+
try:
|
|
243
|
+
await self._connection.close()
|
|
244
|
+
except Exception as e:
|
|
245
|
+
logger.warning(f"Error closing Mistral connection: {e}")
|
|
246
|
+
finally:
|
|
247
|
+
self._connection = None
|
|
248
|
+
self._connection_ready.clear()
|
|
249
|
+
self._done_received.clear()
|
|
250
|
+
|
|
251
|
+
self._audio_start_time = None
|
|
252
|
+
self._accumulated_text = ""
|