vision-agents-plugins-fast-whisper 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agents_plugins_fast_whisper-0.2.0/.gitignore +90 -0
- vision_agents_plugins_fast_whisper-0.2.0/PKG-INFO +31 -0
- vision_agents_plugins_fast_whisper-0.2.0/README.md +17 -0
- vision_agents_plugins_fast_whisper-0.2.0/pyproject.toml +41 -0
- vision_agents_plugins_fast_whisper-0.2.0/vision_agents/plugins/fast_whisper/__init__.py +4 -0
- vision_agents_plugins_fast_whisper-0.2.0/vision_agents/plugins/fast_whisper/stt.py +239 -0
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
.cursor/*
|
|
7
|
+
# Distribution / packaging
|
|
8
|
+
.Python
|
|
9
|
+
build/
|
|
10
|
+
dist/
|
|
11
|
+
downloads/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
eggs/
|
|
14
|
+
.eggs/
|
|
15
|
+
lib64/
|
|
16
|
+
parts/
|
|
17
|
+
sdist/
|
|
18
|
+
var/
|
|
19
|
+
wheels/
|
|
20
|
+
share/python-wheels/
|
|
21
|
+
pip-wheel-metadata/
|
|
22
|
+
MANIFEST
|
|
23
|
+
*.egg-info/
|
|
24
|
+
*.egg
|
|
25
|
+
|
|
26
|
+
# Installer logs
|
|
27
|
+
pip-log.txt
|
|
28
|
+
pip-delete-this-directory.txt
|
|
29
|
+
|
|
30
|
+
# Unit test / coverage reports
|
|
31
|
+
htmlcov/
|
|
32
|
+
.tox/
|
|
33
|
+
.nox/
|
|
34
|
+
.coverage
|
|
35
|
+
.coverage.*
|
|
36
|
+
.cache
|
|
37
|
+
coverage.xml
|
|
38
|
+
nosetests.xml
|
|
39
|
+
*.cover
|
|
40
|
+
*.py,cover
|
|
41
|
+
.hypothesis/
|
|
42
|
+
.pytest_cache/
|
|
43
|
+
|
|
44
|
+
# Type checker / lint caches
|
|
45
|
+
.mypy_cache/
|
|
46
|
+
.dmypy.json
|
|
47
|
+
dmypy.json
|
|
48
|
+
.pytype/
|
|
49
|
+
.pyre/
|
|
50
|
+
.ruff_cache/
|
|
51
|
+
|
|
52
|
+
# Environments
|
|
53
|
+
.venv
|
|
54
|
+
env/
|
|
55
|
+
venv/
|
|
56
|
+
ENV/
|
|
57
|
+
env.bak/
|
|
58
|
+
venv.bak/
|
|
59
|
+
.env
|
|
60
|
+
.env.local
|
|
61
|
+
.env.*.local
|
|
62
|
+
.env.bak
|
|
63
|
+
pyvenv.cfg
|
|
64
|
+
.python-version
|
|
65
|
+
|
|
66
|
+
# Editors / IDEs
|
|
67
|
+
.vscode/
|
|
68
|
+
.idea/
|
|
69
|
+
|
|
70
|
+
# Jupyter Notebook
|
|
71
|
+
.ipynb_checkpoints/
|
|
72
|
+
|
|
73
|
+
# OS / Misc
|
|
74
|
+
.DS_Store
|
|
75
|
+
*.log
|
|
76
|
+
|
|
77
|
+
# Tooling & repo-specific
|
|
78
|
+
pyrightconfig.json
|
|
79
|
+
shell.nix
|
|
80
|
+
bin/*
|
|
81
|
+
lib/*
|
|
82
|
+
stream-py/
|
|
83
|
+
|
|
84
|
+
# Artifacts / assets
|
|
85
|
+
*.pt
|
|
86
|
+
*.kef
|
|
87
|
+
*.onnx
|
|
88
|
+
profile.html
|
|
89
|
+
|
|
90
|
+
/opencode.json
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vision-agents-plugins-fast-whisper
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Fast Whisper STT integration for Vision Agents
|
|
5
|
+
Project-URL: Documentation, https://visionagents.ai/
|
|
6
|
+
Project-URL: Website, https://visionagents.ai/
|
|
7
|
+
Project-URL: Source, https://github.com/GetStream/Vision-Agents
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
Keywords: AI,STT,agents,faster-whisper,speech-to-text,voice agents,whisper
|
|
10
|
+
Requires-Python: >=3.10
|
|
11
|
+
Requires-Dist: faster-whisper>=1.0.0
|
|
12
|
+
Requires-Dist: vision-agents
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
|
|
15
|
+
# Fast Whisper STT Plugin
|
|
16
|
+
|
|
17
|
+
Fast Whisper STT plugin for Vision Agents, providing real-time audio transcription using [faster-whisper](https://github.com/guillaumekln/faster-whisper).
|
|
18
|
+
|
|
19
|
+
## Features
|
|
20
|
+
|
|
21
|
+
- Fast inference using CTranslate2-based Whisper implementation
|
|
22
|
+
- Support for multiple model sizes (tiny, base, small, medium, large, large-v2, large-v3)
|
|
23
|
+
- Automatic language detection or manual language specification
|
|
24
|
+
- CPU and GPU support
|
|
25
|
+
- Quantization support (int8, float16, float32)
|
|
26
|
+
|
|
27
|
+
## Installation
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
uv add vision-agents[fast-whisper]
|
|
31
|
+
```
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# Fast Whisper STT Plugin
|
|
2
|
+
|
|
3
|
+
Fast Whisper STT plugin for Vision Agents, providing real-time audio transcription using [faster-whisper](https://github.com/guillaumekln/faster-whisper).
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- Fast inference using CTranslate2-based Whisper implementation
|
|
8
|
+
- Support for multiple model sizes (tiny, base, small, medium, large, large-v2, large-v3)
|
|
9
|
+
- Automatic language detection or manual language specification
|
|
10
|
+
- CPU and GPU support
|
|
11
|
+
- Quantization support (int8, float16, float32)
|
|
12
|
+
|
|
13
|
+
## Installation
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
uv add vision-agents[fast-whisper]
|
|
17
|
+
```
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling", "hatch-vcs"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "vision-agents-plugins-fast-whisper"
|
|
7
|
+
dynamic = ["version"]
|
|
8
|
+
description = "Fast Whisper STT integration for Vision Agents"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
keywords = ["faster-whisper", "STT", "speech-to-text", "AI", "voice agents", "agents", "whisper"]
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
license = "MIT"
|
|
13
|
+
dependencies = [
|
|
14
|
+
"vision-agents",
|
|
15
|
+
"faster-whisper>=1.0.0",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
[project.urls]
|
|
19
|
+
Documentation = "https://visionagents.ai/"
|
|
20
|
+
Website = "https://visionagents.ai/"
|
|
21
|
+
Source = "https://github.com/GetStream/Vision-Agents"
|
|
22
|
+
|
|
23
|
+
[tool.hatch.version]
|
|
24
|
+
source = "vcs"
|
|
25
|
+
raw-options = { root = "..", search_parent_directories = true, fallback_version = "0.0.0" }
|
|
26
|
+
|
|
27
|
+
[tool.hatch.build.targets.wheel]
|
|
28
|
+
packages = [".", "vision_agents"]
|
|
29
|
+
|
|
30
|
+
[tool.hatch.build.targets.sdist]
|
|
31
|
+
include = ["/vision_agents"]
|
|
32
|
+
|
|
33
|
+
[tool.uv.sources]
|
|
34
|
+
vision-agents = { workspace = true }
|
|
35
|
+
|
|
36
|
+
[dependency-groups]
|
|
37
|
+
dev = [
|
|
38
|
+
"pytest>=8.4.1",
|
|
39
|
+
"pytest-asyncio>=1.0.0",
|
|
40
|
+
]
|
|
41
|
+
|
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
import time
|
|
4
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
5
|
+
from typing import Literal, Optional
|
|
6
|
+
|
|
7
|
+
from faster_whisper import WhisperModel
|
|
8
|
+
from faster_whisper.transcribe import Segment, TranscriptionInfo
|
|
9
|
+
from getstream.video.rtc.track_util import AudioFormat, PcmData
|
|
10
|
+
from numpy.typing import NDArray
|
|
11
|
+
from vision_agents.core import stt
|
|
12
|
+
from vision_agents.core.edge.types import Participant
|
|
13
|
+
from vision_agents.core.stt.events import TranscriptResponse
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
# Audio processing constants
|
|
18
|
+
RATE = 16000 # Sample rate in Hz (16kHz)
|
|
19
|
+
MIN_BUFFER_DURATION_MS = 1000 # Minimum buffer duration before processing (1 second)
|
|
20
|
+
MAX_BUFFER_DURATION_MS = (
|
|
21
|
+
8000 # Maximum buffer duration before forcing processing (8 seconds)
|
|
22
|
+
)
|
|
23
|
+
PROCESS_INTERVAL_MS = 2000 # Process buffer every 2 seconds if it has content
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class STT(stt.STT):
|
|
27
|
+
"""
|
|
28
|
+
Faster-Whisper Speech-to-Text implementation.
|
|
29
|
+
|
|
30
|
+
This implementation uses faster-whisper for offline transcription.
|
|
31
|
+
Audio is buffered and processed periodically to provide near-real-time
|
|
32
|
+
transcription results.
|
|
33
|
+
|
|
34
|
+
Since faster-whisper is not a streaming STT, this implementation:
|
|
35
|
+
1. Buffers incoming audio chunks
|
|
36
|
+
2. Processes the buffer periodically (every 2 seconds) or when it reaches max duration
|
|
37
|
+
3. Emits partial transcripts for individual segments
|
|
38
|
+
4. Emits final transcripts for complete buffers
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
model_size: Literal["tiny", "base", "small", "medium", "large"] = "tiny",
|
|
44
|
+
language: Optional[str] = "en",
|
|
45
|
+
device: Literal["cpu", "cuda"] = "cpu",
|
|
46
|
+
client: Optional[WhisperModel] = None,
|
|
47
|
+
):
|
|
48
|
+
"""
|
|
49
|
+
Initialize Faster-Whisper STT.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
model_size: Whisper model size (tiny, base, small, medium, large)
|
|
53
|
+
language: Language code (e.g., "en", "es", "fr") or None for auto-detect
|
|
54
|
+
device: Device to run on ("cpu" or "cuda")
|
|
55
|
+
client: Optional pre-initialized WhisperModel instance
|
|
56
|
+
"""
|
|
57
|
+
super().__init__(provider_name="faster_whisper")
|
|
58
|
+
|
|
59
|
+
self.model_size = model_size
|
|
60
|
+
self.language = language
|
|
61
|
+
self.device = device
|
|
62
|
+
self.compute_type = "int8"
|
|
63
|
+
|
|
64
|
+
self.whisper = client
|
|
65
|
+
|
|
66
|
+
self._audio_buffer = PcmData(
|
|
67
|
+
sample_rate=RATE, channels=1, format=AudioFormat.F32
|
|
68
|
+
)
|
|
69
|
+
self._last_process_time = time.time()
|
|
70
|
+
self._executor = ThreadPoolExecutor(max_workers=1)
|
|
71
|
+
|
|
72
|
+
async def warmup(self) -> None:
|
|
73
|
+
"""
|
|
74
|
+
The Whisper model if not already provided."""
|
|
75
|
+
|
|
76
|
+
if self.whisper is None:
|
|
77
|
+
logger.info(f"Loading faster-whisper model: {self.model_size}")
|
|
78
|
+
# Load whisper in thread pool to avoid blocking event loop
|
|
79
|
+
loop = asyncio.get_running_loop()
|
|
80
|
+
self.whisper = await loop.run_in_executor(
|
|
81
|
+
self._executor,
|
|
82
|
+
lambda: WhisperModel(
|
|
83
|
+
self.model_size, device=self.device, compute_type=self.compute_type
|
|
84
|
+
),
|
|
85
|
+
)
|
|
86
|
+
logger.info("Faster-whisper model loaded")
|
|
87
|
+
|
|
88
|
+
async def process_audio(
|
|
89
|
+
self,
|
|
90
|
+
pcm_data: PcmData,
|
|
91
|
+
participant: Optional[Participant] = None,
|
|
92
|
+
):
|
|
93
|
+
"""
|
|
94
|
+
Process audio data through faster-whisper for transcription.
|
|
95
|
+
|
|
96
|
+
Audio is buffered and processed periodically to provide transcription results.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
pcm_data: The PCM audio data to process
|
|
100
|
+
participant: Optional participant metadata
|
|
101
|
+
"""
|
|
102
|
+
if self.closed:
|
|
103
|
+
logger.warning("Faster-Whisper STT is closed, ignoring audio")
|
|
104
|
+
return
|
|
105
|
+
|
|
106
|
+
if self.whisper is None:
|
|
107
|
+
raise ValueError("Whisper model not loaded, call warmup() first")
|
|
108
|
+
|
|
109
|
+
# Check for empty audio
|
|
110
|
+
if pcm_data.samples.size == 0:
|
|
111
|
+
return
|
|
112
|
+
|
|
113
|
+
try:
|
|
114
|
+
# Ensure audio is in the right format: 16kHz, float32
|
|
115
|
+
|
|
116
|
+
audio_data = pcm_data.resample(RATE).to_float32()
|
|
117
|
+
self._audio_buffer = self._audio_buffer.append(audio_data)
|
|
118
|
+
|
|
119
|
+
current_time = time.time()
|
|
120
|
+
buffer_duration_ms = self._audio_buffer.duration_ms
|
|
121
|
+
buffer_size = self._audio_buffer.samples.size
|
|
122
|
+
time_since_last_process = (current_time - self._last_process_time) * 1000
|
|
123
|
+
|
|
124
|
+
should_process = (
|
|
125
|
+
buffer_duration_ms >= MIN_BUFFER_DURATION_MS
|
|
126
|
+
and buffer_size > 0
|
|
127
|
+
and (
|
|
128
|
+
time_since_last_process >= PROCESS_INTERVAL_MS
|
|
129
|
+
or buffer_duration_ms >= MAX_BUFFER_DURATION_MS
|
|
130
|
+
)
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
if should_process:
|
|
134
|
+
await self._process_buffer(participant)
|
|
135
|
+
|
|
136
|
+
except Exception as e:
|
|
137
|
+
logger.exception("Error buffering audio for faster-whisper")
|
|
138
|
+
self._emit_error_event(
|
|
139
|
+
e, context="buffering_audio", participant=participant
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
async def _process_buffer(self, participant: Optional[Participant] = None):
|
|
143
|
+
"""
|
|
144
|
+
Process the current audio buffer through faster-whisper.
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
participant: Optional participant metadata
|
|
148
|
+
"""
|
|
149
|
+
|
|
150
|
+
# Extract buffer to process
|
|
151
|
+
buffer_to_process = self._audio_buffer
|
|
152
|
+
|
|
153
|
+
self._audio_buffer = PcmData(
|
|
154
|
+
sample_rate=RATE, channels=1, format=AudioFormat.F32
|
|
155
|
+
)
|
|
156
|
+
self._last_process_time = time.time()
|
|
157
|
+
|
|
158
|
+
# Ensure it's 16kHz and f32 format
|
|
159
|
+
pcm = buffer_to_process.resample(RATE).to_float32()
|
|
160
|
+
audio_array = pcm.samples
|
|
161
|
+
|
|
162
|
+
if audio_array.size == 0:
|
|
163
|
+
return
|
|
164
|
+
|
|
165
|
+
start_time = time.time()
|
|
166
|
+
|
|
167
|
+
try:
|
|
168
|
+
segments, info = await self._transcribe(audio_array=audio_array)
|
|
169
|
+
except Exception as e:
|
|
170
|
+
logger.error(
|
|
171
|
+
"Error processing audio buffer with faster-whisper", exc_info=e
|
|
172
|
+
)
|
|
173
|
+
self._emit_error_event(e, context="transcription", participant=participant)
|
|
174
|
+
return
|
|
175
|
+
|
|
176
|
+
processing_time_ms = (time.time() - start_time) * 1000
|
|
177
|
+
|
|
178
|
+
# Create default participant if none provided
|
|
179
|
+
if participant is None:
|
|
180
|
+
participant = Participant(original=None, user_id="unknown")
|
|
181
|
+
|
|
182
|
+
# Process segments
|
|
183
|
+
text_parts = []
|
|
184
|
+
for segment in segments:
|
|
185
|
+
text = segment.text.strip()
|
|
186
|
+
if text:
|
|
187
|
+
text_parts.append(text)
|
|
188
|
+
|
|
189
|
+
# Emit partial transcript for each segment
|
|
190
|
+
response = TranscriptResponse(
|
|
191
|
+
confidence=segment.no_speech_prob
|
|
192
|
+
if hasattr(segment, "no_speech_prob")
|
|
193
|
+
else None,
|
|
194
|
+
language=info.language
|
|
195
|
+
if hasattr(info, "language")
|
|
196
|
+
else self.language,
|
|
197
|
+
processing_time_ms=processing_time_ms,
|
|
198
|
+
audio_duration_ms=buffer_to_process.duration_ms,
|
|
199
|
+
model_name=f"faster-whisper-{self.model_size}",
|
|
200
|
+
)
|
|
201
|
+
self._emit_partial_transcript_event(text, participant, response)
|
|
202
|
+
|
|
203
|
+
# Emit final transcript for the complete buffer
|
|
204
|
+
if text_parts:
|
|
205
|
+
full_text = " ".join(text_parts).strip()
|
|
206
|
+
response = TranscriptResponse(
|
|
207
|
+
confidence=None, # faster-whisper doesn't provide overall confidence
|
|
208
|
+
language=info.language if hasattr(info, "language") else self.language,
|
|
209
|
+
processing_time_ms=processing_time_ms,
|
|
210
|
+
audio_duration_ms=buffer_to_process.duration_ms,
|
|
211
|
+
model_name=f"faster-whisper-{self.model_size}",
|
|
212
|
+
)
|
|
213
|
+
self._emit_transcript_event(full_text, participant, response)
|
|
214
|
+
|
|
215
|
+
async def close(self):
|
|
216
|
+
"""Close the STT and cleanup resources."""
|
|
217
|
+
await super().close()
|
|
218
|
+
self._executor.shutdown(wait=False)
|
|
219
|
+
|
|
220
|
+
async def _transcribe(
|
|
221
|
+
self, audio_array: NDArray
|
|
222
|
+
) -> tuple[list[Segment], TranscriptionInfo]:
|
|
223
|
+
if self.whisper is None:
|
|
224
|
+
raise ValueError("Whisper model not loaded, call warmup() first")
|
|
225
|
+
|
|
226
|
+
whisper = self.whisper # Type narrowing for closure
|
|
227
|
+
|
|
228
|
+
def _worker():
|
|
229
|
+
segments, info = whisper.transcribe(
|
|
230
|
+
audio_array,
|
|
231
|
+
language=self.language,
|
|
232
|
+
beam_size=1,
|
|
233
|
+
vad_filter=False, # Let faster-whisper handle VAD if needed
|
|
234
|
+
)
|
|
235
|
+
# Evaluate the generator in the background thread
|
|
236
|
+
return list(segments), info
|
|
237
|
+
|
|
238
|
+
loop = asyncio.get_running_loop()
|
|
239
|
+
return await loop.run_in_executor(self._executor, _worker)
|