vision-agents-plugins-fast-whisper 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,90 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .cursor/*
7
+ # Distribution / packaging
8
+ .Python
9
+ build/
10
+ dist/
11
+ downloads/
12
+ develop-eggs/
13
+ eggs/
14
+ .eggs/
15
+ lib64/
16
+ parts/
17
+ sdist/
18
+ var/
19
+ wheels/
20
+ share/python-wheels/
21
+ pip-wheel-metadata/
22
+ MANIFEST
23
+ *.egg-info/
24
+ *.egg
25
+
26
+ # Installer logs
27
+ pip-log.txt
28
+ pip-delete-this-directory.txt
29
+
30
+ # Unit test / coverage reports
31
+ htmlcov/
32
+ .tox/
33
+ .nox/
34
+ .coverage
35
+ .coverage.*
36
+ .cache
37
+ coverage.xml
38
+ nosetests.xml
39
+ *.cover
40
+ *.py,cover
41
+ .hypothesis/
42
+ .pytest_cache/
43
+
44
+ # Type checker / lint caches
45
+ .mypy_cache/
46
+ .dmypy.json
47
+ dmypy.json
48
+ .pytype/
49
+ .pyre/
50
+ .ruff_cache/
51
+
52
+ # Environments
53
+ .venv
54
+ env/
55
+ venv/
56
+ ENV/
57
+ env.bak/
58
+ venv.bak/
59
+ .env
60
+ .env.local
61
+ .env.*.local
62
+ .env.bak
63
+ pyvenv.cfg
64
+ .python-version
65
+
66
+ # Editors / IDEs
67
+ .vscode/
68
+ .idea/
69
+
70
+ # Jupyter Notebook
71
+ .ipynb_checkpoints/
72
+
73
+ # OS / Misc
74
+ .DS_Store
75
+ *.log
76
+
77
+ # Tooling & repo-specific
78
+ pyrightconfig.json
79
+ shell.nix
80
+ bin/*
81
+ lib/*
82
+ stream-py/
83
+
84
+ # Artifacts / assets
85
+ *.pt
86
+ *.kef
87
+ *.onnx
88
+ profile.html
89
+
90
+ /opencode.json
@@ -0,0 +1,31 @@
1
+ Metadata-Version: 2.4
2
+ Name: vision-agents-plugins-fast-whisper
3
+ Version: 0.2.4
4
+ Summary: Fast Whisper STT integration for Vision Agents
5
+ Project-URL: Documentation, https://visionagents.ai/
6
+ Project-URL: Website, https://visionagents.ai/
7
+ Project-URL: Source, https://github.com/GetStream/Vision-Agents
8
+ License-Expression: MIT
9
+ Keywords: AI,STT,agents,faster-whisper,speech-to-text,voice agents,whisper
10
+ Requires-Python: >=3.10
11
+ Requires-Dist: faster-whisper>=1.0.0
12
+ Requires-Dist: vision-agents
13
+ Description-Content-Type: text/markdown
14
+
15
+ # Fast Whisper STT Plugin
16
+
17
+ Fast Whisper STT plugin for Vision Agents, providing real-time audio transcription using [faster-whisper](https://github.com/guillaumekln/faster-whisper).
18
+
19
+ ## Features
20
+
21
+ - Fast inference using CTranslate2-based Whisper implementation
22
+ - Support for multiple model sizes (tiny, base, small, medium, large, large-v2, large-v3)
23
+ - Automatic language detection or manual language specification
24
+ - CPU and GPU support
25
+ - Quantization support (int8, float16, float32)
26
+
27
+ ## Installation
28
+
29
+ ```bash
30
+ uv add vision-agents[fast-whisper]
31
+ ```
@@ -0,0 +1,17 @@
1
+ # Fast Whisper STT Plugin
2
+
3
+ Fast Whisper STT plugin for Vision Agents, providing real-time audio transcription using [faster-whisper](https://github.com/guillaumekln/faster-whisper).
4
+
5
+ ## Features
6
+
7
+ - Fast inference using CTranslate2-based Whisper implementation
8
+ - Support for multiple model sizes (tiny, base, small, medium, large, large-v2, large-v3)
9
+ - Automatic language detection or manual language specification
10
+ - CPU and GPU support
11
+ - Quantization support (int8, float16, float32)
12
+
13
+ ## Installation
14
+
15
+ ```bash
16
+ uv add vision-agents[fast-whisper]
17
+ ```
@@ -0,0 +1,41 @@
1
+ [build-system]
2
+ requires = ["hatchling", "hatch-vcs"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "vision-agents-plugins-fast-whisper"
7
+ dynamic = ["version"]
8
+ description = "Fast Whisper STT integration for Vision Agents"
9
+ readme = "README.md"
10
+ keywords = ["faster-whisper", "STT", "speech-to-text", "AI", "voice agents", "agents", "whisper"]
11
+ requires-python = ">=3.10"
12
+ license = "MIT"
13
+ dependencies = [
14
+ "vision-agents",
15
+ "faster-whisper>=1.0.0",
16
+ ]
17
+
18
+ [project.urls]
19
+ Documentation = "https://visionagents.ai/"
20
+ Website = "https://visionagents.ai/"
21
+ Source = "https://github.com/GetStream/Vision-Agents"
22
+
23
+ [tool.hatch.version]
24
+ source = "vcs"
25
+ raw-options = { root = "..", search_parent_directories = true, fallback_version = "0.0.0" }
26
+
27
+ [tool.hatch.build.targets.wheel]
28
+ packages = [".", "vision_agents"]
29
+
30
+ [tool.hatch.build.targets.sdist]
31
+ include = ["/vision_agents"]
32
+
33
+ [tool.uv.sources]
34
+ vision-agents = { workspace = true }
35
+
36
+ [dependency-groups]
37
+ dev = [
38
+ "pytest>=8.4.1",
39
+ "pytest-asyncio>=1.0.0",
40
+ ]
41
+
@@ -0,0 +1,3 @@
1
+ from .stt import STT
2
+
3
+ __all__ = ["STT"]
@@ -0,0 +1,239 @@
1
+ import asyncio
2
+ import logging
3
+ import time
4
+ from concurrent.futures import ThreadPoolExecutor
5
+ from typing import Literal, Optional
6
+
7
+ from faster_whisper import WhisperModel
8
+ from faster_whisper.transcribe import Segment, TranscriptionInfo
9
+ from getstream.video.rtc.track_util import AudioFormat, PcmData
10
+ from numpy.typing import NDArray
11
+ from vision_agents.core import stt
12
+ from vision_agents.core.edge.types import Participant
13
+ from vision_agents.core.stt.events import TranscriptResponse
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ # Audio processing constants
18
+ RATE = 16000 # Sample rate in Hz (16kHz)
19
+ MIN_BUFFER_DURATION_MS = 1000 # Minimum buffer duration before processing (1 second)
20
+ MAX_BUFFER_DURATION_MS = (
21
+ 8000 # Maximum buffer duration before forcing processing (8 seconds)
22
+ )
23
+ PROCESS_INTERVAL_MS = 2000 # Process buffer every 2 seconds if it has content
24
+
25
+
26
+ class STT(stt.STT):
27
+ """
28
+ Faster-Whisper Speech-to-Text implementation.
29
+
30
+ This implementation uses faster-whisper for offline transcription.
31
+ Audio is buffered and processed periodically to provide near-real-time
32
+ transcription results.
33
+
34
+ Since faster-whisper is not a streaming STT, this implementation:
35
+ 1. Buffers incoming audio chunks
36
+ 2. Processes the buffer periodically (every 2 seconds) or when it reaches max duration
37
+ 3. Emits partial transcripts for individual segments
38
+ 4. Emits final transcripts for complete buffers
39
+ """
40
+
41
+ def __init__(
42
+ self,
43
+ model_size: Literal["tiny", "base", "small", "medium", "large"] = "tiny",
44
+ language: Optional[str] = "en",
45
+ device: Literal["cpu", "cuda"] = "cpu",
46
+ client: Optional[WhisperModel] = None,
47
+ ):
48
+ """
49
+ Initialize Faster-Whisper STT.
50
+
51
+ Args:
52
+ model_size: Whisper model size (tiny, base, small, medium, large)
53
+ language: Language code (e.g., "en", "es", "fr") or None for auto-detect
54
+ device: Device to run on ("cpu" or "cuda")
55
+ client: Optional pre-initialized WhisperModel instance
56
+ """
57
+ super().__init__(provider_name="faster_whisper")
58
+
59
+ self.model_size = model_size
60
+ self.language = language
61
+ self.device = device
62
+ self.compute_type = "int8"
63
+
64
+ self.whisper = client
65
+
66
+ self._audio_buffer = PcmData(
67
+ sample_rate=RATE, channels=1, format=AudioFormat.F32
68
+ )
69
+ self._last_process_time = time.time()
70
+ self._executor = ThreadPoolExecutor(max_workers=1)
71
+
72
+ async def warmup(self) -> None:
73
+ """
74
+ The Whisper model if not already provided."""
75
+
76
+ if self.whisper is None:
77
+ logger.info(f"Loading faster-whisper model: {self.model_size}")
78
+ # Load whisper in thread pool to avoid blocking event loop
79
+ loop = asyncio.get_running_loop()
80
+ self.whisper = await loop.run_in_executor(
81
+ self._executor,
82
+ lambda: WhisperModel(
83
+ self.model_size, device=self.device, compute_type=self.compute_type
84
+ ),
85
+ )
86
+ logger.info("Faster-whisper model loaded")
87
+
88
+ async def process_audio(
89
+ self,
90
+ pcm_data: PcmData,
91
+ participant: Optional[Participant] = None,
92
+ ):
93
+ """
94
+ Process audio data through faster-whisper for transcription.
95
+
96
+ Audio is buffered and processed periodically to provide transcription results.
97
+
98
+ Args:
99
+ pcm_data: The PCM audio data to process
100
+ participant: Optional participant metadata
101
+ """
102
+ if self.closed:
103
+ logger.warning("Faster-Whisper STT is closed, ignoring audio")
104
+ return
105
+
106
+ if self.whisper is None:
107
+ raise ValueError("Whisper model not loaded, call warmup() first")
108
+
109
+ # Check for empty audio
110
+ if pcm_data.samples.size == 0:
111
+ return
112
+
113
+ try:
114
+ # Ensure audio is in the right format: 16kHz, float32
115
+
116
+ audio_data = pcm_data.resample(RATE).to_float32()
117
+ self._audio_buffer = self._audio_buffer.append(audio_data)
118
+
119
+ current_time = time.time()
120
+ buffer_duration_ms = self._audio_buffer.duration_ms
121
+ buffer_size = self._audio_buffer.samples.size
122
+ time_since_last_process = (current_time - self._last_process_time) * 1000
123
+
124
+ should_process = (
125
+ buffer_duration_ms >= MIN_BUFFER_DURATION_MS
126
+ and buffer_size > 0
127
+ and (
128
+ time_since_last_process >= PROCESS_INTERVAL_MS
129
+ or buffer_duration_ms >= MAX_BUFFER_DURATION_MS
130
+ )
131
+ )
132
+
133
+ if should_process:
134
+ await self._process_buffer(participant)
135
+
136
+ except Exception as e:
137
+ logger.exception("Error buffering audio for faster-whisper")
138
+ self._emit_error_event(
139
+ e, context="buffering_audio", participant=participant
140
+ )
141
+
142
+ async def _process_buffer(self, participant: Optional[Participant] = None):
143
+ """
144
+ Process the current audio buffer through faster-whisper.
145
+
146
+ Args:
147
+ participant: Optional participant metadata
148
+ """
149
+
150
+ # Extract buffer to process
151
+ buffer_to_process = self._audio_buffer
152
+
153
+ self._audio_buffer = PcmData(
154
+ sample_rate=RATE, channels=1, format=AudioFormat.F32
155
+ )
156
+ self._last_process_time = time.time()
157
+
158
+ # Ensure it's 16kHz and f32 format
159
+ pcm = buffer_to_process.resample(RATE).to_float32()
160
+ audio_array = pcm.samples
161
+
162
+ if audio_array.size == 0:
163
+ return
164
+
165
+ start_time = time.time()
166
+
167
+ try:
168
+ segments, info = await self._transcribe(audio_array=audio_array)
169
+ except Exception as e:
170
+ logger.error(
171
+ "Error processing audio buffer with faster-whisper", exc_info=e
172
+ )
173
+ self._emit_error_event(e, context="transcription", participant=participant)
174
+ return
175
+
176
+ processing_time_ms = (time.time() - start_time) * 1000
177
+
178
+ # Create default participant if none provided
179
+ if participant is None:
180
+ participant = Participant(original=None, user_id="unknown")
181
+
182
+ # Process segments
183
+ text_parts = []
184
+ for segment in segments:
185
+ text = segment.text.strip()
186
+ if text:
187
+ text_parts.append(text)
188
+
189
+ # Emit partial transcript for each segment
190
+ response = TranscriptResponse(
191
+ confidence=segment.no_speech_prob
192
+ if hasattr(segment, "no_speech_prob")
193
+ else None,
194
+ language=info.language
195
+ if hasattr(info, "language")
196
+ else self.language,
197
+ processing_time_ms=processing_time_ms,
198
+ audio_duration_ms=buffer_to_process.duration_ms,
199
+ model_name=f"faster-whisper-{self.model_size}",
200
+ )
201
+ self._emit_partial_transcript_event(text, participant, response)
202
+
203
+ # Emit final transcript for the complete buffer
204
+ if text_parts:
205
+ full_text = " ".join(text_parts).strip()
206
+ response = TranscriptResponse(
207
+ confidence=None, # faster-whisper doesn't provide overall confidence
208
+ language=info.language if hasattr(info, "language") else self.language,
209
+ processing_time_ms=processing_time_ms,
210
+ audio_duration_ms=buffer_to_process.duration_ms,
211
+ model_name=f"faster-whisper-{self.model_size}",
212
+ )
213
+ self._emit_transcript_event(full_text, participant, response)
214
+
215
+ async def close(self):
216
+ """Close the STT and cleanup resources."""
217
+ await super().close()
218
+ self._executor.shutdown(wait=False)
219
+
220
+ async def _transcribe(
221
+ self, audio_array: NDArray
222
+ ) -> tuple[list[Segment], TranscriptionInfo]:
223
+ if self.whisper is None:
224
+ raise ValueError("Whisper model not loaded, call warmup() first")
225
+
226
+ whisper = self.whisper # Type narrowing for closure
227
+
228
+ def _worker():
229
+ segments, info = whisper.transcribe(
230
+ audio_array,
231
+ language=self.language,
232
+ beam_size=1,
233
+ vad_filter=False, # Let faster-whisper handle VAD if needed
234
+ )
235
+ # Evaluate the generator in the background thread
236
+ return list(segments), info
237
+
238
+ loop = asyncio.get_running_loop()
239
+ return await loop.run_in_executor(self._executor, _worker)