vision-agents-plugins-wizper 0.1.9__tar.gz → 0.1.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of vision-agents-plugins-wizper might be problematic. Click here for more details.
- {vision_agents_plugins_wizper-0.1.9 → vision_agents_plugins_wizper-0.1.12}/.gitignore +1 -0
- {vision_agents_plugins_wizper-0.1.9 → vision_agents_plugins_wizper-0.1.12}/PKG-INFO +1 -1
- {vision_agents_plugins_wizper-0.1.9 → vision_agents_plugins_wizper-0.1.12}/vision_agents/plugins/wizper/stt.py +37 -80
- {vision_agents_plugins_wizper-0.1.9 → vision_agents_plugins_wizper-0.1.12}/README.md +0 -0
- {vision_agents_plugins_wizper-0.1.9 → vision_agents_plugins_wizper-0.1.12}/pyproject.toml +0 -0
- {vision_agents_plugins_wizper-0.1.9 → vision_agents_plugins_wizper-0.1.12}/vision_agents/plugins/wizper/__init__.py +0 -0
|
@@ -4,40 +4,22 @@ Fal Wizper STT Plugin for Stream
|
|
|
4
4
|
Provides real-time audio transcription and translation using fal-ai/wizper (Whisper v3).
|
|
5
5
|
This plugin integrates with Stream's audio processing pipeline to provide high-quality
|
|
6
6
|
speech-to-text capabilities.
|
|
7
|
-
|
|
8
|
-
Example usage:
|
|
9
|
-
from vision_agents.plugins import fal
|
|
10
|
-
|
|
11
|
-
# For transcription
|
|
12
|
-
stt = fal.STT(task="transcribe")
|
|
13
|
-
|
|
14
|
-
# For translation to Portuguese
|
|
15
|
-
stt = fal.STT(task="translate", target_language="pt")
|
|
16
|
-
|
|
17
|
-
@stt.on("transcript")
|
|
18
|
-
async def on_transcript(text: str, user: Any, metadata: dict):
|
|
19
|
-
print(f"Transcript: {text}")
|
|
20
|
-
|
|
21
|
-
@stt.on("error")
|
|
22
|
-
async def on_error(error: str):
|
|
23
|
-
print(f"Error: {error}")
|
|
24
7
|
"""
|
|
25
8
|
|
|
26
|
-
import
|
|
9
|
+
import logging
|
|
27
10
|
import os
|
|
28
11
|
import tempfile
|
|
29
|
-
import time
|
|
30
|
-
import logging
|
|
31
12
|
from pathlib import Path
|
|
32
|
-
from typing import
|
|
33
|
-
|
|
34
|
-
if TYPE_CHECKING:
|
|
35
|
-
from vision_agents.core.edge.types import Participant
|
|
36
|
-
import wave
|
|
13
|
+
from typing import TYPE_CHECKING, Optional
|
|
37
14
|
|
|
38
15
|
import fal_client
|
|
39
16
|
from getstream.video.rtc.track_util import PcmData
|
|
17
|
+
|
|
40
18
|
from vision_agents.core import stt
|
|
19
|
+
from vision_agents.core.stt import TranscriptResponse
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from vision_agents.core.edge.types import Participant
|
|
41
23
|
|
|
42
24
|
logger = logging.getLogger(__name__)
|
|
43
25
|
|
|
@@ -58,74 +40,50 @@ class STT(stt.STT):
|
|
|
58
40
|
def __init__(
|
|
59
41
|
self,
|
|
60
42
|
task: str = "transcribe",
|
|
61
|
-
target_language: str
|
|
62
|
-
sample_rate: int = 48000,
|
|
43
|
+
target_language: Optional[str] = None,
|
|
63
44
|
client: Optional[fal_client.AsyncClient] = None,
|
|
64
45
|
):
|
|
65
46
|
"""
|
|
66
|
-
Initialize
|
|
47
|
+
Initialize Wizper STT.
|
|
67
48
|
|
|
68
49
|
Args:
|
|
69
50
|
task: "transcribe" or "translate"
|
|
70
51
|
target_language: Target language code (e.g., "pt" for Portuguese)
|
|
71
|
-
|
|
52
|
+
client: Optional fal_client.AsyncClient instance for testing
|
|
72
53
|
"""
|
|
73
|
-
super().__init__(
|
|
54
|
+
super().__init__(provider_name="wizper")
|
|
74
55
|
self.task = task
|
|
56
|
+
self.sample_rate = 48000
|
|
75
57
|
self.target_language = target_language
|
|
76
|
-
self.last_activity_time = time.time()
|
|
77
|
-
self._is_closed = False
|
|
78
58
|
self._fal_client = client if client is not None else fal_client.AsyncClient()
|
|
79
59
|
|
|
80
|
-
def
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
pcm_data: PCM audio data from Stream's audio pipeline
|
|
86
|
-
|
|
87
|
-
Returns:
|
|
88
|
-
WAV format audio data as bytes
|
|
89
|
-
"""
|
|
90
|
-
wav_buffer = io.BytesIO()
|
|
91
|
-
|
|
92
|
-
with wave.open(wav_buffer, "wb") as wav_file:
|
|
93
|
-
wav_file.setnchannels(1) # Mono
|
|
94
|
-
wav_file.setsampwidth(2) # 16-bit
|
|
95
|
-
wav_file.setframerate(self.sample_rate)
|
|
96
|
-
wav_file.writeframes(pcm_data.samples.tobytes())
|
|
97
|
-
|
|
98
|
-
wav_buffer.seek(0)
|
|
99
|
-
return wav_buffer.read()
|
|
100
|
-
|
|
101
|
-
async def _process_audio_impl(
|
|
102
|
-
self, pcm_data: PcmData, user_metadata: Optional[Union[Dict[str, Any], "Participant"]] = None
|
|
103
|
-
) -> Optional[List[Tuple[bool, str, Dict[str, Any]]]]:
|
|
60
|
+
async def process_audio(
|
|
61
|
+
self,
|
|
62
|
+
pcm_data: PcmData,
|
|
63
|
+
participant: Optional["Participant"] = None,
|
|
64
|
+
):
|
|
104
65
|
"""
|
|
105
|
-
Process
|
|
106
|
-
|
|
107
|
-
This method is typically called by VAD (Voice Activity Detection) systems
|
|
108
|
-
when speech segments are detected.
|
|
66
|
+
Process audio through fal-ai/wizper for transcription.
|
|
109
67
|
|
|
110
68
|
Args:
|
|
111
|
-
|
|
112
|
-
|
|
69
|
+
pcm_data: The PCM audio data to process
|
|
70
|
+
participant: Optional participant metadata
|
|
113
71
|
"""
|
|
114
|
-
if self.
|
|
115
|
-
logger.
|
|
116
|
-
return
|
|
72
|
+
if self.closed:
|
|
73
|
+
logger.warning("Wizper STT is closed, ignoring audio")
|
|
74
|
+
return
|
|
117
75
|
|
|
118
76
|
if pcm_data.samples.size == 0:
|
|
119
77
|
logger.debug("No audio data to process")
|
|
120
|
-
return
|
|
78
|
+
return
|
|
121
79
|
|
|
122
80
|
try:
|
|
123
81
|
logger.debug(
|
|
124
82
|
"Sending speech audio to fal-ai/wizper",
|
|
125
83
|
extra={"audio_bytes": pcm_data.samples.nbytes},
|
|
126
84
|
)
|
|
127
|
-
# Convert PCM to WAV format for upload
|
|
128
|
-
wav_data =
|
|
85
|
+
# Convert PCM to WAV format for upload using shared PcmData method
|
|
86
|
+
wav_data = pcm_data.to_wav_bytes()
|
|
129
87
|
|
|
130
88
|
# Create temporary file for upload
|
|
131
89
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
|
|
@@ -153,9 +111,10 @@ class STT(stt.STT):
|
|
|
153
111
|
)
|
|
154
112
|
if "text" in result:
|
|
155
113
|
text = result["text"].strip()
|
|
156
|
-
if text:
|
|
114
|
+
if text and participant is not None:
|
|
115
|
+
response_metadata = TranscriptResponse()
|
|
157
116
|
self._emit_transcript_event(
|
|
158
|
-
text,
|
|
117
|
+
text, participant, response_metadata
|
|
159
118
|
)
|
|
160
119
|
finally:
|
|
161
120
|
# Clean up temporary file
|
|
@@ -164,17 +123,15 @@ class STT(stt.STT):
|
|
|
164
123
|
except OSError:
|
|
165
124
|
pass
|
|
166
125
|
|
|
167
|
-
# Return None for asynchronous mode - events are emitted when they arrive
|
|
168
|
-
return None
|
|
169
|
-
|
|
170
126
|
except Exception as e:
|
|
171
|
-
logger.error(f"
|
|
172
|
-
self._emit_error_event(e, "
|
|
173
|
-
return None
|
|
127
|
+
logger.error(f"Wizper processing error: {str(e)}")
|
|
128
|
+
self._emit_error_event(e, "Wizper processing")
|
|
174
129
|
|
|
175
130
|
async def close(self):
|
|
176
|
-
"""Close the STT service and release any resources."""
|
|
177
|
-
if self.
|
|
131
|
+
"""Close the Wizper STT service and release any resources."""
|
|
132
|
+
if self.closed:
|
|
133
|
+
logger.debug("Wizper STT service already closed")
|
|
178
134
|
return
|
|
179
|
-
|
|
180
|
-
logger.info("
|
|
135
|
+
|
|
136
|
+
logger.info("Closing Wizper STT service")
|
|
137
|
+
await super().close()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|