vision-agents-plugins-wizper 0.0.17__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of vision-agents-plugins-wizper might be problematic. Click here for more details.
- vision_agents_plugins_wizper-0.0.17/.gitignore +32 -0
- vision_agents_plugins_wizper-0.0.17/PKG-INFO +15 -0
- vision_agents_plugins_wizper-0.0.17/README.md +1 -0
- vision_agents_plugins_wizper-0.0.17/pyproject.toml +41 -0
- vision_agents_plugins_wizper-0.0.17/vision_agents/plugins/wizper/__init__.py +5 -0
- vision_agents_plugins_wizper-0.0.17/vision_agents/plugins/wizper/stt.py +180 -0
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
*/__pycache__
|
|
2
|
+
*/chat/__pycache__
|
|
3
|
+
*/video/__pycache__
|
|
4
|
+
*/chat/sync/__pycache__
|
|
5
|
+
*/chat/async_/__pycache__
|
|
6
|
+
*/sync/__pycache__
|
|
7
|
+
*/async_/__pycache__
|
|
8
|
+
*/video/sync/__pycache__
|
|
9
|
+
*/model/__pycache__/
|
|
10
|
+
*/cli/__pycache__
|
|
11
|
+
*/cli/__pycache__
|
|
12
|
+
.env
|
|
13
|
+
.venv
|
|
14
|
+
.vscode/settings.json
|
|
15
|
+
*.pyc
|
|
16
|
+
dist/*
|
|
17
|
+
dist/*
|
|
18
|
+
*.log
|
|
19
|
+
.python-version
|
|
20
|
+
pyvenv.cfg
|
|
21
|
+
.idea*
|
|
22
|
+
bin/*
|
|
23
|
+
lib/*
|
|
24
|
+
shell.nix
|
|
25
|
+
pyrightconfig.json
|
|
26
|
+
.DS_Store
|
|
27
|
+
|
|
28
|
+
*.egg-info/
|
|
29
|
+
*.egg
|
|
30
|
+
*.pt
|
|
31
|
+
*.kef
|
|
32
|
+
.env.bak
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vision-agents-plugins-wizper
|
|
3
|
+
Version: 0.0.17
|
|
4
|
+
Summary: Wizper plugin for Vision Agents
|
|
5
|
+
Project-URL: Documentation, https://visionagents.ai/
|
|
6
|
+
Project-URL: Website, https://visionagents.ai/
|
|
7
|
+
Project-URL: Source, https://github.com/GetStream/Vision-Agents
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
Keywords: AI,agents,fal,turn detection,voice agents
|
|
10
|
+
Requires-Python: >=3.10
|
|
11
|
+
Requires-Dist: fal-client>=0.7.0
|
|
12
|
+
Requires-Dist: vision-agents
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
|
|
15
|
+
Wizper for vision agents
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
Wizper for vision agents
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling", "hatch-vcs"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "vision-agents-plugins-wizper"
|
|
7
|
+
dynamic = ["version"]
|
|
8
|
+
description = "Wizper plugin for Vision Agents"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
keywords = ["fal", "turn detection", "AI", "voice agents", "agents"]
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
license = "MIT"
|
|
13
|
+
dependencies = [
|
|
14
|
+
"vision-agents",
|
|
15
|
+
"fal-client>=0.7.0",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
[project.urls]
|
|
19
|
+
Documentation = "https://visionagents.ai/"
|
|
20
|
+
Website = "https://visionagents.ai/"
|
|
21
|
+
Source = "https://github.com/GetStream/Vision-Agents"
|
|
22
|
+
|
|
23
|
+
[tool.hatch.version]
|
|
24
|
+
source = "vcs"
|
|
25
|
+
raw-options = { root = "..", search_parent_directories = true, fallback_version = "0.0.0" }
|
|
26
|
+
|
|
27
|
+
[tool.hatch.build.targets.wheel]
|
|
28
|
+
packages = ["."]
|
|
29
|
+
|
|
30
|
+
[tool.hatch.build.targets.sdist]
|
|
31
|
+
include = ["/vision_agents"]
|
|
32
|
+
|
|
33
|
+
[tool.uv.sources]
|
|
34
|
+
vision-agents = { workspace = true }
|
|
35
|
+
|
|
36
|
+
[dependency-groups]
|
|
37
|
+
dev = [
|
|
38
|
+
"pytest>=8.4.1",
|
|
39
|
+
"pytest-asyncio>=1.0.0",
|
|
40
|
+
"numpy>=2.2.6,<2.3",
|
|
41
|
+
]
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Fal Wizper STT Plugin for Stream
|
|
3
|
+
|
|
4
|
+
Provides real-time audio transcription and translation using fal-ai/wizper (Whisper v3).
|
|
5
|
+
This plugin integrates with Stream's audio processing pipeline to provide high-quality
|
|
6
|
+
speech-to-text capabilities.
|
|
7
|
+
|
|
8
|
+
Example usage:
|
|
9
|
+
from vision_agents.plugins import fal
|
|
10
|
+
|
|
11
|
+
# For transcription
|
|
12
|
+
stt = fal.STT(task="transcribe")
|
|
13
|
+
|
|
14
|
+
# For translation to Portuguese
|
|
15
|
+
stt = fal.STT(task="translate", target_language="pt")
|
|
16
|
+
|
|
17
|
+
@stt.on("transcript")
|
|
18
|
+
async def on_transcript(text: str, user: Any, metadata: dict):
|
|
19
|
+
print(f"Transcript: {text}")
|
|
20
|
+
|
|
21
|
+
@stt.on("error")
|
|
22
|
+
async def on_error(error: str):
|
|
23
|
+
print(f"Error: {error}")
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
import io
|
|
27
|
+
import os
|
|
28
|
+
import tempfile
|
|
29
|
+
import time
|
|
30
|
+
import logging
|
|
31
|
+
from pathlib import Path
|
|
32
|
+
from typing import Any, Dict, Optional, List, Tuple, Union, TYPE_CHECKING
|
|
33
|
+
|
|
34
|
+
if TYPE_CHECKING:
|
|
35
|
+
from vision_agents.core.edge.types import Participant
|
|
36
|
+
import wave
|
|
37
|
+
|
|
38
|
+
import fal_client
|
|
39
|
+
from getstream.video.rtc.track_util import PcmData
|
|
40
|
+
from vision_agents.core import stt
|
|
41
|
+
|
|
42
|
+
logger = logging.getLogger(__name__)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class STT(stt.STT):
|
|
46
|
+
"""
|
|
47
|
+
Audio transcription and translation using fal-ai/wizper (Whisper v3).
|
|
48
|
+
|
|
49
|
+
This plugin provides real-time speech-to-text capabilities using the fal-ai/wizper
|
|
50
|
+
service, which is based on OpenAI's Whisper v3 model. It supports both transcription
|
|
51
|
+
and translation tasks.
|
|
52
|
+
|
|
53
|
+
Attributes:
|
|
54
|
+
task: The task type - either "transcribe" or "translate"
|
|
55
|
+
target_language: Target language code for translation (e.g., "pt" for Portuguese)
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
def __init__(
|
|
59
|
+
self,
|
|
60
|
+
task: str = "transcribe",
|
|
61
|
+
target_language: str | None = None,
|
|
62
|
+
sample_rate: int = 48000,
|
|
63
|
+
client: Optional[fal_client.AsyncClient] = None,
|
|
64
|
+
):
|
|
65
|
+
"""
|
|
66
|
+
Initialize FalWizperSTT.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
task: "transcribe" or "translate"
|
|
70
|
+
target_language: Target language code (e.g., "pt" for Portuguese)
|
|
71
|
+
sample_rate: Sample rate of the audio in Hz.
|
|
72
|
+
"""
|
|
73
|
+
super().__init__(sample_rate=sample_rate)
|
|
74
|
+
self.task = task
|
|
75
|
+
self.target_language = target_language
|
|
76
|
+
self.last_activity_time = time.time()
|
|
77
|
+
self._is_closed = False
|
|
78
|
+
self._fal_client = client if client is not None else fal_client.AsyncClient()
|
|
79
|
+
|
|
80
|
+
def _pcm_to_wav_bytes(self, pcm_data: PcmData) -> bytes:
|
|
81
|
+
"""
|
|
82
|
+
Convert PCM data to WAV format bytes.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
pcm_data: PCM audio data from Stream's audio pipeline
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
WAV format audio data as bytes
|
|
89
|
+
"""
|
|
90
|
+
wav_buffer = io.BytesIO()
|
|
91
|
+
|
|
92
|
+
with wave.open(wav_buffer, "wb") as wav_file:
|
|
93
|
+
wav_file.setnchannels(1) # Mono
|
|
94
|
+
wav_file.setsampwidth(2) # 16-bit
|
|
95
|
+
wav_file.setframerate(self.sample_rate)
|
|
96
|
+
wav_file.writeframes(pcm_data.samples.tobytes())
|
|
97
|
+
|
|
98
|
+
wav_buffer.seek(0)
|
|
99
|
+
return wav_buffer.read()
|
|
100
|
+
|
|
101
|
+
async def _process_audio_impl(
|
|
102
|
+
self, pcm_data: PcmData, user_metadata: Optional[Union[Dict[str, Any], "Participant"]] = None
|
|
103
|
+
) -> Optional[List[Tuple[bool, str, Dict[str, Any]]]]:
|
|
104
|
+
"""
|
|
105
|
+
Process accumulated speech audio through fal-ai/wizper.
|
|
106
|
+
|
|
107
|
+
This method is typically called by VAD (Voice Activity Detection) systems
|
|
108
|
+
when speech segments are detected.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
speech_audio: Accumulated speech audio as numpy array
|
|
112
|
+
user: User metadata from the Stream call
|
|
113
|
+
"""
|
|
114
|
+
if self._is_closed:
|
|
115
|
+
logger.debug("connection is closed, ignoring audio")
|
|
116
|
+
return None
|
|
117
|
+
|
|
118
|
+
if pcm_data.samples.size == 0:
|
|
119
|
+
logger.debug("No audio data to process")
|
|
120
|
+
return None
|
|
121
|
+
|
|
122
|
+
try:
|
|
123
|
+
logger.debug(
|
|
124
|
+
"Sending speech audio to fal-ai/wizper",
|
|
125
|
+
extra={"audio_bytes": pcm_data.samples.nbytes},
|
|
126
|
+
)
|
|
127
|
+
# Convert PCM to WAV format for upload
|
|
128
|
+
wav_data = self._pcm_to_wav_bytes(pcm_data)
|
|
129
|
+
|
|
130
|
+
# Create temporary file for upload
|
|
131
|
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
|
|
132
|
+
temp_file.write(wav_data)
|
|
133
|
+
temp_file.flush()
|
|
134
|
+
temp_file_path = temp_file.name
|
|
135
|
+
|
|
136
|
+
try:
|
|
137
|
+
input_params = {
|
|
138
|
+
"task": "transcribe", # TODO: make this dynamic, currently there's a bug in the fal-ai/wizper service where it only works with "transcribe"
|
|
139
|
+
"chunk_level": "segment",
|
|
140
|
+
"version": "3",
|
|
141
|
+
}
|
|
142
|
+
# Add language for translation
|
|
143
|
+
if self.target_language is not None:
|
|
144
|
+
input_params["language"] = self.target_language
|
|
145
|
+
|
|
146
|
+
# Upload file and get URL
|
|
147
|
+
audio_url = await self._fal_client.upload_file(Path(temp_file_path))
|
|
148
|
+
input_params["audio_url"] = audio_url
|
|
149
|
+
|
|
150
|
+
# Use regular subscribe since streaming isn't supported
|
|
151
|
+
result = await self._fal_client.subscribe(
|
|
152
|
+
"fal-ai/wizper", arguments=input_params
|
|
153
|
+
)
|
|
154
|
+
if "text" in result:
|
|
155
|
+
text = result["text"].strip()
|
|
156
|
+
if text:
|
|
157
|
+
self._emit_transcript_event(
|
|
158
|
+
text, user_metadata, {"chunks": result.get("chunks", [])}
|
|
159
|
+
)
|
|
160
|
+
finally:
|
|
161
|
+
# Clean up temporary file
|
|
162
|
+
try:
|
|
163
|
+
os.unlink(temp_file_path)
|
|
164
|
+
except OSError:
|
|
165
|
+
pass
|
|
166
|
+
|
|
167
|
+
# Return None for asynchronous mode - events are emitted when they arrive
|
|
168
|
+
return None
|
|
169
|
+
|
|
170
|
+
except Exception as e:
|
|
171
|
+
logger.error(f"FalWizper processing error: {str(e)}")
|
|
172
|
+
self._emit_error_event(e, "FalWizper processing")
|
|
173
|
+
return None
|
|
174
|
+
|
|
175
|
+
async def close(self):
|
|
176
|
+
"""Close the STT service and release any resources."""
|
|
177
|
+
if self._is_closed:
|
|
178
|
+
return
|
|
179
|
+
self._is_closed = True
|
|
180
|
+
logger.info("FalWizperSTT closed")
|