vision-agents-plugins-wizper 0.2.10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of vision-agents-plugins-wizper might be problematic. Click here for more details.
- vision_agents_plugins_wizper-0.2.10/.gitignore +90 -0
- vision_agents_plugins_wizper-0.2.10/PKG-INFO +15 -0
- vision_agents_plugins_wizper-0.2.10/README.md +1 -0
- vision_agents_plugins_wizper-0.2.10/pyproject.toml +41 -0
- vision_agents_plugins_wizper-0.2.10/vision_agents/plugins/wizper/__init__.py +5 -0
- vision_agents_plugins_wizper-0.2.10/vision_agents/plugins/wizper/stt.py +154 -0
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
.cursor/*
|
|
7
|
+
# Distribution / packaging
|
|
8
|
+
.Python
|
|
9
|
+
build/
|
|
10
|
+
dist/
|
|
11
|
+
downloads/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
eggs/
|
|
14
|
+
.eggs/
|
|
15
|
+
lib64/
|
|
16
|
+
parts/
|
|
17
|
+
sdist/
|
|
18
|
+
var/
|
|
19
|
+
wheels/
|
|
20
|
+
share/python-wheels/
|
|
21
|
+
pip-wheel-metadata/
|
|
22
|
+
MANIFEST
|
|
23
|
+
*.egg-info/
|
|
24
|
+
*.egg
|
|
25
|
+
|
|
26
|
+
# Installer logs
|
|
27
|
+
pip-log.txt
|
|
28
|
+
pip-delete-this-directory.txt
|
|
29
|
+
|
|
30
|
+
# Unit test / coverage reports
|
|
31
|
+
htmlcov/
|
|
32
|
+
.tox/
|
|
33
|
+
.nox/
|
|
34
|
+
.coverage
|
|
35
|
+
.coverage.*
|
|
36
|
+
.cache
|
|
37
|
+
coverage.xml
|
|
38
|
+
nosetests.xml
|
|
39
|
+
*.cover
|
|
40
|
+
*.py,cover
|
|
41
|
+
.hypothesis/
|
|
42
|
+
.pytest_cache/
|
|
43
|
+
|
|
44
|
+
# Type checker / lint caches
|
|
45
|
+
.mypy_cache/
|
|
46
|
+
.dmypy.json
|
|
47
|
+
dmypy.json
|
|
48
|
+
.pytype/
|
|
49
|
+
.pyre/
|
|
50
|
+
.ruff_cache/
|
|
51
|
+
|
|
52
|
+
# Environments
|
|
53
|
+
.venv
|
|
54
|
+
env/
|
|
55
|
+
venv/
|
|
56
|
+
ENV/
|
|
57
|
+
env.bak/
|
|
58
|
+
venv.bak/
|
|
59
|
+
.env
|
|
60
|
+
.env.local
|
|
61
|
+
.env.*.local
|
|
62
|
+
.env.bak
|
|
63
|
+
pyvenv.cfg
|
|
64
|
+
.python-version
|
|
65
|
+
|
|
66
|
+
# Editors / IDEs
|
|
67
|
+
.vscode/
|
|
68
|
+
.idea/
|
|
69
|
+
|
|
70
|
+
# Jupyter Notebook
|
|
71
|
+
.ipynb_checkpoints/
|
|
72
|
+
|
|
73
|
+
# OS / Misc
|
|
74
|
+
.DS_Store
|
|
75
|
+
*.log
|
|
76
|
+
|
|
77
|
+
# Tooling & repo-specific
|
|
78
|
+
pyrightconfig.json
|
|
79
|
+
shell.nix
|
|
80
|
+
bin/*
|
|
81
|
+
lib/*
|
|
82
|
+
stream-py/
|
|
83
|
+
|
|
84
|
+
# Artifacts / assets
|
|
85
|
+
*.pt
|
|
86
|
+
*.kef
|
|
87
|
+
*.onnx
|
|
88
|
+
profile.html
|
|
89
|
+
|
|
90
|
+
/opencode.json
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vision-agents-plugins-wizper
|
|
3
|
+
Version: 0.2.10
|
|
4
|
+
Summary: Wizper plugin for Vision Agents
|
|
5
|
+
Project-URL: Documentation, https://visionagents.ai/
|
|
6
|
+
Project-URL: Website, https://visionagents.ai/
|
|
7
|
+
Project-URL: Source, https://github.com/GetStream/Vision-Agents
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
Keywords: AI,agents,fal,turn detection,voice agents
|
|
10
|
+
Requires-Python: >=3.10
|
|
11
|
+
Requires-Dist: fal-client>=0.7.0
|
|
12
|
+
Requires-Dist: vision-agents
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
|
|
15
|
+
Wizper for vision agents
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
Wizper for vision agents
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling", "hatch-vcs"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "vision-agents-plugins-wizper"
|
|
7
|
+
dynamic = ["version"]
|
|
8
|
+
description = "Wizper plugin for Vision Agents"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
keywords = ["fal", "turn detection", "AI", "voice agents", "agents"]
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
license = "MIT"
|
|
13
|
+
dependencies = [
|
|
14
|
+
"vision-agents",
|
|
15
|
+
"fal-client>=0.7.0",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
[project.urls]
|
|
19
|
+
Documentation = "https://visionagents.ai/"
|
|
20
|
+
Website = "https://visionagents.ai/"
|
|
21
|
+
Source = "https://github.com/GetStream/Vision-Agents"
|
|
22
|
+
|
|
23
|
+
[tool.hatch.version]
|
|
24
|
+
source = "vcs"
|
|
25
|
+
raw-options = { root = "..", search_parent_directories = true, fallback_version = "0.0.0" }
|
|
26
|
+
|
|
27
|
+
[tool.hatch.build.targets.wheel]
|
|
28
|
+
packages = ["."]
|
|
29
|
+
|
|
30
|
+
[tool.hatch.build.targets.sdist]
|
|
31
|
+
include = ["/vision_agents"]
|
|
32
|
+
|
|
33
|
+
[tool.uv.sources]
|
|
34
|
+
vision-agents = { workspace = true }
|
|
35
|
+
|
|
36
|
+
[dependency-groups]
|
|
37
|
+
dev = [
|
|
38
|
+
"pytest>=8.4.1",
|
|
39
|
+
"pytest-asyncio>=1.0.0",
|
|
40
|
+
"numpy>=2.2.6,<2.3",
|
|
41
|
+
]
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Fal Wizper STT Plugin for Stream
|
|
3
|
+
|
|
4
|
+
Provides real-time audio transcription and translation using fal-ai/wizper (Whisper v3).
|
|
5
|
+
This plugin integrates with Stream's audio processing pipeline to provide high-quality
|
|
6
|
+
speech-to-text capabilities.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import aiofiles
|
|
10
|
+
import asyncio
|
|
11
|
+
import logging
|
|
12
|
+
import os
|
|
13
|
+
import tempfile
|
|
14
|
+
import time
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import TYPE_CHECKING, Optional
|
|
17
|
+
|
|
18
|
+
import fal_client
|
|
19
|
+
from getstream.video.rtc.track_util import PcmData
|
|
20
|
+
|
|
21
|
+
from vision_agents.core import stt
|
|
22
|
+
from vision_agents.core.stt import TranscriptResponse
|
|
23
|
+
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
from vision_agents.core.edge.types import Participant
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class STT(stt.STT):
|
|
31
|
+
"""
|
|
32
|
+
Audio transcription and translation using fal-ai/wizper (Whisper v3).
|
|
33
|
+
|
|
34
|
+
This plugin provides real-time speech-to-text capabilities using the fal-ai/wizper
|
|
35
|
+
service, which is based on OpenAI's Whisper v3 model. It supports both transcription
|
|
36
|
+
and translation tasks.
|
|
37
|
+
|
|
38
|
+
Attributes:
|
|
39
|
+
task: The task type - either "transcribe" or "translate"
|
|
40
|
+
target_language: Target language code for translation (e.g., "pt" for Portuguese)
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
def __init__(
|
|
44
|
+
self,
|
|
45
|
+
task: str = "transcribe",
|
|
46
|
+
target_language: Optional[str] = None,
|
|
47
|
+
client: Optional[fal_client.AsyncClient] = None,
|
|
48
|
+
):
|
|
49
|
+
"""
|
|
50
|
+
Initialize Wizper STT.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
task: "transcribe" or "translate"
|
|
54
|
+
target_language: Target language code (e.g., "pt" for Portuguese)
|
|
55
|
+
client: Optional fal_client.AsyncClient instance for testing
|
|
56
|
+
"""
|
|
57
|
+
super().__init__(provider_name="wizper")
|
|
58
|
+
self.task = task
|
|
59
|
+
self.sample_rate = 48000
|
|
60
|
+
self.target_language = target_language
|
|
61
|
+
self._fal_client = client if client is not None else fal_client.AsyncClient()
|
|
62
|
+
|
|
63
|
+
async def process_audio(
|
|
64
|
+
self,
|
|
65
|
+
pcm_data: PcmData,
|
|
66
|
+
participant: Optional["Participant"] = None,
|
|
67
|
+
):
|
|
68
|
+
"""
|
|
69
|
+
Process audio through fal-ai/wizper for transcription.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
pcm_data: The PCM audio data to process
|
|
73
|
+
participant: Optional participant metadata
|
|
74
|
+
"""
|
|
75
|
+
if self.closed:
|
|
76
|
+
logger.warning("Wizper STT is closed, ignoring audio")
|
|
77
|
+
return
|
|
78
|
+
|
|
79
|
+
if pcm_data.samples.size == 0:
|
|
80
|
+
logger.debug("No audio data to process")
|
|
81
|
+
return
|
|
82
|
+
|
|
83
|
+
try:
|
|
84
|
+
start_time = time.perf_counter()
|
|
85
|
+
logger.debug(
|
|
86
|
+
"Sending speech audio to fal-ai/wizper",
|
|
87
|
+
extra={"audio_bytes": pcm_data.samples.nbytes},
|
|
88
|
+
)
|
|
89
|
+
# Convert PCM to WAV format for upload using shared PcmData method
|
|
90
|
+
wav_data = pcm_data.to_wav_bytes()
|
|
91
|
+
|
|
92
|
+
# Create temporary file for upload (async to avoid blocking)
|
|
93
|
+
temp_file_path = os.path.join(
|
|
94
|
+
tempfile.gettempdir(), f"wizper_{os.getpid()}_{id(pcm_data)}.wav"
|
|
95
|
+
)
|
|
96
|
+
async with aiofiles.open(temp_file_path, "wb") as f:
|
|
97
|
+
await f.write(wav_data)
|
|
98
|
+
|
|
99
|
+
try:
|
|
100
|
+
input_params = {
|
|
101
|
+
"task": "transcribe", # TODO: make this dynamic, currently there's a bug in the fal-ai/wizper service where it only works with "transcribe"
|
|
102
|
+
"chunk_level": "segment",
|
|
103
|
+
"version": "3",
|
|
104
|
+
}
|
|
105
|
+
# Add language for translation
|
|
106
|
+
if self.target_language is not None:
|
|
107
|
+
input_params["language"] = self.target_language
|
|
108
|
+
|
|
109
|
+
# Upload file and get URL
|
|
110
|
+
audio_url = await self._fal_client.upload_file(Path(temp_file_path))
|
|
111
|
+
input_params["audio_url"] = audio_url
|
|
112
|
+
|
|
113
|
+
# Use regular subscribe since streaming isn't supported
|
|
114
|
+
result = await self._fal_client.subscribe(
|
|
115
|
+
"fal-ai/wizper", arguments=input_params
|
|
116
|
+
)
|
|
117
|
+
if "text" in result:
|
|
118
|
+
text = result["text"].strip()
|
|
119
|
+
if text:
|
|
120
|
+
# Create a default participant if none provided
|
|
121
|
+
if participant is None:
|
|
122
|
+
from vision_agents.core.edge.types import Participant
|
|
123
|
+
|
|
124
|
+
participant = Participant(
|
|
125
|
+
original=None, user_id="test-user"
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
processing_time_ms = (time.perf_counter() - start_time) * 1000
|
|
129
|
+
response_metadata = TranscriptResponse(
|
|
130
|
+
processing_time_ms=processing_time_ms,
|
|
131
|
+
model_name="wizper-v3",
|
|
132
|
+
)
|
|
133
|
+
self._emit_transcript_event(
|
|
134
|
+
text, participant, response_metadata
|
|
135
|
+
)
|
|
136
|
+
finally:
|
|
137
|
+
# Clean up temporary file (async to avoid blocking)
|
|
138
|
+
try:
|
|
139
|
+
await asyncio.to_thread(os.unlink, temp_file_path)
|
|
140
|
+
except OSError:
|
|
141
|
+
pass
|
|
142
|
+
|
|
143
|
+
except Exception as e:
|
|
144
|
+
logger.error(f"Wizper processing error: {str(e)}")
|
|
145
|
+
self._emit_error_event(e, "Wizper processing")
|
|
146
|
+
|
|
147
|
+
async def close(self):
|
|
148
|
+
"""Close the Wizper STT service and release any resources."""
|
|
149
|
+
if self.closed:
|
|
150
|
+
logger.debug("Wizper STT service already closed")
|
|
151
|
+
return
|
|
152
|
+
|
|
153
|
+
logger.info("Closing Wizper STT service")
|
|
154
|
+
await super().close()
|