vision-agents-plugins-wizper 0.0.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of vision-agents-plugins-wizper might be problematic. Click here for more details.

.gitignore ADDED
@@ -0,0 +1,32 @@
1
+ */__pycache__
2
+ */chat/__pycache__
3
+ */video/__pycache__
4
+ */chat/sync/__pycache__
5
+ */chat/async_/__pycache__
6
+ */sync/__pycache__
7
+ */async_/__pycache__
8
+ */video/sync/__pycache__
9
+ */model/__pycache__/
10
+ */cli/__pycache__
11
+ */cli/__pycache__
12
+ .env
13
+ .venv
14
+ .vscode/settings.json
15
+ *.pyc
16
+ dist/*
17
+ dist/*
18
+ *.log
19
+ .python-version
20
+ pyvenv.cfg
21
+ .idea*
22
+ bin/*
23
+ lib/*
24
+ shell.nix
25
+ pyrightconfig.json
26
+ .DS_Store
27
+
28
+ *.egg-info/
29
+ *.egg
30
+ *.pt
31
+ *.kef
32
+ .env.bak
PKG-INFO ADDED
@@ -0,0 +1,15 @@
1
+ Metadata-Version: 2.4
2
+ Name: vision-agents-plugins-wizper
3
+ Version: 0.0.17
4
+ Summary: Wizper plugin for Vision Agents
5
+ Project-URL: Documentation, https://visionagents.ai/
6
+ Project-URL: Website, https://visionagents.ai/
7
+ Project-URL: Source, https://github.com/GetStream/Vision-Agents
8
+ License-Expression: MIT
9
+ Keywords: AI,agents,fal,turn detection,voice agents
10
+ Requires-Python: >=3.10
11
+ Requires-Dist: fal-client>=0.7.0
12
+ Requires-Dist: vision-agents
13
+ Description-Content-Type: text/markdown
14
+
15
+ Wizper for vision agents
README.md ADDED
@@ -0,0 +1 @@
1
+ Wizper for vision agents
pyproject.toml ADDED
@@ -0,0 +1,41 @@
1
+ [build-system]
2
+ requires = ["hatchling", "hatch-vcs"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "vision-agents-plugins-wizper"
7
+ dynamic = ["version"]
8
+ description = "Wizper plugin for Vision Agents"
9
+ readme = "README.md"
10
+ keywords = ["fal", "turn detection", "AI", "voice agents", "agents"]
11
+ requires-python = ">=3.10"
12
+ license = "MIT"
13
+ dependencies = [
14
+ "vision-agents",
15
+ "fal-client>=0.7.0",
16
+ ]
17
+
18
+ [project.urls]
19
+ Documentation = "https://visionagents.ai/"
20
+ Website = "https://visionagents.ai/"
21
+ Source = "https://github.com/GetStream/Vision-Agents"
22
+
23
+ [tool.hatch.version]
24
+ source = "vcs"
25
+ raw-options = { root = "..", search_parent_directories = true, fallback_version = "0.0.0" }
26
+
27
+ [tool.hatch.build.targets.wheel]
28
+ packages = ["."]
29
+
30
+ [tool.hatch.build.targets.sdist]
31
+ include = ["/vision_agents"]
32
+
33
+ [tool.uv.sources]
34
+ vision-agents = { workspace = true }
35
+
36
+ [dependency-groups]
37
+ dev = [
38
+ "pytest>=8.4.1",
39
+ "pytest-asyncio>=1.0.0",
40
+ "numpy>=2.2.6,<2.3",
41
+ ]
@@ -0,0 +1,5 @@
1
+ from .stt import STT
2
+
3
+ # Re-export under the new namespace for convenience
4
+
5
+ __all__ = ["STT"]
@@ -0,0 +1,180 @@
1
+ """
2
+ Fal Wizper STT Plugin for Stream
3
+
4
+ Provides real-time audio transcription and translation using fal-ai/wizper (Whisper v3).
5
+ This plugin integrates with Stream's audio processing pipeline to provide high-quality
6
+ speech-to-text capabilities.
7
+
8
+ Example usage:
9
+ from vision_agents.plugins import fal
10
+
11
+ # For transcription
12
+ stt = fal.STT(task="transcribe")
13
+
14
+ # For translation to Portuguese
15
+ stt = fal.STT(task="translate", target_language="pt")
16
+
17
+ @stt.on("transcript")
18
+ async def on_transcript(text: str, user: Any, metadata: dict):
19
+ print(f"Transcript: {text}")
20
+
21
+ @stt.on("error")
22
+ async def on_error(error: str):
23
+ print(f"Error: {error}")
24
+ """
25
+
26
+ import io
27
+ import os
28
+ import tempfile
29
+ import time
30
+ import logging
31
+ from pathlib import Path
32
+ from typing import Any, Dict, Optional, List, Tuple, Union, TYPE_CHECKING
33
+
34
+ if TYPE_CHECKING:
35
+ from vision_agents.core.edge.types import Participant
36
+ import wave
37
+
38
+ import fal_client
39
+ from getstream.video.rtc.track_util import PcmData
40
+ from vision_agents.core import stt
41
+
42
+ logger = logging.getLogger(__name__)
43
+
44
+
45
+ class STT(stt.STT):
46
+ """
47
+ Audio transcription and translation using fal-ai/wizper (Whisper v3).
48
+
49
+ This plugin provides real-time speech-to-text capabilities using the fal-ai/wizper
50
+ service, which is based on OpenAI's Whisper v3 model. It supports both transcription
51
+ and translation tasks.
52
+
53
+ Attributes:
54
+ task: The task type - either "transcribe" or "translate"
55
+ target_language: Target language code for translation (e.g., "pt" for Portuguese)
56
+ """
57
+
58
+ def __init__(
59
+ self,
60
+ task: str = "transcribe",
61
+ target_language: str | None = None,
62
+ sample_rate: int = 48000,
63
+ client: Optional[fal_client.AsyncClient] = None,
64
+ ):
65
+ """
66
+ Initialize FalWizperSTT.
67
+
68
+ Args:
69
+ task: "transcribe" or "translate"
70
+ target_language: Target language code (e.g., "pt" for Portuguese)
71
+ sample_rate: Sample rate of the audio in Hz.
72
+ """
73
+ super().__init__(sample_rate=sample_rate)
74
+ self.task = task
75
+ self.target_language = target_language
76
+ self.last_activity_time = time.time()
77
+ self._is_closed = False
78
+ self._fal_client = client if client is not None else fal_client.AsyncClient()
79
+
80
+ def _pcm_to_wav_bytes(self, pcm_data: PcmData) -> bytes:
81
+ """
82
+ Convert PCM data to WAV format bytes.
83
+
84
+ Args:
85
+ pcm_data: PCM audio data from Stream's audio pipeline
86
+
87
+ Returns:
88
+ WAV format audio data as bytes
89
+ """
90
+ wav_buffer = io.BytesIO()
91
+
92
+ with wave.open(wav_buffer, "wb") as wav_file:
93
+ wav_file.setnchannels(1) # Mono
94
+ wav_file.setsampwidth(2) # 16-bit
95
+ wav_file.setframerate(self.sample_rate)
96
+ wav_file.writeframes(pcm_data.samples.tobytes())
97
+
98
+ wav_buffer.seek(0)
99
+ return wav_buffer.read()
100
+
101
+ async def _process_audio_impl(
102
+ self, pcm_data: PcmData, user_metadata: Optional[Union[Dict[str, Any], "Participant"]] = None
103
+ ) -> Optional[List[Tuple[bool, str, Dict[str, Any]]]]:
104
+ """
105
+ Process accumulated speech audio through fal-ai/wizper.
106
+
107
+ This method is typically called by VAD (Voice Activity Detection) systems
108
+ when speech segments are detected.
109
+
110
+ Args:
111
+ speech_audio: Accumulated speech audio as numpy array
112
+ user: User metadata from the Stream call
113
+ """
114
+ if self._is_closed:
115
+ logger.debug("connection is closed, ignoring audio")
116
+ return None
117
+
118
+ if pcm_data.samples.size == 0:
119
+ logger.debug("No audio data to process")
120
+ return None
121
+
122
+ try:
123
+ logger.debug(
124
+ "Sending speech audio to fal-ai/wizper",
125
+ extra={"audio_bytes": pcm_data.samples.nbytes},
126
+ )
127
+ # Convert PCM to WAV format for upload
128
+ wav_data = self._pcm_to_wav_bytes(pcm_data)
129
+
130
+ # Create temporary file for upload
131
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
132
+ temp_file.write(wav_data)
133
+ temp_file.flush()
134
+ temp_file_path = temp_file.name
135
+
136
+ try:
137
+ input_params = {
138
+ "task": "transcribe", # TODO: make this dynamic, currently there's a bug in the fal-ai/wizper service where it only works with "transcribe"
139
+ "chunk_level": "segment",
140
+ "version": "3",
141
+ }
142
+ # Add language for translation
143
+ if self.target_language is not None:
144
+ input_params["language"] = self.target_language
145
+
146
+ # Upload file and get URL
147
+ audio_url = await self._fal_client.upload_file(Path(temp_file_path))
148
+ input_params["audio_url"] = audio_url
149
+
150
+ # Use regular subscribe since streaming isn't supported
151
+ result = await self._fal_client.subscribe(
152
+ "fal-ai/wizper", arguments=input_params
153
+ )
154
+ if "text" in result:
155
+ text = result["text"].strip()
156
+ if text:
157
+ self._emit_transcript_event(
158
+ text, user_metadata, {"chunks": result.get("chunks", [])}
159
+ )
160
+ finally:
161
+ # Clean up temporary file
162
+ try:
163
+ os.unlink(temp_file_path)
164
+ except OSError:
165
+ pass
166
+
167
+ # Return None for asynchronous mode - events are emitted when they arrive
168
+ return None
169
+
170
+ except Exception as e:
171
+ logger.error(f"FalWizper processing error: {str(e)}")
172
+ self._emit_error_event(e, "FalWizper processing")
173
+ return None
174
+
175
+ async def close(self):
176
+ """Close the STT service and release any resources."""
177
+ if self._is_closed:
178
+ return
179
+ self._is_closed = True
180
+ logger.info("FalWizperSTT closed")
@@ -0,0 +1,15 @@
1
+ Metadata-Version: 2.4
2
+ Name: vision-agents-plugins-wizper
3
+ Version: 0.0.17
4
+ Summary: Wizper plugin for Vision Agents
5
+ Project-URL: Documentation, https://visionagents.ai/
6
+ Project-URL: Website, https://visionagents.ai/
7
+ Project-URL: Source, https://github.com/GetStream/Vision-Agents
8
+ License-Expression: MIT
9
+ Keywords: AI,agents,fal,turn detection,voice agents
10
+ Requires-Python: >=3.10
11
+ Requires-Dist: fal-client>=0.7.0
12
+ Requires-Dist: vision-agents
13
+ Description-Content-Type: text/markdown
14
+
15
+ Wizper for vision agents
@@ -0,0 +1,9 @@
1
+ ./.gitignore,sha256=LiiMFm4RUXduFZI42AL85GrllvISRUwTt4t3lrUDGUE,408
2
+ ./PKG-INFO,sha256=WruHzrEAIpbxPodXi66z2-1VN1D8dknBCufm10gnYnw,505
3
+ ./README.md,sha256=7MDH68Ywzj2WKm3QAFCUvupBHxrTdjtAL0WqrqaCHFc,24
4
+ ./pyproject.toml,sha256=eSC8A7YqeCWd9_VfDiwOc9z3KPGzSPbf9IIqk_aWES0,959
5
+ ./vision_agents/plugins/wizper/__init__.py,sha256=aRLgDFc3zq4tNj3G9kmM4zJzSpO7hYqGujz3zTTPsMk,93
6
+ ./vision_agents/plugins/wizper/stt.py,sha256=dRJV4klITwZ1SKbe-MPiC_s7idDJGBTDfmGQlh5E6ss,6112
7
+ vision_agents_plugins_wizper-0.0.17.dist-info/METADATA,sha256=WruHzrEAIpbxPodXi66z2-1VN1D8dknBCufm10gnYnw,505
8
+ vision_agents_plugins_wizper-0.0.17.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
9
+ vision_agents_plugins_wizper-0.0.17.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.27.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any