vision-agents-plugins-wizper 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,90 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .cursor/*
7
+ # Distribution / packaging
8
+ .Python
9
+ build/
10
+ dist/
11
+ downloads/
12
+ develop-eggs/
13
+ eggs/
14
+ .eggs/
15
+ lib64/
16
+ parts/
17
+ sdist/
18
+ var/
19
+ wheels/
20
+ share/python-wheels/
21
+ pip-wheel-metadata/
22
+ MANIFEST
23
+ *.egg-info/
24
+ *.egg
25
+
26
+ # Installer logs
27
+ pip-log.txt
28
+ pip-delete-this-directory.txt
29
+
30
+ # Unit test / coverage reports
31
+ htmlcov/
32
+ .tox/
33
+ .nox/
34
+ .coverage
35
+ .coverage.*
36
+ .cache
37
+ coverage.xml
38
+ nosetests.xml
39
+ *.cover
40
+ *.py,cover
41
+ .hypothesis/
42
+ .pytest_cache/
43
+
44
+ # Type checker / lint caches
45
+ .mypy_cache/
46
+ .dmypy.json
47
+ dmypy.json
48
+ .pytype/
49
+ .pyre/
50
+ .ruff_cache/
51
+
52
+ # Environments
53
+ .venv
54
+ env/
55
+ venv/
56
+ ENV/
57
+ env.bak/
58
+ venv.bak/
59
+ .env
60
+ .env.local
61
+ .env.*.local
62
+ .env.bak
63
+ pyvenv.cfg
64
+ .python-version
65
+
66
+ # Editors / IDEs
67
+ .vscode/
68
+ .idea/
69
+
70
+ # Jupyter Notebook
71
+ .ipynb_checkpoints/
72
+
73
+ # OS / Misc
74
+ .DS_Store
75
+ *.log
76
+
77
+ # Tooling & repo-specific
78
+ pyrightconfig.json
79
+ shell.nix
80
+ bin/*
81
+ lib/*
82
+ stream-py/
83
+
84
+ # Artifacts / assets
85
+ *.pt
86
+ *.kef
87
+ *.onnx
88
+ profile.html
89
+
90
+ /opencode.json
@@ -0,0 +1,15 @@
1
+ Metadata-Version: 2.4
2
+ Name: vision-agents-plugins-wizper
3
+ Version: 0.2.3
4
+ Summary: Wizper plugin for Vision Agents
5
+ Project-URL: Documentation, https://visionagents.ai/
6
+ Project-URL: Website, https://visionagents.ai/
7
+ Project-URL: Source, https://github.com/GetStream/Vision-Agents
8
+ License-Expression: MIT
9
+ Keywords: AI,agents,fal,turn detection,voice agents
10
+ Requires-Python: >=3.10
11
+ Requires-Dist: fal-client>=0.7.0
12
+ Requires-Dist: vision-agents
13
+ Description-Content-Type: text/markdown
14
+
15
+ Wizper for vision agents
@@ -0,0 +1 @@
1
+ Wizper for vision agents
@@ -0,0 +1,41 @@
1
+ [build-system]
2
+ requires = ["hatchling", "hatch-vcs"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "vision-agents-plugins-wizper"
7
+ dynamic = ["version"]
8
+ description = "Wizper plugin for Vision Agents"
9
+ readme = "README.md"
10
+ keywords = ["fal", "turn detection", "AI", "voice agents", "agents"]
11
+ requires-python = ">=3.10"
12
+ license = "MIT"
13
+ dependencies = [
14
+ "vision-agents",
15
+ "fal-client>=0.7.0",
16
+ ]
17
+
18
+ [project.urls]
19
+ Documentation = "https://visionagents.ai/"
20
+ Website = "https://visionagents.ai/"
21
+ Source = "https://github.com/GetStream/Vision-Agents"
22
+
23
+ [tool.hatch.version]
24
+ source = "vcs"
25
+ raw-options = { root = "..", search_parent_directories = true, fallback_version = "0.0.0" }
26
+
27
+ [tool.hatch.build.targets.wheel]
28
+ packages = ["."]
29
+
30
+ [tool.hatch.build.targets.sdist]
31
+ include = ["/vision_agents"]
32
+
33
+ [tool.uv.sources]
34
+ vision-agents = { workspace = true }
35
+
36
+ [dependency-groups]
37
+ dev = [
38
+ "pytest>=8.4.1",
39
+ "pytest-asyncio>=1.0.0",
40
+ "numpy>=2.2.6,<2.3",
41
+ ]
@@ -0,0 +1,5 @@
1
+ from .stt import STT
2
+
3
+ # Re-export under the new namespace for convenience
4
+
5
+ __all__ = ["STT"]
@@ -0,0 +1,148 @@
1
+ """
2
+ Fal Wizper STT Plugin for Stream
3
+
4
+ Provides real-time audio transcription and translation using fal-ai/wizper (Whisper v3).
5
+ This plugin integrates with Stream's audio processing pipeline to provide high-quality
6
+ speech-to-text capabilities.
7
+ """
8
+
9
+ import aiofiles
10
+ import asyncio
11
+ import logging
12
+ import os
13
+ import tempfile
14
+ from pathlib import Path
15
+ from typing import TYPE_CHECKING, Optional
16
+
17
+ import fal_client
18
+ from getstream.video.rtc.track_util import PcmData
19
+
20
+ from vision_agents.core import stt
21
+ from vision_agents.core.stt import TranscriptResponse
22
+
23
+ if TYPE_CHECKING:
24
+ from vision_agents.core.edge.types import Participant
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ class STT(stt.STT):
30
+ """
31
+ Audio transcription and translation using fal-ai/wizper (Whisper v3).
32
+
33
+ This plugin provides real-time speech-to-text capabilities using the fal-ai/wizper
34
+ service, which is based on OpenAI's Whisper v3 model. It supports both transcription
35
+ and translation tasks.
36
+
37
+ Attributes:
38
+ task: The task type - either "transcribe" or "translate"
39
+ target_language: Target language code for translation (e.g., "pt" for Portuguese)
40
+ """
41
+
42
+ def __init__(
43
+ self,
44
+ task: str = "transcribe",
45
+ target_language: Optional[str] = None,
46
+ client: Optional[fal_client.AsyncClient] = None,
47
+ ):
48
+ """
49
+ Initialize Wizper STT.
50
+
51
+ Args:
52
+ task: "transcribe" or "translate"
53
+ target_language: Target language code (e.g., "pt" for Portuguese)
54
+ client: Optional fal_client.AsyncClient instance for testing
55
+ """
56
+ super().__init__(provider_name="wizper")
57
+ self.task = task
58
+ self.sample_rate = 48000
59
+ self.target_language = target_language
60
+ self._fal_client = client if client is not None else fal_client.AsyncClient()
61
+
62
+ async def process_audio(
63
+ self,
64
+ pcm_data: PcmData,
65
+ participant: Optional["Participant"] = None,
66
+ ):
67
+ """
68
+ Process audio through fal-ai/wizper for transcription.
69
+
70
+ Args:
71
+ pcm_data: The PCM audio data to process
72
+ participant: Optional participant metadata
73
+ """
74
+ if self.closed:
75
+ logger.warning("Wizper STT is closed, ignoring audio")
76
+ return
77
+
78
+ if pcm_data.samples.size == 0:
79
+ logger.debug("No audio data to process")
80
+ return
81
+
82
+ try:
83
+ logger.debug(
84
+ "Sending speech audio to fal-ai/wizper",
85
+ extra={"audio_bytes": pcm_data.samples.nbytes},
86
+ )
87
+ # Convert PCM to WAV format for upload using shared PcmData method
88
+ wav_data = pcm_data.to_wav_bytes()
89
+
90
+ # Create temporary file for upload (async to avoid blocking)
91
+ temp_file_path = os.path.join(
92
+ tempfile.gettempdir(), f"wizper_{os.getpid()}_{id(pcm_data)}.wav"
93
+ )
94
+ async with aiofiles.open(temp_file_path, "wb") as f:
95
+ await f.write(wav_data)
96
+
97
+ try:
98
+ input_params = {
99
+ "task": "transcribe", # TODO: make this dynamic, currently there's a bug in the fal-ai/wizper service where it only works with "transcribe"
100
+ "chunk_level": "segment",
101
+ "version": "3",
102
+ }
103
+ # Add language for translation
104
+ if self.target_language is not None:
105
+ input_params["language"] = self.target_language
106
+
107
+ # Upload file and get URL
108
+ audio_url = await self._fal_client.upload_file(Path(temp_file_path))
109
+ input_params["audio_url"] = audio_url
110
+
111
+ # Use regular subscribe since streaming isn't supported
112
+ result = await self._fal_client.subscribe(
113
+ "fal-ai/wizper", arguments=input_params
114
+ )
115
+ if "text" in result:
116
+ text = result["text"].strip()
117
+ if text:
118
+ # Create a default participant if none provided
119
+ if participant is None:
120
+ from vision_agents.core.edge.types import Participant
121
+
122
+ participant = Participant(
123
+ original=None, user_id="test-user"
124
+ )
125
+
126
+ response_metadata = TranscriptResponse()
127
+ self._emit_transcript_event(
128
+ text, participant, response_metadata
129
+ )
130
+ finally:
131
+ # Clean up temporary file (async to avoid blocking)
132
+ try:
133
+ await asyncio.to_thread(os.unlink, temp_file_path)
134
+ except OSError:
135
+ pass
136
+
137
+ except Exception as e:
138
+ logger.error(f"Wizper processing error: {str(e)}")
139
+ self._emit_error_event(e, "Wizper processing")
140
+
141
+ async def close(self):
142
+ """Close the Wizper STT service and release any resources."""
143
+ if self.closed:
144
+ logger.debug("Wizper STT service already closed")
145
+ return
146
+
147
+ logger.info("Closing Wizper STT service")
148
+ await super().close()