vision-agents-plugins-wizper 0.1.9__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of vision-agents-plugins-wizper might be problematic. Click here for more details.

.gitignore CHANGED
@@ -84,3 +84,4 @@ stream-py/
84
84
  # Artifacts / assets
85
85
  *.pt
86
86
  *.kef
87
+ *.onnx
PKG-INFO CHANGED
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vision-agents-plugins-wizper
3
- Version: 0.1.9
3
+ Version: 0.1.12
4
4
  Summary: Wizper plugin for Vision Agents
5
5
  Project-URL: Documentation, https://visionagents.ai/
6
6
  Project-URL: Website, https://visionagents.ai/
@@ -4,40 +4,22 @@ Fal Wizper STT Plugin for Stream
4
4
  Provides real-time audio transcription and translation using fal-ai/wizper (Whisper v3).
5
5
  This plugin integrates with Stream's audio processing pipeline to provide high-quality
6
6
  speech-to-text capabilities.
7
-
8
- Example usage:
9
- from vision_agents.plugins import fal
10
-
11
- # For transcription
12
- stt = fal.STT(task="transcribe")
13
-
14
- # For translation to Portuguese
15
- stt = fal.STT(task="translate", target_language="pt")
16
-
17
- @stt.on("transcript")
18
- async def on_transcript(text: str, user: Any, metadata: dict):
19
- print(f"Transcript: {text}")
20
-
21
- @stt.on("error")
22
- async def on_error(error: str):
23
- print(f"Error: {error}")
24
7
  """
25
8
 
26
- import io
9
+ import logging
27
10
  import os
28
11
  import tempfile
29
- import time
30
- import logging
31
12
  from pathlib import Path
32
- from typing import Any, Dict, Optional, List, Tuple, Union, TYPE_CHECKING
33
-
34
- if TYPE_CHECKING:
35
- from vision_agents.core.edge.types import Participant
36
- import wave
13
+ from typing import TYPE_CHECKING, Optional
37
14
 
38
15
  import fal_client
39
16
  from getstream.video.rtc.track_util import PcmData
17
+
40
18
  from vision_agents.core import stt
19
+ from vision_agents.core.stt import TranscriptResponse
20
+
21
+ if TYPE_CHECKING:
22
+ from vision_agents.core.edge.types import Participant
41
23
 
42
24
  logger = logging.getLogger(__name__)
43
25
 
@@ -58,74 +40,50 @@ class STT(stt.STT):
58
40
  def __init__(
59
41
  self,
60
42
  task: str = "transcribe",
61
- target_language: str | None = None,
62
- sample_rate: int = 48000,
43
+ target_language: Optional[str] = None,
63
44
  client: Optional[fal_client.AsyncClient] = None,
64
45
  ):
65
46
  """
66
- Initialize FalWizperSTT.
47
+ Initialize Wizper STT.
67
48
 
68
49
  Args:
69
50
  task: "transcribe" or "translate"
70
51
  target_language: Target language code (e.g., "pt" for Portuguese)
71
- sample_rate: Sample rate of the audio in Hz.
52
+ client: Optional fal_client.AsyncClient instance for testing
72
53
  """
73
- super().__init__(sample_rate=sample_rate)
54
+ super().__init__(provider_name="wizper")
74
55
  self.task = task
56
+ self.sample_rate = 48000
75
57
  self.target_language = target_language
76
- self.last_activity_time = time.time()
77
- self._is_closed = False
78
58
  self._fal_client = client if client is not None else fal_client.AsyncClient()
79
59
 
80
- def _pcm_to_wav_bytes(self, pcm_data: PcmData) -> bytes:
81
- """
82
- Convert PCM data to WAV format bytes.
83
-
84
- Args:
85
- pcm_data: PCM audio data from Stream's audio pipeline
86
-
87
- Returns:
88
- WAV format audio data as bytes
89
- """
90
- wav_buffer = io.BytesIO()
91
-
92
- with wave.open(wav_buffer, "wb") as wav_file:
93
- wav_file.setnchannels(1) # Mono
94
- wav_file.setsampwidth(2) # 16-bit
95
- wav_file.setframerate(self.sample_rate)
96
- wav_file.writeframes(pcm_data.samples.tobytes())
97
-
98
- wav_buffer.seek(0)
99
- return wav_buffer.read()
100
-
101
- async def _process_audio_impl(
102
- self, pcm_data: PcmData, user_metadata: Optional[Union[Dict[str, Any], "Participant"]] = None
103
- ) -> Optional[List[Tuple[bool, str, Dict[str, Any]]]]:
60
+ async def process_audio(
61
+ self,
62
+ pcm_data: PcmData,
63
+ participant: Optional["Participant"] = None,
64
+ ):
104
65
  """
105
- Process accumulated speech audio through fal-ai/wizper.
106
-
107
- This method is typically called by VAD (Voice Activity Detection) systems
108
- when speech segments are detected.
66
+ Process audio through fal-ai/wizper for transcription.
109
67
 
110
68
  Args:
111
- speech_audio: Accumulated speech audio as numpy array
112
- user: User metadata from the Stream call
69
+ pcm_data: The PCM audio data to process
70
+ participant: Optional participant metadata
113
71
  """
114
- if self._is_closed:
115
- logger.debug("connection is closed, ignoring audio")
116
- return None
72
+ if self.closed:
73
+ logger.warning("Wizper STT is closed, ignoring audio")
74
+ return
117
75
 
118
76
  if pcm_data.samples.size == 0:
119
77
  logger.debug("No audio data to process")
120
- return None
78
+ return
121
79
 
122
80
  try:
123
81
  logger.debug(
124
82
  "Sending speech audio to fal-ai/wizper",
125
83
  extra={"audio_bytes": pcm_data.samples.nbytes},
126
84
  )
127
- # Convert PCM to WAV format for upload
128
- wav_data = self._pcm_to_wav_bytes(pcm_data)
85
+ # Convert PCM to WAV format for upload using shared PcmData method
86
+ wav_data = pcm_data.to_wav_bytes()
129
87
 
130
88
  # Create temporary file for upload
131
89
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
@@ -153,9 +111,10 @@ class STT(stt.STT):
153
111
  )
154
112
  if "text" in result:
155
113
  text = result["text"].strip()
156
- if text:
114
+ if text and participant is not None:
115
+ response_metadata = TranscriptResponse()
157
116
  self._emit_transcript_event(
158
- text, user_metadata, {"chunks": result.get("chunks", [])}
117
+ text, participant, response_metadata
159
118
  )
160
119
  finally:
161
120
  # Clean up temporary file
@@ -164,17 +123,15 @@ class STT(stt.STT):
164
123
  except OSError:
165
124
  pass
166
125
 
167
- # Return None for asynchronous mode - events are emitted when they arrive
168
- return None
169
-
170
126
  except Exception as e:
171
- logger.error(f"FalWizper processing error: {str(e)}")
172
- self._emit_error_event(e, "FalWizper processing")
173
- return None
127
+ logger.error(f"Wizper processing error: {str(e)}")
128
+ self._emit_error_event(e, "Wizper processing")
174
129
 
175
130
  async def close(self):
176
- """Close the STT service and release any resources."""
177
- if self._is_closed:
131
+ """Close the Wizper STT service and release any resources."""
132
+ if self.closed:
133
+ logger.debug("Wizper STT service already closed")
178
134
  return
179
- self._is_closed = True
180
- logger.info("FalWizperSTT closed")
135
+
136
+ logger.info("Closing Wizper STT service")
137
+ await super().close()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vision-agents-plugins-wizper
3
- Version: 0.1.9
3
+ Version: 0.1.12
4
4
  Summary: Wizper plugin for Vision Agents
5
5
  Project-URL: Documentation, https://visionagents.ai/
6
6
  Project-URL: Website, https://visionagents.ai/
@@ -0,0 +1,9 @@
1
+ ./.gitignore,sha256=ye7v72rmcYcz93U_u9IyYUvYJKEXGElBsTevPVyASo0,923
2
+ ./PKG-INFO,sha256=eBdcwmO3PZHGtalyBIPFycUERCl28ZBjYB97saPWLWw,505
3
+ ./README.md,sha256=7MDH68Ywzj2WKm3QAFCUvupBHxrTdjtAL0WqrqaCHFc,24
4
+ ./pyproject.toml,sha256=eSC8A7YqeCWd9_VfDiwOc9z3KPGzSPbf9IIqk_aWES0,959
5
+ ./vision_agents/plugins/wizper/__init__.py,sha256=aRLgDFc3zq4tNj3G9kmM4zJzSpO7hYqGujz3zTTPsMk,93
6
+ ./vision_agents/plugins/wizper/stt.py,sha256=d2XLQQufSL4qruSumff1gj2aA_dssbA0DUN4RZNNIwA,4778
7
+ vision_agents_plugins_wizper-0.1.12.dist-info/METADATA,sha256=eBdcwmO3PZHGtalyBIPFycUERCl28ZBjYB97saPWLWw,505
8
+ vision_agents_plugins_wizper-0.1.12.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
9
+ vision_agents_plugins_wizper-0.1.12.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- ./.gitignore,sha256=S6wPCu4rBDB_yyTYoXbMIR-pn4OPv6b3Ulnx1n5RWvo,916
2
- ./PKG-INFO,sha256=HvLKhmkrhIgfliC_MaGQFgbJ886NL8gkxg28tqrzQIE,504
3
- ./README.md,sha256=7MDH68Ywzj2WKm3QAFCUvupBHxrTdjtAL0WqrqaCHFc,24
4
- ./pyproject.toml,sha256=eSC8A7YqeCWd9_VfDiwOc9z3KPGzSPbf9IIqk_aWES0,959
5
- ./vision_agents/plugins/wizper/__init__.py,sha256=aRLgDFc3zq4tNj3G9kmM4zJzSpO7hYqGujz3zTTPsMk,93
6
- ./vision_agents/plugins/wizper/stt.py,sha256=dRJV4klITwZ1SKbe-MPiC_s7idDJGBTDfmGQlh5E6ss,6112
7
- vision_agents_plugins_wizper-0.1.9.dist-info/METADATA,sha256=HvLKhmkrhIgfliC_MaGQFgbJ886NL8gkxg28tqrzQIE,504
8
- vision_agents_plugins_wizper-0.1.9.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
9
- vision_agents_plugins_wizper-0.1.9.dist-info/RECORD,,