vision-agents-plugins-wizper 0.1.8__tar.gz → 0.1.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of vision-agents-plugins-wizper might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vision-agents-plugins-wizper
3
- Version: 0.1.8
3
+ Version: 0.1.11
4
4
  Summary: Wizper plugin for Vision Agents
5
5
  Project-URL: Documentation, https://visionagents.ai/
6
6
  Project-URL: Website, https://visionagents.ai/
@@ -24,20 +24,21 @@ Example usage:
24
24
  """
25
25
 
26
26
  import io
27
+ import logging
27
28
  import os
28
29
  import tempfile
29
- import time
30
- import logging
31
30
  from pathlib import Path
32
- from typing import Any, Dict, Optional, List, Tuple, Union, TYPE_CHECKING
33
-
34
- if TYPE_CHECKING:
35
- from vision_agents.core.edge.types import Participant
31
+ from typing import TYPE_CHECKING, Optional
36
32
  import wave
37
33
 
38
34
  import fal_client
39
35
  from getstream.video.rtc.track_util import PcmData
36
+
40
37
  from vision_agents.core import stt
38
+ from vision_agents.core.stt import TranscriptResponse
39
+
40
+ if TYPE_CHECKING:
41
+ from vision_agents.core.edge.types import Participant
41
42
 
42
43
  logger = logging.getLogger(__name__)
43
44
 
@@ -58,23 +59,21 @@ class STT(stt.STT):
58
59
  def __init__(
59
60
  self,
60
61
  task: str = "transcribe",
61
- target_language: str | None = None,
62
- sample_rate: int = 48000,
62
+ target_language: Optional[str] = None,
63
63
  client: Optional[fal_client.AsyncClient] = None,
64
64
  ):
65
65
  """
66
- Initialize FalWizperSTT.
66
+ Initialize Wizper STT.
67
67
 
68
68
  Args:
69
69
  task: "transcribe" or "translate"
70
70
  target_language: Target language code (e.g., "pt" for Portuguese)
71
- sample_rate: Sample rate of the audio in Hz.
71
+ client: Optional fal_client.AsyncClient instance for testing
72
72
  """
73
- super().__init__(sample_rate=sample_rate)
73
+ super().__init__(provider_name="wizper")
74
74
  self.task = task
75
+ self.sample_rate = 48000
75
76
  self.target_language = target_language
76
- self.last_activity_time = time.time()
77
- self._is_closed = False
78
77
  self._fal_client = client if client is not None else fal_client.AsyncClient()
79
78
 
80
79
  def _pcm_to_wav_bytes(self, pcm_data: PcmData) -> bytes:
@@ -98,26 +97,25 @@ class STT(stt.STT):
98
97
  wav_buffer.seek(0)
99
98
  return wav_buffer.read()
100
99
 
101
- async def _process_audio_impl(
102
- self, pcm_data: PcmData, user_metadata: Optional[Union[Dict[str, Any], "Participant"]] = None
103
- ) -> Optional[List[Tuple[bool, str, Dict[str, Any]]]]:
100
+ async def process_audio(
101
+ self,
102
+ pcm_data: PcmData,
103
+ participant: Optional["Participant"] = None,
104
+ ):
104
105
  """
105
- Process accumulated speech audio through fal-ai/wizper.
106
-
107
- This method is typically called by VAD (Voice Activity Detection) systems
108
- when speech segments are detected.
106
+ Process audio through fal-ai/wizper for transcription.
109
107
 
110
108
  Args:
111
- speech_audio: Accumulated speech audio as numpy array
112
- user: User metadata from the Stream call
109
+ pcm_data: The PCM audio data to process
110
+ participant: Optional participant metadata
113
111
  """
114
- if self._is_closed:
115
- logger.debug("connection is closed, ignoring audio")
116
- return None
112
+ if self.closed:
113
+ logger.warning("Wizper STT is closed, ignoring audio")
114
+ return
117
115
 
118
116
  if pcm_data.samples.size == 0:
119
117
  logger.debug("No audio data to process")
120
- return None
118
+ return
121
119
 
122
120
  try:
123
121
  logger.debug(
@@ -154,8 +152,9 @@ class STT(stt.STT):
154
152
  if "text" in result:
155
153
  text = result["text"].strip()
156
154
  if text:
155
+ response_metadata = TranscriptResponse()
157
156
  self._emit_transcript_event(
158
- text, user_metadata, {"chunks": result.get("chunks", [])}
157
+ text, participant, response_metadata
159
158
  )
160
159
  finally:
161
160
  # Clean up temporary file
@@ -164,17 +163,15 @@ class STT(stt.STT):
164
163
  except OSError:
165
164
  pass
166
165
 
167
- # Return None for asynchronous mode - events are emitted when they arrive
168
- return None
169
-
170
166
  except Exception as e:
171
- logger.error(f"FalWizper processing error: {str(e)}")
172
- self._emit_error_event(e, "FalWizper processing")
173
- return None
167
+ logger.error(f"Wizper processing error: {str(e)}")
168
+ self._emit_error_event(e, "Wizper processing")
174
169
 
175
170
  async def close(self):
176
- """Close the STT service and release any resources."""
177
- if self._is_closed:
171
+ """Close the Wizper STT service and release any resources."""
172
+ if self.closed:
173
+ logger.debug("Wizper STT service already closed")
178
174
  return
179
- self._is_closed = True
180
- logger.info("FalWizperSTT closed")
175
+
176
+ logger.info("Closing Wizper STT service")
177
+ await super().close()