PyPI - dv-pipecat-ai - Versions diffs - 0.0.82.dev68__py3-none-any.whl → 0.0.82.dev69__py3-none-any.whl - Mend

dv-pipecat-ai 0.0.82.dev68py3-none-any.whl → 0.0.82.dev69py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (6) hide show

{dv_pipecat_ai-0.0.82.dev68.dist-info → dv_pipecat_ai-0.0.82.dev69.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dv-pipecat-ai
-Version: 0.0.82.dev68
+Version: 0.0.82.dev69
 Summary: An open source framework for voice (and multimodal) assistants
 License-Expression: BSD-2-Clause
 Project-URL: Source, https://github.com/pipecat-ai/pipecat

{dv_pipecat_ai-0.0.82.dev68.dist-info → dv_pipecat_ai-0.0.82.dev69.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-dv_pipecat_ai-0.0.82.dev68.dist-info/licenses/LICENSE,sha256=DWY2QGf2eMCFhuu2ChairtT6CB7BEFffNVhXWc4Od08,1301
+dv_pipecat_ai-0.0.82.dev69.dist-info/licenses/LICENSE,sha256=DWY2QGf2eMCFhuu2ChairtT6CB7BEFffNVhXWc4Od08,1301
 pipecat/__init__.py,sha256=j0Xm6adxHhd7D06dIyyPV_GlBYLlBnTAERVvD_jAARQ,861
 pipecat/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 pipecat/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -290,7 +290,7 @@ pipecat/services/sambanova/__init__.py,sha256=oTXExLic-qTcsfsiWmssf3Elclf3IIWoN4
 pipecat/services/sambanova/llm.py,sha256=5XVfPLEk__W8ykFqLdV95ZUhlGGkAaJwmbciLdZYtTc,8976
 pipecat/services/sambanova/stt.py,sha256=ZZgEZ7WQjLFHbCko-3LNTtVajjtfUvbtVLtFcaNadVQ,2536
 pipecat/services/sarvam/__init__.py,sha256=B4TN_tTHV9fWg0aSoPvfQlXISA0nJaQ9-u08I9UWvH4,280
-pipecat/services/sarvam/stt.py,sha256=cSrQaDpixNQh4tl8r2xRNREHjKKcyLmrFDLa-Lp4Hl4,15465
+pipecat/services/sarvam/stt.py,sha256=p9Iq4loMwnftNZ_S0WoFSoX7iBbRKyja6RsVWbpj508,19314
 pipecat/services/sarvam/tts.py,sha256=K-AtWE1Q0ZZwshLP-7sCDmOSIWhuKOj91BCCE4N9XAk,25010
 pipecat/services/simli/__init__.py,sha256=cbDcqOaGsEgKbGYKpJ1Vv7LN4ZjOWA04sE84WW5vgQI,257
 pipecat/services/simli/video.py,sha256=fVMYsCE5epH9rTdhN_tyPPJw7W6TCMHCOe2akKHWduw,8330
@@ -378,7 +378,7 @@ pipecat/utils/tracing/service_decorators.py,sha256=HwDCqLGijhYD3F8nxDuQmEw-YkRw0
 pipecat/utils/tracing/setup.py,sha256=7TEgPNpq6M8lww8OQvf0P9FzYc5A30xICGklVA-fua0,2892
 pipecat/utils/tracing/turn_context_provider.py,sha256=ikon3plFOx0XbMrH6DdeHttNpb-U0gzMZIm3bWLc9eI,2485
 pipecat/utils/tracing/turn_trace_observer.py,sha256=dma16SBJpYSOE58YDWy89QzHyQFc_9gQZszKeWixuwc,9725
-dv_pipecat_ai-0.0.82.dev68.dist-info/METADATA,sha256=tRV7JwvNl-emWJwrua577U-gfTxxMtB2RY_ZeI4Qpro,32692
-dv_pipecat_ai-0.0.82.dev68.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-dv_pipecat_ai-0.0.82.dev68.dist-info/top_level.txt,sha256=kQzG20CxGf-nSsHmtXHx3hY2-8zHA3jYg8jk0TajqXc,8
-dv_pipecat_ai-0.0.82.dev68.dist-info/RECORD,,
+dv_pipecat_ai-0.0.82.dev69.dist-info/METADATA,sha256=2Zcf_ZuOSm039KmMpmr76DGUK20UdkgRKaw4dp6y8xA,32692
+dv_pipecat_ai-0.0.82.dev69.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+dv_pipecat_ai-0.0.82.dev69.dist-info/top_level.txt,sha256=kQzG20CxGf-nSsHmtXHx3hY2-8zHA3jYg8jk0TajqXc,8
+dv_pipecat_ai-0.0.82.dev69.dist-info/RECORD,,

pipecat/services/sarvam/stt.py CHANGED Viewed

@@ -31,6 +31,9 @@ from pipecat.utils.tracing.service_decorators import traced_stt
 try:
     import websockets
     from sarvamai import AsyncSarvamAI
+    from sarvamai.speech_to_text_streaming.socket_client import (
+        AsyncSpeechToTextStreamingSocketClient,
+    )
     from sarvamai.speech_to_text_translate_streaming.socket_client import (
         AsyncSpeechToTextTranslateStreamingSocketClient,
     )
@@ -41,11 +44,11 @@ except ModuleNotFoundError as e:
     raise Exception(f"Missing module: {e}")
-def language_to_sarvam_language(language: Language) -> str:
-    """Convert Language enum to Sarvam language code.
+def language_to_sarvam_language(language) -> str:
+    """Convert Language enum or string to Sarvam language code.
     Args:
-        language: The Language enum to convert.
+        language: The Language enum or language code string to convert.
     Returns:
         The corresponding Sarvam language code string.
@@ -53,6 +56,30 @@ def language_to_sarvam_language(language: Language) -> str:
     Raises:
         ValueError: If the language is not supported by Sarvam.
     """
+    # If already a string in the right format, return it
+    if isinstance(language, str):
+        if "-" in language:  # Already in format like "hi-IN"
+            return language
+        # Convert short codes to full format
+        lang_map = {
+            "hi": "hi-IN",
+            "bn": "bn-IN",
+            "gu": "gu-IN",
+            "kn": "kn-IN",
+            "ml": "ml-IN",
+            "mr": "mr-IN",
+            "ta": "ta-IN",
+            "te": "te-IN",
+            "pa": "pa-IN",
+            "or": "od-IN",
+            "as": "as-IN",
+            "en": "en-IN",
+        }
+        if language.lower() in lang_map:
+            return lang_map[language.lower()]
+        raise ValueError(f"Unsupported language string: {language}")
+    # Handle Language enum
     match language:
         case Language.BN_IN:
             return "bn-IN"
@@ -133,6 +160,13 @@ class SarvamSTTService(STTService):
     """Sarvam speech-to-text service.
     Provides real-time speech recognition using Sarvam's WebSocket API.
+    Supports both Saarika (transcription) and Saaras (translation) models.
+    Models:
+        - Saarika (saarika:v2.5): Transcription in a single language
+        - Saaras (saaras:v2.5): Translation from source language to target language
+    The service automatically selects the correct endpoint based on the model name.
     """
     def __init__(
@@ -253,6 +287,7 @@ class SarvamSTTService(STTService):
             # Convert audio bytes to base64 for Sarvam API
             audio_base64 = base64.b64encode(audio).decode("utf-8")
+            # Sarvam requires 'audio/wav' encoding (even for raw PCM data)
             message = {
                 "audio": {
                     "data": audio_base64,
@@ -273,33 +308,47 @@ class SarvamSTTService(STTService):
     async def _connect(self):
         """Connect to Sarvam WebSocket API directly."""
-        logger.debug("Connecting to Sarvam")
+        logger.debug(f"Connecting to Sarvam with model: {self._model}")
         try:
-            # Build WebSocket URL and headers manually
-            ws_url = (
-                self._client._client_wrapper.get_environment().production
-                + "/speech-to-text-translate/ws"
-            )
+            base_url = self._client._client_wrapper.get_environment().production
+            # Choose endpoint and socket class based on model
+            if self._model.startswith("saarika"):
+                # Saarika = Transcription endpoint
+                path = "/speech-to-text/ws"
+                query_params = {
+                    "language-code": language_to_sarvam_language(self._language),
+                    "model": self._model,
+                    "vad_signals": "true",
+                }
+                socket_cls = AsyncSpeechToTextStreamingSocketClient
+                logger.debug(
+                    f"Using Saarika transcription endpoint with language: {self._language}"
+                )
+            else:
+                # Saaras = Translation endpoint
+                path = "/speech-to-text-translate/ws"
+                query_params = {
+                    "model": self._model,
+                    "vad_signals": "true",
+                }
+                socket_cls = AsyncSpeechToTextTranslateStreamingSocketClient
+                logger.debug("Using Saaras translation endpoint")
-            # Add query parameters
-            query_params = {"model": self._model, "vad_signals": "true"}
-            query_string = urlencode(query_params)
-            ws_url = ws_url + f"?{query_string}"
+            ws_url = f"{base_url}{path}?{urlencode(query_params)}"
             # Get headers
             headers = self._client._client_wrapper.get_headers()
             headers["Api-Subscription-Key"] = self._api_key
-            # Connect to WebSocket directly
+            # Connect to WebSocket
             self._websocket_connection = await websockets.connect(
                 ws_url, additional_headers=headers
             )
             # Create the socket client wrapper
-            self._websocket = AsyncSpeechToTextTranslateStreamingSocketClient(
-                websocket=self._websocket_connection
-            )
+            self._websocket = socket_cls(websocket=self._websocket_connection)
             # Start listening for messages
             self._listening_task = asyncio.create_task(self._listen_for_messages())
@@ -309,7 +358,10 @@ class SarvamSTTService(STTService):
         except websockets.exceptions.InvalidStatusCode as e:
             error_msg = f"Failed to connect to Sarvam: HTTP {e.status_code}"
             if e.status_code == 403:
-                error_msg += f" - Access denied. Your API key may not have access to model '{self._model}'. Available models: saaras:v2, saaras:v2.5"
+                if self._model.startswith("saarika"):
+                    error_msg += f" - Access denied. Check: 1) API key has Saarika access, 2) Model '{self._model}' exists (try saarika:v2.5), 3) Using correct endpoint (transcription)"
+                else:
+                    error_msg += f" - Access denied. Check: 1) API key has Saaras access, 2) Model '{self._model}' exists (try saaras:v2.5), 3) Using correct endpoint (translation)"
             elif e.status_code == 401:
                 error_msg += " - Invalid API key"
             logger.error(error_msg)
@@ -370,21 +422,60 @@ class SarvamSTTService(STTService):
     async def _handle_response(self, response):
         """Handle transcription response from Sarvam.
+        Handles both Saarika (transcription) and Saaras (translation) message formats.
         Args:
             response: The response object from Sarvam WebSocket.
         """
         logger.debug(f"Received response: {response}")
         try:
-            if response["type"] == "error":
+            msg_type = response.get("type")
+            # Error handling
+            if msg_type == "error":
                 error_msg = response.get("data", {}).get("message", "Unknown error")
                 logger.error(f"Sarvam API error: {error_msg}")
                 await self.push_error(ErrorFrame(f"Sarvam API error: {error_msg}"))
-                # Close connection on error
                 await self._disconnect()
                 return
-            if response["type"] == "events":
+            # Modern Saarika/Saaras message format
+            if msg_type == "speech_start":
+                await self.start_metrics()
+                logger.debug("User started speaking")
+                await self._call_event_handler("on_speech_started")
+                return
+            if msg_type == "speech_end":
+                logger.debug("User stopped speaking")
+                await self._call_event_handler("on_speech_ended")
+                return
+            if msg_type == "transcript":
+                await self.stop_ttfb_metrics()
+                # Handle both Saarika (text) and Saaras (text + text_translated)
+                transcript = response.get("text") or response.get("text_translated") or ""
+                language_code = (
+                    response.get("source_language_code") or response.get("language_code") or "hi-IN"
+                )
+                language = self._map_language_code_to_enum(language_code)
+                if transcript.strip():
+                    await self.push_frame(
+                        TranscriptionFrame(
+                            transcript,
+                            self._user_id,
+                            time_now_iso8601(),
+                            language,
+                            result=response,
+                        )
+                    )
+                await self.stop_processing_metrics()
+                return
+            # Legacy format (backward compatibility)
+            if msg_type == "events":
                 parsed = EventResponse(**response)
                 signal = parsed.data.signal_type
                 timestamp = parsed.data.occured_at
@@ -397,14 +488,13 @@ class SarvamSTTService(STTService):
                 elif signal == VADSignal.END:
                     logger.debug("User stopped speaking")
                     await self._call_event_handler("on_speech_ended")
+                return
-            elif response["type"] == "data":
+            if msg_type == "data":
                 await self.stop_ttfb_metrics()
                 parsed = TranscriptionResponse(**response)
                 transcript = parsed.data.transcript
-                language_code = parsed.data.language_code
-                if language_code is None:
-                    language_code = "hi-IN"
+                language_code = parsed.data.language_code or "hi-IN"
                 language = self._map_language_code_to_enum(language_code)
                 if transcript and transcript.strip():
@@ -417,8 +507,8 @@ class SarvamSTTService(STTService):
                             result=response,
                         )
                     )
                 await self.stop_processing_metrics()
+                return
         except Exception as e:
             logger.error(f"Error handling Sarvam response: {e}")

{dv_pipecat_ai-0.0.82.dev68.dist-info → dv_pipecat_ai-0.0.82.dev69.dist-info}/WHEEL RENAMED Viewed

File without changes

{dv_pipecat_ai-0.0.82.dev68.dist-info → dv_pipecat_ai-0.0.82.dev69.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{dv_pipecat_ai-0.0.82.dev68.dist-info → dv_pipecat_ai-0.0.82.dev69.dist-info}/top_level.txt RENAMED Viewed

File without changes

dv-pipecat-ai 0.0.82.dev68__py3-none-any.whl → 0.0.82.dev69__py3-none-any.whl

Potentially problematic release.

dv-pipecat-ai 0.0.82.dev68py3-none-any.whl → 0.0.82.dev69py3-none-any.whl